From d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Mon, 11 Dec 2023 02:45:28 +0000 Subject: [PATCH] add boot partition size --- kernel/block/bio.c | 1276 +++++++++++++++++++-------------------------------------- 1 files changed, 425 insertions(+), 851 deletions(-) diff --git a/kernel/block/bio.c b/kernel/block/bio.c index ec51fed..6d6e7b9 100644 --- a/kernel/block/bio.c +++ b/kernel/block/bio.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public Licens - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- - * */ #include <linux/mm.h> #include <linux/swap.h> @@ -29,6 +16,8 @@ #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/blk-cgroup.h> +#include <linux/highmem.h> +#include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> #include <trace/events/block.h> @@ -245,7 +234,14 @@ void bio_uninit(struct bio *bio) { - bio_disassociate_task(bio); +#ifdef CONFIG_BLK_CGROUP + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif + if (bio_integrity(bio)) + bio_integrity_free(bio); bio_crypt_free_ctx(bio); } @@ -331,7 +327,7 @@ /** * bio_chain - chain bio completions * @bio: the target bio - * @parent: the @bio's parent bio + * @parent: the parent bio of @bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have @@ -362,7 +358,7 @@ if (!bio) break; - generic_make_request(bio); + submit_bio_noacct(bio); } } @@ -420,19 +416,19 @@ * submit the previously allocated bio for IO before attempting to allocate * a new one. Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under generic_make_request() (i.e. any block + * Note that when running under submit_bio_noacct() (i.e. any block * driver), bios are not submitted until after you return - see the code in - * generic_make_request() that converts recursion into iteration, to prevent + * submit_bio_noacct() that converts recursion into iteration, to prevent * stack overflows. * * This would normally mean allocating multiple bios under - * generic_make_request() would be susceptible to deadlocks, but we have + * submit_bio_noacct() would be susceptible to deadlocks, but we have * deadlock avoidance code that resubmits any blocked bios from a rescuer * thread. * * However, we do not guarantee forward progress for allocations from other * mempools. Doing multiple allocations from the same mempool under - * generic_make_request() should be avoided - instead, use bio_set's front_pad + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * * RETURNS: @@ -452,9 +448,7 @@ if (nr_iovecs > UIO_MAXIOV) return NULL; - p = kmalloc(sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); + p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); front_pad = 0; inline_vecs = nr_iovecs; } else { @@ -463,14 +457,14 @@ nr_iovecs > 0)) return NULL; /* - * generic_make_request() converts recursion to iteration; this + * submit_bio_noacct() converts recursion to iteration; this * means if we're running beneath it, any bios we allocate and * submit will not be submitted (and thus freed) until after we * return. * * This exposes us to a potential deadlock if we allocate * multiple bios from the same bio_set() while running - * underneath generic_make_request(). If we were to allocate + * underneath submit_bio_noacct(). If we were to allocate * multiple bios (say a stacking block driver that was splitting * bios), we would deadlock if we exhausted the mempool's * reserve. @@ -551,6 +545,99 @@ EXPORT_SYMBOL(zero_fill_bio_iter); /** + * bio_truncate - truncate the bio to small size of @new_size + * @bio: the bio to be truncated + * @new_size: new size for truncating the bio + * + * Description: + * Truncate the bio to new size of @new_size. If bio_op(bio) is + * REQ_OP_READ, zero the truncated part. This function should only + * be used for handling corner cases, such as bio eod. + */ +void bio_truncate(struct bio *bio, unsigned new_size) +{ + struct bio_vec bv; + struct bvec_iter iter; + unsigned int done = 0; + bool truncated = false; + + if (new_size >= bio->bi_iter.bi_size) + return; + + if (bio_op(bio) != REQ_OP_READ) + goto exit; + + bio_for_each_segment(bv, bio, iter) { + if (done + bv.bv_len > new_size) { + unsigned offset; + + if (!truncated) + offset = new_size - done; + else + offset = 0; + zero_user(bv.bv_page, bv.bv_offset + offset, + bv.bv_len - offset); + truncated = true; + } + done += bv.bv_len; + } + + exit: + /* + * Don't touch bvec table here and make it really immutable, since + * fs bio user has to retrieve all pages via bio_for_each_segment_all + * in its .end_bio() callback. + * + * It is enough to truncate bio by updating .bi_size since we can make + * correct bvec with the updated .bi_size for drivers. + */ + bio->bi_iter.bi_size = new_size; +} + +/** + * guard_bio_eod - truncate a BIO to fit the block device + * @bio: bio to truncate + * + * This allows us to do IO even on the odd last sectors of a device, even if the + * block size is some multiple of the physical sector size. + * + * We'll just truncate the bio to the size of the device, and clear the end of + * the buffer head manually. Truly out-of-range accesses will turn into actual + * I/O errors, this only handles the "we need to be able to do I/O at the final + * sector" case. + */ +void guard_bio_eod(struct bio *bio) +{ + sector_t maxsector; + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (part) + maxsector = part_nr_sects_read(part); + else + maxsector = get_capacity(bio->bi_disk); + rcu_read_unlock(); + + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_iter.bi_sector; + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) + return; + + bio_truncate(bio, maxsector << 9); +} + +/** * bio_put - release a reference to a bio * @bio: bio to release reference to * @@ -573,15 +660,6 @@ } } EXPORT_SYMBOL(bio_put); - -inline int bio_phys_segments(struct request_queue *q, struct bio *bio) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - - return bio->bi_phys_segments; -} -EXPORT_SYMBOL(bio_phys_segments); /** * __bio_clone_fast - clone a bio that shares the original bio's biovec @@ -613,7 +691,8 @@ bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); } EXPORT_SYMBOL(__bio_clone_fast); @@ -635,133 +714,162 @@ __bio_clone_fast(b, bio); - bio_crypt_clone(b, bio, gfp_mask); + if (bio_crypt_clone(b, bio, gfp_mask) < 0) + goto err_put; if (bio_integrity(bio) && - bio_integrity_clone(b, bio, gfp_mask) < 0) { - bio_put(b); - return NULL; - } + bio_integrity_clone(b, bio, gfp_mask) < 0) + goto err_put; return b; + +err_put: + bio_put(b); + return NULL; } EXPORT_SYMBOL(bio_clone_fast); -/** - * bio_add_pc_page - attempt to add page to bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - * - * This should only be used by REQ_PC bios. - */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset) +const char *bio_devname(struct bio *bio, char *buf) { - int retried_segments = 0; + return disk_name(bio->bi_disk, bio->bi_partno, buf); +} +EXPORT_SYMBOL(bio_devname); + +static inline bool page_is_mergeable(const struct bio_vec *bv, + struct page *page, unsigned int len, unsigned int off, + bool *same_page) +{ + size_t bv_end = bv->bv_offset + bv->bv_len; + phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; + phys_addr_t page_addr = page_to_phys(page); + + if (vec_end_addr + 1 != page_addr + off) + return false; + if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) + return false; + + *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); + if (*same_page) + return true; + return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); +} + +/* + * Try to merge a page into a segment, while obeying the hardware segment + * size limit. This is not for normal read/write bios, but for passthrough + * or Zone Append operations that we can't split. + */ +static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, + struct page *page, unsigned len, + unsigned offset, bool *same_page) +{ + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + unsigned long mask = queue_segment_boundary(q); + phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; + phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; + + if ((addr1 | mask) != (addr2 | mask)) + return false; + if (bv->bv_len + len > queue_max_segment_size(q)) + return false; + return __bio_try_merge_page(bio, page, len, offset, same_page); +} + +/** + * bio_add_hw_page - attempt to add a page to a bio with hw constraints + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * @max_sectors: maximum number of sectors that can be added + * @same_page: return if the segment has been merged inside the same page + * + * Add a page to a bio while respecting the hardware max_sectors, max_segment + * and gap limitations. + */ +int bio_add_hw_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + unsigned int max_sectors, bool *same_page) +{ struct bio_vec *bvec; - /* - * cloned bio must not modify vec list - */ - if (unlikely(bio_flagged(bio, BIO_CLONED))) + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q)) + if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) return 0; - /* - * For filesystems with a blocksize smaller than the pagesize - * we will often be called with the same page as last time and - * a consecutive offset. Optimize this special case. - */ if (bio->bi_vcnt > 0) { - struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page == prev->bv_page && - offset == prev->bv_offset + prev->bv_len) { - prev->bv_len += len; - bio->bi_iter.bi_size += len; - goto done; - } + if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) + return len; /* - * If the queue doesn't support SG gaps and adding this - * offset would create a gap, disallow it. + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. */ - if (bvec_gap_to_prev(q, prev, offset)) + bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if (bvec_gap_to_prev(q, bvec, offset)) return 0; } - if (bio_full(bio)) + if (bio_full(bio, len)) return 0; - /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ + if (bio->bi_vcnt >= queue_max_segments(q)) + return 0; + bvec = &bio->bi_io_vec[bio->bi_vcnt]; bvec->bv_page = page; bvec->bv_len = len; bvec->bv_offset = offset; bio->bi_vcnt++; - bio->bi_phys_segments++; bio->bi_iter.bi_size += len; - - /* - * Perform a recount if the number of segments is greater - * than queue_max_segments(q). - */ - - while (bio->bi_phys_segments > queue_max_segments(q)) { - - if (retried_segments) - goto failed; - - retried_segments = 1; - blk_recount_segments(q, bio); - } - - /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) - bio_clear_flag(bio, BIO_SEG_VALID); - - done: return len; +} - failed: - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - bio->bi_vcnt--; - bio->bi_iter.bi_size -= len; - blk_recount_segments(q, bio); - return 0; +/** + * bio_add_pc_page - attempt to add page to passthrough bio + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by passthrough bios. + */ +int bio_add_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset) +{ + bool same_page = false; + return bio_add_hw_page(q, bio, page, len, offset, + queue_max_hw_sectors(q), &same_page); } EXPORT_SYMBOL(bio_add_pc_page); /** * __bio_try_merge_page - try appending data to an existing bvec. * @bio: destination bio - * @page: page to add + * @page: start page to add * @len: length of the data to add - * @off: offset of the data in @page + * @off: offset of the data relative to @page + * @same_page: return if the segment has been merged inside the same page * * Try to add the data at @page + @off to the last bvec of @bio. This is a - * a useful optimisation for file systems with a block size smaller than the + * useful optimisation for file systems with a block size smaller than the * page size. + * + * Warn if (@len, @off) crosses pages in case that @same_page is true. * * Return %true on success or %false on failure. */ bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off) + unsigned int len, unsigned int off, bool *same_page) { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return false; @@ -769,7 +877,11 @@ if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) { + if (page_is_mergeable(bv, page, len, off, same_page)) { + if (bio->bi_iter.bi_size > UINT_MAX - len) { + *same_page = false; + return false; + } bv->bv_len += len; bio->bi_iter.bi_size += len; return true; @@ -780,11 +892,11 @@ EXPORT_SYMBOL_GPL(__bio_try_merge_page); /** - * __bio_add_page - add page to a bio in a new segment + * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio - * @page: page to add - * @len: length of the data to add - * @off: offset of the data in @page + * @page: start page to add + * @len: length of the data to add, may cross pages + * @off: offset of the data relative to @page, may cross pages * * Add the data at @page + @off to @bio as a new bvec. The caller must ensure * that @bio has space for another bvec. @@ -795,7 +907,7 @@ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); - WARN_ON_ONCE(bio_full(bio)); + WARN_ON_ONCE(bio_full(bio, len)); bv->bv_page = page; bv->bv_offset = off; @@ -810,26 +922,72 @@ EXPORT_SYMBOL_GPL(__bio_add_page); /** - * bio_add_page - attempt to add page to bio + * bio_add_page - attempt to add page(s) to bio * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset + * @page: start page to add + * @len: vec entry length, may cross pages + * @offset: vec entry offset relative to @page, may cross pages * - * Attempt to add a page to the bio_vec maplist. This will only fail + * Attempt to add page(s) to the bio_vec maplist. This will only fail * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - if (!__bio_try_merge_page(bio, page, len, offset)) { - if (bio_full(bio)) + bool same_page = false; + + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (bio_full(bio, len)) return 0; __bio_add_page(bio, page, len, offset); } return len; } EXPORT_SYMBOL(bio_add_page); + +void bio_release_pages(struct bio *bio, bool mark_dirty) +{ + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + + if (bio_flagged(bio, BIO_NO_PAGE_REF)) + return; + + bio_for_each_segment_all(bvec, bio, iter_all) { + if (mark_dirty && !PageCompound(bvec->bv_page)) + set_page_dirty_lock(bvec->bv_page); + put_page(bvec->bv_page); + } +} +EXPORT_SYMBOL_GPL(bio_release_pages); + +static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +{ + const struct bio_vec *bv = iter->bvec; + unsigned int len; + size_t size; + + if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) + return -EINVAL; + + len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); + size = bio_add_page(bio, bv->bv_page, len, + bv->bv_offset + iter->iov_offset); + if (unlikely(size != len)) + return -EINVAL; + iov_iter_advance(iter, size); + return 0; +} + +static void bio_put_pages(struct page **pages, size_t size, size_t off) +{ + size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); + + for (i = 0; i < nr; i++) + put_page(pages[i]); +} + +#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio @@ -839,71 +997,142 @@ * Pins pages from *iter and appends them to @bio's bvec array. The * pages will have to be released using put_page() when done. * For multi-segment *iter, this function only adds pages from the - * the next non-empty segment of the iov iterator. + * next non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx; + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; + bool same_page = false; + ssize_t size, left; + unsigned len, i; size_t offset; - ssize_t size; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; - idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE; - /* - * Deep magic below: We need to walk the pinned pages backwards - * because we are abusing the space allocated for the bio_vecs - * for the page array. Because the bio_vecs are larger than the - * page pointers by definition this will always work. But it also - * means we can't use bio_add_page, so any changes to it's semantics - * need to be reflected here as well. - */ - bio->bi_iter.bi_size += size; - bio->bi_vcnt += nr_pages; + for (left = size, i = 0; left > 0; left -= len, i++) { + struct page *page = pages[i]; - while (idx--) { - bv[idx].bv_page = pages[idx]; - bv[idx].bv_len = PAGE_SIZE; - bv[idx].bv_offset = 0; + len = min_t(size_t, PAGE_SIZE - offset, left); + + if (__bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (same_page) + put_page(page); + } else { + if (WARN_ON_ONCE(bio_full(bio, len))) { + bio_put_pages(pages + i, left, offset); + return -EINVAL; + } + __bio_add_page(bio, page, len, offset); + } + offset = 0; } - - bv[0].bv_offset += offset; - bv[0].bv_len -= offset; - bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size; iov_iter_advance(iter, size); return 0; } +static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; + struct request_queue *q = bio->bi_disk->queue; + unsigned int max_append_sectors = queue_max_zone_append_sectors(q); + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + ssize_t size, left; + unsigned len, i; + size_t offset; + int ret = 0; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + + for (left = size, i = 0; left > 0; left -= len, i++) { + struct page *page = pages[i]; + bool same_page = false; + + len = min_t(size_t, PAGE_SIZE - offset, left); + if (bio_add_hw_page(q, bio, page, len, offset, + max_append_sectors, &same_page) != len) { + bio_put_pages(pages + i, left, offset); + ret = -EINVAL; + break; + } + if (same_page) + put_page(page); + offset = 0; + } + + iov_iter_advance(iter, size - left); + return ret; +} + /** - * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio + * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to - * @iter: iov iterator describing the region to be mapped + * @iter: iov iterator describing the region to be added * - * Pins pages from *iter and appends them to @bio's bvec array. The - * pages will have to be released using put_page() when done. + * This takes either an iterator pointing to user memory, or one pointing to + * kernel pages (BVEC iterator). If we're adding user pages, we pin them and + * map them into the kernel. On IO completion, the caller should put those + * pages. If we're adding kernel pages, and the caller told us it's safe to + * do so, we just have to add the pages to the bio directly. We don't grab an + * extra reference to those pages (the user should already have that), and we + * don't put the page on IO completion. The caller needs to check if the bio is + * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be + * released. + * * The function tries, but does not guarantee, to pin as many pages as - * fit into the bio, or are requested in *iter, whatever is smaller. - * If MM encounters an error pinning the requested pages, it stops. - * Error is returned only if 0 pages could be pinned. + * fit into the bio, or are requested in @iter, whatever is smaller. If + * MM encounters an error pinning the requested pages, it stops. Error + * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - unsigned short orig_vcnt = bio->bi_vcnt; + const bool is_bvec = iov_iter_is_bvec(iter); + int ret; + + if (WARN_ON_ONCE(bio->bi_vcnt)) + return -EINVAL; do { - int ret = __bio_iov_iter_get_pages(bio, iter); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (WARN_ON_ONCE(is_bvec)) + return -EINVAL; + ret = __bio_iov_append_get_pages(bio, iter); + } else { + if (is_bvec) + ret = __bio_iov_bvec_add_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); + } + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - if (unlikely(ret)) - return bio->bi_vcnt > orig_vcnt ? 0 : ret; - - } while (iov_iter_count(iter) && !bio_full(bio)); - - return 0; + if (is_bvec) + bio_set_flag(bio, BIO_NO_PAGE_REF); + return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); @@ -926,12 +1155,21 @@ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + unsigned long hang_check; bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); - wait_for_completion_io(&done); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_io_timeout(&done, + hang_check * (HZ/2))) + ; + else + wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); } @@ -1043,523 +1281,15 @@ } EXPORT_SYMBOL(bio_list_copy_data); -struct bio_map_data { - int is_our_pages; - struct iov_iter iter; - struct iovec iov[]; -}; - -static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - if (data->nr_segs > UIO_MAXIOV) - return NULL; - - bmd = kmalloc(sizeof(struct bio_map_data) + - sizeof(struct iovec) * data->nr_segs, gfp_mask); - if (!bmd) - return NULL; - memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); - bmd->iter = *data; - bmd->iter.iov = bmd->iov; - return bmd; -} - -/** - * bio_copy_from_iter - copy all pages from iov_iter to bio - * @bio: The &struct bio which describes the I/O as destination - * @iter: iov_iter as source - * - * Copy all pages from iov_iter to bio. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) -{ - int i; - struct bio_vec *bvec; - - bio_for_each_segment_all(bvec, bio, i) { - ssize_t ret; - - ret = copy_page_from_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - iter); - - if (!iov_iter_count(iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - -/** - * bio_copy_to_iter - copy all pages from bio to iov_iter - * @bio: The &struct bio which describes the I/O as source - * @iter: iov_iter as destination - * - * Copy all pages from bio to iov_iter. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) -{ - int i; - struct bio_vec *bvec; - - bio_for_each_segment_all(bvec, bio, i) { - ssize_t ret; - - ret = copy_page_to_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - &iter); - - if (!iov_iter_count(&iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; - int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, iter_all) __free_page(bvec->bv_page); } EXPORT_SYMBOL(bio_free_pages); - -/** - * bio_uncopy_user - finish previously mapped bio - * @bio: bio being terminated - * - * Free pages allocated from bio_copy_user_iov() and write back data - * to user space in case of a read. - */ -int bio_uncopy_user(struct bio *bio) -{ - struct bio_map_data *bmd = bio->bi_private; - int ret = 0; - - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { - /* - * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free - * and return -EINTR so user space doesn't expect any data. - */ - if (!current->mm) - ret = -EINTR; - else if (bio_data_dir(bio) == READ) - ret = bio_copy_to_iter(bio, bmd->iter); - if (bmd->is_our_pages) - bio_free_pages(bio); - } - kfree(bmd); - bio_put(bio); - return ret; -} - -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - struct page *page; - struct bio *bio; - int i = 0, ret; - int nr_pages; - unsigned int len = iter->count; - unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; - - bmd = bio_alloc_map_data(iter, gfp_mask); - if (!bmd) - return ERR_PTR(-ENOMEM); - - /* - * We need to do a deep copy of the iov_iter including the iovecs. - * The caller provided iov might point to an on-stack or otherwise - * shortlived one. - */ - bmd->is_our_pages = map_data ? 0 : 1; - - nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - if (nr_pages > BIO_MAX_PAGES) - nr_pages = BIO_MAX_PAGES; - - ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - goto out_bmd; - - ret = 0; - - if (map_data) { - nr_pages = 1 << map_data->page_order; - i = map_data->offset / PAGE_SIZE; - } - while (len) { - unsigned int bytes = PAGE_SIZE; - - bytes -= offset; - - if (bytes > len) - bytes = len; - - if (map_data) { - if (i == map_data->nr_entries * nr_pages) { - ret = -ENOMEM; - break; - } - - page = map_data->pages[i / nr_pages]; - page += (i % nr_pages); - - i++; - } else { - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; - } - } - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { - if (!map_data) - __free_page(page); - break; - } - - len -= bytes; - offset = 0; - } - - if (ret) - goto cleanup; - - if (map_data) - map_data->offset += bio->bi_iter.bi_size; - - /* - * success - */ - if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { - ret = bio_copy_from_iter(bio, iter); - if (ret) - goto cleanup; - } else { - if (bmd->is_our_pages) - zero_fill_bio(bio); - iov_iter_advance(iter, bio->bi_iter.bi_size); - } - - bio->bi_private = bmd; - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - return bio; -cleanup: - if (!map_data) - bio_free_pages(bio); - bio_put(bio); -out_bmd: - kfree(bmd); - return ERR_PTR(ret); -} - -/** - * bio_map_user_iov - map user iovec into bio - * @q: the struct request_queue for the bio - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - int j; - struct bio *bio; - int ret; - struct bio_vec *bvec; - - if (!iov_iter_count(iter)) - return ERR_PTR(-EINVAL); - - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (iov_iter_count(iter)) { - struct page **pages; - ssize_t bytes; - size_t offs, added = 0; - int npages; - - bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); - if (unlikely(bytes <= 0)) { - ret = bytes ? bytes : -EFAULT; - goto out_unmap; - } - - npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - - if (unlikely(offs & queue_dma_alignment(q))) { - ret = -EINVAL; - j = 0; - } else { - for (j = 0; j < npages; j++) { - struct page *page = pages[j]; - unsigned int n = PAGE_SIZE - offs; - unsigned short prev_bi_vcnt = bio->bi_vcnt; - - if (n > bytes) - n = bytes; - - if (!bio_add_pc_page(q, bio, page, n, offs)) - break; - - /* - * check if vector was merged with previous - * drop page reference if needed - */ - if (bio->bi_vcnt == prev_bi_vcnt) - put_page(page); - - added += n; - bytes -= n; - offs = 0; - } - iov_iter_advance(iter, added); - } - /* - * release the pages we didn't map into the bio, if any - */ - while (j < npages) - put_page(pages[j++]); - kvfree(pages); - /* couldn't stuff something into bio? */ - if (bytes) - break; - } - - bio_set_flag(bio, BIO_USER_MAPPED); - - /* - * subtle -- if bio_map_user_iov() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - return bio; - - out_unmap: - bio_for_each_segment_all(bvec, bio, j) { - put_page(bvec->bv_page); - } - bio_put(bio); - return ERR_PTR(ret); -} - -static void __bio_unmap_user(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - /* - * make sure we dirty pages we wrote to - */ - bio_for_each_segment_all(bvec, bio, i) { - if (bio_data_dir(bio) == READ) - set_page_dirty_lock(bvec->bv_page); - - put_page(bvec->bv_page); - } - - bio_put(bio); -} - -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from - * process context. - * - * bio_unmap_user() may sleep. - */ -void bio_unmap_user(struct bio *bio) -{ - __bio_unmap_user(bio); - bio_put(bio); -} - -static void bio_map_kern_endio(struct bio *bio) -{ - bio_put(bio); -} - -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - int offset, i; - struct bio *bio; - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, - offset) < bytes) { - /* we don't support partial mappings */ - bio_put(bio); - return ERR_PTR(-EINVAL); - } - - data += bytes; - len -= bytes; - offset = 0; - } - - bio->bi_end_io = bio_map_kern_endio; - return bio; -} -EXPORT_SYMBOL(bio_map_kern); - -static void bio_copy_kern_endio(struct bio *bio) -{ - bio_free_pages(bio); - bio_put(bio); -} - -static void bio_copy_kern_endio_read(struct bio *bio) -{ - char *p = bio->bi_private; - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); - p += bvec->bv_len; - } - - bio_copy_kern_endio(bio); -} - -/** - * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to copy - * @len: length in bytes - * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ - * - * copy the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask, int reading) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - struct bio *bio; - void *p = data; - int nr_pages = 0; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages = end - start; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (len) { - struct page *page; - unsigned int bytes = PAGE_SIZE; - - if (bytes > len) - bytes = len; - - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) - goto cleanup; - - if (!reading) - memcpy(page_address(page), p, bytes); - - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) - break; - - len -= bytes; - p += bytes; - } - - if (reading) { - bio->bi_end_io = bio_copy_kern_endio_read; - bio->bi_private = data; - } else { - bio->bi_end_io = bio_copy_kern_endio; - } - - return bio; - -cleanup: - bio_free_pages(bio); - bio_put(bio); - return ERR_PTR(-ENOMEM); -} /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions @@ -1593,22 +1323,12 @@ void bio_set_pages_dirty(struct bio *bio) { struct bio_vec *bvec; - int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { if (!PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); } -} -EXPORT_SYMBOL_GPL(bio_set_pages_dirty); - -static void bio_release_pages(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) - put_page(bvec->bv_page); } /* @@ -1643,8 +1363,7 @@ while ((bio = next) != NULL) { next = bio->bi_private; - bio_set_pages_dirty(bio); - bio_release_pages(bio); + bio_release_pages(bio, true); bio_put(bio); } } @@ -1653,14 +1372,14 @@ { struct bio_vec *bvec; unsigned long flags; - int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) goto defer; } - bio_release_pages(bio); + bio_release_pages(bio, false); bio_put(bio); return; defer: @@ -1670,49 +1389,6 @@ spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } -EXPORT_SYMBOL_GPL(bio_check_pages_dirty); - -void generic_start_io_acct(struct request_queue *q, int op, - unsigned long sectors, struct hd_struct *part) -{ - const int sgrp = op_stat_group(op); - int cpu = part_stat_lock(); - - part_round_stats(q, cpu, part); - part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, sectors[sgrp], sectors); - part_inc_in_flight(q, part, op_is_write(op)); - - part_stat_unlock(); -} -EXPORT_SYMBOL(generic_start_io_acct); - -void generic_end_io_acct(struct request_queue *q, int req_op, - struct hd_struct *part, unsigned long start_time) -{ - unsigned long duration = jiffies - start_time; - const int sgrp = op_stat_group(req_op); - int cpu = part_stat_lock(); - - part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_round_stats(q, cpu, part); - part_dec_in_flight(q, part, op_is_write(req_op)); - - part_stat_unlock(); -} -EXPORT_SYMBOL(generic_end_io_acct); - -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -void bio_flush_dcache_pages(struct bio *bi) -{ - struct bio_vec bvec; - struct bvec_iter iter; - - bio_for_each_segment(bvec, bi, iter) - flush_dcache_page(bvec.bv_page); -} -EXPORT_SYMBOL(bio_flush_dcache_pages); -#endif static inline bool bio_remaining_done(struct bio *bio) { @@ -1752,10 +1428,6 @@ again: if (!bio_remaining_done(bio)) return; - - if (!blk_crypto_endio(bio)) - return; - if (!bio_integrity_endio(bio)) return; @@ -1776,8 +1448,7 @@ } if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_disk->queue, bio, - blk_status_to_errno(bio->bi_status)); + trace_block_bio_complete(bio->bi_disk->queue, bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } @@ -1800,8 +1471,8 @@ * @bio, and updates @bio to represent the remaining sectors. * * Unless this is a discard request the newly allocated bio will point - * to @bio's bi_io_vec; it is the caller's responsibility to ensure that - * @bio is not freed before the split. + * to @bio's bi_io_vec. It is the caller's responsibility to ensure that + * neither @bio nor @bs are freed before the split bio. */ struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) @@ -1810,6 +1481,10 @@ BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio)); + + /* Zone append commands cannot be split */ + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return NULL; split = bio_clone_fast(bio, gfp, bs); if (!split) @@ -1821,7 +1496,6 @@ bio_integrity_trim(split); bio_advance(bio, split->bi_iter.bi_size); - bio->bi_iter.bi_done = 0; if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION); @@ -1846,10 +1520,7 @@ if (offset == 0 && size == bio->bi_iter.bi_size) return; - bio_clear_flag(bio, BIO_SEG_VALID); - bio_advance(bio, offset << 9); - bio->bi_iter.bi_size = size; if (bio_integrity(bio)) @@ -1968,106 +1639,6 @@ } EXPORT_SYMBOL(bioset_init_from_src); -#ifdef CONFIG_BLK_CGROUP - -#ifdef CONFIG_MEMCG -/** - * bio_associate_blkcg_from_page - associate a bio with the page's blkcg - * @bio: target bio - * @page: the page to lookup the blkcg from - * - * Associate @bio with the blkcg from @page's owning memcg. This works like - * every other associate function wrt references. - */ -int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) -{ - struct cgroup_subsys_state *blkcg_css; - - if (unlikely(bio->bi_css)) - return -EBUSY; - if (!page->mem_cgroup) - return 0; - blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, - &io_cgrp_subsys); - bio->bi_css = blkcg_css; - return 0; -} -#endif /* CONFIG_MEMCG */ - -/** - * bio_associate_blkcg - associate a bio with the specified blkcg - * @bio: target bio - * @blkcg_css: css of the blkcg to associate - * - * Associate @bio with the blkcg specified by @blkcg_css. Block layer will - * treat @bio as if it were issued by a task which belongs to the blkcg. - * - * This function takes an extra reference of @blkcg_css which will be put - * when @bio is released. The caller must own @bio and is responsible for - * synchronizing calls to this function. - */ -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) -{ - if (unlikely(bio->bi_css)) - return -EBUSY; - css_get(blkcg_css); - bio->bi_css = blkcg_css; - return 0; -} -EXPORT_SYMBOL_GPL(bio_associate_blkcg); - -/** - * bio_associate_blkg - associate a bio with the specified blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * Associate @bio with the blkg specified by @blkg. This is the queue specific - * blkcg information associated with the @bio, a reference will be taken on the - * @blkg and will be freed when the bio is freed. - */ -int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - if (unlikely(bio->bi_blkg)) - return -EBUSY; - if (!blkg_try_get(blkg)) - return -ENODEV; - bio->bi_blkg = blkg; - return 0; -} - -/** - * bio_disassociate_task - undo bio_associate_current() - * @bio: target bio - */ -void bio_disassociate_task(struct bio *bio) -{ - if (bio->bi_ioc) { - put_io_context(bio->bi_ioc); - bio->bi_ioc = NULL; - } - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } -} - -/** - * bio_clone_blkcg_association - clone blkcg association from src to dst bio - * @dst: destination bio - * @src: source bio - */ -void bio_clone_blkcg_association(struct bio *dst, struct bio *src) -{ - if (src->bi_css) - WARN_ON(bio_associate_blkcg(dst, src->bi_css)); -} -EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); -#endif /* CONFIG_BLK_CGROUP */ - static void __init biovec_init_slabs(void) { int i; @@ -2093,6 +1664,9 @@ bio_slab_nr = 0; bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), GFP_KERNEL); + + BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); + if (!bio_slabs) panic("bio: can't allocate bios\n"); -- Gitblit v1.6.2