From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/buffer.c | 347 ++++++++++++++++++++++++++++----------------------------- 1 files changed, 172 insertions(+), 175 deletions(-) diff --git a/kernel/fs/buffer.c b/kernel/fs/buffer.c index b9164e5..5f6fd1f 100644 --- a/kernel/fs/buffer.c +++ b/kernel/fs/buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/buffer.c * @@ -47,6 +48,8 @@ #include <linux/sched/mm.h> #include <trace/events/block.h> #include <linux/fscrypt.h> + +#include "internal.h" static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, @@ -120,14 +123,6 @@ } EXPORT_SYMBOL(__wait_on_buffer); -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} - static void buffer_io_error(struct buffer_head *bh, char *msg) { if (!test_bit(BH_Quiet, &bh->b_state)) @@ -178,7 +173,7 @@ unlock_buffer(bh); put_bh(bh); } -EXPORT_SYMBOL(end_buffer_write_sync); +EXPORT_SYMBOL_NS(end_buffer_write_sync, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Various filesystems appear to want __find_get_block to be non-blocking. @@ -246,10 +241,6 @@ return ret; } -/* - * I/O completion handler for block_read_full_page() - pages - * which come unlocked at the end of I/O. - */ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) { unsigned long flags; @@ -275,7 +266,7 @@ * decide that the page is now completely done. */ first = page_buffers(page); - flags = bh_uptodate_lock_irqsave(first); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -288,7 +279,7 @@ } tmp = tmp->b_this_page; } while (tmp != bh); - bh_uptodate_unlock_irqrestore(first, flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); /* * If none of the buffers had errors and they are all @@ -300,7 +291,48 @@ return; still_busy: - bh_uptodate_unlock_irqrestore(first, flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; +} + +struct decrypt_bh_ctx { + struct work_struct work; + struct buffer_head *bh; +}; + +static void decrypt_bh(struct work_struct *work) +{ + struct decrypt_bh_ctx *ctx = + container_of(work, struct decrypt_bh_ctx, work); + struct buffer_head *bh = ctx->bh; + int err; + + err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size, + bh_offset(bh)); + end_buffer_async_read(bh, err == 0); + kfree(ctx); +} + +/* + * I/O completion handler for block_read_full_page() - pages + * which come unlocked at the end of I/O. + */ +static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) +{ + /* Decrypt if needed */ + if (uptodate && + fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) { + struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); + + if (ctx) { + INIT_WORK(&ctx->work, decrypt_bh); + ctx->bh = bh; + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + uptodate = 0; + } + end_buffer_async_read(bh, uptodate); } /* @@ -327,7 +359,7 @@ } first = page_buffers(page); - flags = bh_uptodate_lock_irqsave(first); + spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_write(bh); unlock_buffer(bh); @@ -339,12 +371,13 @@ } tmp = tmp->b_this_page; } - bh_uptodate_unlock_irqrestore(first, flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); end_page_writeback(page); return; still_busy: - bh_uptodate_unlock_irqrestore(first, flags); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; } EXPORT_SYMBOL(end_buffer_async_write); @@ -371,7 +404,7 @@ */ static void mark_buffer_async_read(struct buffer_head *bh) { - bh->b_end_io = end_buffer_async_read; + bh->b_end_io = end_buffer_async_read_io; set_buffer_async_read(bh); } @@ -386,7 +419,7 @@ { mark_buffer_async_write_endio(bh, end_buffer_async_write); } -EXPORT_SYMBOL(mark_buffer_async_write); +EXPORT_SYMBOL_NS(mark_buffer_async_write, ANDROID_GKI_VFS_EXPORT_ONLY); /* @@ -490,7 +523,7 @@ void emergency_thaw_bdev(struct super_block *sb) { - while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); } @@ -556,7 +589,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); /* - * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * Mark the page dirty, and set it dirty in the page cache, and mark the inode * dirty. * * If warn is true, then emit a warning if the page is not uptodate and has @@ -573,8 +606,8 @@ if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -641,7 +674,7 @@ return newly_dirty; } -EXPORT_SYMBOL(__set_page_dirty_buffers); +EXPORT_SYMBOL_NS(__set_page_dirty_buffers, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Write out and wait upon a list of buffers. @@ -809,13 +842,13 @@ struct buffer_head *bh, *head; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *old_memcg; if (retry) gfp |= __GFP_NOFAIL; memcg = get_mem_cgroup_from_page(page); - memalloc_use_memcg(memcg); + old_memcg = set_active_memcg(memcg); head = NULL; offset = PAGE_SIZE; @@ -834,7 +867,7 @@ set_bh_page(bh, page, offset); } out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); mem_cgroup_put(memcg); return head; /* @@ -864,7 +897,7 @@ bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) @@ -925,7 +958,7 @@ struct page *page; struct buffer_head *bh; sector_t end_block; - int ret = 0; /* Will call free_more_memory() */ + int ret = 0; gfp_t gfp_mask; gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; @@ -1044,7 +1077,7 @@ * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page is tagged dirty in its radix tree. + * the page is tagged dirty in the page cache. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@ -1067,9 +1100,9 @@ * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * - * mark_buffer_dirty() will set the dirty bit against the buffer, then set its - * backing page dirty, then tag the page as dirty in its address_space's radix - * tree and then attach the address_space's inode to its superblock's dirty + * mark_buffer_dirty() will set the dirty bit against the buffer, then set + * its backing page dirty, then tag the page as dirty in the page cache + * and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, @@ -1108,18 +1141,25 @@ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } -EXPORT_SYMBOL(mark_buffer_dirty); +EXPORT_SYMBOL_NS(mark_buffer_dirty, ANDROID_GKI_VFS_EXPORT_ONLY); void mark_buffer_write_io_error(struct buffer_head *bh) { + struct super_block *sb; + set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_page && bh->b_page->mapping) mapping_set_error(bh->b_page->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); + rcu_read_lock(); + sb = READ_ONCE(bh->b_bdev->bd_super); + if (sb) + errseq_set(&sb->s_wb_err, -EIO); + rcu_read_unlock(); } -EXPORT_SYMBOL(mark_buffer_write_io_error); +EXPORT_SYMBOL_NS(mark_buffer_write_io_error, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Decrement a buffer_head's reference count. If all buffers against a page @@ -1136,7 +1176,7 @@ } WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); } -EXPORT_SYMBOL(__brelse); +EXPORT_SYMBOL_NS(__brelse, ANDROID_GKI_VFS_EXPORT_ONLY); /* * bforget() is like brelse(), except it discards any @@ -1155,7 +1195,7 @@ } __brelse(bh); } -EXPORT_SYMBOL(__bforget); +EXPORT_SYMBOL_NS(__bforget, ANDROID_GKI_VFS_EXPORT_ONLY); static struct buffer_head *__bread_slow(struct buffer_head *bh) { @@ -1224,6 +1264,15 @@ int i; check_irqs_on(); + /* + * the refcount of buffer_head in bh_lru prevents dropping the + * attached page(i.e., try_to_free_buffers) so it could cause + * failing page migration. + * Skip putting upcoming bh into bh_lru until migration is done. + */ + if (lru_cache_disabled()) + return; + bh_lru_lock(); b = this_cpu_ptr(&bh_lrus); @@ -1327,7 +1376,7 @@ brelse(bh); } } -EXPORT_SYMBOL(__breadahead); +EXPORT_SYMBOL_NS(__breadahead, ANDROID_GKI_VFS_EXPORT_ONLY); void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) @@ -1362,8 +1411,17 @@ bh = __bread_slow(bh); return bh; } -EXPORT_SYMBOL(__bread_gfp); +EXPORT_SYMBOL_NS(__bread_gfp, ANDROID_GKI_VFS_EXPORT_ONLY); +static void __invalidate_bh_lrus(struct bh_lru *b) +{ + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } +} /* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq @@ -1372,33 +1430,43 @@ static void invalidate_bh_lru(void *arg) { struct bh_lru *b = &get_cpu_var(bh_lrus); - int i; - for (i = 0; i < BH_LRU_SIZE; i++) { - brelse(b->bhs[i]); - b->bhs[i] = NULL; - } + __invalidate_bh_lrus(b); put_cpu_var(bh_lrus); } -static bool has_bh_in_lru(int cpu, void *dummy) +bool has_bh_in_lru(int cpu, void *dummy) { struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); int i; for (i = 0; i < BH_LRU_SIZE; i++) { if (b->bhs[i]) - return 1; + return true; } - return 0; + return false; } void invalidate_bh_lrus(void) { - on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL); + on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1); } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); + +/* + * It's called from workqueue context so we need a bh_lru_lock to close + * the race with preemption/irq. + */ +void invalidate_bh_lrus_cpu(void) +{ + struct bh_lru *b; + + bh_lru_lock(); + b = this_cpu_ptr(&bh_lrus); + __invalidate_bh_lrus(b); + bh_lru_unlock(); +} void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset) @@ -1505,7 +1573,7 @@ out: return; } -EXPORT_SYMBOL(block_invalidatepage); +EXPORT_SYMBOL_NS(block_invalidatepage, ANDROID_GKI_VFS_EXPORT_ONLY); /* @@ -1538,10 +1606,10 @@ bh = bh->b_this_page; } while (bh != head); } - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } -EXPORT_SYMBOL(create_empty_buffers); +EXPORT_SYMBOL_NS(create_empty_buffers, ANDROID_GKI_VFS_EXPORT_ONLY); /** * clean_bdev_aliases: clean a range of buffers in block device @@ -1615,7 +1683,7 @@ break; } } -EXPORT_SYMBOL(clean_bdev_aliases); +EXPORT_SYMBOL_NS(clean_bdev_aliases, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Size is a power-of-two in the range 512..PAGE_SIZE, @@ -1873,7 +1941,7 @@ bh = bh->b_this_page; } while (bh != head); } -EXPORT_SYMBOL(page_zero_new_buffers); +EXPORT_SYMBOL_NS(page_zero_new_buffers, ANDROID_GKI_VFS_EXPORT_ONLY); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, @@ -1918,7 +1986,7 @@ */ set_buffer_new(bh); set_buffer_unwritten(bh); - /* FALLTHRU */ + fallthrough; case IOMAP_MAPPED: if ((iomap->flags & IOMAP_F_NEW) || offset >= i_size_read(inode)) @@ -2089,40 +2157,6 @@ } EXPORT_SYMBOL(block_write_begin); -int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, - struct page *page) -{ - loff_t old_size = inode->i_size; - bool i_size_changed = false; - - /* - * No need to use i_size_read() here, the i_size cannot change under us - * because we hold i_rwsem. - * - * But it's important to update i_size while still holding page lock: - * page writeout could otherwise come in and zero beyond i_size. - */ - if (pos + copied > inode->i_size) { - i_size_write(inode, pos + copied); - i_size_changed = true; - } - - unlock_page(page); - put_page(page); - - if (old_size < pos) - pagecache_isize_extended(inode, old_size, pos); - /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling - * filesystems. - */ - if (i_size_changed) - mark_inode_dirty(inode); - return copied; -} - int block_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) @@ -2163,8 +2197,38 @@ loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + bool i_size_changed = false; + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - return __generic_write_end(mapping->host, pos, copied, page); + + /* + * No need to use i_size_read() here, the i_size cannot change under us + * because we hold i_rwsem. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos + copied > inode->i_size) { + i_size_write(inode, pos + copied); + i_size_changed = true; + } + + unlock_page(page); + put_page(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + return copied; } EXPORT_SYMBOL(generic_write_end); @@ -2211,7 +2275,7 @@ return ret; } -EXPORT_SYMBOL(block_is_partially_uptodate); +EXPORT_SYMBOL_NS(block_is_partially_uptodate, ANDROID_GKI_VFS_EXPORT_ONLY); /* * Generic "read page" function for block devices that have the normal @@ -2314,7 +2378,7 @@ { struct address_space *mapping = inode->i_mapping; struct page *page; - void *fsdata; + void *fsdata = NULL; int err; err = inode_newsize_ok(inode, size); @@ -2340,7 +2404,7 @@ struct inode *inode = mapping->host; unsigned int blocksize = i_blocksize(inode); struct page *page; - void *fsdata; + void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; unsigned zerofrom, offset, len; @@ -2371,7 +2435,7 @@ balance_dirty_pages_ratelimited(mapping); - if (unlikely(fatal_signal_pending(current))) { + if (fatal_signal_pending(current)) { err = -EINTR; goto out; } @@ -2529,7 +2593,7 @@ bh->b_this_page = head; bh = bh->b_this_page; } while (bh != head); - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } @@ -2970,69 +3034,6 @@ bio_put(bio); } -/* - * This allows us to do IO even on the odd last sectors - * of a device, even if the block size is some multiple - * of the physical sector size. - * - * We'll just truncate the bio to the size of the device, - * and clear the end of the buffer head manually. - * - * Truly out-of-range accesses will turn into actual IO - * errors, this only handles the "we need to be able to - * do IO at the final sector" case. - */ -void guard_bio_eod(int op, struct bio *bio) -{ - sector_t maxsector; - struct bio_vec *bvec = bio_last_bvec_all(bio); - unsigned truncated_bytes; - struct hd_struct *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = part_nr_sects_read(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); - - if (!maxsector) - return; - - /* - * If the *whole* IO is past the end of the device, - * let it through, and the IO layer will turn it into - * an EIO. - */ - if (unlikely(bio->bi_iter.bi_sector >= maxsector)) - return; - - maxsector -= bio->bi_iter.bi_sector; - if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) - return; - - /* Uhhuh. We've got a bio that straddles the device size! */ - truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); - - /* - * The bio contains more than one segment which spans EOD, just return - * and let IO layer turn it into an EIO - */ - if (truncated_bytes > bvec->bv_len) - return; - - /* Truncate the bio.. */ - bio->bi_iter.bi_size -= truncated_bytes; - bvec->bv_len -= truncated_bytes; - - /* ..and clear the end of the buffer for reads */ - if (op == REQ_OP_READ) { - zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, - truncated_bytes); - } -} - static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, enum rw_hint write_hint, struct writeback_control *wbc) { @@ -3050,18 +3051,9 @@ if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE)) clear_buffer_write_io_error(bh); - /* - * from here on down, it's all bio -- do the initial mapping, - * submit_bio -> generic_make_request may further map this bio around - */ bio = bio_alloc(GFP_NOIO, 1); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); - - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); @@ -3073,14 +3065,19 @@ bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; - /* Take care of bh's that straddle the end of the device */ - guard_bio_eod(op, bio); - if (buffer_meta(bh)) op_flags |= REQ_META; if (buffer_prio(bh)) op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + + /* Take care of bh's that straddle the end of the device */ + guard_bio_eod(bio); + + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); + } submit_bio(bio); return 0; @@ -3145,7 +3142,7 @@ unlock_buffer(bh); } } -EXPORT_SYMBOL(ll_rw_block); +EXPORT_SYMBOL_NS(ll_rw_block, ANDROID_GKI_VFS_EXPORT_ONLY); void write_dirty_buffer(struct buffer_head *bh, int op_flags) { @@ -3198,7 +3195,7 @@ { return __sync_dirty_buffer(bh, REQ_SYNC); } -EXPORT_SYMBOL(sync_dirty_buffer); +EXPORT_SYMBOL_NS(sync_dirty_buffer, ANDROID_GKI_VFS_EXPORT_ONLY); /* * try_to_free_buffers() checks if all the buffers on this particular page @@ -3247,7 +3244,7 @@ bh = next; } while (bh != head); *buffers_to_free = head; - __clear_page_buffers(page); + detach_page_private(page); return 1; failed: return 0; @@ -3367,7 +3364,7 @@ struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); - buffer_head_init_locks(ret); + spin_lock_init(&ret->b_uptodate_lock); preempt_disable(); __this_cpu_inc(bh_accounting.nr); recalc_bh_state(); -- Gitblit v1.6.2