From 10ebd8556b7990499c896a550e3d416b444211e6 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 10 May 2024 02:23:07 +0000 Subject: [PATCH] add led --- kernel/fs/orangefs/file.c | 463 +++++++++++++++++++++++++-------------------------------- 1 files changed, 207 insertions(+), 256 deletions(-) diff --git a/kernel/fs/orangefs/file.c b/kernel/fs/orangefs/file.c index a5a2fe7..af375e0 100644 --- a/kernel/fs/orangefs/file.c +++ b/kernel/fs/orangefs/file.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. * * See COPYING in top-level directory. */ @@ -44,15 +45,19 @@ /* * Post and wait for the I/O upcall to finish */ -static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, - loff_t *offset, struct iov_iter *iter, - size_t total_size, loff_t readahead_size) +ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, + loff_t *offset, struct iov_iter *iter, size_t total_size, + loff_t readahead_size, struct orangefs_write_range *wr, + int *index_return, struct file *file) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; struct orangefs_kernel_op_s *new_op = NULL; - int buffer_index = -1; + int buffer_index; ssize_t ret; + size_t copy_amount; + int open_for_read; + int open_for_write; new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); if (!new_op) @@ -84,6 +89,42 @@ new_op->upcall.req.io.buf_index = buffer_index; new_op->upcall.req.io.count = total_size; new_op->upcall.req.io.offset = *offset; + if (type == ORANGEFS_IO_WRITE && wr) { + new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); + new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); + } + /* + * Orangefs has no open, and orangefs checks file permissions + * on each file access. Posix requires that file permissions + * be checked on open and nowhere else. Orangefs-through-the-kernel + * needs to seem posix compliant. + * + * The VFS opens files, even if the filesystem provides no + * method. We can see if a file was successfully opened for + * read and or for write by looking at file->f_mode. + * + * When writes are flowing from the page cache, file is no + * longer available. We can trust the VFS to have checked + * file->f_mode before writing to the page cache. + * + * The mode of a file might change between when it is opened + * and IO commences, or it might be created with an arbitrary mode. + * + * We'll make sure we don't hit EACCES during the IO stage by + * using UID 0. Some of the time we have access without changing + * to UID 0 - how to check? + */ + if (file) { + open_for_write = file->f_mode & FMODE_WRITE; + open_for_read = file->f_mode & FMODE_READ; + } else { + open_for_write = 1; + open_for_read = 0; /* not relevant? */ + } + if ((type == ORANGEFS_IO_WRITE) && open_for_write) + new_op->upcall.uid = 0; + if ((type == ORANGEFS_IO_READ) && open_for_read) + new_op->upcall.uid = 0; gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): offset: %llu total_size: %zd\n", @@ -128,7 +169,6 @@ */ if (ret == -EAGAIN && op_state_purged(new_op)) { orangefs_bufmap_put(buffer_index); - buffer_index = -1; if (type == ORANGEFS_IO_WRITE) iov_iter_revert(iter, total_size); gossip_debug(GOSSIP_FILE_DEBUG, @@ -168,7 +208,10 @@ * trigger the write. */ case OP_VFS_STATE_INPROGR: - ret = total_size; + if (type == ORANGEFS_IO_READ) + ret = -EINTR; + else + ret = total_size; break; default: gossip_err("%s: unexpected op state :%d:.\n", @@ -204,8 +247,25 @@ * can futher be kernel-space or user-space addresses. * or it can pointers to struct page's */ + + /* + * When reading, readahead_size will only be zero when + * we're doing O_DIRECT, otherwise we got here from + * orangefs_readpage. + * + * If we got here from orangefs_readpage we want to + * copy either a page or the whole file into the io + * vector, whichever is smaller. + */ + if (readahead_size) + copy_amount = + min(new_op->downcall.resp.io.amt_complete, + (__s64)PAGE_SIZE); + else + copy_amount = new_op->downcall.resp.io.amt_complete; + ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, - new_op->downcall.resp.io.amt_complete); + copy_amount); if (ret < 0) { gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", __func__, (long)ret); @@ -223,250 +283,112 @@ out: if (buffer_index >= 0) { - orangefs_bufmap_put(buffer_index); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): PUT buffer_index %d\n", - __func__, handle, buffer_index); - buffer_index = -1; + if ((readahead_size) && (type == ORANGEFS_IO_READ)) { + /* readpage */ + *index_return = buffer_index; + gossip_debug(GOSSIP_FILE_DEBUG, + "%s: hold on to buffer_index :%d:\n", + __func__, buffer_index); + } else { + /* O_DIRECT */ + orangefs_bufmap_put(buffer_index); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): PUT buffer_index %d\n", + __func__, handle, buffer_index); + } } op_release(new_op); return ret; } -/* - * Common entry point for read/write/readv/writev - * This function will dispatch it to either the direct I/O - * or buffered I/O path depending on the mount options and/or - * augmented/extended metadata attached to the file. - * Note: File extended attributes override any mount options. - */ -static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file, - loff_t *offset, struct iov_iter *iter) +int orangefs_revalidate_mapping(struct inode *inode) { - struct inode *inode = file->f_mapping->host; struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; - size_t count = iov_iter_count(iter); - ssize_t total_count = 0; - ssize_t ret = -EINVAL; + struct address_space *mapping = inode->i_mapping; + unsigned long *bitlock = &orangefs_inode->bitlock; + int ret; - gossip_debug(GOSSIP_FILE_DEBUG, - "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", - __func__, - handle, - (int)count); - - if (type == ORANGEFS_IO_WRITE) { - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): proceeding with offset : %llu, " - "size %d\n", - __func__, - handle, - llu(*offset), - (int)count); - } - - if (count == 0) { - ret = 0; - goto out; - } - - while (iov_iter_count(iter)) { - size_t each_count = iov_iter_count(iter); - size_t amt_complete; - - /* how much to transfer in this loop iteration */ - if (each_count > orangefs_bufmap_size_query()) - each_count = orangefs_bufmap_size_query(); - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): size of each_count(%d)\n", - __func__, - handle, - (int)each_count); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): BEFORE wait_for_io: offset is %d\n", - __func__, - handle, - (int)*offset); - - ret = wait_for_direct_io(type, inode, offset, iter, - each_count, 0); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): return from wait_for_io:%d\n", - __func__, - handle, - (int)ret); - - if (ret < 0) - goto out; - - *offset += ret; - total_count += ret; - amt_complete = ret; - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): AFTER wait_for_io: offset is %d\n", - __func__, - handle, - (int)*offset); - - /* - * if we got a short I/O operations, - * fall out and return what we got so far - */ - if (amt_complete < each_count) + while (1) { + ret = wait_on_bit(bitlock, 1, TASK_KILLABLE); + if (ret) + return ret; + spin_lock(&inode->i_lock); + if (test_bit(1, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (!time_before(jiffies, orangefs_inode->mapping_time)) break; - } /*end while */ + spin_unlock(&inode->i_lock); + return 0; + } + set_bit(1, bitlock); + smp_wmb(); + spin_unlock(&inode->i_lock); + + unmap_mapping_range(mapping, 0, 0, 0); + ret = filemap_write_and_wait(mapping); + if (!ret) + ret = invalidate_inode_pages2(mapping); + + orangefs_inode->mapping_time = jiffies + + orangefs_cache_timeout_msecs*HZ/1000; + + clear_bit(1, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, 1); + + return ret; +} + +static ssize_t orangefs_file_read_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + int ret; + orangefs_stats.reads++; + + down_read(&file_inode(iocb->ki_filp)->i_rwsem); + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + goto out; + + ret = generic_file_read_iter(iocb, iter); out: - if (total_count > 0) - ret = total_count; - if (ret > 0) { - if (type == ORANGEFS_IO_READ) { - file_accessed(file); - } else { - file_update_time(file); - /* - * Must invalidate to ensure write loop doesn't - * prevent kernel from reading updated - * attribute. Size probably changed because of - * the write, and other clients could update - * any other attribute. - */ - orangefs_inode->getattr_time = jiffies - 1; - } - } - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): Value(%d) returned.\n", - __func__, - handle, - (int)ret); - + up_read(&file_inode(iocb->ki_filp)->i_rwsem); return ret; } -/* - * Read data from a specified offset in a file (referenced by inode). - * Data may be placed either in a user or kernel buffer. - */ -ssize_t orangefs_inode_read(struct inode *inode, - struct iov_iter *iter, - loff_t *offset, - loff_t readahead_size) +static ssize_t orangefs_file_write_iter(struct kiocb *iocb, + struct iov_iter *iter) { - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - size_t count = iov_iter_count(iter); - size_t bufmap_size; - ssize_t ret = -EINVAL; - - orangefs_stats.reads++; - - bufmap_size = orangefs_bufmap_size_query(); - if (count > bufmap_size) { - gossip_debug(GOSSIP_FILE_DEBUG, - "%s: count is too large (%zd/%zd)!\n", - __func__, count, bufmap_size); - return -EINVAL; - } - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU) %zd@%llu\n", - __func__, - &orangefs_inode->refn.khandle, - count, - llu(*offset)); - - ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter, - count, readahead_size); - if (ret > 0) - *offset += ret; - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): Value(%zd) returned.\n", - __func__, - &orangefs_inode->refn.khandle, - ret); - - return ret; -} - -static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; - ssize_t rc = 0; - - BUG_ON(iocb->private); - - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); - - orangefs_stats.reads++; - - rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); - iocb->ki_pos = pos; - - return rc; -} - -static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - loff_t pos; - ssize_t rc; - - BUG_ON(iocb->private); - - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); - - inode_lock(file->f_mapping->host); - - /* Make sure generic_write_checks sees an up to date inode size. */ - if (file->f_flags & O_APPEND) { - rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, - STATX_SIZE); - if (rc == -ESTALE) - rc = -EIO; - if (rc) { - gossip_err("%s: orangefs_inode_getattr failed, " - "rc:%zd:.\n", __func__, rc); - goto out; - } - } - - rc = generic_write_checks(iocb, iter); - - if (rc <= 0) { - gossip_err("%s: generic_write_checks failed, rc:%zd:.\n", - __func__, rc); - goto out; - } - - /* - * if we are appending, generic_write_checks would have updated - * pos to the end of the file, so we will wait till now to set - * pos... - */ - pos = iocb->ki_pos; - - rc = do_readv_writev(ORANGEFS_IO_WRITE, - file, - &pos, - iter); - if (rc < 0) { - gossip_err("%s: do_readv_writev failed, rc:%zd:.\n", - __func__, rc); - goto out; - } - - iocb->ki_pos = pos; + int ret; orangefs_stats.writes++; -out: + if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) { + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + return ret; + } - inode_unlock(file->f_mapping->host); - return rc; + ret = generic_file_write_iter(iocb, iter); + return ret; +} + +static int orangefs_getflags(struct inode *inode, unsigned long *uval) +{ + __u64 val = 0; + int ret; + + ret = orangefs_inode_getxattr(inode, + "user.pvfs2.meta_hint", + &val, sizeof(val)); + if (ret < 0 && ret != -ENODATA) + return ret; + else if (ret == -ENODATA) + val = 0; + *uval = val; + return 0; } /* @@ -474,6 +396,7 @@ */ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + struct inode *inode = file_inode(file); int ret = -ENOTTY; __u64 val = 0; unsigned long uval; @@ -487,20 +410,16 @@ * and append flags */ if (cmd == FS_IOC_GETFLAGS) { - val = 0; - ret = orangefs_inode_getxattr(file_inode(file), - "user.pvfs2.meta_hint", - &val, sizeof(val)); - if (ret < 0 && ret != -ENODATA) + ret = orangefs_getflags(inode, &uval); + if (ret) return ret; - else if (ret == -ENODATA) - val = 0; - uval = val; gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n", (unsigned long long)uval); return put_user(uval, (int __user *)arg); } else if (cmd == FS_IOC_SETFLAGS) { + unsigned long old_uval; + ret = 0; if (get_user(uval, (int __user *)arg)) return -EFAULT; @@ -516,11 +435,17 @@ gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n"); return -EINVAL; } + ret = orangefs_getflags(inode, &old_uval); + if (ret) + return ret; + ret = vfs_ioc_setflags_prepare(inode, old_uval, uval); + if (ret) + return ret; val = uval; gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", (unsigned long long)val); - ret = orangefs_inode_setxattr(file_inode(file), + ret = orangefs_inode_setxattr(inode, "user.pvfs2.meta_hint", &val, sizeof(val), 0); } @@ -532,14 +457,13 @@ { struct file *file = vmf->vma->vm_file; int ret; - - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, - STATX_SIZE); + ret = orangefs_inode_getattr(file->f_mapping->host, + ORANGEFS_GETATTR_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { - gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n", - __func__, ret); + gossip_err("%s: orangefs_inode_getattr failed, " + "ret:%d:.\n", __func__, ret); return VM_FAULT_SIGBUS; } return filemap_fault(vmf); @@ -548,7 +472,7 @@ static const struct vm_operations_struct orangefs_file_vm_ops = { .fault = orangefs_fault, .map_pages = filemap_map_pages, - .page_mkwrite = filemap_page_mkwrite, + .page_mkwrite = orangefs_page_mkwrite, }; /* @@ -556,14 +480,17 @@ */ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) { + int ret; + + ret = orangefs_revalidate_mapping(file_inode(file)); + if (ret) + return ret; + gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_mmap: called on %s\n", (file ? (char *)file->f_path.dentry->d_name.name : (char *)"Unknown")); - - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - return -EINVAL; /* set the sequential readahead hint */ vma->vm_flags |= VM_SEQ_READ; @@ -604,8 +531,7 @@ gossip_debug(GOSSIP_INODE_DEBUG, "flush_racache finished\n"); } - truncate_inode_pages(file_inode(file)->i_mapping, - 0); + } return 0; } @@ -622,6 +548,11 @@ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(file_inode(file)); struct orangefs_kernel_op_s *new_op = NULL; + + ret = filemap_write_and_wait_range(file_inode(file)->i_mapping, + start, end); + if (ret < 0) + return ret; new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); if (!new_op) @@ -644,7 +575,7 @@ * Change the file pointer position for an instance of an open file. * * \note If .llseek is overriden, we must acquire lock as described in - * Documentation/filesystems/Locking. + * Documentation/filesystems/locking.rst. * * Future upgrade could support SEEK_DATA and SEEK_HOLE but would * require much changes to the FS @@ -660,8 +591,8 @@ * NOTE: We are only interested in file size here, * so we set mask accordingly. */ - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, - STATX_SIZE); + ret = orangefs_inode_getattr(file->f_mapping->host, + ORANGEFS_GETATTR_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { @@ -704,6 +635,25 @@ return rc; } +static int orangefs_flush(struct file *file, fl_owner_t id) +{ + /* + * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the + * service_operation in orangefs_fsync. + * + * Do not send fsync to OrangeFS server on a close. Do send fsync + * on an explicit fsync call. This duplicates historical OrangeFS + * behavior. + */ + int r; + + r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); + if (r > 0) + return 0; + else + return r; +} + /** ORANGEFS implementation of VFS file operations */ const struct file_operations orangefs_file_operations = { .llseek = orangefs_file_llseek, @@ -713,6 +663,7 @@ .unlocked_ioctl = orangefs_ioctl, .mmap = orangefs_file_mmap, .open = generic_file_open, + .flush = orangefs_flush, .release = orangefs_file_release, .fsync = orangefs_fsync, }; -- Gitblit v1.6.2