From 072de836f53be56a70cecf70b43ae43b7ce17376 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 10:08:36 +0000
Subject: [PATCH] mk-rootfs.sh
---
kernel/fs/overlayfs/file.c | 394 +++++++++++++++++++++++++++++++++++++------------------
1 files changed, 264 insertions(+), 130 deletions(-)
diff --git a/kernel/fs/overlayfs/file.c b/kernel/fs/overlayfs/file.c
index 818a8ee..719eec7 100644
--- a/kernel/fs/overlayfs/file.c
+++ b/kernel/fs/overlayfs/file.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
*/
#include <linux/cred.h>
@@ -12,7 +9,22 @@
#include <linux/xattr.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
+#include <linux/splice.h>
+#include <linux/security.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
#include "overlayfs.h"
+
+#define OVL_IOCB_MASK (IOCB_DSYNC | IOCB_HIPRI | IOCB_NOWAIT | IOCB_SYNC)
+
+struct ovl_aio_req {
+ struct kiocb iocb;
+ refcount_t ref;
+ struct kiocb *orig_iocb;
+ struct fd fd;
+};
+
+static struct kmem_cache *ovl_aio_request_cachep;
static char ovl_whatisit(struct inode *inode, struct inode *realinode)
{
@@ -34,11 +46,23 @@
struct file *realfile;
const struct cred *old_cred;
int flags = file->f_flags | OVL_OPEN_FLAGS;
+ int acc_mode = ACC_MODE(flags);
+ int err;
+
+ if (flags & O_APPEND)
+ acc_mode |= MAY_APPEND;
old_cred = ovl_override_creds(inode->i_sb);
- realfile = open_with_fake_path(&file->f_path, flags, realinode,
- current_cred());
- ovl_revert_creds(old_cred);
+ err = inode_permission(realinode, MAY_OPEN | acc_mode);
+ if (err) {
+ realfile = ERR_PTR(err);
+ } else if (old_cred && !inode_owner_or_capable(realinode)) {
+ realfile = ERR_PTR(-EPERM);
+ } else {
+ realfile = open_with_fake_path(&file->f_path, flags, realinode,
+ current_cred());
+ }
+ ovl_revert_creds(inode->i_sb, old_cred);
pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
file, file, ovl_whatisit(inode, realinode), file->f_flags,
@@ -115,6 +139,13 @@
static int ovl_real_fdget(const struct file *file, struct fd *real)
{
+ if (d_is_dir(file_dentry(file))) {
+ real->flags = 0;
+ real->file = ovl_dir_real_file(file, false);
+
+ return PTR_ERR_OR_ZERO(real->file);
+ }
+
return ovl_real_fdget_meta(file, real, false);
}
@@ -176,15 +207,15 @@
* limitations that are more strict than ->s_maxbytes for specific
* files, so we use the real file to perform seeks.
*/
- inode_lock(inode);
+ ovl_inode_lock(inode);
real.file->f_pos = file->f_pos;
old_cred = ovl_override_creds(inode->i_sb);
ret = vfs_llseek(real.file, offset, whence);
- ovl_revert_creds(old_cred);
+ ovl_revert_creds(inode->i_sb, old_cred);
file->f_pos = real.file->f_pos;
- inode_unlock(inode);
+ ovl_inode_unlock(inode);
fdput(real);
@@ -213,21 +244,41 @@
touch_atime(&file->f_path);
}
-static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb)
+static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
{
- int ifl = iocb->ki_flags;
- rwf_t flags = 0;
+ if (refcount_dec_and_test(&aio_req->ref)) {
+ fdput(aio_req->fd);
+ kmem_cache_free(ovl_aio_request_cachep, aio_req);
+ }
+}
- if (ifl & IOCB_NOWAIT)
- flags |= RWF_NOWAIT;
- if (ifl & IOCB_HIPRI)
- flags |= RWF_HIPRI;
- if (ifl & IOCB_DSYNC)
- flags |= RWF_DSYNC;
- if (ifl & IOCB_SYNC)
- flags |= RWF_SYNC;
+static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
+{
+ struct kiocb *iocb = &aio_req->iocb;
+ struct kiocb *orig_iocb = aio_req->orig_iocb;
- return flags;
+ if (iocb->ki_flags & IOCB_WRITE) {
+ struct inode *inode = file_inode(orig_iocb->ki_filp);
+
+ /* Actually acquired in ovl_write_iter() */
+ __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
+ SB_FREEZE_WRITE);
+ file_end_write(iocb->ki_filp);
+ ovl_copyattr(ovl_inode_real(inode), inode);
+ }
+
+ orig_iocb->ki_pos = iocb->ki_pos;
+ ovl_aio_put(aio_req);
+}
+
+static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2)
+{
+ struct ovl_aio_req *aio_req = container_of(iocb,
+ struct ovl_aio_req, iocb);
+ struct kiocb *orig_iocb = aio_req->orig_iocb;
+
+ ovl_aio_cleanup_handler(aio_req);
+ orig_iocb->ki_complete(orig_iocb, res, res2);
}
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -244,13 +295,41 @@
if (ret)
return ret;
+ ret = -EINVAL;
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ (!real.file->f_mapping->a_ops ||
+ !real.file->f_mapping->a_ops->direct_IO))
+ goto out_fdput;
+
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(iocb));
- ovl_revert_creds(old_cred);
+ if (is_sync_kiocb(iocb)) {
+ ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
+ iocb_to_rw_flags(iocb->ki_flags,
+ OVL_IOCB_MASK));
+ } else {
+ struct ovl_aio_req *aio_req;
+
+ ret = -ENOMEM;
+ aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
+ if (!aio_req)
+ goto out;
+
+ aio_req->fd = real;
+ real.flags = 0;
+ aio_req->orig_iocb = iocb;
+ kiocb_clone(&aio_req->iocb, iocb, real.file);
+ aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
+ ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
+ if (ret != -EIOCBQUEUED)
+ ovl_aio_cleanup_handler(aio_req);
+ }
+out:
+ ovl_revert_creds(file_inode(file)->i_sb, old_cred);
ovl_file_accessed(file);
-
+out_fdput:
fdput(real);
return ret;
@@ -263,6 +342,7 @@
struct fd real;
const struct cred *old_cred;
ssize_t ret;
+ int ifl = iocb->ki_flags;
if (!iov_iter_count(iter))
return 0;
@@ -278,16 +358,95 @@
if (ret)
goto out_unlock;
+ ret = -EINVAL;
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ (!real.file->f_mapping->a_ops ||
+ !real.file->f_mapping->a_ops->direct_IO))
+ goto out_fdput;
+
+ if (!ovl_should_sync(OVL_FS(inode->i_sb)))
+ ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
+
old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ if (is_sync_kiocb(iocb)) {
+ file_start_write(real.file);
+ ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
+ iocb_to_rw_flags(ifl, OVL_IOCB_MASK));
+ file_end_write(real.file);
+ /* Update size */
+ ovl_copyattr(ovl_inode_real(inode), inode);
+ } else {
+ struct ovl_aio_req *aio_req;
+
+ ret = -ENOMEM;
+ aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
+ if (!aio_req)
+ goto out;
+
+ file_start_write(real.file);
+ /* Pacify lockdep, same trick as done in aio_write() */
+ __sb_writers_release(file_inode(real.file)->i_sb,
+ SB_FREEZE_WRITE);
+ aio_req->fd = real;
+ real.flags = 0;
+ aio_req->orig_iocb = iocb;
+ kiocb_clone(&aio_req->iocb, iocb, real.file);
+ aio_req->iocb.ki_flags = ifl;
+ aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
+ ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
+ ovl_aio_put(aio_req);
+ if (ret != -EIOCBQUEUED)
+ ovl_aio_cleanup_handler(aio_req);
+ }
+out:
+ ovl_revert_creds(file_inode(file)->i_sb, old_cred);
+out_fdput:
+ fdput(real);
+
+out_unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
+/*
+ * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
+ * due to lock order inversion between pipe->mutex in iter_file_splice_write()
+ * and file_start_write(real.file) in ovl_write_iter().
+ *
+ * So do everything ovl_write_iter() does and call iter_file_splice_write() on
+ * the real file.
+ */
+static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ struct inode *inode = file_inode(out);
+ struct inode *realinode = ovl_inode_real(inode);
+ ssize_t ret;
+
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(realinode, inode);
+ ret = file_remove_privs(out);
+ if (ret)
+ goto out_unlock;
+
+ ret = ovl_real_fdget(out, &real);
+ if (ret)
+ goto out_unlock;
+
+ old_cred = ovl_override_creds(inode->i_sb);
file_start_write(real.file);
- ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
- ovl_iocb_to_rwf(iocb));
+
+ ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
+
file_end_write(real.file);
- ovl_revert_creds(old_cred);
-
/* Update size */
- ovl_copyattr(ovl_inode_real(inode), inode);
-
+ ovl_copyattr(realinode, inode);
+ ovl_revert_creds(inode->i_sb, old_cred);
fdput(real);
out_unlock:
@@ -302,6 +461,10 @@
const struct cred *old_cred;
int ret;
+ ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
+ if (ret <= 0)
+ return ret;
+
ret = ovl_real_fdget_meta(file, &real, !datasync);
if (ret)
return ret;
@@ -310,7 +473,7 @@
if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = vfs_fsync_range(real.file, start, end, datasync);
- ovl_revert_creds(old_cred);
+ ovl_revert_creds(file_inode(file)->i_sb, old_cred);
}
fdput(real);
@@ -334,7 +497,7 @@
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = call_mmap(vma->vm_file, vma);
- ovl_revert_creds(old_cred);
+ ovl_revert_creds(file_inode(file)->i_sb, old_cred);
if (ret) {
/* Drop reference count from new vm_file value */
@@ -362,7 +525,7 @@
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = vfs_fallocate(real.file, mode, offset, len);
- ovl_revert_creds(old_cred);
+ ovl_revert_creds(file_inode(file)->i_sb, old_cred);
/* Update size */
ovl_copyattr(ovl_inode_real(inode), inode);
@@ -384,7 +547,7 @@
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = vfs_fadvise(real.file, offset, len, advice);
- ovl_revert_creds(old_cred);
+ ovl_revert_creds(file_inode(file)->i_sb, old_cred);
fdput(real);
@@ -395,16 +558,20 @@
unsigned long arg)
{
struct fd real;
- const struct cred *old_cred;
long ret;
ret = ovl_real_fdget(file, &real);
if (ret)
return ret;
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_ioctl(real.file, cmd, arg);
- ovl_revert_creds(old_cred);
+ ret = security_file_ioctl(real.file, cmd, arg);
+ if (!ret) {
+ /*
+ * Don't override creds, since we currently can't safely check
+ * permissions before doing so.
+ */
+ ret = vfs_ioctl(real.file, cmd, arg);
+ }
fdput(real);
@@ -412,11 +579,10 @@
}
static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd,
- unsigned long arg, unsigned int iflags)
+ unsigned long arg)
{
long ret;
struct inode *inode = file_inode(file);
- unsigned int old_iflags;
if (!inode_owner_or_capable(inode))
return -EACCES;
@@ -427,10 +593,12 @@
inode_lock(inode);
- /* Check the capability before cred override */
+ /*
+ * Prevent copy up if immutable and has no CAP_LINUX_IMMUTABLE
+ * capability.
+ */
ret = -EPERM;
- old_iflags = READ_ONCE(inode->i_flags);
- if (((iflags ^ old_iflags) & (S_APPEND | S_IMMUTABLE)) &&
+ if (!ovl_has_upperdata(inode) && IS_IMMUTABLE(inode) &&
!capable(CAP_LINUX_IMMUTABLE))
goto unlock;
@@ -450,64 +618,7 @@
}
-static unsigned int ovl_fsflags_to_iflags(unsigned int flags)
-{
- unsigned int iflags = 0;
-
- if (flags & FS_SYNC_FL)
- iflags |= S_SYNC;
- if (flags & FS_APPEND_FL)
- iflags |= S_APPEND;
- if (flags & FS_IMMUTABLE_FL)
- iflags |= S_IMMUTABLE;
- if (flags & FS_NOATIME_FL)
- iflags |= S_NOATIME;
-
- return iflags;
-}
-
-static long ovl_ioctl_set_fsflags(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- unsigned int flags;
-
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
-
- return ovl_ioctl_set_flags(file, cmd, arg,
- ovl_fsflags_to_iflags(flags));
-}
-
-static unsigned int ovl_fsxflags_to_iflags(unsigned int xflags)
-{
- unsigned int iflags = 0;
-
- if (xflags & FS_XFLAG_SYNC)
- iflags |= S_SYNC;
- if (xflags & FS_XFLAG_APPEND)
- iflags |= S_APPEND;
- if (xflags & FS_XFLAG_IMMUTABLE)
- iflags |= S_IMMUTABLE;
- if (xflags & FS_XFLAG_NOATIME)
- iflags |= S_NOATIME;
-
- return iflags;
-}
-
-static long ovl_ioctl_set_fsxflags(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- struct fsxattr fa;
-
- memset(&fa, 0, sizeof(fa));
- if (copy_from_user(&fa, (void __user *) arg, sizeof(fa)))
- return -EFAULT;
-
- return ovl_ioctl_set_flags(file, cmd, arg,
- ovl_fsxflags_to_iflags(fa.fsx_xflags));
-}
-
-static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
long ret;
@@ -517,12 +628,9 @@
ret = ovl_real_ioctl(file, cmd, arg);
break;
- case FS_IOC_SETFLAGS:
- ret = ovl_ioctl_set_fsflags(file, cmd, arg);
- break;
-
case FS_IOC_FSSETXATTR:
- ret = ovl_ioctl_set_fsxflags(file, cmd, arg);
+ case FS_IOC_SETFLAGS:
+ ret = ovl_ioctl_set_flags(file, cmd, arg);
break;
default:
@@ -532,8 +640,8 @@
return ret;
}
-static long ovl_compat_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
+#ifdef CONFIG_COMPAT
+long ovl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
case FS_IOC32_GETFLAGS:
@@ -550,6 +658,7 @@
return ovl_ioctl(file, cmd, arg);
}
+#endif
enum ovl_copyop {
OVL_COPY,
@@ -557,14 +666,14 @@
OVL_DEDUPE,
};
-static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in,
+static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
- u64 len, unsigned int flags, enum ovl_copyop op)
+ loff_t len, unsigned int flags, enum ovl_copyop op)
{
struct inode *inode_out = file_inode(file_out);
struct fd real_in, real_out;
const struct cred *old_cred;
- ssize_t ret;
+ loff_t ret;
ret = ovl_real_fdget(file_out, &real_out);
if (ret)
@@ -585,15 +694,16 @@
case OVL_CLONE:
ret = vfs_clone_file_range(real_in.file, pos_in,
- real_out.file, pos_out, len);
+ real_out.file, pos_out, len, flags);
break;
case OVL_DEDUPE:
ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
- real_out.file, pos_out, len);
+ real_out.file, pos_out, len,
+ flags);
break;
}
- ovl_revert_creds(old_cred);
+ ovl_revert_creds(file_inode(file_out)->i_sb, old_cred);
/* Update size */
ovl_copyattr(ovl_inode_real(inode_out), inode_out);
@@ -612,26 +722,31 @@
OVL_COPY);
}
-static int ovl_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
+static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
- return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
- OVL_CLONE);
-}
+ enum ovl_copyop op;
-static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
-{
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ op = OVL_DEDUPE;
+ else
+ op = OVL_CLONE;
+
/*
* Don't copy up because of a dedupe request, this wouldn't make sense
* most of the time (data would be duplicated instead of deduplicated).
*/
- if (!ovl_inode_upper(file_inode(file_in)) ||
- !ovl_inode_upper(file_inode(file_out)))
+ if (op == OVL_DEDUPE &&
+ (!ovl_inode_upper(file_inode(file_in)) ||
+ !ovl_inode_upper(file_inode(file_out))))
return -EPERM;
- return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
- OVL_DEDUPE);
+ return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
+ remap_flags, op);
}
const struct file_operations ovl_file_operations = {
@@ -645,9 +760,28 @@
.fallocate = ovl_fallocate,
.fadvise = ovl_fadvise,
.unlocked_ioctl = ovl_ioctl,
+#ifdef CONFIG_COMPAT
.compat_ioctl = ovl_compat_ioctl,
+#endif
+ .splice_read = generic_file_splice_read,
+ .splice_write = ovl_splice_write,
.copy_file_range = ovl_copy_file_range,
- .clone_file_range = ovl_clone_file_range,
- .dedupe_file_range = ovl_dedupe_file_range,
+ .remap_file_range = ovl_remap_file_range,
};
+
+int __init ovl_aio_request_cache_init(void)
+{
+ ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req",
+ sizeof(struct ovl_aio_req),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!ovl_aio_request_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ovl_aio_request_cache_destroy(void)
+{
+ kmem_cache_destroy(ovl_aio_request_cachep);
+}
--
Gitblit v1.6.2