From 244b2c5ca8b14627e4a17755e5922221e121c771 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Wed, 09 Oct 2024 06:15:07 +0000
Subject: [PATCH] change system file
---
kernel/fs/file.c | 322 +++++++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 294 insertions(+), 28 deletions(-)
diff --git a/kernel/fs/file.c b/kernel/fs/file.c
index d6ca500..d6bc739 100644
--- a/kernel/fs/file.c
+++ b/kernel/fs/file.c
@@ -10,6 +10,7 @@
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
+#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
@@ -18,6 +19,10 @@
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/close_range.h>
+#include <net/sock.h>
+
+#include "internal.h"
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
@@ -82,6 +87,21 @@
copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}
+/*
+ * Note how the fdtable bitmap allocations very much have to be a multiple of
+ * BITS_PER_LONG. This is not only because we walk those things in chunks of
+ * 'unsigned long' in some places, but simply because that is how the Linux
+ * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
+ * they are very much "bits in an array of unsigned long".
+ *
+ * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
+ * by that "1024/sizeof(ptr)" before, we already know there are sufficient
+ * clear low bits. Clang seems to realize that, gcc ends up being confused.
+ *
+ * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
+ * let's consider it documentation (and maybe a test-case for gcc to improve
+ * its code generation ;)
+ */
static struct fdtable * alloc_fdtable(unsigned int nr)
{
struct fdtable *fdt;
@@ -97,6 +117,7 @@
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
+ nr = ALIGN(nr, BITS_PER_LONG);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
* had been set lower between the check in expand_files() and here. Deal
@@ -158,7 +179,7 @@
* or have finished their rcu_read_lock_sched() section.
*/
if (atomic_read(&files->count) > 1)
- synchronize_sched();
+ synchronize_rcu();
spin_lock(&files->file_lock);
if (!new_fdt)
@@ -265,11 +286,34 @@
}
/*
+ * Note that a sane fdtable size always has to be a multiple of
+ * BITS_PER_LONG, since we have bitmaps that are sized by this.
+ *
+ * 'max_fds' will normally already be properly aligned, but it
+ * turns out that in the close_range() -> __close_range() ->
+ * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
+ * up having a 'max_fds' value that isn't already aligned.
+ *
+ * Rather than make close_range() have to worry about this,
+ * just make that BITS_PER_LONG alignment be part of a sane
+ * fdtable size. Becuase that's really what it is.
+ */
+static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
+{
+ unsigned int count;
+
+ count = count_open_files(fdt);
+ if (max_fds < NR_OPEN_DEFAULT)
+ max_fds = NR_OPEN_DEFAULT;
+ return ALIGN(min(count, max_fds), BITS_PER_LONG);
+}
+
+/*
* Allocate a new files structure and copy contents from the
* passed in files structure.
* errorp will be valid only when the returned files_struct is NULL.
*/
-struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
+struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
@@ -296,7 +340,7 @@
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
- open_files = count_open_files(old_fdt);
+ open_files = sane_fdtable_size(old_fdt, max_fds);
/*
* Check whether we need to allocate a larger fd array and fd set.
@@ -327,7 +371,7 @@
*/
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
- open_files = count_open_files(old_fdt);
+ open_files = sane_fdtable_size(old_fdt, max_fds);
}
copy_fd_bitmaps(new_fdt, old_fdt, open_files);
@@ -540,9 +584,14 @@
return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
}
+int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
+{
+ return __alloc_fd(current->files, 0, nofile, flags);
+}
+
int get_unused_fd_flags(unsigned flags)
{
- return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
+ return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);
@@ -608,6 +657,10 @@
rcu_read_unlock_sched();
}
+/*
+ * This consumes the "file" refcount, so callers should treat it
+ * as if they had called fput(file).
+ */
void fd_install(unsigned int fd, struct file *file)
{
__fd_install(current->files, fd, file);
@@ -615,31 +668,162 @@
EXPORT_SYMBOL(fd_install);
-/*
- * The same warnings as for __alloc_fd()/__fd_install() apply here...
- */
-int __close_fd(struct files_struct *files, unsigned fd)
+static struct file *pick_file(struct files_struct *files, unsigned fd)
{
- struct file *file;
+ struct file *file = NULL;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
goto out_unlock;
+ fd = array_index_nospec(fd, fdt->max_fds);
file = fdt->fd[fd];
if (!file)
goto out_unlock;
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
- spin_unlock(&files->file_lock);
- return filp_close(file, files);
out_unlock:
spin_unlock(&files->file_lock);
- return -EBADF;
+ return file;
+}
+
+/*
+ * The same warnings as for __alloc_fd()/__fd_install() apply here...
+ */
+int __close_fd(struct files_struct *files, unsigned fd)
+{
+ struct file *file;
+
+ file = pick_file(files, fd);
+ if (!file)
+ return -EBADF;
+
+ return filp_close(file, files);
}
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
+
+/**
+ * __close_range() - Close all file descriptors in a given range.
+ *
+ * @fd: starting file descriptor to close
+ * @max_fd: last file descriptor to close
+ *
+ * This closes a range of file descriptors. All file descriptors
+ * from @fd up to and including @max_fd are closed.
+ */
+int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
+{
+ unsigned int cur_max;
+ struct task_struct *me = current;
+ struct files_struct *cur_fds = me->files, *fds = NULL;
+
+ if (flags & ~CLOSE_RANGE_UNSHARE)
+ return -EINVAL;
+
+ if (fd > max_fd)
+ return -EINVAL;
+
+ rcu_read_lock();
+ cur_max = files_fdtable(cur_fds)->max_fds;
+ rcu_read_unlock();
+
+ /* cap to last valid index into fdtable */
+ cur_max--;
+
+ if (flags & CLOSE_RANGE_UNSHARE) {
+ int ret;
+ unsigned int max_unshare_fds = NR_OPEN_MAX;
+
+ /*
+ * If the requested range is greater than the current maximum,
+ * we're closing everything so only copy all file descriptors
+ * beneath the lowest file descriptor.
+ */
+ if (max_fd >= cur_max)
+ max_unshare_fds = fd;
+
+ ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
+ if (ret)
+ return ret;
+
+ /*
+ * We used to share our file descriptor table, and have now
+ * created a private one, make sure we're using it below.
+ */
+ if (fds)
+ swap(cur_fds, fds);
+ }
+
+ max_fd = min(max_fd, cur_max);
+ while (fd <= max_fd) {
+ struct file *file;
+
+ file = pick_file(cur_fds, fd++);
+ if (!file)
+ continue;
+
+ filp_close(file, cur_fds);
+ cond_resched();
+ }
+
+ if (fds) {
+ /*
+ * We're done closing the files we were supposed to. Time to install
+ * the new file descriptor table and drop the old one.
+ */
+ task_lock(me);
+ me->files = cur_fds;
+ task_unlock(me);
+ put_files_struct(fds);
+ }
+
+ return 0;
+}
+
+/*
+ * See close_fd_get_file() below, this variant assumes current->files->file_lock
+ * is held.
+ */
+int __close_fd_get_file(unsigned int fd, struct file **res)
+{
+ struct files_struct *files = current->files;
+ struct file *file;
+ struct fdtable *fdt;
+
+ fdt = files_fdtable(files);
+ if (fd >= fdt->max_fds)
+ goto out_err;
+ file = fdt->fd[fd];
+ if (!file)
+ goto out_err;
+ rcu_assign_pointer(fdt->fd[fd], NULL);
+ __put_unused_fd(files, fd);
+ get_file(file);
+ *res = file;
+ return 0;
+out_err:
+ *res = NULL;
+ return -ENOENT;
+}
+
+/*
+ * variant of close_fd that gets a ref on the file for later fput.
+ * The caller must ensure that filp_close() called on the file, and then
+ * an fput().
+ */
+int close_fd_get_file(unsigned int fd, struct file **res)
+{
+ struct files_struct *files = current->files;
+ int ret;
+
+ spin_lock(&files->file_lock);
+ ret = __close_fd_get_file(fd, res);
+ spin_unlock(&files->file_lock);
+
+ return ret;
+}
void do_close_on_exec(struct files_struct *files)
{
@@ -678,7 +862,7 @@
}
static inline struct file *__fget_files_rcu(struct files_struct *files,
- unsigned int fd, fmode_t mask, unsigned int refs)
+ unsigned int fd, fmode_t mask, unsigned int refs)
{
for (;;) {
struct file *file;
@@ -732,10 +916,9 @@
}
}
-
-static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
+static struct file *__fget_files(struct files_struct *files, unsigned int fd,
+ fmode_t mask, unsigned int refs)
{
- struct files_struct *files = current->files;
struct file *file;
rcu_read_lock();
@@ -743,6 +926,12 @@
rcu_read_unlock();
return file;
+}
+
+static inline struct file *__fget(unsigned int fd, fmode_t mask,
+ unsigned int refs)
+{
+ return __fget_files(current->files, fd, mask, refs);
}
struct file *fget_many(unsigned int fd, unsigned int refs)
@@ -761,6 +950,18 @@
return __fget(fd, 0, 1);
}
EXPORT_SYMBOL(fget_raw);
+
+struct file *fget_task(struct task_struct *task, unsigned int fd)
+{
+ struct file *file = NULL;
+
+ task_lock(task);
+ if (task->files)
+ file = __fget_files(task->files, fd, 0, 1);
+ task_unlock(task);
+
+ return file;
+}
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -806,16 +1007,30 @@
return __fget_light(fd, 0);
}
+/*
+ * Try to avoid f_pos locking. We only need it if the
+ * file is marked for FMODE_ATOMIC_POS, and it can be
+ * accessed multiple ways.
+ *
+ * Always do it for directories, because pidfd_getfd()
+ * can make a file accessible even if it otherwise would
+ * not be, and for directories this is a correctness
+ * issue, not a "POSIX requirement".
+ */
+static inline bool file_needs_f_pos_lock(struct file *file)
+{
+ return (file->f_mode & FMODE_ATOMIC_POS) &&
+ (file_count(file) > 1 || S_ISDIR(file_inode(file)->i_mode));
+}
+
unsigned long __fdget_pos(unsigned int fd)
{
unsigned long v = __fdget(fd);
struct file *file = (struct file *)(v & ~3);
- if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
- if (file_count(file) > 1) {
- v |= FDPUT_POS_UNLOCK;
- mutex_lock(&file->f_pos_lock);
- }
+ if (file && file_needs_f_pos_lock(file)) {
+ v |= FDPUT_POS_UNLOCK;
+ mutex_lock(&file->f_pos_lock);
}
return v;
}
@@ -922,6 +1137,62 @@
return err;
}
+/**
+ * __receive_fd() - Install received file into file descriptor table
+ *
+ * @fd: fd to install into (if negative, a new fd will be allocated)
+ * @file: struct file that was received from another process
+ * @ufd: __user pointer to write new fd number to
+ * @o_flags: the O_* flags to apply to the new fd entry
+ *
+ * Installs a received file into the file descriptor table, with appropriate
+ * checks and count updates. Optionally writes the fd number to userspace, if
+ * @ufd is non-NULL.
+ *
+ * This helper handles its own reference counting of the incoming
+ * struct file.
+ *
+ * Returns newly install fd or -ve on error.
+ */
+int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags)
+{
+ int new_fd;
+ int error;
+
+ error = security_file_receive(file);
+ if (error)
+ return error;
+
+ if (fd < 0) {
+ new_fd = get_unused_fd_flags(o_flags);
+ if (new_fd < 0)
+ return new_fd;
+ } else {
+ new_fd = fd;
+ }
+
+ if (ufd) {
+ error = put_user(new_fd, ufd);
+ if (error) {
+ if (fd < 0)
+ put_unused_fd(new_fd);
+ return error;
+ }
+ }
+
+ if (fd < 0) {
+ fd_install(new_fd, get_file(file));
+ } else {
+ error = replace_fd(new_fd, file, o_flags);
+ if (error)
+ return error;
+ }
+
+ /* Bump the sock usage counts, if any. */
+ __receive_sock(file);
+ return new_fd;
+}
+
static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
int err = -EBADF;
@@ -976,7 +1247,7 @@
return ksys_dup3(oldfd, newfd, 0);
}
-int ksys_dup(unsigned int fildes)
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
int ret = -EBADF;
struct file *file = fget_raw(fildes);
@@ -989,11 +1260,6 @@
fput(file);
}
return ret;
-}
-
-SYSCALL_DEFINE1(dup, unsigned int, fildes)
-{
- return ksys_dup(fildes);
}
int f_dupfd(unsigned int from, struct file *file, unsigned flags)
--
Gitblit v1.6.2