hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/fs/file.c
....@@ -10,6 +10,7 @@
1010 #include <linux/syscalls.h>
1111 #include <linux/export.h>
1212 #include <linux/fs.h>
13
+#include <linux/kernel.h>
1314 #include <linux/mm.h>
1415 #include <linux/sched/signal.h>
1516 #include <linux/slab.h>
....@@ -18,6 +19,10 @@
1819 #include <linux/bitops.h>
1920 #include <linux/spinlock.h>
2021 #include <linux/rcupdate.h>
22
+#include <linux/close_range.h>
23
+#include <net/sock.h>
24
+
25
+#include "internal.h"
2126
2227 unsigned int sysctl_nr_open __read_mostly = 1024*1024;
2328 unsigned int sysctl_nr_open_min = BITS_PER_LONG;
....@@ -82,6 +87,21 @@
8287 copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
8388 }
8489
90
+/*
91
+ * Note how the fdtable bitmap allocations very much have to be a multiple of
92
+ * BITS_PER_LONG. This is not only because we walk those things in chunks of
93
+ * 'unsigned long' in some places, but simply because that is how the Linux
94
+ * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
95
+ * they are very much "bits in an array of unsigned long".
96
+ *
97
+ * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
98
+ * by that "1024/sizeof(ptr)" before, we already know there are sufficient
99
+ * clear low bits. Clang seems to realize that, gcc ends up being confused.
100
+ *
101
+ * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
102
+ * let's consider it documentation (and maybe a test-case for gcc to improve
103
+ * its code generation ;)
104
+ */
85105 static struct fdtable * alloc_fdtable(unsigned int nr)
86106 {
87107 struct fdtable *fdt;
....@@ -97,6 +117,7 @@
97117 nr /= (1024 / sizeof(struct file *));
98118 nr = roundup_pow_of_two(nr + 1);
99119 nr *= (1024 / sizeof(struct file *));
120
+ nr = ALIGN(nr, BITS_PER_LONG);
100121 /*
101122 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
102123 * had been set lower between the check in expand_files() and here. Deal
....@@ -158,7 +179,7 @@
158179 * or have finished their rcu_read_lock_sched() section.
159180 */
160181 if (atomic_read(&files->count) > 1)
161
- synchronize_sched();
182
+ synchronize_rcu();
162183
163184 spin_lock(&files->file_lock);
164185 if (!new_fdt)
....@@ -265,11 +286,34 @@
265286 }
266287
267288 /*
289
+ * Note that a sane fdtable size always has to be a multiple of
290
+ * BITS_PER_LONG, since we have bitmaps that are sized by this.
291
+ *
292
+ * 'max_fds' will normally already be properly aligned, but it
293
+ * turns out that in the close_range() -> __close_range() ->
294
+ * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
295
+ * up having a 'max_fds' value that isn't already aligned.
296
+ *
297
+ * Rather than make close_range() have to worry about this,
298
+ * just make that BITS_PER_LONG alignment be part of a sane
299
+ * fdtable size. Becuase that's really what it is.
300
+ */
301
+static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
302
+{
303
+ unsigned int count;
304
+
305
+ count = count_open_files(fdt);
306
+ if (max_fds < NR_OPEN_DEFAULT)
307
+ max_fds = NR_OPEN_DEFAULT;
308
+ return ALIGN(min(count, max_fds), BITS_PER_LONG);
309
+}
310
+
311
+/*
268312 * Allocate a new files structure and copy contents from the
269313 * passed in files structure.
270314 * errorp will be valid only when the returned files_struct is NULL.
271315 */
272
-struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
316
+struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
273317 {
274318 struct files_struct *newf;
275319 struct file **old_fds, **new_fds;
....@@ -296,7 +340,7 @@
296340
297341 spin_lock(&oldf->file_lock);
298342 old_fdt = files_fdtable(oldf);
299
- open_files = count_open_files(old_fdt);
343
+ open_files = sane_fdtable_size(old_fdt, max_fds);
300344
301345 /*
302346 * Check whether we need to allocate a larger fd array and fd set.
....@@ -327,7 +371,7 @@
327371 */
328372 spin_lock(&oldf->file_lock);
329373 old_fdt = files_fdtable(oldf);
330
- open_files = count_open_files(old_fdt);
374
+ open_files = sane_fdtable_size(old_fdt, max_fds);
331375 }
332376
333377 copy_fd_bitmaps(new_fdt, old_fdt, open_files);
....@@ -540,9 +584,14 @@
540584 return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
541585 }
542586
587
+int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
588
+{
589
+ return __alloc_fd(current->files, 0, nofile, flags);
590
+}
591
+
543592 int get_unused_fd_flags(unsigned flags)
544593 {
545
- return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
594
+ return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
546595 }
547596 EXPORT_SYMBOL(get_unused_fd_flags);
548597
....@@ -608,6 +657,10 @@
608657 rcu_read_unlock_sched();
609658 }
610659
660
+/*
661
+ * This consumes the "file" refcount, so callers should treat it
662
+ * as if they had called fput(file).
663
+ */
611664 void fd_install(unsigned int fd, struct file *file)
612665 {
613666 __fd_install(current->files, fd, file);
....@@ -615,12 +668,9 @@
615668
616669 EXPORT_SYMBOL(fd_install);
617670
618
-/*
619
- * The same warnings as for __alloc_fd()/__fd_install() apply here...
620
- */
621
-int __close_fd(struct files_struct *files, unsigned fd)
671
+static struct file *pick_file(struct files_struct *files, unsigned fd)
622672 {
623
- struct file *file;
673
+ struct file *file = NULL;
624674 struct fdtable *fdt;
625675
626676 spin_lock(&files->file_lock);
....@@ -632,14 +682,147 @@
632682 goto out_unlock;
633683 rcu_assign_pointer(fdt->fd[fd], NULL);
634684 __put_unused_fd(files, fd);
635
- spin_unlock(&files->file_lock);
636
- return filp_close(file, files);
637685
638686 out_unlock:
639687 spin_unlock(&files->file_lock);
640
- return -EBADF;
688
+ return file;
689
+}
690
+
691
+/*
692
+ * The same warnings as for __alloc_fd()/__fd_install() apply here...
693
+ */
694
+int __close_fd(struct files_struct *files, unsigned fd)
695
+{
696
+ struct file *file;
697
+
698
+ file = pick_file(files, fd);
699
+ if (!file)
700
+ return -EBADF;
701
+
702
+ return filp_close(file, files);
641703 }
642704 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
705
+
706
+/**
707
+ * __close_range() - Close all file descriptors in a given range.
708
+ *
709
+ * @fd: starting file descriptor to close
710
+ * @max_fd: last file descriptor to close
711
+ *
712
+ * This closes a range of file descriptors. All file descriptors
713
+ * from @fd up to and including @max_fd are closed.
714
+ */
715
+int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
716
+{
717
+ unsigned int cur_max;
718
+ struct task_struct *me = current;
719
+ struct files_struct *cur_fds = me->files, *fds = NULL;
720
+
721
+ if (flags & ~CLOSE_RANGE_UNSHARE)
722
+ return -EINVAL;
723
+
724
+ if (fd > max_fd)
725
+ return -EINVAL;
726
+
727
+ rcu_read_lock();
728
+ cur_max = files_fdtable(cur_fds)->max_fds;
729
+ rcu_read_unlock();
730
+
731
+ /* cap to last valid index into fdtable */
732
+ cur_max--;
733
+
734
+ if (flags & CLOSE_RANGE_UNSHARE) {
735
+ int ret;
736
+ unsigned int max_unshare_fds = NR_OPEN_MAX;
737
+
738
+ /*
739
+ * If the requested range is greater than the current maximum,
740
+ * we're closing everything so only copy all file descriptors
741
+ * beneath the lowest file descriptor.
742
+ */
743
+ if (max_fd >= cur_max)
744
+ max_unshare_fds = fd;
745
+
746
+ ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
747
+ if (ret)
748
+ return ret;
749
+
750
+ /*
751
+ * We used to share our file descriptor table, and have now
752
+ * created a private one, make sure we're using it below.
753
+ */
754
+ if (fds)
755
+ swap(cur_fds, fds);
756
+ }
757
+
758
+ max_fd = min(max_fd, cur_max);
759
+ while (fd <= max_fd) {
760
+ struct file *file;
761
+
762
+ file = pick_file(cur_fds, fd++);
763
+ if (!file)
764
+ continue;
765
+
766
+ filp_close(file, cur_fds);
767
+ cond_resched();
768
+ }
769
+
770
+ if (fds) {
771
+ /*
772
+ * We're done closing the files we were supposed to. Time to install
773
+ * the new file descriptor table and drop the old one.
774
+ */
775
+ task_lock(me);
776
+ me->files = cur_fds;
777
+ task_unlock(me);
778
+ put_files_struct(fds);
779
+ }
780
+
781
+ return 0;
782
+}
783
+
784
+/*
785
+ * See close_fd_get_file() below, this variant assumes current->files->file_lock
786
+ * is held.
787
+ */
788
+int __close_fd_get_file(unsigned int fd, struct file **res)
789
+{
790
+ struct files_struct *files = current->files;
791
+ struct file *file;
792
+ struct fdtable *fdt;
793
+
794
+ fdt = files_fdtable(files);
795
+ if (fd >= fdt->max_fds)
796
+ goto out_err;
797
+ file = fdt->fd[fd];
798
+ if (!file)
799
+ goto out_err;
800
+ rcu_assign_pointer(fdt->fd[fd], NULL);
801
+ __put_unused_fd(files, fd);
802
+ get_file(file);
803
+ *res = file;
804
+ return 0;
805
+out_err:
806
+ *res = NULL;
807
+ return -ENOENT;
808
+}
809
+
810
+/*
811
+ * variant of close_fd that gets a ref on the file for later fput.
812
+ * The caller must ensure that filp_close() called on the file, and then
813
+ * an fput().
814
+ */
815
+int close_fd_get_file(unsigned int fd, struct file **res)
816
+{
817
+ struct files_struct *files = current->files;
818
+ int ret;
819
+
820
+ spin_lock(&files->file_lock);
821
+ ret = __close_fd_get_file(fd, res);
822
+ spin_unlock(&files->file_lock);
823
+
824
+ return ret;
825
+}
643826
644827 void do_close_on_exec(struct files_struct *files)
645828 {
....@@ -678,7 +861,7 @@
678861 }
679862
680863 static inline struct file *__fget_files_rcu(struct files_struct *files,
681
- unsigned int fd, fmode_t mask, unsigned int refs)
864
+ unsigned int fd, fmode_t mask, unsigned int refs)
682865 {
683866 for (;;) {
684867 struct file *file;
....@@ -732,10 +915,9 @@
732915 }
733916 }
734917
735
-
736
-static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
918
+static struct file *__fget_files(struct files_struct *files, unsigned int fd,
919
+ fmode_t mask, unsigned int refs)
737920 {
738
- struct files_struct *files = current->files;
739921 struct file *file;
740922
741923 rcu_read_lock();
....@@ -743,6 +925,12 @@
743925 rcu_read_unlock();
744926
745927 return file;
928
+}
929
+
930
+static inline struct file *__fget(unsigned int fd, fmode_t mask,
931
+ unsigned int refs)
932
+{
933
+ return __fget_files(current->files, fd, mask, refs);
746934 }
747935
748936 struct file *fget_many(unsigned int fd, unsigned int refs)
....@@ -761,6 +949,18 @@
761949 return __fget(fd, 0, 1);
762950 }
763951 EXPORT_SYMBOL(fget_raw);
952
+
953
+struct file *fget_task(struct task_struct *task, unsigned int fd)
954
+{
955
+ struct file *file = NULL;
956
+
957
+ task_lock(task);
958
+ if (task->files)
959
+ file = __fget_files(task->files, fd, 0, 1);
960
+ task_unlock(task);
961
+
962
+ return file;
963
+}
764964
765965 /*
766966 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
....@@ -922,6 +1122,62 @@
9221122 return err;
9231123 }
9241124
1125
+/**
1126
+ * __receive_fd() - Install received file into file descriptor table
1127
+ *
1128
+ * @fd: fd to install into (if negative, a new fd will be allocated)
1129
+ * @file: struct file that was received from another process
1130
+ * @ufd: __user pointer to write new fd number to
1131
+ * @o_flags: the O_* flags to apply to the new fd entry
1132
+ *
1133
+ * Installs a received file into the file descriptor table, with appropriate
1134
+ * checks and count updates. Optionally writes the fd number to userspace, if
1135
+ * @ufd is non-NULL.
1136
+ *
1137
+ * This helper handles its own reference counting of the incoming
1138
+ * struct file.
1139
+ *
1140
+ * Returns newly install fd or -ve on error.
1141
+ */
1142
+int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags)
1143
+{
1144
+ int new_fd;
1145
+ int error;
1146
+
1147
+ error = security_file_receive(file);
1148
+ if (error)
1149
+ return error;
1150
+
1151
+ if (fd < 0) {
1152
+ new_fd = get_unused_fd_flags(o_flags);
1153
+ if (new_fd < 0)
1154
+ return new_fd;
1155
+ } else {
1156
+ new_fd = fd;
1157
+ }
1158
+
1159
+ if (ufd) {
1160
+ error = put_user(new_fd, ufd);
1161
+ if (error) {
1162
+ if (fd < 0)
1163
+ put_unused_fd(new_fd);
1164
+ return error;
1165
+ }
1166
+ }
1167
+
1168
+ if (fd < 0) {
1169
+ fd_install(new_fd, get_file(file));
1170
+ } else {
1171
+ error = replace_fd(new_fd, file, o_flags);
1172
+ if (error)
1173
+ return error;
1174
+ }
1175
+
1176
+ /* Bump the sock usage counts, if any. */
1177
+ __receive_sock(file);
1178
+ return new_fd;
1179
+}
1180
+
9251181 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
9261182 {
9271183 int err = -EBADF;
....@@ -976,7 +1232,7 @@
9761232 return ksys_dup3(oldfd, newfd, 0);
9771233 }
9781234
979
-int ksys_dup(unsigned int fildes)
1235
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
9801236 {
9811237 int ret = -EBADF;
9821238 struct file *file = fget_raw(fildes);
....@@ -989,11 +1245,6 @@
9891245 fput(file);
9901246 }
9911247 return ret;
992
-}
993
-
994
-SYSCALL_DEFINE1(dup, unsigned int, fildes)
995
-{
996
- return ksys_dup(fildes);
9971248 }
9981249
9991250 int f_dupfd(unsigned int from, struct file *file, unsigned flags)