hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/fs/file.c
....@@ -10,6 +10,7 @@
1010 #include <linux/syscalls.h>
1111 #include <linux/export.h>
1212 #include <linux/fs.h>
13
+#include <linux/kernel.h>
1314 #include <linux/mm.h>
1415 #include <linux/sched/signal.h>
1516 #include <linux/slab.h>
....@@ -18,6 +19,10 @@
1819 #include <linux/bitops.h>
1920 #include <linux/spinlock.h>
2021 #include <linux/rcupdate.h>
22
+#include <linux/close_range.h>
23
+#include <net/sock.h>
24
+
25
+#include "internal.h"
2126
2227 unsigned int sysctl_nr_open __read_mostly = 1024*1024;
2328 unsigned int sysctl_nr_open_min = BITS_PER_LONG;
....@@ -82,6 +87,21 @@
8287 copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
8388 }
8489
90
+/*
91
+ * Note how the fdtable bitmap allocations very much have to be a multiple of
92
+ * BITS_PER_LONG. This is not only because we walk those things in chunks of
93
+ * 'unsigned long' in some places, but simply because that is how the Linux
94
+ * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
95
+ * they are very much "bits in an array of unsigned long".
96
+ *
97
+ * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
98
+ * by that "1024/sizeof(ptr)" before, we already know there are sufficient
99
+ * clear low bits. Clang seems to realize that, gcc ends up being confused.
100
+ *
101
+ * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
102
+ * let's consider it documentation (and maybe a test-case for gcc to improve
103
+ * its code generation ;)
104
+ */
85105 static struct fdtable * alloc_fdtable(unsigned int nr)
86106 {
87107 struct fdtable *fdt;
....@@ -97,6 +117,7 @@
97117 nr /= (1024 / sizeof(struct file *));
98118 nr = roundup_pow_of_two(nr + 1);
99119 nr *= (1024 / sizeof(struct file *));
120
+ nr = ALIGN(nr, BITS_PER_LONG);
100121 /*
101122 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
102123 * had been set lower between the check in expand_files() and here. Deal
....@@ -158,7 +179,7 @@
158179 * or have finished their rcu_read_lock_sched() section.
159180 */
160181 if (atomic_read(&files->count) > 1)
161
- synchronize_sched();
182
+ synchronize_rcu();
162183
163184 spin_lock(&files->file_lock);
164185 if (!new_fdt)
....@@ -265,11 +286,34 @@
265286 }
266287
267288 /*
289
+ * Note that a sane fdtable size always has to be a multiple of
290
+ * BITS_PER_LONG, since we have bitmaps that are sized by this.
291
+ *
292
+ * 'max_fds' will normally already be properly aligned, but it
293
+ * turns out that in the close_range() -> __close_range() ->
294
+ * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
295
+ * up having a 'max_fds' value that isn't already aligned.
296
+ *
297
+ * Rather than make close_range() have to worry about this,
298
+ * just make that BITS_PER_LONG alignment be part of a sane
299
+ * fdtable size. Becuase that's really what it is.
300
+ */
301
+static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
302
+{
303
+ unsigned int count;
304
+
305
+ count = count_open_files(fdt);
306
+ if (max_fds < NR_OPEN_DEFAULT)
307
+ max_fds = NR_OPEN_DEFAULT;
308
+ return ALIGN(min(count, max_fds), BITS_PER_LONG);
309
+}
310
+
311
+/*
268312 * Allocate a new files structure and copy contents from the
269313 * passed in files structure.
270314 * errorp will be valid only when the returned files_struct is NULL.
271315 */
272
-struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
316
+struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
273317 {
274318 struct files_struct *newf;
275319 struct file **old_fds, **new_fds;
....@@ -296,7 +340,7 @@
296340
297341 spin_lock(&oldf->file_lock);
298342 old_fdt = files_fdtable(oldf);
299
- open_files = count_open_files(old_fdt);
343
+ open_files = sane_fdtable_size(old_fdt, max_fds);
300344
301345 /*
302346 * Check whether we need to allocate a larger fd array and fd set.
....@@ -327,7 +371,7 @@
327371 */
328372 spin_lock(&oldf->file_lock);
329373 old_fdt = files_fdtable(oldf);
330
- open_files = count_open_files(old_fdt);
374
+ open_files = sane_fdtable_size(old_fdt, max_fds);
331375 }
332376
333377 copy_fd_bitmaps(new_fdt, old_fdt, open_files);
....@@ -540,9 +584,14 @@
540584 return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
541585 }
542586
587
+int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
588
+{
589
+ return __alloc_fd(current->files, 0, nofile, flags);
590
+}
591
+
543592 int get_unused_fd_flags(unsigned flags)
544593 {
545
- return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
594
+ return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
546595 }
547596 EXPORT_SYMBOL(get_unused_fd_flags);
548597
....@@ -608,6 +657,10 @@
608657 rcu_read_unlock_sched();
609658 }
610659
660
+/*
661
+ * This consumes the "file" refcount, so callers should treat it
662
+ * as if they had called fput(file).
663
+ */
611664 void fd_install(unsigned int fd, struct file *file)
612665 {
613666 __fd_install(current->files, fd, file);
....@@ -615,31 +668,162 @@
615668
616669 EXPORT_SYMBOL(fd_install);
617670
618
-/*
619
- * The same warnings as for __alloc_fd()/__fd_install() apply here...
620
- */
621
-int __close_fd(struct files_struct *files, unsigned fd)
671
+static struct file *pick_file(struct files_struct *files, unsigned fd)
622672 {
623
- struct file *file;
673
+ struct file *file = NULL;
624674 struct fdtable *fdt;
625675
626676 spin_lock(&files->file_lock);
627677 fdt = files_fdtable(files);
628678 if (fd >= fdt->max_fds)
629679 goto out_unlock;
680
+ fd = array_index_nospec(fd, fdt->max_fds);
630681 file = fdt->fd[fd];
631682 if (!file)
632683 goto out_unlock;
633684 rcu_assign_pointer(fdt->fd[fd], NULL);
634685 __put_unused_fd(files, fd);
635
- spin_unlock(&files->file_lock);
636
- return filp_close(file, files);
637686
638687 out_unlock:
639688 spin_unlock(&files->file_lock);
640
- return -EBADF;
689
+ return file;
690
+}
691
+
692
+/*
693
+ * The same warnings as for __alloc_fd()/__fd_install() apply here...
694
+ */
695
+int __close_fd(struct files_struct *files, unsigned fd)
696
+{
697
+ struct file *file;
698
+
699
+ file = pick_file(files, fd);
700
+ if (!file)
701
+ return -EBADF;
702
+
703
+ return filp_close(file, files);
641704 }
642705 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
706
+
707
+/**
708
+ * __close_range() - Close all file descriptors in a given range.
709
+ *
710
+ * @fd: starting file descriptor to close
711
+ * @max_fd: last file descriptor to close
712
+ *
713
+ * This closes a range of file descriptors. All file descriptors
714
+ * from @fd up to and including @max_fd are closed.
715
+ */
716
+int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
717
+{
718
+ unsigned int cur_max;
719
+ struct task_struct *me = current;
720
+ struct files_struct *cur_fds = me->files, *fds = NULL;
721
+
722
+ if (flags & ~CLOSE_RANGE_UNSHARE)
723
+ return -EINVAL;
724
+
725
+ if (fd > max_fd)
726
+ return -EINVAL;
727
+
728
+ rcu_read_lock();
729
+ cur_max = files_fdtable(cur_fds)->max_fds;
730
+ rcu_read_unlock();
731
+
732
+ /* cap to last valid index into fdtable */
733
+ cur_max--;
734
+
735
+ if (flags & CLOSE_RANGE_UNSHARE) {
736
+ int ret;
737
+ unsigned int max_unshare_fds = NR_OPEN_MAX;
738
+
739
+ /*
740
+ * If the requested range is greater than the current maximum,
741
+ * we're closing everything so only copy all file descriptors
742
+ * beneath the lowest file descriptor.
743
+ */
744
+ if (max_fd >= cur_max)
745
+ max_unshare_fds = fd;
746
+
747
+ ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
748
+ if (ret)
749
+ return ret;
750
+
751
+ /*
752
+ * We used to share our file descriptor table, and have now
753
+ * created a private one, make sure we're using it below.
754
+ */
755
+ if (fds)
756
+ swap(cur_fds, fds);
757
+ }
758
+
759
+ max_fd = min(max_fd, cur_max);
760
+ while (fd <= max_fd) {
761
+ struct file *file;
762
+
763
+ file = pick_file(cur_fds, fd++);
764
+ if (!file)
765
+ continue;
766
+
767
+ filp_close(file, cur_fds);
768
+ cond_resched();
769
+ }
770
+
771
+ if (fds) {
772
+ /*
773
+ * We're done closing the files we were supposed to. Time to install
774
+ * the new file descriptor table and drop the old one.
775
+ */
776
+ task_lock(me);
777
+ me->files = cur_fds;
778
+ task_unlock(me);
779
+ put_files_struct(fds);
780
+ }
781
+
782
+ return 0;
783
+}
784
+
785
+/*
786
+ * See close_fd_get_file() below, this variant assumes current->files->file_lock
787
+ * is held.
788
+ */
789
+int __close_fd_get_file(unsigned int fd, struct file **res)
790
+{
791
+ struct files_struct *files = current->files;
792
+ struct file *file;
793
+ struct fdtable *fdt;
794
+
795
+ fdt = files_fdtable(files);
796
+ if (fd >= fdt->max_fds)
797
+ goto out_err;
798
+ file = fdt->fd[fd];
799
+ if (!file)
800
+ goto out_err;
801
+ rcu_assign_pointer(fdt->fd[fd], NULL);
802
+ __put_unused_fd(files, fd);
803
+ get_file(file);
804
+ *res = file;
805
+ return 0;
806
+out_err:
807
+ *res = NULL;
808
+ return -ENOENT;
809
+}
810
+
811
+/*
812
+ * variant of close_fd that gets a ref on the file for later fput.
813
+ * The caller must ensure that filp_close() called on the file, and then
814
+ * an fput().
815
+ */
816
+int close_fd_get_file(unsigned int fd, struct file **res)
817
+{
818
+ struct files_struct *files = current->files;
819
+ int ret;
820
+
821
+ spin_lock(&files->file_lock);
822
+ ret = __close_fd_get_file(fd, res);
823
+ spin_unlock(&files->file_lock);
824
+
825
+ return ret;
826
+}
643827
644828 void do_close_on_exec(struct files_struct *files)
645829 {
....@@ -678,7 +862,7 @@
678862 }
679863
680864 static inline struct file *__fget_files_rcu(struct files_struct *files,
681
- unsigned int fd, fmode_t mask, unsigned int refs)
865
+ unsigned int fd, fmode_t mask, unsigned int refs)
682866 {
683867 for (;;) {
684868 struct file *file;
....@@ -732,10 +916,9 @@
732916 }
733917 }
734918
735
-
736
-static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
919
+static struct file *__fget_files(struct files_struct *files, unsigned int fd,
920
+ fmode_t mask, unsigned int refs)
737921 {
738
- struct files_struct *files = current->files;
739922 struct file *file;
740923
741924 rcu_read_lock();
....@@ -743,6 +926,12 @@
743926 rcu_read_unlock();
744927
745928 return file;
929
+}
930
+
931
+static inline struct file *__fget(unsigned int fd, fmode_t mask,
932
+ unsigned int refs)
933
+{
934
+ return __fget_files(current->files, fd, mask, refs);
746935 }
747936
748937 struct file *fget_many(unsigned int fd, unsigned int refs)
....@@ -761,6 +950,18 @@
761950 return __fget(fd, 0, 1);
762951 }
763952 EXPORT_SYMBOL(fget_raw);
953
+
954
+struct file *fget_task(struct task_struct *task, unsigned int fd)
955
+{
956
+ struct file *file = NULL;
957
+
958
+ task_lock(task);
959
+ if (task->files)
960
+ file = __fget_files(task->files, fd, 0, 1);
961
+ task_unlock(task);
962
+
963
+ return file;
964
+}
764965
765966 /*
766967 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
....@@ -806,16 +1007,30 @@
8061007 return __fget_light(fd, 0);
8071008 }
8081009
1010
+/*
1011
+ * Try to avoid f_pos locking. We only need it if the
1012
+ * file is marked for FMODE_ATOMIC_POS, and it can be
1013
+ * accessed multiple ways.
1014
+ *
1015
+ * Always do it for directories, because pidfd_getfd()
1016
+ * can make a file accessible even if it otherwise would
1017
+ * not be, and for directories this is a correctness
1018
+ * issue, not a "POSIX requirement".
1019
+ */
1020
+static inline bool file_needs_f_pos_lock(struct file *file)
1021
+{
1022
+ return (file->f_mode & FMODE_ATOMIC_POS) &&
1023
+ (file_count(file) > 1 || S_ISDIR(file_inode(file)->i_mode));
1024
+}
1025
+
8091026 unsigned long __fdget_pos(unsigned int fd)
8101027 {
8111028 unsigned long v = __fdget(fd);
8121029 struct file *file = (struct file *)(v & ~3);
8131030
814
- if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
815
- if (file_count(file) > 1) {
816
- v |= FDPUT_POS_UNLOCK;
817
- mutex_lock(&file->f_pos_lock);
818
- }
1031
+ if (file && file_needs_f_pos_lock(file)) {
1032
+ v |= FDPUT_POS_UNLOCK;
1033
+ mutex_lock(&file->f_pos_lock);
8191034 }
8201035 return v;
8211036 }
....@@ -922,6 +1137,62 @@
9221137 return err;
9231138 }
9241139
1140
+/**
1141
+ * __receive_fd() - Install received file into file descriptor table
1142
+ *
1143
+ * @fd: fd to install into (if negative, a new fd will be allocated)
1144
+ * @file: struct file that was received from another process
1145
+ * @ufd: __user pointer to write new fd number to
1146
+ * @o_flags: the O_* flags to apply to the new fd entry
1147
+ *
1148
+ * Installs a received file into the file descriptor table, with appropriate
1149
+ * checks and count updates. Optionally writes the fd number to userspace, if
1150
+ * @ufd is non-NULL.
1151
+ *
1152
+ * This helper handles its own reference counting of the incoming
1153
+ * struct file.
1154
+ *
1155
+ * Returns newly install fd or -ve on error.
1156
+ */
1157
+int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags)
1158
+{
1159
+ int new_fd;
1160
+ int error;
1161
+
1162
+ error = security_file_receive(file);
1163
+ if (error)
1164
+ return error;
1165
+
1166
+ if (fd < 0) {
1167
+ new_fd = get_unused_fd_flags(o_flags);
1168
+ if (new_fd < 0)
1169
+ return new_fd;
1170
+ } else {
1171
+ new_fd = fd;
1172
+ }
1173
+
1174
+ if (ufd) {
1175
+ error = put_user(new_fd, ufd);
1176
+ if (error) {
1177
+ if (fd < 0)
1178
+ put_unused_fd(new_fd);
1179
+ return error;
1180
+ }
1181
+ }
1182
+
1183
+ if (fd < 0) {
1184
+ fd_install(new_fd, get_file(file));
1185
+ } else {
1186
+ error = replace_fd(new_fd, file, o_flags);
1187
+ if (error)
1188
+ return error;
1189
+ }
1190
+
1191
+ /* Bump the sock usage counts, if any. */
1192
+ __receive_sock(file);
1193
+ return new_fd;
1194
+}
1195
+
9251196 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
9261197 {
9271198 int err = -EBADF;
....@@ -976,7 +1247,7 @@
9761247 return ksys_dup3(oldfd, newfd, 0);
9771248 }
9781249
979
-int ksys_dup(unsigned int fildes)
1250
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
9801251 {
9811252 int ret = -EBADF;
9821253 struct file *file = fget_raw(fildes);
....@@ -989,11 +1260,6 @@
9891260 fput(file);
9901261 }
9911262 return ret;
992
-}
993
-
994
-SYSCALL_DEFINE1(dup, unsigned int, fildes)
995
-{
996
- return ksys_dup(fildes);
9971263 }
9981264
9991265 int f_dupfd(unsigned int from, struct file *file, unsigned flags)