hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/kernel/seccomp.c
....@@ -13,6 +13,7 @@
1313 * Mode 2 allows user-defined system call filters in the form
1414 * of Berkeley Packet Filters/Linux Socket Filters.
1515 */
16
+#define pr_fmt(fmt) "seccomp: " fmt
1617
1718 #include <linux/refcount.h>
1819 #include <linux/audit.h>
....@@ -28,31 +29,175 @@
2829 #include <linux/syscalls.h>
2930 #include <linux/sysctl.h>
3031
31
-/* Not exposed in headers: strictly internal use only. */
32
-#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
33
-
3432 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
3533 #include <asm/syscall.h>
3634 #endif
3735
3836 #ifdef CONFIG_SECCOMP_FILTER
37
+#include <linux/file.h>
3938 #include <linux/filter.h>
4039 #include <linux/pid.h>
4140 #include <linux/ptrace.h>
4241 #include <linux/capability.h>
4342 #include <linux/tracehook.h>
4443 #include <linux/uaccess.h>
44
+#include <linux/anon_inodes.h>
45
+#include <linux/lockdep.h>
46
+
47
+/*
48
+ * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
49
+ * wrong direction flag in the ioctl number. This is the broken one,
50
+ * which the kernel needs to keep supporting until all userspaces stop
51
+ * using the wrong command number.
52
+ */
53
+#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
54
+
55
+enum notify_state {
56
+ SECCOMP_NOTIFY_INIT,
57
+ SECCOMP_NOTIFY_SENT,
58
+ SECCOMP_NOTIFY_REPLIED,
59
+};
60
+
61
+struct seccomp_knotif {
62
+ /* The struct pid of the task whose filter triggered the notification */
63
+ struct task_struct *task;
64
+
65
+ /* The "cookie" for this request; this is unique for this filter. */
66
+ u64 id;
67
+
68
+ /*
69
+ * The seccomp data. This pointer is valid the entire time this
70
+ * notification is active, since it comes from __seccomp_filter which
71
+ * eclipses the entire lifecycle here.
72
+ */
73
+ const struct seccomp_data *data;
74
+
75
+ /*
76
+ * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
77
+ * struct seccomp_knotif is created and starts out in INIT. Once the
78
+ * handler reads the notification off of an FD, it transitions to SENT.
79
+ * If a signal is received the state transitions back to INIT and
80
+ * another message is sent. When the userspace handler replies, state
81
+ * transitions to REPLIED.
82
+ */
83
+ enum notify_state state;
84
+
85
+ /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
86
+ int error;
87
+ long val;
88
+ u32 flags;
89
+
90
+ /*
91
+ * Signals when this has changed states, such as the listener
92
+ * dying, a new seccomp addfd message, or changing to REPLIED
93
+ */
94
+ struct completion ready;
95
+
96
+ struct list_head list;
97
+
98
+ /* outstanding addfd requests */
99
+ struct list_head addfd;
100
+};
101
+
102
+/**
103
+ * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
104
+ *
105
+ * @file: A reference to the file to install in the other task
106
+ * @fd: The fd number to install it at. If the fd number is -1, it means the
107
+ * installing process should allocate the fd as normal.
108
+ * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
109
+ * is allowed.
110
+ * @ret: The return value of the installing process. It is set to the fd num
111
+ * upon success (>= 0).
112
+ * @completion: Indicates that the installing process has completed fd
113
+ * installation, or gone away (either due to successful
114
+ * reply, or signal)
115
+ *
116
+ */
117
+struct seccomp_kaddfd {
118
+ struct file *file;
119
+ int fd;
120
+ unsigned int flags;
121
+
122
+ /* To only be set on reply */
123
+ int ret;
124
+ struct completion completion;
125
+ struct list_head list;
126
+};
127
+
128
+/**
129
+ * struct notification - container for seccomp userspace notifications. Since
130
+ * most seccomp filters will not have notification listeners attached and this
131
+ * structure is fairly large, we store the notification-specific stuff in a
132
+ * separate structure.
133
+ *
134
+ * @request: A semaphore that users of this notification can wait on for
135
+ * changes. Actual reads and writes are still controlled with
136
+ * filter->notify_lock.
137
+ * @next_id: The id of the next request.
138
+ * @notifications: A list of struct seccomp_knotif elements.
139
+ */
140
+struct notification {
141
+ struct semaphore request;
142
+ u64 next_id;
143
+ struct list_head notifications;
144
+};
145
+
146
+#ifdef SECCOMP_ARCH_NATIVE
147
+/**
148
+ * struct action_cache - per-filter cache of seccomp actions per
149
+ * arch/syscall pair
150
+ *
151
+ * @allow_native: A bitmap where each bit represents whether the
152
+ * filter will always allow the syscall, for the
153
+ * native architecture.
154
+ * @allow_compat: A bitmap where each bit represents whether the
155
+ * filter will always allow the syscall, for the
156
+ * compat architecture.
157
+ */
158
+struct action_cache {
159
+ DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
160
+#ifdef SECCOMP_ARCH_COMPAT
161
+ DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
162
+#endif
163
+};
164
+#else
165
+struct action_cache { };
166
+
167
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
168
+ const struct seccomp_data *sd)
169
+{
170
+ return false;
171
+}
172
+
173
+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
174
+{
175
+}
176
+#endif /* SECCOMP_ARCH_NATIVE */
45177
46178 /**
47179 * struct seccomp_filter - container for seccomp BPF programs
48180 *
49
- * @usage: reference count to manage the object lifetime.
50
- * get/put helpers should be used when accessing an instance
51
- * outside of a lifetime-guarded section. In general, this
52
- * is only needed for handling filters shared across tasks.
181
+ * @refs: Reference count to manage the object lifetime.
182
+ * A filter's reference count is incremented for each directly
183
+ * attached task, once for the dependent filter, and if
184
+ * requested for the user notifier. When @refs reaches zero,
185
+ * the filter can be freed.
186
+ * @users: A filter's @users count is incremented for each directly
187
+ * attached task (filter installation, fork(), thread_sync),
188
+ * and once for the dependent filter (tracked in filter->prev).
189
+ * When it reaches zero it indicates that no direct or indirect
190
+ * users of that filter exist. No new tasks can get associated with
191
+ * this filter after reaching 0. The @users count is always smaller
192
+ * or equal to @refs. Hence, reaching 0 for @users does not mean
193
+ * the filter can be freed.
194
+ * @cache: cache of arch/syscall mappings to actions
53195 * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
54196 * @prev: points to a previously installed, or inherited, filter
55197 * @prog: the BPF program to evaluate
198
+ * @notif: the struct that holds all notification related information
199
+ * @notify_lock: A lock for all notification-related accesses.
200
+ * @wqh: A wait queue for poll if a notifier is in use.
56201 *
57202 * seccomp_filter objects are organized in a tree linked via the @prev
58203 * pointer. For any task, it appears to be a singly-linked list starting
....@@ -62,13 +207,18 @@
62207 * how namespaces work.
63208 *
64209 * seccomp_filter objects should never be modified after being attached
65
- * to a task_struct (other than @usage).
210
+ * to a task_struct (other than @refs).
66211 */
67212 struct seccomp_filter {
68
- refcount_t usage;
213
+ refcount_t refs;
214
+ refcount_t users;
69215 bool log;
216
+ struct action_cache cache;
70217 struct seccomp_filter *prev;
71218 struct bpf_prog *prog;
219
+ struct notification *notif;
220
+ struct mutex notify_lock;
221
+ wait_queue_head_t wqh;
72222 };
73223
74224 /* Limit any path through the tree to 256KB worth of instructions. */
....@@ -80,13 +230,17 @@
80230 */
81231 static void populate_seccomp_data(struct seccomp_data *sd)
82232 {
233
+ /*
234
+ * Instead of using current_pt_reg(), we're already doing the work
235
+ * to safely fetch "current", so just use "task" everywhere below.
236
+ */
83237 struct task_struct *task = current;
84238 struct pt_regs *regs = task_pt_regs(task);
85239 unsigned long args[6];
86240
87241 sd->nr = syscall_get_nr(task, regs);
88
- sd->arch = syscall_get_arch();
89
- syscall_get_arguments(task, regs, 0, 6, args);
242
+ sd->arch = syscall_get_arch(task);
243
+ syscall_get_arguments(task, regs, args);
90244 sd->args[0] = args[0];
91245 sd->args[1] = args[1];
92246 sd->args[2] = args[2];
....@@ -178,6 +332,52 @@
178332 return 0;
179333 }
180334
335
+#ifdef SECCOMP_ARCH_NATIVE
336
+static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
337
+ size_t bitmap_size,
338
+ int syscall_nr)
339
+{
340
+ if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
341
+ return false;
342
+ syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
343
+
344
+ return test_bit(syscall_nr, bitmap);
345
+}
346
+
347
+/**
348
+ * seccomp_cache_check_allow - lookup seccomp cache
349
+ * @sfilter: The seccomp filter
350
+ * @sd: The seccomp data to lookup the cache with
351
+ *
352
+ * Returns true if the seccomp_data is cached and allowed.
353
+ */
354
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
355
+ const struct seccomp_data *sd)
356
+{
357
+ int syscall_nr = sd->nr;
358
+ const struct action_cache *cache = &sfilter->cache;
359
+
360
+#ifndef SECCOMP_ARCH_COMPAT
361
+ /* A native-only architecture doesn't need to check sd->arch. */
362
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
363
+ SECCOMP_ARCH_NATIVE_NR,
364
+ syscall_nr);
365
+#else
366
+ if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
367
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
368
+ SECCOMP_ARCH_NATIVE_NR,
369
+ syscall_nr);
370
+ if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
371
+ return seccomp_cache_check_allow_bitmap(cache->allow_compat,
372
+ SECCOMP_ARCH_COMPAT_NR,
373
+ syscall_nr);
374
+#endif /* SECCOMP_ARCH_COMPAT */
375
+
376
+ WARN_ON_ONCE(true);
377
+ return false;
378
+}
379
+#endif /* SECCOMP_ARCH_NATIVE */
380
+
181381 /**
182382 * seccomp_run_filters - evaluates all seccomp filters against @sd
183383 * @sd: optional seccomp data to be passed to filters
....@@ -191,27 +391,24 @@
191391 static u32 seccomp_run_filters(const struct seccomp_data *sd,
192392 struct seccomp_filter **match)
193393 {
194
- struct seccomp_data sd_local;
195394 u32 ret = SECCOMP_RET_ALLOW;
196395 /* Make sure cross-thread synced filter points somewhere sane. */
197396 struct seccomp_filter *f =
198397 READ_ONCE(current->seccomp.filter);
199398
200399 /* Ensure unexpected behavior doesn't result in failing open. */
201
- if (unlikely(WARN_ON(f == NULL)))
400
+ if (WARN_ON(f == NULL))
202401 return SECCOMP_RET_KILL_PROCESS;
203402
204
- if (!sd) {
205
- populate_seccomp_data(&sd_local);
206
- sd = &sd_local;
207
- }
403
+ if (seccomp_cache_check_allow(f, sd))
404
+ return SECCOMP_RET_ALLOW;
208405
209406 /*
210407 * All filters in the list are evaluated and the lowest BPF return
211408 * value always takes priority (ignoring the DATA).
212409 */
213410 for (; f; f = f->prev) {
214
- u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
411
+ u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
215412
216413 if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
217414 ret = cur_ret;
....@@ -272,7 +469,7 @@
272469 * Expects sighand and cred_guard_mutex locks to be held.
273470 *
274471 * Returns 0 on success, -ve on error, or the pid of a thread which was
275
- * either not in the correct seccomp mode or it did not have an ancestral
472
+ * either not in the correct seccomp mode or did not have an ancestral
276473 * seccomp filter.
277474 */
278475 static inline pid_t seccomp_can_sync_threads(void)
....@@ -300,12 +497,65 @@
300497 /* Return the first thread that cannot be synchronized. */
301498 failed = task_pid_vnr(thread);
302499 /* If the pid cannot be resolved, then return -ESRCH */
303
- if (unlikely(WARN_ON(failed == 0)))
500
+ if (WARN_ON(failed == 0))
304501 failed = -ESRCH;
305502 return failed;
306503 }
307504
308505 return 0;
506
+}
507
+
508
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
509
+{
510
+ if (filter) {
511
+ bpf_prog_destroy(filter->prog);
512
+ kfree(filter);
513
+ }
514
+}
515
+
516
+static void __seccomp_filter_orphan(struct seccomp_filter *orig)
517
+{
518
+ while (orig && refcount_dec_and_test(&orig->users)) {
519
+ if (waitqueue_active(&orig->wqh))
520
+ wake_up_poll(&orig->wqh, EPOLLHUP);
521
+ orig = orig->prev;
522
+ }
523
+}
524
+
525
+static void __put_seccomp_filter(struct seccomp_filter *orig)
526
+{
527
+ /* Clean up single-reference branches iteratively. */
528
+ while (orig && refcount_dec_and_test(&orig->refs)) {
529
+ struct seccomp_filter *freeme = orig;
530
+ orig = orig->prev;
531
+ seccomp_filter_free(freeme);
532
+ }
533
+}
534
+
535
+static void __seccomp_filter_release(struct seccomp_filter *orig)
536
+{
537
+ /* Notify about any unused filters in the task's former filter tree. */
538
+ __seccomp_filter_orphan(orig);
539
+ /* Finally drop all references to the task's former tree. */
540
+ __put_seccomp_filter(orig);
541
+}
542
+
543
+/**
544
+ * seccomp_filter_release - Detach the task from its filter tree,
545
+ * drop its reference count, and notify
546
+ * about unused filters
547
+ *
548
+ * This function should only be called when the task is exiting as
549
+ * it detaches it from its filter tree. As such, READ_ONCE() and
550
+ * barriers are not needed here, as would normally be needed.
551
+ */
552
+void seccomp_filter_release(struct task_struct *tsk)
553
+{
554
+ struct seccomp_filter *orig = tsk->seccomp.filter;
555
+
556
+ /* Detach task from its filter tree. */
557
+ tsk->seccomp.filter = NULL;
558
+ __seccomp_filter_release(orig);
309559 }
310560
311561 /**
....@@ -332,14 +582,19 @@
332582
333583 /* Get a task reference for the new leaf node. */
334584 get_seccomp_filter(caller);
585
+
335586 /*
336587 * Drop the task reference to the shared ancestor since
337588 * current's path will hold a reference. (This also
338589 * allows a put before the assignment.)
339590 */
340
- put_seccomp_filter(thread);
591
+ __seccomp_filter_release(thread->seccomp.filter);
592
+
593
+ /* Make our new filter tree visible. */
341594 smp_store_release(&thread->seccomp.filter,
342595 caller->seccomp.filter);
596
+ atomic_set(&thread->seccomp.filter_count,
597
+ atomic_read(&caller->seccomp.filter_count));
343598
344599 /*
345600 * Don't let an unprivileged task work around
....@@ -372,7 +627,12 @@
372627 {
373628 struct seccomp_filter *sfilter;
374629 int ret;
375
- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
630
+ const bool save_orig =
631
+#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
632
+ true;
633
+#else
634
+ false;
635
+#endif
376636
377637 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
378638 return ERR_PTR(-EINVAL);
....@@ -394,6 +654,7 @@
394654 if (!sfilter)
395655 return ERR_PTR(-ENOMEM);
396656
657
+ mutex_init(&sfilter->notify_lock);
397658 ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
398659 seccomp_check_filter, save_orig);
399660 if (ret < 0) {
....@@ -401,7 +662,9 @@
401662 return ERR_PTR(ret);
402663 }
403664
404
- refcount_set(&sfilter->usage, 1);
665
+ refcount_set(&sfilter->refs, 1);
666
+ refcount_set(&sfilter->users, 1);
667
+ init_waitqueue_head(&sfilter->wqh);
405668
406669 return sfilter;
407670 }
....@@ -434,6 +697,148 @@
434697 return filter;
435698 }
436699
700
+#ifdef SECCOMP_ARCH_NATIVE
701
+/**
702
+ * seccomp_is_const_allow - check if filter is constant allow with given data
703
+ * @fprog: The BPF programs
704
+ * @sd: The seccomp data to check against, only syscall number and arch
705
+ * number are considered constant.
706
+ */
707
+static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
708
+ struct seccomp_data *sd)
709
+{
710
+ unsigned int reg_value = 0;
711
+ unsigned int pc;
712
+ bool op_res;
713
+
714
+ if (WARN_ON_ONCE(!fprog))
715
+ return false;
716
+
717
+ for (pc = 0; pc < fprog->len; pc++) {
718
+ struct sock_filter *insn = &fprog->filter[pc];
719
+ u16 code = insn->code;
720
+ u32 k = insn->k;
721
+
722
+ switch (code) {
723
+ case BPF_LD | BPF_W | BPF_ABS:
724
+ switch (k) {
725
+ case offsetof(struct seccomp_data, nr):
726
+ reg_value = sd->nr;
727
+ break;
728
+ case offsetof(struct seccomp_data, arch):
729
+ reg_value = sd->arch;
730
+ break;
731
+ default:
732
+ /* can't optimize (non-constant value load) */
733
+ return false;
734
+ }
735
+ break;
736
+ case BPF_RET | BPF_K:
737
+ /* reached return with constant values only, check allow */
738
+ return k == SECCOMP_RET_ALLOW;
739
+ case BPF_JMP | BPF_JA:
740
+ pc += insn->k;
741
+ break;
742
+ case BPF_JMP | BPF_JEQ | BPF_K:
743
+ case BPF_JMP | BPF_JGE | BPF_K:
744
+ case BPF_JMP | BPF_JGT | BPF_K:
745
+ case BPF_JMP | BPF_JSET | BPF_K:
746
+ switch (BPF_OP(code)) {
747
+ case BPF_JEQ:
748
+ op_res = reg_value == k;
749
+ break;
750
+ case BPF_JGE:
751
+ op_res = reg_value >= k;
752
+ break;
753
+ case BPF_JGT:
754
+ op_res = reg_value > k;
755
+ break;
756
+ case BPF_JSET:
757
+ op_res = !!(reg_value & k);
758
+ break;
759
+ default:
760
+ /* can't optimize (unknown jump) */
761
+ return false;
762
+ }
763
+
764
+ pc += op_res ? insn->jt : insn->jf;
765
+ break;
766
+ case BPF_ALU | BPF_AND | BPF_K:
767
+ reg_value &= k;
768
+ break;
769
+ default:
770
+ /* can't optimize (unknown insn) */
771
+ return false;
772
+ }
773
+ }
774
+
775
+ /* ran off the end of the filter?! */
776
+ WARN_ON(1);
777
+ return false;
778
+}
779
+
780
+static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
781
+ void *bitmap, const void *bitmap_prev,
782
+ size_t bitmap_size, int arch)
783
+{
784
+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
785
+ struct seccomp_data sd;
786
+ int nr;
787
+
788
+ if (bitmap_prev) {
789
+ /* The new filter must be as restrictive as the last. */
790
+ bitmap_copy(bitmap, bitmap_prev, bitmap_size);
791
+ } else {
792
+ /* Before any filters, all syscalls are always allowed. */
793
+ bitmap_fill(bitmap, bitmap_size);
794
+ }
795
+
796
+ for (nr = 0; nr < bitmap_size; nr++) {
797
+ /* No bitmap change: not a cacheable action. */
798
+ if (!test_bit(nr, bitmap))
799
+ continue;
800
+
801
+ sd.nr = nr;
802
+ sd.arch = arch;
803
+
804
+ /* No bitmap change: continue to always allow. */
805
+ if (seccomp_is_const_allow(fprog, &sd))
806
+ continue;
807
+
808
+ /*
809
+ * Not a cacheable action: always run filters.
810
+ * atomic clear_bit() not needed, filter not visible yet.
811
+ */
812
+ __clear_bit(nr, bitmap);
813
+ }
814
+}
815
+
816
+/**
817
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
818
+ * @sfilter: The seccomp filter
819
+ *
820
+ * Returns 0 if successful or -errno if error occurred.
821
+ */
822
+static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
823
+{
824
+ struct action_cache *cache = &sfilter->cache;
825
+ const struct action_cache *cache_prev =
826
+ sfilter->prev ? &sfilter->prev->cache : NULL;
827
+
828
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
829
+ cache_prev ? cache_prev->allow_native : NULL,
830
+ SECCOMP_ARCH_NATIVE_NR,
831
+ SECCOMP_ARCH_NATIVE);
832
+
833
+#ifdef SECCOMP_ARCH_COMPAT
834
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
835
+ cache_prev ? cache_prev->allow_compat : NULL,
836
+ SECCOMP_ARCH_COMPAT_NR,
837
+ SECCOMP_ARCH_COMPAT);
838
+#endif /* SECCOMP_ARCH_COMPAT */
839
+}
840
+#endif /* SECCOMP_ARCH_NATIVE */
841
+
437842 /**
438843 * seccomp_attach_filter: validate and attach filter
439844 * @flags: flags to change filter behavior
....@@ -441,7 +846,10 @@
441846 *
442847 * Caller must be holding current->sighand->siglock lock.
443848 *
444
- * Returns 0 on success, -ve on error.
849
+ * Returns 0 on success, -ve on error, or
850
+ * - in TSYNC mode: the pid of a thread which was either not in the correct
851
+ * seccomp mode or did not have an ancestral seccomp filter
852
+ * - in NEW_LISTENER mode: the fd of the new listener
445853 */
446854 static long seccomp_attach_filter(unsigned int flags,
447855 struct seccomp_filter *filter)
....@@ -463,8 +871,12 @@
463871 int ret;
464872
465873 ret = seccomp_can_sync_threads();
466
- if (ret)
467
- return ret;
874
+ if (ret) {
875
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
876
+ return -ESRCH;
877
+ else
878
+ return ret;
879
+ }
468880 }
469881
470882 /* Set log flag, if present. */
....@@ -476,7 +888,9 @@
476888 * task reference.
477889 */
478890 filter->prev = current->seccomp.filter;
891
+ seccomp_cache_prepare(filter);
479892 current->seccomp.filter = filter;
893
+ atomic_inc(&current->seccomp.filter_count);
480894
481895 /* Now that the new filter is in place, synchronize to all threads. */
482896 if (flags & SECCOMP_FILTER_FLAG_TSYNC)
....@@ -487,8 +901,7 @@
487901
488902 static void __get_seccomp_filter(struct seccomp_filter *filter)
489903 {
490
- /* Reference count is bounded by the number of total processes. */
491
- refcount_inc(&filter->usage);
904
+ refcount_inc(&filter->refs);
492905 }
493906
494907 /* get_seccomp_filter - increments the reference count of the filter on @tsk */
....@@ -498,40 +911,17 @@
498911 if (!orig)
499912 return;
500913 __get_seccomp_filter(orig);
914
+ refcount_inc(&orig->users);
501915 }
502916
503
-static inline void seccomp_filter_free(struct seccomp_filter *filter)
504
-{
505
- if (filter) {
506
- bpf_prog_destroy(filter->prog);
507
- kfree(filter);
508
- }
509
-}
510
-
511
-static void __put_seccomp_filter(struct seccomp_filter *orig)
512
-{
513
- /* Clean up single-reference branches iteratively. */
514
- while (orig && refcount_dec_and_test(&orig->usage)) {
515
- struct seccomp_filter *freeme = orig;
516
- orig = orig->prev;
517
- seccomp_filter_free(freeme);
518
- }
519
-}
520
-
521
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
522
-void put_seccomp_filter(struct task_struct *tsk)
523
-{
524
- __put_seccomp_filter(tsk->seccomp.filter);
525
-}
526
-
527
-static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
917
+static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
528918 {
529919 clear_siginfo(info);
530920 info->si_signo = SIGSYS;
531921 info->si_code = SYS_SECCOMP;
532922 info->si_call_addr = (void __user *)KSTK_EIP(current);
533923 info->si_errno = reason;
534
- info->si_arch = syscall_get_arch();
924
+ info->si_arch = syscall_get_arch(current);
535925 info->si_syscall = syscall;
536926 }
537927
....@@ -544,9 +934,9 @@
544934 */
545935 static void seccomp_send_sigsys(int syscall, int reason)
546936 {
547
- struct siginfo info;
937
+ struct kernel_siginfo info;
548938 seccomp_init_siginfo(&info, syscall, reason);
549
- force_sig_info(SIGSYS, &info, current);
939
+ force_sig_info(&info);
550940 }
551941 #endif /* CONFIG_SECCOMP_FILTER */
552942
....@@ -558,11 +948,13 @@
558948 #define SECCOMP_LOG_TRACE (1 << 4)
559949 #define SECCOMP_LOG_LOG (1 << 5)
560950 #define SECCOMP_LOG_ALLOW (1 << 6)
951
+#define SECCOMP_LOG_USER_NOTIF (1 << 7)
561952
562953 static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
563954 SECCOMP_LOG_KILL_THREAD |
564955 SECCOMP_LOG_TRAP |
565956 SECCOMP_LOG_ERRNO |
957
+ SECCOMP_LOG_USER_NOTIF |
566958 SECCOMP_LOG_TRACE |
567959 SECCOMP_LOG_LOG;
568960
....@@ -582,6 +974,9 @@
582974 break;
583975 case SECCOMP_RET_TRACE:
584976 log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
977
+ break;
978
+ case SECCOMP_RET_USER_NOTIF:
979
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
585980 break;
586981 case SECCOMP_RET_LOG:
587982 log = seccomp_actions_logged & SECCOMP_LOG_LOG;
....@@ -613,25 +1008,24 @@
6131008 */
6141009 static const int mode1_syscalls[] = {
6151010 __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
616
- 0, /* null terminated */
1011
+ -1, /* negative terminated */
6171012 };
6181013
6191014 static void __secure_computing_strict(int this_syscall)
6201015 {
621
- const int *syscall_whitelist = mode1_syscalls;
1016
+ const int *allowed_syscalls = mode1_syscalls;
6221017 #ifdef CONFIG_COMPAT
6231018 if (in_compat_syscall())
624
- syscall_whitelist = get_compat_mode1_syscalls();
1019
+ allowed_syscalls = get_compat_mode1_syscalls();
6251020 #endif
6261021 do {
627
- if (*syscall_whitelist == this_syscall)
1022
+ if (*allowed_syscalls == this_syscall)
6281023 return;
629
- } while (*++syscall_whitelist);
1024
+ } while (*++allowed_syscalls != -1);
6301025
6311026 #ifdef SECCOMP_DEBUG
6321027 dump_stack();
6331028 #endif
634
- current->seccomp.mode = SECCOMP_MODE_DEAD;
6351029 seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
6361030 do_exit(SIGKILL);
6371031 }
....@@ -655,18 +1049,126 @@
6551049 #else
6561050
6571051 #ifdef CONFIG_SECCOMP_FILTER
1052
+static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
1053
+{
1054
+ /*
1055
+ * Note: overflow is ok here, the id just needs to be unique per
1056
+ * filter.
1057
+ */
1058
+ lockdep_assert_held(&filter->notify_lock);
1059
+ return filter->notif->next_id++;
1060
+}
1061
+
1062
+static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
1063
+{
1064
+ /*
1065
+ * Remove the notification, and reset the list pointers, indicating
1066
+ * that it has been handled.
1067
+ */
1068
+ list_del_init(&addfd->list);
1069
+ addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
1070
+ complete(&addfd->completion);
1071
+}
1072
+
1073
+static int seccomp_do_user_notification(int this_syscall,
1074
+ struct seccomp_filter *match,
1075
+ const struct seccomp_data *sd)
1076
+{
1077
+ int err;
1078
+ u32 flags = 0;
1079
+ long ret = 0;
1080
+ struct seccomp_knotif n = {};
1081
+ struct seccomp_kaddfd *addfd, *tmp;
1082
+
1083
+ mutex_lock(&match->notify_lock);
1084
+ err = -ENOSYS;
1085
+ if (!match->notif)
1086
+ goto out;
1087
+
1088
+ n.task = current;
1089
+ n.state = SECCOMP_NOTIFY_INIT;
1090
+ n.data = sd;
1091
+ n.id = seccomp_next_notify_id(match);
1092
+ init_completion(&n.ready);
1093
+ list_add(&n.list, &match->notif->notifications);
1094
+ INIT_LIST_HEAD(&n.addfd);
1095
+
1096
+ up(&match->notif->request);
1097
+ wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
1098
+
1099
+ /*
1100
+ * This is where we wait for a reply from userspace.
1101
+ */
1102
+ do {
1103
+ mutex_unlock(&match->notify_lock);
1104
+ err = wait_for_completion_interruptible(&n.ready);
1105
+ mutex_lock(&match->notify_lock);
1106
+ if (err != 0)
1107
+ goto interrupted;
1108
+
1109
+ addfd = list_first_entry_or_null(&n.addfd,
1110
+ struct seccomp_kaddfd, list);
1111
+ /* Check if we were woken up by a addfd message */
1112
+ if (addfd)
1113
+ seccomp_handle_addfd(addfd);
1114
+
1115
+ } while (n.state != SECCOMP_NOTIFY_REPLIED);
1116
+
1117
+ ret = n.val;
1118
+ err = n.error;
1119
+ flags = n.flags;
1120
+
1121
+interrupted:
1122
+ /* If there were any pending addfd calls, clear them out */
1123
+ list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
1124
+ /* The process went away before we got a chance to handle it */
1125
+ addfd->ret = -ESRCH;
1126
+ list_del_init(&addfd->list);
1127
+ complete(&addfd->completion);
1128
+ }
1129
+
1130
+ /*
1131
+ * Note that it's possible the listener died in between the time when
1132
+ * we were notified of a response (or a signal) and when we were able to
1133
+ * re-acquire the lock, so only delete from the list if the
1134
+ * notification actually exists.
1135
+ *
1136
+ * Also note that this test is only valid because there's no way to
1137
+ * *reattach* to a notifier right now. If one is added, we'll need to
1138
+ * keep track of the notif itself and make sure they match here.
1139
+ */
1140
+ if (match->notif)
1141
+ list_del(&n.list);
1142
+out:
1143
+ mutex_unlock(&match->notify_lock);
1144
+
1145
+ /* Userspace requests to continue the syscall. */
1146
+ if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1147
+ return 0;
1148
+
1149
+ syscall_set_return_value(current, current_pt_regs(),
1150
+ err, ret);
1151
+ return -1;
1152
+}
1153
+
6581154 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
6591155 const bool recheck_after_trace)
6601156 {
6611157 u32 filter_ret, action;
6621158 struct seccomp_filter *match = NULL;
6631159 int data;
1160
+ struct seccomp_data sd_local;
6641161
6651162 /*
6661163 * Make sure that any changes to mode from another thread have
6671164 * been seen after TIF_SECCOMP was seen.
6681165 */
6691166 rmb();
1167
+
1168
+ if (!sd) {
1169
+ populate_seccomp_data(&sd_local);
1170
+ sd = &sd_local;
1171
+ }
6701172
6711173 filter_ret = seccomp_run_filters(sd, &match);
6721174 data = filter_ret & SECCOMP_RET_DATA;
....@@ -677,13 +1179,13 @@
6771179 /* Set low-order bits as an errno, capped at MAX_ERRNO. */
6781180 if (data > MAX_ERRNO)
6791181 data = MAX_ERRNO;
680
- syscall_set_return_value(current, task_pt_regs(current),
1182
+ syscall_set_return_value(current, current_pt_regs(),
6811183 -data, 0);
6821184 goto skip;
6831185
6841186 case SECCOMP_RET_TRAP:
6851187 /* Show the handler the original registers. */
686
- syscall_rollback(current, task_pt_regs(current));
1188
+ syscall_rollback(current, current_pt_regs());
6871189 /* Let the filter pass back 16 bits of data. */
6881190 seccomp_send_sigsys(this_syscall, data);
6891191 goto skip;
....@@ -696,7 +1198,7 @@
6961198 /* ENOSYS these calls if there is no tracer attached. */
6971199 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
6981200 syscall_set_return_value(current,
699
- task_pt_regs(current),
1201
+ current_pt_regs(),
7001202 -ENOSYS, 0);
7011203 goto skip;
7021204 }
....@@ -716,7 +1218,7 @@
7161218 if (fatal_signal_pending(current))
7171219 goto skip;
7181220 /* Check if the tracer forced the syscall to be skipped. */
719
- this_syscall = syscall_get_nr(current, task_pt_regs(current));
1221
+ this_syscall = syscall_get_nr(current, current_pt_regs());
7201222 if (this_syscall < 0)
7211223 goto skip;
7221224
....@@ -728,6 +1230,12 @@
7281230 */
7291231 if (__seccomp_filter(this_syscall, NULL, true))
7301232 return -1;
1233
+
1234
+ return 0;
1235
+
1236
+ case SECCOMP_RET_USER_NOTIF:
1237
+ if (seccomp_do_user_notification(this_syscall, match, sd))
1238
+ goto skip;
7311239
7321240 return 0;
7331241
....@@ -746,23 +1254,22 @@
7461254 case SECCOMP_RET_KILL_THREAD:
7471255 case SECCOMP_RET_KILL_PROCESS:
7481256 default:
749
- current->seccomp.mode = SECCOMP_MODE_DEAD;
7501257 seccomp_log(this_syscall, SIGSYS, action, true);
7511258 /* Dump core only if this is the last remaining thread. */
752
- if (action == SECCOMP_RET_KILL_PROCESS ||
1259
+ if (action != SECCOMP_RET_KILL_THREAD ||
7531260 get_nr_threads(current) == 1) {
754
- siginfo_t info;
1261
+ kernel_siginfo_t info;
7551262
7561263 /* Show the original registers in the dump. */
757
- syscall_rollback(current, task_pt_regs(current));
1264
+ syscall_rollback(current, current_pt_regs());
7581265 /* Trigger a manual coredump since do_exit skips it. */
7591266 seccomp_init_siginfo(&info, this_syscall, data);
7601267 do_coredump(&info);
7611268 }
762
- if (action == SECCOMP_RET_KILL_PROCESS)
763
- do_group_exit(SIGSYS);
764
- else
1269
+ if (action == SECCOMP_RET_KILL_THREAD)
7651270 do_exit(SIGSYS);
1271
+ else
1272
+ do_group_exit(SIGSYS);
7661273 }
7671274
7681275 unreachable();
....@@ -791,7 +1298,7 @@
7911298 return 0;
7921299
7931300 this_syscall = sd ? sd->nr :
794
- syscall_get_nr(current, task_pt_regs(current));
1301
+ syscall_get_nr(current, current_pt_regs());
7951302
7961303 switch (mode) {
7971304 case SECCOMP_MODE_STRICT:
....@@ -799,11 +1306,6 @@
7991306 return 0;
8001307 case SECCOMP_MODE_FILTER:
8011308 return __seccomp_filter(this_syscall, sd, false);
802
- /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
803
- case SECCOMP_MODE_DEAD:
804
- WARN_ON_ONCE(1);
805
- do_exit(SIGKILL);
806
- return -1;
8071309 default:
8081310 BUG();
8091311 }
....@@ -845,6 +1347,420 @@
8451347 }
8461348
8471349 #ifdef CONFIG_SECCOMP_FILTER
1350
+static void seccomp_notify_free(struct seccomp_filter *filter)
1351
+{
1352
+ kfree(filter->notif);
1353
+ filter->notif = NULL;
1354
+}
1355
+
1356
+static void seccomp_notify_detach(struct seccomp_filter *filter)
1357
+{
1358
+ struct seccomp_knotif *knotif;
1359
+
1360
+ if (!filter)
1361
+ return;
1362
+
1363
+ mutex_lock(&filter->notify_lock);
1364
+
1365
+ /*
1366
+ * If this file is being closed because e.g. the task who owned it
1367
+ * died, let's wake everyone up who was waiting on us.
1368
+ */
1369
+ list_for_each_entry(knotif, &filter->notif->notifications, list) {
1370
+ if (knotif->state == SECCOMP_NOTIFY_REPLIED)
1371
+ continue;
1372
+
1373
+ knotif->state = SECCOMP_NOTIFY_REPLIED;
1374
+ knotif->error = -ENOSYS;
1375
+ knotif->val = 0;
1376
+
1377
+ /*
1378
+ * We do not need to wake up any pending addfd messages, as
1379
+ * the notifier will do that for us, as this just looks
1380
+ * like a standard reply.
1381
+ */
1382
+ complete(&knotif->ready);
1383
+ }
1384
+
1385
+ seccomp_notify_free(filter);
1386
+ mutex_unlock(&filter->notify_lock);
1387
+}
1388
+
1389
+static int seccomp_notify_release(struct inode *inode, struct file *file)
1390
+{
1391
+ struct seccomp_filter *filter = file->private_data;
1392
+
1393
+ seccomp_notify_detach(filter);
1394
+ __put_seccomp_filter(filter);
1395
+ return 0;
1396
+}
1397
+
1398
+/* must be called with notif_lock held */
1399
+static inline struct seccomp_knotif *
1400
+find_notification(struct seccomp_filter *filter, u64 id)
1401
+{
1402
+ struct seccomp_knotif *cur;
1403
+
1404
+ lockdep_assert_held(&filter->notify_lock);
1405
+
1406
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
1407
+ if (cur->id == id)
1408
+ return cur;
1409
+ }
1410
+
1411
+ return NULL;
1412
+}
1413
+
1414
+
1415
+static long seccomp_notify_recv(struct seccomp_filter *filter,
1416
+ void __user *buf)
1417
+{
1418
+ struct seccomp_knotif *knotif = NULL, *cur;
1419
+ struct seccomp_notif unotif;
1420
+ ssize_t ret;
1421
+
1422
+ /* Verify that we're not given garbage to keep struct extensible. */
1423
+ ret = check_zeroed_user(buf, sizeof(unotif));
1424
+ if (ret < 0)
1425
+ return ret;
1426
+ if (!ret)
1427
+ return -EINVAL;
1428
+
1429
+ memset(&unotif, 0, sizeof(unotif));
1430
+
1431
+ ret = down_interruptible(&filter->notif->request);
1432
+ if (ret < 0)
1433
+ return ret;
1434
+
1435
+ mutex_lock(&filter->notify_lock);
1436
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
1437
+ if (cur->state == SECCOMP_NOTIFY_INIT) {
1438
+ knotif = cur;
1439
+ break;
1440
+ }
1441
+ }
1442
+
1443
+ /*
1444
+ * If we didn't find a notification, it could be that the task was
1445
+ * interrupted by a fatal signal between the time we were woken and
1446
+ * when we were able to acquire the rw lock.
1447
+ */
1448
+ if (!knotif) {
1449
+ ret = -ENOENT;
1450
+ goto out;
1451
+ }
1452
+
1453
+ unotif.id = knotif->id;
1454
+ unotif.pid = task_pid_vnr(knotif->task);
1455
+ unotif.data = *(knotif->data);
1456
+
1457
+ knotif->state = SECCOMP_NOTIFY_SENT;
1458
+ wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
1459
+ ret = 0;
1460
+out:
1461
+ mutex_unlock(&filter->notify_lock);
1462
+
1463
+ if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
1464
+ ret = -EFAULT;
1465
+
1466
+ /*
1467
+ * Userspace screwed up. To make sure that we keep this
1468
+ * notification alive, let's reset it back to INIT. It
1469
+ * may have died when we released the lock, so we need to make
1470
+ * sure it's still around.
1471
+ */
1472
+ mutex_lock(&filter->notify_lock);
1473
+ knotif = find_notification(filter, unotif.id);
1474
+ if (knotif) {
1475
+ knotif->state = SECCOMP_NOTIFY_INIT;
1476
+ up(&filter->notif->request);
1477
+ }
1478
+ mutex_unlock(&filter->notify_lock);
1479
+ }
1480
+
1481
+ return ret;
1482
+}
1483
+
1484
+static long seccomp_notify_send(struct seccomp_filter *filter,
1485
+ void __user *buf)
1486
+{
1487
+ struct seccomp_notif_resp resp = {};
1488
+ struct seccomp_knotif *knotif;
1489
+ long ret;
1490
+
1491
+ if (copy_from_user(&resp, buf, sizeof(resp)))
1492
+ return -EFAULT;
1493
+
1494
+ if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1495
+ return -EINVAL;
1496
+
1497
+ if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
1498
+ (resp.error || resp.val))
1499
+ return -EINVAL;
1500
+
1501
+ ret = mutex_lock_interruptible(&filter->notify_lock);
1502
+ if (ret < 0)
1503
+ return ret;
1504
+
1505
+ knotif = find_notification(filter, resp.id);
1506
+ if (!knotif) {
1507
+ ret = -ENOENT;
1508
+ goto out;
1509
+ }
1510
+
1511
+ /* Allow exactly one reply. */
1512
+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
1513
+ ret = -EINPROGRESS;
1514
+ goto out;
1515
+ }
1516
+
1517
+ ret = 0;
1518
+ knotif->state = SECCOMP_NOTIFY_REPLIED;
1519
+ knotif->error = resp.error;
1520
+ knotif->val = resp.val;
1521
+ knotif->flags = resp.flags;
1522
+ complete(&knotif->ready);
1523
+out:
1524
+ mutex_unlock(&filter->notify_lock);
1525
+ return ret;
1526
+}
1527
+
1528
+static long seccomp_notify_id_valid(struct seccomp_filter *filter,
1529
+ void __user *buf)
1530
+{
1531
+ struct seccomp_knotif *knotif;
1532
+ u64 id;
1533
+ long ret;
1534
+
1535
+ if (copy_from_user(&id, buf, sizeof(id)))
1536
+ return -EFAULT;
1537
+
1538
+ ret = mutex_lock_interruptible(&filter->notify_lock);
1539
+ if (ret < 0)
1540
+ return ret;
1541
+
1542
+ knotif = find_notification(filter, id);
1543
+ if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
1544
+ ret = 0;
1545
+ else
1546
+ ret = -ENOENT;
1547
+
1548
+ mutex_unlock(&filter->notify_lock);
1549
+ return ret;
1550
+}
1551
+
1552
+static long seccomp_notify_addfd(struct seccomp_filter *filter,
1553
+ struct seccomp_notif_addfd __user *uaddfd,
1554
+ unsigned int size)
1555
+{
1556
+ struct seccomp_notif_addfd addfd;
1557
+ struct seccomp_knotif *knotif;
1558
+ struct seccomp_kaddfd kaddfd;
1559
+ int ret;
1560
+
1561
+ BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
1562
+ BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
1563
+
1564
+ if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
1565
+ return -EINVAL;
1566
+
1567
+ ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
1568
+ if (ret)
1569
+ return ret;
1570
+
1571
+ if (addfd.newfd_flags & ~O_CLOEXEC)
1572
+ return -EINVAL;
1573
+
1574
+ if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
1575
+ return -EINVAL;
1576
+
1577
+ if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
1578
+ return -EINVAL;
1579
+
1580
+ kaddfd.file = fget(addfd.srcfd);
1581
+ if (!kaddfd.file)
1582
+ return -EBADF;
1583
+
1584
+ kaddfd.flags = addfd.newfd_flags;
1585
+ kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
1586
+ addfd.newfd : -1;
1587
+ init_completion(&kaddfd.completion);
1588
+
1589
+ ret = mutex_lock_interruptible(&filter->notify_lock);
1590
+ if (ret < 0)
1591
+ goto out;
1592
+
1593
+ knotif = find_notification(filter, addfd.id);
1594
+ if (!knotif) {
1595
+ ret = -ENOENT;
1596
+ goto out_unlock;
1597
+ }
1598
+
1599
+ /*
1600
+ * We do not want to allow for FD injection to occur before the
1601
+ * notification has been picked up by a userspace handler, or after
1602
+ * the notification has been replied to.
1603
+ */
1604
+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
1605
+ ret = -EINPROGRESS;
1606
+ goto out_unlock;
1607
+ }
1608
+
1609
+ list_add(&kaddfd.list, &knotif->addfd);
1610
+ complete(&knotif->ready);
1611
+ mutex_unlock(&filter->notify_lock);
1612
+
1613
+ /* Now we wait for it to be processed or be interrupted */
1614
+ ret = wait_for_completion_interruptible(&kaddfd.completion);
1615
+ if (ret == 0) {
1616
+ /*
1617
+ * We had a successful completion. The other side has already
1618
+ * removed us from the addfd queue, and
1619
+ * wait_for_completion_interruptible has a memory barrier upon
1620
+ * success that lets us read this value directly without
1621
+ * locking.
1622
+ */
1623
+ ret = kaddfd.ret;
1624
+ goto out;
1625
+ }
1626
+
1627
+ mutex_lock(&filter->notify_lock);
1628
+ /*
1629
+ * Even though we were woken up by a signal and not a successful
1630
+ * completion, a completion may have happened in the mean time.
1631
+ *
1632
+ * We need to check again if the addfd request has been handled,
1633
+ * and if not, we will remove it from the queue.
1634
+ */
1635
+ if (list_empty(&kaddfd.list))
1636
+ ret = kaddfd.ret;
1637
+ else
1638
+ list_del(&kaddfd.list);
1639
+
1640
+out_unlock:
1641
+ mutex_unlock(&filter->notify_lock);
1642
+out:
1643
+ fput(kaddfd.file);
1644
+
1645
+ return ret;
1646
+}
1647
+
1648
+static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
1649
+ unsigned long arg)
1650
+{
1651
+ struct seccomp_filter *filter = file->private_data;
1652
+ void __user *buf = (void __user *)arg;
1653
+
1654
+ /* Fixed-size ioctls */
1655
+ switch (cmd) {
1656
+ case SECCOMP_IOCTL_NOTIF_RECV:
1657
+ return seccomp_notify_recv(filter, buf);
1658
+ case SECCOMP_IOCTL_NOTIF_SEND:
1659
+ return seccomp_notify_send(filter, buf);
1660
+ case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
1661
+ case SECCOMP_IOCTL_NOTIF_ID_VALID:
1662
+ return seccomp_notify_id_valid(filter, buf);
1663
+ }
1664
+
1665
+ /* Extensible Argument ioctls */
1666
+#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
1667
+ switch (EA_IOCTL(cmd)) {
1668
+ case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
1669
+ return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
1670
+ default:
1671
+ return -EINVAL;
1672
+ }
1673
+}
1674
+
1675
+static __poll_t seccomp_notify_poll(struct file *file,
1676
+ struct poll_table_struct *poll_tab)
1677
+{
1678
+ struct seccomp_filter *filter = file->private_data;
1679
+ __poll_t ret = 0;
1680
+ struct seccomp_knotif *cur;
1681
+
1682
+ poll_wait(file, &filter->wqh, poll_tab);
1683
+
1684
+ if (mutex_lock_interruptible(&filter->notify_lock) < 0)
1685
+ return EPOLLERR;
1686
+
1687
+ list_for_each_entry(cur, &filter->notif->notifications, list) {
1688
+ if (cur->state == SECCOMP_NOTIFY_INIT)
1689
+ ret |= EPOLLIN | EPOLLRDNORM;
1690
+ if (cur->state == SECCOMP_NOTIFY_SENT)
1691
+ ret |= EPOLLOUT | EPOLLWRNORM;
1692
+ if ((ret & EPOLLIN) && (ret & EPOLLOUT))
1693
+ break;
1694
+ }
1695
+
1696
+ mutex_unlock(&filter->notify_lock);
1697
+
1698
+ if (refcount_read(&filter->users) == 0)
1699
+ ret |= EPOLLHUP;
1700
+
1701
+ return ret;
1702
+}
1703
+
1704
+static const struct file_operations seccomp_notify_ops = {
1705
+ .poll = seccomp_notify_poll,
1706
+ .release = seccomp_notify_release,
1707
+ .unlocked_ioctl = seccomp_notify_ioctl,
1708
+ .compat_ioctl = seccomp_notify_ioctl,
1709
+};
1710
+
1711
+static struct file *init_listener(struct seccomp_filter *filter)
1712
+{
1713
+ struct file *ret;
1714
+
1715
+ ret = ERR_PTR(-ENOMEM);
1716
+ filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
1717
+ if (!filter->notif)
1718
+ goto out;
1719
+
1720
+ sema_init(&filter->notif->request, 0);
1721
+ filter->notif->next_id = get_random_u64();
1722
+ INIT_LIST_HEAD(&filter->notif->notifications);
1723
+
1724
+ ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
1725
+ filter, O_RDWR);
1726
+ if (IS_ERR(ret))
1727
+ goto out_notif;
1728
+
1729
+ /* The file has a reference to it now */
1730
+ __get_seccomp_filter(filter);
1731
+
1732
+out_notif:
1733
+ if (IS_ERR(ret))
1734
+ seccomp_notify_free(filter);
1735
+out:
1736
+ return ret;
1737
+}
1738
+
1739
+/*
1740
+ * Does @new_child have a listener while an ancestor also has a listener?
1741
+ * If so, we'll want to reject this filter.
1742
+ * This only has to be tested for the current process, even in the TSYNC case,
1743
+ * because TSYNC installs @child with the same parent on all threads.
1744
+ * Note that @new_child is not hooked up to its parent at this point yet, so
1745
+ * we use current->seccomp.filter.
1746
+ */
1747
+static bool has_duplicate_listener(struct seccomp_filter *new_child)
1748
+{
1749
+ struct seccomp_filter *cur;
1750
+
1751
+ /* must be protected against concurrent TSYNC */
1752
+ lockdep_assert_held(&current->sighand->siglock);
1753
+
1754
+ if (!new_child->notif)
1755
+ return false;
1756
+ for (cur = current->seccomp.filter; cur; cur = cur->prev) {
1757
+ if (cur->notif)
1758
+ return true;
1759
+ }
1760
+
1761
+ return false;
1762
+}
1763
+
8481764 /**
8491765 * seccomp_set_mode_filter: internal function for setting seccomp filter
8501766 * @flags: flags to change filter behavior
....@@ -864,9 +1780,23 @@
8641780 const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
8651781 struct seccomp_filter *prepared = NULL;
8661782 long ret = -EINVAL;
1783
+ int listener = -1;
1784
+ struct file *listener_f = NULL;
8671785
8681786 /* Validate flags. */
8691787 if (flags & ~SECCOMP_FILTER_FLAG_MASK)
1788
+ return -EINVAL;
1789
+
1790
+ /*
1791
+ * In the successful case, NEW_LISTENER returns the new listener fd.
1792
+ * But in the failure case, TSYNC returns the thread that died. If you
1793
+ * combine these two flags, there's no way to tell whether something
1794
+ * succeeded or failed. So, let's disallow this combination if the user
1795
+ * has not explicitly requested no errors from TSYNC.
1796
+ */
1797
+ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
1798
+ (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) &&
1799
+ ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
8701800 return -EINVAL;
8711801
8721802 /* Prepare the new filter before holding any locks. */
....@@ -874,18 +1804,38 @@
8741804 if (IS_ERR(prepared))
8751805 return PTR_ERR(prepared);
8761806
1807
+ if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1808
+ listener = get_unused_fd_flags(O_CLOEXEC);
1809
+ if (listener < 0) {
1810
+ ret = listener;
1811
+ goto out_free;
1812
+ }
1813
+
1814
+ listener_f = init_listener(prepared);
1815
+ if (IS_ERR(listener_f)) {
1816
+ put_unused_fd(listener);
1817
+ ret = PTR_ERR(listener_f);
1818
+ goto out_free;
1819
+ }
1820
+ }
1821
+
8771822 /*
8781823 * Make sure we cannot change seccomp or nnp state via TSYNC
8791824 * while another thread is in the middle of calling exec.
8801825 */
8811826 if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
8821827 mutex_lock_killable(&current->signal->cred_guard_mutex))
883
- goto out_free;
1828
+ goto out_put_fd;
8841829
8851830 spin_lock_irq(&current->sighand->siglock);
8861831
8871832 if (!seccomp_may_assign_mode(seccomp_mode))
8881833 goto out;
1834
+
1835
+ if (has_duplicate_listener(prepared)) {
1836
+ ret = -EBUSY;
1837
+ goto out;
1838
+ }
8891839
8901840 ret = seccomp_attach_filter(flags, prepared);
8911841 if (ret)
....@@ -898,6 +1848,18 @@
8981848 spin_unlock_irq(&current->sighand->siglock);
8991849 if (flags & SECCOMP_FILTER_FLAG_TSYNC)
9001850 mutex_unlock(&current->signal->cred_guard_mutex);
1851
+out_put_fd:
1852
+ if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1853
+ if (ret) {
1854
+ listener_f->private_data = NULL;
1855
+ fput(listener_f);
1856
+ put_unused_fd(listener);
1857
+ seccomp_notify_detach(prepared);
1858
+ } else {
1859
+ fd_install(listener, listener_f);
1860
+ ret = listener;
1861
+ }
1862
+ }
9011863 out_free:
9021864 seccomp_filter_free(prepared);
9031865 return ret;
....@@ -922,6 +1884,7 @@
9221884 case SECCOMP_RET_KILL_THREAD:
9231885 case SECCOMP_RET_TRAP:
9241886 case SECCOMP_RET_ERRNO:
1887
+ case SECCOMP_RET_USER_NOTIF:
9251888 case SECCOMP_RET_TRACE:
9261889 case SECCOMP_RET_LOG:
9271890 case SECCOMP_RET_ALLOW:
....@@ -933,9 +1896,23 @@
9331896 return 0;
9341897 }
9351898
1899
+static long seccomp_get_notif_sizes(void __user *usizes)
1900
+{
1901
+ struct seccomp_notif_sizes sizes = {
1902
+ .seccomp_notif = sizeof(struct seccomp_notif),
1903
+ .seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
1904
+ .seccomp_data = sizeof(struct seccomp_data),
1905
+ };
1906
+
1907
+ if (copy_to_user(usizes, &sizes, sizeof(sizes)))
1908
+ return -EFAULT;
1909
+
1910
+ return 0;
1911
+}
1912
+
9361913 /* Common entry point for both prctl and syscall. */
9371914 static long do_seccomp(unsigned int op, unsigned int flags,
938
- const char __user *uargs)
1915
+ void __user *uargs)
9391916 {
9401917 switch (op) {
9411918 case SECCOMP_SET_MODE_STRICT:
....@@ -949,13 +1926,18 @@
9491926 return -EINVAL;
9501927
9511928 return seccomp_get_action_avail(uargs);
1929
+ case SECCOMP_GET_NOTIF_SIZES:
1930
+ if (flags != 0)
1931
+ return -EINVAL;
1932
+
1933
+ return seccomp_get_notif_sizes(uargs);
9521934 default:
9531935 return -EINVAL;
9541936 }
9551937 }
9561938
9571939 SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
958
- const char __user *, uargs)
1940
+ void __user *, uargs)
9591941 {
9601942 return do_seccomp(op, flags, uargs);
9611943 }
....@@ -967,10 +1949,10 @@
9671949 *
9681950 * Returns 0 on success or -EINVAL on failure.
9691951 */
970
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
1952
+long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
9711953 {
9721954 unsigned int op;
973
- char __user *uargs;
1955
+ void __user *uargs;
9741956
9751957 switch (seccomp_mode) {
9761958 case SECCOMP_MODE_STRICT:
....@@ -1122,6 +2104,7 @@
11222104 #define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
11232105 #define SECCOMP_RET_TRAP_NAME "trap"
11242106 #define SECCOMP_RET_ERRNO_NAME "errno"
2107
+#define SECCOMP_RET_USER_NOTIF_NAME "user_notif"
11252108 #define SECCOMP_RET_TRACE_NAME "trace"
11262109 #define SECCOMP_RET_LOG_NAME "log"
11272110 #define SECCOMP_RET_ALLOW_NAME "allow"
....@@ -1131,6 +2114,7 @@
11312114 SECCOMP_RET_KILL_THREAD_NAME " "
11322115 SECCOMP_RET_TRAP_NAME " "
11332116 SECCOMP_RET_ERRNO_NAME " "
2117
+ SECCOMP_RET_USER_NOTIF_NAME " "
11342118 SECCOMP_RET_TRACE_NAME " "
11352119 SECCOMP_RET_LOG_NAME " "
11362120 SECCOMP_RET_ALLOW_NAME;
....@@ -1145,6 +2129,7 @@
11452129 { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
11462130 { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
11472131 { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
2132
+ { SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
11482133 { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
11492134 { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
11502135 { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
....@@ -1217,7 +2202,7 @@
12172202 return true;
12182203 }
12192204
1220
-static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
2205
+static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
12212206 size_t *lenp, loff_t *ppos)
12222207 {
12232208 char names[sizeof(seccomp_actions_avail)];
....@@ -1235,7 +2220,7 @@
12352220 return proc_dostring(&table, 0, buffer, lenp, ppos);
12362221 }
12372222
1238
-static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
2223
+static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
12392224 size_t *lenp, loff_t *ppos, u32 *actions_logged)
12402225 {
12412226 char names[sizeof(seccomp_actions_avail)];
....@@ -1297,7 +2282,7 @@
12972282 }
12982283
12992284 static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
1300
- void __user *buffer, size_t *lenp,
2285
+ void *buffer, size_t *lenp,
13012286 loff_t *ppos)
13022287 {
13032288 int ret;
....@@ -1343,7 +2328,7 @@
13432328
13442329 hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
13452330 if (!hdr)
1346
- pr_warn("seccomp: sysctl registration failed\n");
2331
+ pr_warn("sysctl registration failed\n");
13472332 else
13482333 kmemleak_not_leak(hdr);
13492334