hc
2024-01-31 f70575805708cabdedea7498aaa3f710fde4d920
kernel/kernel/events/core.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Performance events core code:
34 *
....@@ -5,8 +6,6 @@
56 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
67 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
78 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
- *
9
- * For licensing details see kernel-base/COPYING
109 */
1110
1211 #include <linux/fs.h>
....@@ -29,6 +28,7 @@
2928 #include <linux/export.h>
3029 #include <linux/vmalloc.h>
3130 #include <linux/hardirq.h>
31
+#include <linux/hugetlb.h>
3232 #include <linux/rculist.h>
3333 #include <linux/uaccess.h>
3434 #include <linux/syscalls.h>
....@@ -50,6 +50,7 @@
5050 #include <linux/sched/mm.h>
5151 #include <linux/proc_ns.h>
5252 #include <linux/mount.h>
53
+#include <linux/min_heap.h>
5354
5455 #include "internal.h"
5556
....@@ -265,7 +266,7 @@
265266 if (!event->parent) {
266267 /*
267268 * If this is a !child event, we must hold ctx::mutex to
268
- * stabilize the the event->ctx relation. See
269
+ * stabilize the event->ctx relation. See
269270 * perf_event_ctx_lock().
270271 */
271272 lockdep_assert_held(&ctx->mutex);
....@@ -391,6 +392,10 @@
391392 static atomic_t nr_task_events __read_mostly;
392393 static atomic_t nr_freq_events __read_mostly;
393394 static atomic_t nr_switch_events __read_mostly;
395
+static atomic_t nr_ksymbol_events __read_mostly;
396
+static atomic_t nr_bpf_events __read_mostly;
397
+static atomic_t nr_cgroup_events __read_mostly;
398
+static atomic_t nr_text_poke_events __read_mostly;
394399
395400 static LIST_HEAD(pmus);
396401 static DEFINE_MUTEX(pmus_lock);
....@@ -403,13 +408,8 @@
403408 * 0 - disallow raw tracepoint access for unpriv
404409 * 1 - disallow cpu events for unpriv
405410 * 2 - disallow kernel profiling for unpriv
406
- * 3 - disallow all unpriv perf event use
407411 */
408
-#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
409
-int sysctl_perf_event_paranoid __read_mostly = 3;
410
-#else
411412 int sysctl_perf_event_paranoid __read_mostly = 2;
412
-#endif
413413
414414 /* Minimum for 512 kiB + 1 user control page */
415415 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
....@@ -444,8 +444,7 @@
444444 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
445445
446446 int perf_proc_update_handler(struct ctl_table *table, int write,
447
- void __user *buffer, size_t *lenp,
448
- loff_t *ppos)
447
+ void *buffer, size_t *lenp, loff_t *ppos)
449448 {
450449 int ret;
451450 int perf_cpu = sysctl_perf_cpu_time_max_percent;
....@@ -469,8 +468,7 @@
469468 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
470469
471470 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
472
- void __user *buffer, size_t *lenp,
473
- loff_t *ppos)
471
+ void *buffer, size_t *lenp, loff_t *ppos)
474472 {
475473 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
476474
....@@ -761,7 +759,7 @@
761759 /*
762760 * Do not update time when cgroup is not active
763761 */
764
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
762
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
765763 __update_cgrp_time(event->cgrp);
766764 }
767765
....@@ -901,6 +899,47 @@
901899 rcu_read_unlock();
902900 }
903901
902
+static int perf_cgroup_ensure_storage(struct perf_event *event,
903
+ struct cgroup_subsys_state *css)
904
+{
905
+ struct perf_cpu_context *cpuctx;
906
+ struct perf_event **storage;
907
+ int cpu, heap_size, ret = 0;
908
+
909
+ /*
910
+ * Allow storage to have sufficent space for an iterator for each
911
+ * possibly nested cgroup plus an iterator for events with no cgroup.
912
+ */
913
+ for (heap_size = 1; css; css = css->parent)
914
+ heap_size++;
915
+
916
+ for_each_possible_cpu(cpu) {
917
+ cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
918
+ if (heap_size <= cpuctx->heap_size)
919
+ continue;
920
+
921
+ storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
922
+ GFP_KERNEL, cpu_to_node(cpu));
923
+ if (!storage) {
924
+ ret = -ENOMEM;
925
+ break;
926
+ }
927
+
928
+ raw_spin_lock_irq(&cpuctx->ctx.lock);
929
+ if (cpuctx->heap_size < heap_size) {
930
+ swap(cpuctx->heap, storage);
931
+ if (storage == cpuctx->heap_default)
932
+ storage = NULL;
933
+ cpuctx->heap_size = heap_size;
934
+ }
935
+ raw_spin_unlock_irq(&cpuctx->ctx.lock);
936
+
937
+ kfree(storage);
938
+ }
939
+
940
+ return ret;
941
+}
942
+
904943 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
905944 struct perf_event_attr *attr,
906945 struct perf_event *group_leader)
....@@ -919,6 +958,10 @@
919958 ret = PTR_ERR(css);
920959 goto out;
921960 }
961
+
962
+ ret = perf_cgroup_ensure_storage(event, css);
963
+ if (ret)
964
+ goto out;
922965
923966 cgrp = container_of(css, struct perf_cgroup, css);
924967 event->cgrp = cgrp;
....@@ -945,25 +988,19 @@
945988 event->shadow_ctx_time = now - t->timestamp;
946989 }
947990
948
-/*
949
- * Update cpuctx->cgrp so that it is set when first cgroup event is added and
950
- * cleared when last cgroup event is removed.
951
- */
952991 static inline void
953
-list_update_cgroup_event(struct perf_event *event,
954
- struct perf_event_context *ctx, bool add)
992
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
955993 {
956994 struct perf_cpu_context *cpuctx;
957
- struct list_head *cpuctx_entry;
958995
959996 if (!is_cgroup_event(event))
960997 return;
961998
962999 /*
9631000 * Because cgroup events are always per-cpu events,
964
- * this will always be called from the right CPU.
1001
+ * @ctx == &cpuctx->ctx.
9651002 */
966
- cpuctx = __get_cpu_context(ctx);
1003
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
9671004
9681005 /*
9691006 * Since setting cpuctx->cgrp is conditional on the current @cgrp
....@@ -971,27 +1008,41 @@
9711008 * because if the first would mismatch, the second would not try again
9721009 * and we would leave cpuctx->cgrp unset.
9731010 */
974
- if (add && !cpuctx->cgrp) {
1011
+ if (ctx->is_active && !cpuctx->cgrp) {
9751012 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
9761013
9771014 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
9781015 cpuctx->cgrp = cgrp;
9791016 }
9801017
981
- if (add && ctx->nr_cgroups++)
982
- return;
983
- else if (!add && --ctx->nr_cgroups)
1018
+ if (ctx->nr_cgroups++)
9841019 return;
9851020
986
- /* no cgroup running */
987
- if (!add)
1021
+ list_add(&cpuctx->cgrp_cpuctx_entry,
1022
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1023
+}
1024
+
1025
+static inline void
1026
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1027
+{
1028
+ struct perf_cpu_context *cpuctx;
1029
+
1030
+ if (!is_cgroup_event(event))
1031
+ return;
1032
+
1033
+ /*
1034
+ * Because cgroup events are always per-cpu events,
1035
+ * @ctx == &cpuctx->ctx.
1036
+ */
1037
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1038
+
1039
+ if (--ctx->nr_cgroups)
1040
+ return;
1041
+
1042
+ if (ctx->is_active && cpuctx->cgrp)
9881043 cpuctx->cgrp = NULL;
9891044
990
- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
991
- if (add)
992
- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
993
- else
994
- list_del(cpuctx_entry);
1045
+ list_del(&cpuctx->cgrp_cpuctx_entry);
9951046 }
9961047
9971048 #else /* !CONFIG_CGROUP_PERF */
....@@ -1041,7 +1092,7 @@
10411092 {
10421093 }
10431094
1044
-void
1095
+static inline void
10451096 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
10461097 {
10471098 }
....@@ -1057,11 +1108,14 @@
10571108 }
10581109
10591110 static inline void
1060
-list_update_cgroup_event(struct perf_event *event,
1061
- struct perf_event_context *ctx, bool add)
1111
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
10621112 {
10631113 }
10641114
1115
+static inline void
1116
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1117
+{
1118
+}
10651119 #endif
10661120
10671121 /*
....@@ -1131,11 +1185,16 @@
11311185 if (!cpuctx->hrtimer_active) {
11321186 cpuctx->hrtimer_active = 1;
11331187 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1134
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1188
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
11351189 }
11361190 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
11371191
11381192 return 0;
1193
+}
1194
+
1195
+static int perf_mux_hrtimer_restart_ipi(void *arg)
1196
+{
1197
+ return perf_mux_hrtimer_restart(arg);
11391198 }
11401199
11411200 void perf_pmu_disable(struct pmu *pmu)
....@@ -1182,7 +1241,21 @@
11821241
11831242 static void get_ctx(struct perf_event_context *ctx)
11841243 {
1185
- WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1244
+ refcount_inc(&ctx->refcount);
1245
+}
1246
+
1247
+static void *alloc_task_ctx_data(struct pmu *pmu)
1248
+{
1249
+ if (pmu->task_ctx_cache)
1250
+ return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1251
+
1252
+ return NULL;
1253
+}
1254
+
1255
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1256
+{
1257
+ if (pmu->task_ctx_cache && task_ctx_data)
1258
+ kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
11861259 }
11871260
11881261 static void free_ctx(struct rcu_head *head)
....@@ -1190,13 +1263,13 @@
11901263 struct perf_event_context *ctx;
11911264
11921265 ctx = container_of(head, struct perf_event_context, rcu_head);
1193
- kfree(ctx->task_ctx_data);
1266
+ free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
11941267 kfree(ctx);
11951268 }
11961269
11971270 static void put_ctx(struct perf_event_context *ctx)
11981271 {
1199
- if (atomic_dec_and_test(&ctx->refcount)) {
1272
+ if (refcount_dec_and_test(&ctx->refcount)) {
12001273 if (ctx->parent_ctx)
12011274 put_ctx(ctx->parent_ctx);
12021275 if (ctx->task && ctx->task != TASK_TOMBSTONE)
....@@ -1232,7 +1305,7 @@
12321305 * life-time rules separate them. That is an exiting task cannot fork, and a
12331306 * spawning task cannot (yet) exit.
12341307 *
1235
- * But remember that that these are parent<->child context relations, and
1308
+ * But remember that these are parent<->child context relations, and
12361309 * migration does not affect children, therefore these two orderings should not
12371310 * interact.
12381311 *
....@@ -1258,13 +1331,13 @@
12581331 * function.
12591332 *
12601333 * Lock order:
1261
- * cred_guard_mutex
1334
+ * exec_update_lock
12621335 * task_struct::perf_event_mutex
12631336 * perf_event_context::mutex
12641337 * perf_event::child_mutex;
12651338 * perf_event_context::lock
12661339 * perf_event::mmap_mutex
1267
- * mmap_sem
1340
+ * mmap_lock
12681341 * perf_addr_filters_head::lock
12691342 *
12701343 * cpu_hotplug_lock
....@@ -1279,7 +1352,7 @@
12791352 again:
12801353 rcu_read_lock();
12811354 ctx = READ_ONCE(event->ctx);
1282
- if (!atomic_inc_not_zero(&ctx->refcount)) {
1355
+ if (!refcount_inc_not_zero(&ctx->refcount)) {
12831356 rcu_read_unlock();
12841357 goto again;
12851358 }
....@@ -1371,7 +1444,7 @@
13711444 /*
13721445 * Get the perf_event_context for a task and lock it.
13731446 *
1374
- * This has to cope with with the fact that until it is locked,
1447
+ * This has to cope with the fact that until it is locked,
13751448 * the context could get moved to another task.
13761449 */
13771450 static struct perf_event_context *
....@@ -1412,7 +1485,7 @@
14121485 }
14131486
14141487 if (ctx->task == TASK_TOMBSTONE ||
1415
- !atomic_inc_not_zero(&ctx->refcount)) {
1488
+ !refcount_inc_not_zero(&ctx->refcount)) {
14161489 raw_spin_unlock(&ctx->lock);
14171490 ctx = NULL;
14181491 } else {
....@@ -1540,6 +1613,30 @@
15401613 if (left->cpu > right->cpu)
15411614 return false;
15421615
1616
+#ifdef CONFIG_CGROUP_PERF
1617
+ if (left->cgrp != right->cgrp) {
1618
+ if (!left->cgrp || !left->cgrp->css.cgroup) {
1619
+ /*
1620
+ * Left has no cgroup but right does, no cgroups come
1621
+ * first.
1622
+ */
1623
+ return true;
1624
+ }
1625
+ if (!right->cgrp || !right->cgrp->css.cgroup) {
1626
+ /*
1627
+ * Right has no cgroup but left does, no cgroups come
1628
+ * first.
1629
+ */
1630
+ return false;
1631
+ }
1632
+ /* Two dissimilar cgroups, order by id. */
1633
+ if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
1634
+ return true;
1635
+
1636
+ return false;
1637
+ }
1638
+#endif
1639
+
15431640 if (left->group_index < right->group_index)
15441641 return true;
15451642 if (left->group_index > right->group_index)
....@@ -1619,25 +1716,48 @@
16191716 }
16201717
16211718 /*
1622
- * Get the leftmost event in the @cpu subtree.
1719
+ * Get the leftmost event in the cpu/cgroup subtree.
16231720 */
16241721 static struct perf_event *
1625
-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
1722
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1723
+ struct cgroup *cgrp)
16261724 {
16271725 struct perf_event *node_event = NULL, *match = NULL;
16281726 struct rb_node *node = groups->tree.rb_node;
1727
+#ifdef CONFIG_CGROUP_PERF
1728
+ u64 node_cgrp_id, cgrp_id = 0;
1729
+
1730
+ if (cgrp)
1731
+ cgrp_id = cgrp->kn->id;
1732
+#endif
16291733
16301734 while (node) {
16311735 node_event = container_of(node, struct perf_event, group_node);
16321736
16331737 if (cpu < node_event->cpu) {
16341738 node = node->rb_left;
1635
- } else if (cpu > node_event->cpu) {
1636
- node = node->rb_right;
1637
- } else {
1638
- match = node_event;
1639
- node = node->rb_left;
1739
+ continue;
16401740 }
1741
+ if (cpu > node_event->cpu) {
1742
+ node = node->rb_right;
1743
+ continue;
1744
+ }
1745
+#ifdef CONFIG_CGROUP_PERF
1746
+ node_cgrp_id = 0;
1747
+ if (node_event->cgrp && node_event->cgrp->css.cgroup)
1748
+ node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
1749
+
1750
+ if (cgrp_id < node_cgrp_id) {
1751
+ node = node->rb_left;
1752
+ continue;
1753
+ }
1754
+ if (cgrp_id > node_cgrp_id) {
1755
+ node = node->rb_right;
1756
+ continue;
1757
+ }
1758
+#endif
1759
+ match = node_event;
1760
+ node = node->rb_left;
16411761 }
16421762
16431763 return match;
....@@ -1650,12 +1770,26 @@
16501770 perf_event_groups_next(struct perf_event *event)
16511771 {
16521772 struct perf_event *next;
1773
+#ifdef CONFIG_CGROUP_PERF
1774
+ u64 curr_cgrp_id = 0;
1775
+ u64 next_cgrp_id = 0;
1776
+#endif
16531777
16541778 next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1655
- if (next && next->cpu == event->cpu)
1656
- return next;
1779
+ if (next == NULL || next->cpu != event->cpu)
1780
+ return NULL;
16571781
1658
- return NULL;
1782
+#ifdef CONFIG_CGROUP_PERF
1783
+ if (event->cgrp && event->cgrp->css.cgroup)
1784
+ curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
1785
+
1786
+ if (next->cgrp && next->cgrp->css.cgroup)
1787
+ next_cgrp_id = next->cgrp->css.cgroup->kn->id;
1788
+
1789
+ if (curr_cgrp_id != next_cgrp_id)
1790
+ return NULL;
1791
+#endif
1792
+ return next;
16591793 }
16601794
16611795 /*
....@@ -1691,12 +1825,13 @@
16911825 add_event_to_groups(event, ctx);
16921826 }
16931827
1694
- list_update_cgroup_event(event, ctx, true);
1695
-
16961828 list_add_rcu(&event->event_entry, &ctx->event_list);
16971829 ctx->nr_events++;
16981830 if (event->attr.inherit_stat)
16991831 ctx->nr_stat++;
1832
+
1833
+ if (event->state > PERF_EVENT_STATE_OFF)
1834
+ perf_cgroup_event_enable(event, ctx);
17001835
17011836 ctx->generation++;
17021837 }
....@@ -1762,6 +1897,9 @@
17621897
17631898 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
17641899 size += sizeof(data->phys_addr);
1900
+
1901
+ if (sample_type & PERF_SAMPLE_CGROUP)
1902
+ size += sizeof(data->cgroup);
17651903
17661904 event->header_size = size;
17671905 }
....@@ -1873,8 +2011,6 @@
18732011
18742012 event->attach_state &= ~PERF_ATTACH_CONTEXT;
18752013
1876
- list_update_cgroup_event(event, ctx, false);
1877
-
18782014 ctx->nr_events--;
18792015 if (event->attr.inherit_stat)
18802016 ctx->nr_stat--;
....@@ -1891,14 +2027,136 @@
18912027 * of error state is by explicit re-enabling
18922028 * of the event
18932029 */
1894
- if (event->state > PERF_EVENT_STATE_OFF)
2030
+ if (event->state > PERF_EVENT_STATE_OFF) {
2031
+ perf_cgroup_event_disable(event, ctx);
18952032 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2033
+ }
18962034
18972035 ctx->generation++;
18982036 }
18992037
2038
+static int
2039
+perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2040
+{
2041
+ if (!has_aux(aux_event))
2042
+ return 0;
2043
+
2044
+ if (!event->pmu->aux_output_match)
2045
+ return 0;
2046
+
2047
+ return event->pmu->aux_output_match(aux_event);
2048
+}
2049
+
2050
+static void put_event(struct perf_event *event);
2051
+static void event_sched_out(struct perf_event *event,
2052
+ struct perf_cpu_context *cpuctx,
2053
+ struct perf_event_context *ctx);
2054
+
2055
+static void perf_put_aux_event(struct perf_event *event)
2056
+{
2057
+ struct perf_event_context *ctx = event->ctx;
2058
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2059
+ struct perf_event *iter;
2060
+
2061
+ /*
2062
+ * If event uses aux_event tear down the link
2063
+ */
2064
+ if (event->aux_event) {
2065
+ iter = event->aux_event;
2066
+ event->aux_event = NULL;
2067
+ put_event(iter);
2068
+ return;
2069
+ }
2070
+
2071
+ /*
2072
+ * If the event is an aux_event, tear down all links to
2073
+ * it from other events.
2074
+ */
2075
+ for_each_sibling_event(iter, event->group_leader) {
2076
+ if (iter->aux_event != event)
2077
+ continue;
2078
+
2079
+ iter->aux_event = NULL;
2080
+ put_event(event);
2081
+
2082
+ /*
2083
+ * If it's ACTIVE, schedule it out and put it into ERROR
2084
+ * state so that we don't try to schedule it again. Note
2085
+ * that perf_event_enable() will clear the ERROR status.
2086
+ */
2087
+ event_sched_out(iter, cpuctx, ctx);
2088
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2089
+ }
2090
+}
2091
+
2092
+static bool perf_need_aux_event(struct perf_event *event)
2093
+{
2094
+ return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2095
+}
2096
+
2097
+static int perf_get_aux_event(struct perf_event *event,
2098
+ struct perf_event *group_leader)
2099
+{
2100
+ /*
2101
+ * Our group leader must be an aux event if we want to be
2102
+ * an aux_output. This way, the aux event will precede its
2103
+ * aux_output events in the group, and therefore will always
2104
+ * schedule first.
2105
+ */
2106
+ if (!group_leader)
2107
+ return 0;
2108
+
2109
+ /*
2110
+ * aux_output and aux_sample_size are mutually exclusive.
2111
+ */
2112
+ if (event->attr.aux_output && event->attr.aux_sample_size)
2113
+ return 0;
2114
+
2115
+ if (event->attr.aux_output &&
2116
+ !perf_aux_output_match(event, group_leader))
2117
+ return 0;
2118
+
2119
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2120
+ return 0;
2121
+
2122
+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
2123
+ return 0;
2124
+
2125
+ /*
2126
+ * Link aux_outputs to their aux event; this is undone in
2127
+ * perf_group_detach() by perf_put_aux_event(). When the
2128
+ * group in torn down, the aux_output events loose their
2129
+ * link to the aux_event and can't schedule any more.
2130
+ */
2131
+ event->aux_event = group_leader;
2132
+
2133
+ return 1;
2134
+}
2135
+
2136
+static inline struct list_head *get_event_list(struct perf_event *event)
2137
+{
2138
+ struct perf_event_context *ctx = event->ctx;
2139
+ return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2140
+}
2141
+
2142
+/*
2143
+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
2144
+ * cannot exist on their own, schedule them out and move them into the ERROR
2145
+ * state. Also see _perf_event_enable(), it will not be able to recover
2146
+ * this ERROR state.
2147
+ */
2148
+static inline void perf_remove_sibling_event(struct perf_event *event)
2149
+{
2150
+ struct perf_event_context *ctx = event->ctx;
2151
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2152
+
2153
+ event_sched_out(event, cpuctx, ctx);
2154
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2155
+}
2156
+
19002157 static void perf_group_detach(struct perf_event *event)
19012158 {
2159
+ struct perf_event *leader = event->group_leader;
19022160 struct perf_event *sibling, *tmp;
19032161 struct perf_event_context *ctx = event->ctx;
19042162
....@@ -1912,10 +2170,12 @@
19122170
19132171 event->attach_state &= ~PERF_ATTACH_GROUP;
19142172
2173
+ perf_put_aux_event(event);
2174
+
19152175 /*
19162176 * If this is a sibling, remove it from its group.
19172177 */
1918
- if (event->group_leader != event) {
2178
+ if (leader != event) {
19192179 list_del_init(&event->sibling_list);
19202180 event->group_leader->nr_siblings--;
19212181 goto out;
....@@ -1928,6 +2188,9 @@
19282188 */
19292189 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
19302190
2191
+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2192
+ perf_remove_sibling_event(sibling);
2193
+
19312194 sibling->group_leader = sibling;
19322195 list_del_init(&sibling->sibling_list);
19332196
....@@ -1937,22 +2200,18 @@
19372200 if (!RB_EMPTY_NODE(&event->group_node)) {
19382201 add_event_to_groups(sibling, event->ctx);
19392202
1940
- if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1941
- struct list_head *list = sibling->attr.pinned ?
1942
- &ctx->pinned_active : &ctx->flexible_active;
1943
-
1944
- list_add_tail(&sibling->active_list, list);
1945
- }
2203
+ if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2204
+ list_add_tail(&sibling->active_list, get_event_list(sibling));
19462205 }
19472206
19482207 WARN_ON_ONCE(sibling->ctx != event->ctx);
19492208 }
19502209
19512210 out:
1952
- perf_event__header_size(event->group_leader);
1953
-
1954
- for_each_sibling_event(tmp, event->group_leader)
2211
+ for_each_sibling_event(tmp, leader)
19552212 perf_event__header_size(tmp);
2213
+
2214
+ perf_event__header_size(leader);
19562215 }
19572216
19582217 static bool is_orphaned_event(struct perf_event *event)
....@@ -2021,6 +2280,7 @@
20212280
20222281 if (READ_ONCE(event->pending_disable) >= 0) {
20232282 WRITE_ONCE(event->pending_disable, -1);
2283
+ perf_cgroup_event_disable(event, ctx);
20242284 state = PERF_EVENT_STATE_OFF;
20252285 }
20262286 perf_event_set_state(event, state);
....@@ -2058,9 +2318,6 @@
20582318 event_sched_out(event, cpuctx, ctx);
20592319
20602320 perf_pmu_enable(ctx->pmu);
2061
-
2062
- if (group_event->attr.exclusive)
2063
- cpuctx->exclusive = 0;
20642321 }
20652322
20662323 #define DETACH_GROUP 0x01UL
....@@ -2091,6 +2348,7 @@
20912348
20922349 if (!ctx->nr_events && ctx->is_active) {
20932350 ctx->is_active = 0;
2351
+ ctx->rotate_necessary = 0;
20942352 if (ctx->task) {
20952353 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
20962354 cpuctx->task_ctx = NULL;
....@@ -2157,6 +2415,7 @@
21572415 event_sched_out(event, cpuctx, ctx);
21582416
21592417 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2418
+ perf_cgroup_event_disable(event, ctx);
21602419 }
21612420
21622421 /*
....@@ -2164,7 +2423,7 @@
21642423 *
21652424 * If event->ctx is a cloned context, callers must make sure that
21662425 * every task struct that event->ctx->task could possibly point to
2167
- * remains valid. This condition is satisifed when called through
2426
+ * remains valid. This condition is satisfied when called through
21682427 * perf_event_for_each_child or perf_event_for_each because they
21692428 * hold the top-level event's child_mutex, so any descendant that
21702429 * goes to exit will block in perf_event_exit_event().
....@@ -2238,7 +2497,7 @@
22382497 * But this is a bit hairy.
22392498 *
22402499 * So instead, we have an explicit cgroup call to remain
2241
- * within the time time source all along. We believe it
2500
+ * within the time source all along. We believe it
22422501 * is cleaner and simpler to understand.
22432502 */
22442503 if (is_cgroup_event(event))
....@@ -2258,6 +2517,8 @@
22582517 struct perf_event_context *ctx)
22592518 {
22602519 int ret = 0;
2520
+
2521
+ WARN_ON_ONCE(event->ctx != ctx);
22612522
22622523 lockdep_assert_held(&ctx->lock);
22632524
....@@ -2325,11 +2586,8 @@
23252586
23262587 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
23272588
2328
- if (event_sched_in(group_event, cpuctx, ctx)) {
2329
- pmu->cancel_txn(pmu);
2330
- perf_mux_hrtimer_restart(cpuctx);
2331
- return -EAGAIN;
2332
- }
2589
+ if (event_sched_in(group_event, cpuctx, ctx))
2590
+ goto error;
23332591
23342592 /*
23352593 * Schedule in siblings as one group (if any):
....@@ -2358,10 +2616,8 @@
23582616 }
23592617 event_sched_out(group_event, cpuctx, ctx);
23602618
2619
+error:
23612620 pmu->cancel_txn(pmu);
2362
-
2363
- perf_mux_hrtimer_restart(cpuctx);
2364
-
23652621 return -EAGAIN;
23662622 }
23672623
....@@ -2387,7 +2643,7 @@
23872643 * If this group is exclusive and there are already
23882644 * events on the CPU, it can't go on.
23892645 */
2390
- if (event->attr.exclusive && cpuctx->active_oncpu)
2646
+ if (event->attr.exclusive && !list_empty(get_event_list(event)))
23912647 return 0;
23922648 /*
23932649 * Otherwise, try to add it if all previous groups were able
....@@ -2488,6 +2744,16 @@
24882744 perf_pmu_enable(cpuctx->ctx.pmu);
24892745 }
24902746
2747
+void perf_pmu_resched(struct pmu *pmu)
2748
+{
2749
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2750
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
2751
+
2752
+ perf_ctx_lock(cpuctx, task_ctx);
2753
+ ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2754
+ perf_ctx_unlock(cpuctx, task_ctx);
2755
+}
2756
+
24912757 /*
24922758 * Cross CPU call to install and enable a performance event
24932759 *
....@@ -2528,7 +2794,7 @@
25282794 }
25292795
25302796 #ifdef CONFIG_CGROUP_PERF
2531
- if (is_cgroup_event(event)) {
2797
+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
25322798 /*
25332799 * If the current cgroup doesn't match the event's
25342800 * cgroup, we should not try to schedule it.
....@@ -2580,6 +2846,25 @@
25802846 * will be 'complete'. See perf_iterate_sb_cpu().
25812847 */
25822848 smp_store_release(&event->ctx, ctx);
2849
+
2850
+ /*
2851
+ * perf_event_attr::disabled events will not run and can be initialized
2852
+ * without IPI. Except when this is the first event for the context, in
2853
+ * that case we need the magic of the IPI to set ctx->is_active.
2854
+ *
2855
+ * The IOC_ENABLE that is sure to follow the creation of a disabled
2856
+ * event will issue the IPI and reprogram the hardware.
2857
+ */
2858
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
2859
+ raw_spin_lock_irq(&ctx->lock);
2860
+ if (ctx->task == TASK_TOMBSTONE) {
2861
+ raw_spin_unlock_irq(&ctx->lock);
2862
+ return;
2863
+ }
2864
+ add_event_to_ctx(event, ctx);
2865
+ raw_spin_unlock_irq(&ctx->lock);
2866
+ return;
2867
+ }
25832868
25842869 if (!task) {
25852870 cpu_function_call(cpu, __perf_install_in_context, event);
....@@ -2669,6 +2954,7 @@
26692954 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
26702955
26712956 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2957
+ perf_cgroup_event_enable(event, ctx);
26722958
26732959 if (!ctx->is_active)
26742960 return;
....@@ -2710,6 +2996,7 @@
27102996 raw_spin_lock_irq(&ctx->lock);
27112997 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
27122998 event->state < PERF_EVENT_STATE_ERROR) {
2999
+out:
27133000 raw_spin_unlock_irq(&ctx->lock);
27143001 return;
27153002 }
....@@ -2721,8 +3008,16 @@
27213008 * has gone back into error state, as distinct from the task having
27223009 * been scheduled away before the cross-call arrived.
27233010 */
2724
- if (event->state == PERF_EVENT_STATE_ERROR)
3011
+ if (event->state == PERF_EVENT_STATE_ERROR) {
3012
+ /*
3013
+ * Detached SIBLING events cannot leave ERROR state.
3014
+ */
3015
+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
3016
+ event->group_leader == event)
3017
+ goto out;
3018
+
27253019 event->state = PERF_EVENT_STATE_OFF;
3020
+ }
27263021 raw_spin_unlock_irq(&ctx->lock);
27273022
27283023 event_function_call(event, __perf_event_enable, NULL);
....@@ -2826,7 +3121,7 @@
28263121 * pre-existing mappings, called once when new filters arrive via SET_FILTER
28273122 * ioctl;
28283123 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2829
- * registered mapping, called for every new mmap(), with mm::mmap_sem down
3124
+ * registered mapping, called for every new mmap(), with mm::mmap_lock down
28303125 * for reading;
28313126 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
28323127 * of exec.
....@@ -2966,6 +3261,13 @@
29663261 if (is_active & EVENT_FLEXIBLE) {
29673262 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
29683263 group_sched_out(event, cpuctx, ctx);
3264
+
3265
+ /*
3266
+ * Since we cleared EVENT_FLEXIBLE, also clear
3267
+ * rotate_necessary, is will be reset by
3268
+ * ctx_flexible_sched_in() when needed.
3269
+ */
3270
+ ctx->rotate_necessary = 0;
29693271 }
29703272 perf_pmu_enable(ctx->pmu);
29713273 }
....@@ -3080,10 +3382,12 @@
30803382 struct perf_event_context *parent, *next_parent;
30813383 struct perf_cpu_context *cpuctx;
30823384 int do_switch = 1;
3385
+ struct pmu *pmu;
30833386
30843387 if (likely(!ctx))
30853388 return;
30863389
3390
+ pmu = ctx->pmu;
30873391 cpuctx = __get_cpu_context(ctx);
30883392 if (!cpuctx->task_ctx)
30893393 return;
....@@ -3113,10 +3417,27 @@
31133417 raw_spin_lock(&ctx->lock);
31143418 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
31153419 if (context_equiv(ctx, next_ctx)) {
3420
+
31163421 WRITE_ONCE(ctx->task, next);
31173422 WRITE_ONCE(next_ctx->task, task);
31183423
3119
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3424
+ perf_pmu_disable(pmu);
3425
+
3426
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
3427
+ pmu->sched_task(ctx, false);
3428
+
3429
+ /*
3430
+ * PMU specific parts of task perf context can require
3431
+ * additional synchronization. As an example of such
3432
+ * synchronization see implementation details of Intel
3433
+ * LBR call stack data profiling;
3434
+ */
3435
+ if (pmu->swap_task_ctx)
3436
+ pmu->swap_task_ctx(ctx, next_ctx);
3437
+ else
3438
+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3439
+
3440
+ perf_pmu_enable(pmu);
31203441
31213442 /*
31223443 * RCU_INIT_POINTER here is safe because we've not
....@@ -3140,7 +3461,13 @@
31403461
31413462 if (do_switch) {
31423463 raw_spin_lock(&ctx->lock);
3464
+ perf_pmu_disable(pmu);
3465
+
3466
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
3467
+ pmu->sched_task(ctx, false);
31433468 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3469
+
3470
+ perf_pmu_enable(pmu);
31443471 raw_spin_unlock(&ctx->lock);
31453472 }
31463473 }
....@@ -3176,29 +3503,39 @@
31763503 * PEBS requires this to provide PID/TID information. This requires we flush
31773504 * all queued PEBS records before we context switch to a new task.
31783505 */
3506
+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3507
+{
3508
+ struct pmu *pmu;
3509
+
3510
+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3511
+
3512
+ if (WARN_ON_ONCE(!pmu->sched_task))
3513
+ return;
3514
+
3515
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3516
+ perf_pmu_disable(pmu);
3517
+
3518
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
3519
+
3520
+ perf_pmu_enable(pmu);
3521
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3522
+}
3523
+
31793524 static void perf_pmu_sched_task(struct task_struct *prev,
31803525 struct task_struct *next,
31813526 bool sched_in)
31823527 {
31833528 struct perf_cpu_context *cpuctx;
3184
- struct pmu *pmu;
31853529
31863530 if (prev == next)
31873531 return;
31883532
31893533 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3190
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3191
-
3192
- if (WARN_ON_ONCE(!pmu->sched_task))
3534
+ /* will be handled in perf_event_context_sched_in/out */
3535
+ if (cpuctx->task_ctx)
31933536 continue;
31943537
3195
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3196
- perf_pmu_disable(pmu);
3197
-
3198
- pmu->sched_task(cpuctx->task_ctx, sched_in);
3199
-
3200
- perf_pmu_enable(pmu);
3201
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3538
+ __perf_pmu_sched_task(cpuctx, sched_in);
32023539 }
32033540 }
32043541
....@@ -3251,83 +3588,149 @@
32513588 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
32523589 }
32533590
3254
-static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3255
- int (*func)(struct perf_event *, void *), void *data)
3591
+static bool perf_less_group_idx(const void *l, const void *r)
32563592 {
3257
- struct perf_event **evt, *evt1, *evt2;
3593
+ const struct perf_event *le = *(const struct perf_event **)l;
3594
+ const struct perf_event *re = *(const struct perf_event **)r;
3595
+
3596
+ return le->group_index < re->group_index;
3597
+}
3598
+
3599
+static void swap_ptr(void *l, void *r)
3600
+{
3601
+ void **lp = l, **rp = r;
3602
+
3603
+ swap(*lp, *rp);
3604
+}
3605
+
3606
+static const struct min_heap_callbacks perf_min_heap = {
3607
+ .elem_size = sizeof(struct perf_event *),
3608
+ .less = perf_less_group_idx,
3609
+ .swp = swap_ptr,
3610
+};
3611
+
3612
+static void __heap_add(struct min_heap *heap, struct perf_event *event)
3613
+{
3614
+ struct perf_event **itrs = heap->data;
3615
+
3616
+ if (event) {
3617
+ itrs[heap->nr] = event;
3618
+ heap->nr++;
3619
+ }
3620
+}
3621
+
3622
+static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3623
+ struct perf_event_groups *groups, int cpu,
3624
+ int (*func)(struct perf_event *, void *),
3625
+ void *data)
3626
+{
3627
+#ifdef CONFIG_CGROUP_PERF
3628
+ struct cgroup_subsys_state *css = NULL;
3629
+#endif
3630
+ /* Space for per CPU and/or any CPU event iterators. */
3631
+ struct perf_event *itrs[2];
3632
+ struct min_heap event_heap;
3633
+ struct perf_event **evt;
32583634 int ret;
32593635
3260
- evt1 = perf_event_groups_first(groups, -1);
3261
- evt2 = perf_event_groups_first(groups, cpu);
3636
+ if (cpuctx) {
3637
+ event_heap = (struct min_heap){
3638
+ .data = cpuctx->heap,
3639
+ .nr = 0,
3640
+ .size = cpuctx->heap_size,
3641
+ };
32623642
3263
- while (evt1 || evt2) {
3264
- if (evt1 && evt2) {
3265
- if (evt1->group_index < evt2->group_index)
3266
- evt = &evt1;
3267
- else
3268
- evt = &evt2;
3269
- } else if (evt1) {
3270
- evt = &evt1;
3271
- } else {
3272
- evt = &evt2;
3273
- }
3643
+ lockdep_assert_held(&cpuctx->ctx.lock);
32743644
3645
+#ifdef CONFIG_CGROUP_PERF
3646
+ if (cpuctx->cgrp)
3647
+ css = &cpuctx->cgrp->css;
3648
+#endif
3649
+ } else {
3650
+ event_heap = (struct min_heap){
3651
+ .data = itrs,
3652
+ .nr = 0,
3653
+ .size = ARRAY_SIZE(itrs),
3654
+ };
3655
+ /* Events not within a CPU context may be on any CPU. */
3656
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3657
+ }
3658
+ evt = event_heap.data;
3659
+
3660
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3661
+
3662
+#ifdef CONFIG_CGROUP_PERF
3663
+ for (; css; css = css->parent)
3664
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3665
+#endif
3666
+
3667
+ min_heapify_all(&event_heap, &perf_min_heap);
3668
+
3669
+ while (event_heap.nr) {
32753670 ret = func(*evt, data);
32763671 if (ret)
32773672 return ret;
32783673
32793674 *evt = perf_event_groups_next(*evt);
3280
- }
3281
-
3282
- return 0;
3283
-}
3284
-
3285
-struct sched_in_data {
3286
- struct perf_event_context *ctx;
3287
- struct perf_cpu_context *cpuctx;
3288
- int can_add_hw;
3289
-};
3290
-
3291
-static int pinned_sched_in(struct perf_event *event, void *data)
3292
-{
3293
- struct sched_in_data *sid = data;
3294
-
3295
- if (event->state <= PERF_EVENT_STATE_OFF)
3296
- return 0;
3297
-
3298
- if (!event_filter_match(event))
3299
- return 0;
3300
-
3301
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3302
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3303
- list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3304
- }
3305
-
3306
- /*
3307
- * If this pinned group hasn't been scheduled,
3308
- * put it in error state.
3309
- */
3310
- if (event->state == PERF_EVENT_STATE_INACTIVE)
3311
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3312
-
3313
- return 0;
3314
-}
3315
-
3316
-static int flexible_sched_in(struct perf_event *event, void *data)
3317
-{
3318
- struct sched_in_data *sid = data;
3319
-
3320
- if (event->state <= PERF_EVENT_STATE_OFF)
3321
- return 0;
3322
-
3323
- if (!event_filter_match(event))
3324
- return 0;
3325
-
3326
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3327
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3328
- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3675
+ if (*evt)
3676
+ min_heapify(&event_heap, 0, &perf_min_heap);
33293677 else
3330
- sid->can_add_hw = 0;
3678
+ min_heap_pop(&event_heap, &perf_min_heap);
3679
+ }
3680
+
3681
+ return 0;
3682
+}
3683
+
3684
+static inline bool event_update_userpage(struct perf_event *event)
3685
+{
3686
+ if (likely(!atomic_read(&event->mmap_count)))
3687
+ return false;
3688
+
3689
+ perf_event_update_time(event);
3690
+ perf_set_shadow_time(event, event->ctx);
3691
+ perf_event_update_userpage(event);
3692
+
3693
+ return true;
3694
+}
3695
+
3696
+static inline void group_update_userpage(struct perf_event *group_event)
3697
+{
3698
+ struct perf_event *event;
3699
+
3700
+ if (!event_update_userpage(group_event))
3701
+ return;
3702
+
3703
+ for_each_sibling_event(event, group_event)
3704
+ event_update_userpage(event);
3705
+}
3706
+
3707
+static int merge_sched_in(struct perf_event *event, void *data)
3708
+{
3709
+ struct perf_event_context *ctx = event->ctx;
3710
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3711
+ int *can_add_hw = data;
3712
+
3713
+ if (event->state <= PERF_EVENT_STATE_OFF)
3714
+ return 0;
3715
+
3716
+ if (!event_filter_match(event))
3717
+ return 0;
3718
+
3719
+ if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3720
+ if (!group_sched_in(event, cpuctx, ctx))
3721
+ list_add_tail(&event->active_list, get_event_list(event));
3722
+ }
3723
+
3724
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
3725
+ *can_add_hw = 0;
3726
+ if (event->attr.pinned) {
3727
+ perf_cgroup_event_disable(event, ctx);
3728
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3729
+ } else {
3730
+ ctx->rotate_necessary = 1;
3731
+ perf_mux_hrtimer_restart(cpuctx);
3732
+ group_update_userpage(event);
3733
+ }
33313734 }
33323735
33333736 return 0;
....@@ -3337,30 +3740,28 @@
33373740 ctx_pinned_sched_in(struct perf_event_context *ctx,
33383741 struct perf_cpu_context *cpuctx)
33393742 {
3340
- struct sched_in_data sid = {
3341
- .ctx = ctx,
3342
- .cpuctx = cpuctx,
3343
- .can_add_hw = 1,
3344
- };
3743
+ int can_add_hw = 1;
33453744
3346
- visit_groups_merge(&ctx->pinned_groups,
3745
+ if (ctx != &cpuctx->ctx)
3746
+ cpuctx = NULL;
3747
+
3748
+ visit_groups_merge(cpuctx, &ctx->pinned_groups,
33473749 smp_processor_id(),
3348
- pinned_sched_in, &sid);
3750
+ merge_sched_in, &can_add_hw);
33493751 }
33503752
33513753 static void
33523754 ctx_flexible_sched_in(struct perf_event_context *ctx,
33533755 struct perf_cpu_context *cpuctx)
33543756 {
3355
- struct sched_in_data sid = {
3356
- .ctx = ctx,
3357
- .cpuctx = cpuctx,
3358
- .can_add_hw = 1,
3359
- };
3757
+ int can_add_hw = 1;
33603758
3361
- visit_groups_merge(&ctx->flexible_groups,
3759
+ if (ctx != &cpuctx->ctx)
3760
+ cpuctx = NULL;
3761
+
3762
+ visit_groups_merge(cpuctx, &ctx->flexible_groups,
33623763 smp_processor_id(),
3363
- flexible_sched_in, &sid);
3764
+ merge_sched_in, &can_add_hw);
33643765 }
33653766
33663767 static void
....@@ -3419,10 +3820,14 @@
34193820 struct task_struct *task)
34203821 {
34213822 struct perf_cpu_context *cpuctx;
3823
+ struct pmu *pmu = ctx->pmu;
34223824
34233825 cpuctx = __get_cpu_context(ctx);
3424
- if (cpuctx->task_ctx == ctx)
3826
+ if (cpuctx->task_ctx == ctx) {
3827
+ if (cpuctx->sched_cb_usage)
3828
+ __perf_pmu_sched_task(cpuctx, true);
34253829 return;
3830
+ }
34263831
34273832 perf_ctx_lock(cpuctx, ctx);
34283833 /*
....@@ -3432,7 +3837,7 @@
34323837 if (!ctx->nr_events)
34333838 goto unlock;
34343839
3435
- perf_pmu_disable(ctx->pmu);
3840
+ perf_pmu_disable(pmu);
34363841 /*
34373842 * We want to keep the following priority order:
34383843 * cpu pinned (that don't need to move), task pinned,
....@@ -3444,7 +3849,11 @@
34443849 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
34453850 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
34463851 perf_event_sched_in(cpuctx, ctx, task);
3447
- perf_pmu_enable(ctx->pmu);
3852
+
3853
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
3854
+ pmu->sched_task(cpuctx->task_ctx, true);
3855
+
3856
+ perf_pmu_enable(pmu);
34483857
34493858 unlock:
34503859 perf_ctx_unlock(cpuctx, ctx);
....@@ -3685,34 +4094,45 @@
36854094 perf_event_groups_insert(&ctx->flexible_groups, event);
36864095 }
36874096
4097
+/* pick an event from the flexible_groups to rotate */
36884098 static inline struct perf_event *
3689
-ctx_first_active(struct perf_event_context *ctx)
4099
+ctx_event_to_rotate(struct perf_event_context *ctx)
36904100 {
3691
- return list_first_entry_or_null(&ctx->flexible_active,
3692
- struct perf_event, active_list);
4101
+ struct perf_event *event;
4102
+
4103
+ /* pick the first active flexible event */
4104
+ event = list_first_entry_or_null(&ctx->flexible_active,
4105
+ struct perf_event, active_list);
4106
+
4107
+ /* if no active flexible event, pick the first event */
4108
+ if (!event) {
4109
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4110
+ typeof(*event), group_node);
4111
+ }
4112
+
4113
+ /*
4114
+ * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4115
+ * finds there are unschedulable events, it will set it again.
4116
+ */
4117
+ ctx->rotate_necessary = 0;
4118
+
4119
+ return event;
36934120 }
36944121
36954122 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
36964123 {
36974124 struct perf_event *cpu_event = NULL, *task_event = NULL;
3698
- bool cpu_rotate = false, task_rotate = false;
3699
- struct perf_event_context *ctx = NULL;
4125
+ struct perf_event_context *task_ctx = NULL;
4126
+ int cpu_rotate, task_rotate;
37004127
37014128 /*
37024129 * Since we run this from IRQ context, nobody can install new
37034130 * events, thus the event count values are stable.
37044131 */
37054132
3706
- if (cpuctx->ctx.nr_events) {
3707
- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3708
- cpu_rotate = true;
3709
- }
3710
-
3711
- ctx = cpuctx->task_ctx;
3712
- if (ctx && ctx->nr_events) {
3713
- if (ctx->nr_events != ctx->nr_active)
3714
- task_rotate = true;
3715
- }
4133
+ cpu_rotate = cpuctx->ctx.rotate_necessary;
4134
+ task_ctx = cpuctx->task_ctx;
4135
+ task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
37164136
37174137 if (!(cpu_rotate || task_rotate))
37184138 return false;
....@@ -3721,25 +4141,25 @@
37214141 perf_pmu_disable(cpuctx->ctx.pmu);
37224142
37234143 if (task_rotate)
3724
- task_event = ctx_first_active(ctx);
4144
+ task_event = ctx_event_to_rotate(task_ctx);
37254145 if (cpu_rotate)
3726
- cpu_event = ctx_first_active(&cpuctx->ctx);
4146
+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
37274147
37284148 /*
37294149 * As per the order given at ctx_resched() first 'pop' task flexible
37304150 * and then, if needed CPU flexible.
37314151 */
3732
- if (task_event || (ctx && cpu_event))
3733
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
4152
+ if (task_event || (task_ctx && cpu_event))
4153
+ ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
37344154 if (cpu_event)
37354155 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
37364156
37374157 if (task_event)
3738
- rotate_ctx(ctx, task_event);
4158
+ rotate_ctx(task_ctx, task_event);
37394159 if (cpu_event)
37404160 rotate_ctx(&cpuctx->ctx, cpu_event);
37414161
3742
- perf_event_sched_in(cpuctx, ctx, current);
4162
+ perf_event_sched_in(cpuctx, task_ctx, current);
37434163
37444164 perf_pmu_enable(cpuctx->ctx.pmu);
37454165 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
....@@ -3983,6 +4403,7 @@
39834403
39844404 return ret;
39854405 }
4406
+EXPORT_SYMBOL_GPL(perf_event_read_local);
39864407
39874408 static int perf_event_read(struct perf_event *event, bool group)
39884409 {
....@@ -4074,7 +4495,7 @@
40744495 INIT_LIST_HEAD(&ctx->event_list);
40754496 INIT_LIST_HEAD(&ctx->pinned_active);
40764497 INIT_LIST_HEAD(&ctx->flexible_active);
4077
- atomic_set(&ctx->refcount, 1);
4498
+ refcount_set(&ctx->refcount, 1);
40784499 }
40794500
40804501 static struct perf_event_context *
....@@ -4087,10 +4508,8 @@
40874508 return NULL;
40884509
40894510 __perf_event_init_context(ctx);
4090
- if (task) {
4091
- ctx->task = task;
4092
- get_task_struct(task);
4093
- }
4511
+ if (task)
4512
+ ctx->task = get_task_struct(task);
40944513 ctx->pmu = pmu;
40954514
40964515 return ctx;
....@@ -4152,7 +4571,7 @@
41524571 goto errout;
41534572
41544573 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4155
- task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
4574
+ task_ctx_data = alloc_task_ctx_data(pmu);
41564575 if (!task_ctx_data) {
41574576 err = -ENOMEM;
41584577 goto errout;
....@@ -4210,11 +4629,11 @@
42104629 }
42114630 }
42124631
4213
- kfree(task_ctx_data);
4632
+ free_task_ctx_data(pmu, task_ctx_data);
42144633 return ctx;
42154634
42164635 errout:
4217
- kfree(task_ctx_data);
4636
+ free_task_ctx_data(pmu, task_ctx_data);
42184637 return ERR_PTR(err);
42194638 }
42204639
....@@ -4233,7 +4652,7 @@
42334652 }
42344653
42354654 static void ring_buffer_attach(struct perf_event *event,
4236
- struct ring_buffer *rb);
4655
+ struct perf_buffer *rb);
42374656
42384657 static void detach_sb_event(struct perf_event *event)
42394658 {
....@@ -4256,8 +4675,9 @@
42564675
42574676 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
42584677 attr->comm || attr->comm_exec ||
4259
- attr->task ||
4260
- attr->context_switch)
4678
+ attr->task || attr->ksymbol ||
4679
+ attr->context_switch || attr->text_poke ||
4680
+ attr->bpf_event)
42614681 return true;
42624682 return false;
42634683 }
....@@ -4306,7 +4726,7 @@
43064726 if (event->parent)
43074727 return;
43084728
4309
- if (event->attach_state & PERF_ATTACH_TASK)
4729
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
43104730 dec = true;
43114731 if (event->attr.mmap || event->attr.mmap_data)
43124732 atomic_dec(&nr_mmap_events);
....@@ -4314,6 +4734,8 @@
43144734 atomic_dec(&nr_comm_events);
43154735 if (event->attr.namespaces)
43164736 atomic_dec(&nr_namespaces_events);
4737
+ if (event->attr.cgroup)
4738
+ atomic_dec(&nr_cgroup_events);
43174739 if (event->attr.task)
43184740 atomic_dec(&nr_task_events);
43194741 if (event->attr.freq)
....@@ -4326,6 +4748,12 @@
43264748 dec = true;
43274749 if (has_branch_stack(event))
43284750 dec = true;
4751
+ if (event->attr.ksymbol)
4752
+ atomic_dec(&nr_ksymbol_events);
4753
+ if (event->attr.bpf_event)
4754
+ atomic_dec(&nr_bpf_events);
4755
+ if (event->attr.text_poke)
4756
+ atomic_dec(&nr_text_poke_events);
43294757
43304758 if (dec) {
43314759 if (!atomic_add_unless(&perf_sched_count, -1, 1))
....@@ -4909,7 +5337,7 @@
49095337 static __poll_t perf_poll(struct file *file, poll_table *wait)
49105338 {
49115339 struct perf_event *event = file->private_data;
4912
- struct ring_buffer *rb;
5340
+ struct perf_buffer *rb;
49135341 __poll_t events = EPOLLHUP;
49145342
49155343 poll_wait(file, &event->waitq, wait);
....@@ -4935,6 +5363,24 @@
49355363 local64_set(&event->count, 0);
49365364 perf_event_update_userpage(event);
49375365 }
5366
+
5367
+/* Assume it's not an event with inherit set. */
5368
+u64 perf_event_pause(struct perf_event *event, bool reset)
5369
+{
5370
+ struct perf_event_context *ctx;
5371
+ u64 count;
5372
+
5373
+ ctx = perf_event_ctx_lock(event);
5374
+ WARN_ON_ONCE(event->attr.inherit);
5375
+ _perf_event_disable(event);
5376
+ count = local64_read(&event->count);
5377
+ if (reset)
5378
+ local64_set(&event->count, 0);
5379
+ perf_event_ctx_unlock(event, ctx);
5380
+
5381
+ return count;
5382
+}
5383
+EXPORT_SYMBOL_GPL(perf_event_pause);
49385384
49395385 /*
49405386 * Holding the top-level event's child_mutex means that any
....@@ -5013,15 +5459,10 @@
50135459 return event->pmu->check_period(event, value);
50145460 }
50155461
5016
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
5462
+static int _perf_event_period(struct perf_event *event, u64 value)
50175463 {
5018
- u64 value;
5019
-
50205464 if (!is_sampling_event(event))
50215465 return -EINVAL;
5022
-
5023
- if (copy_from_user(&value, arg, sizeof(value)))
5024
- return -EFAULT;
50255466
50265467 if (!value)
50275468 return -EINVAL;
....@@ -5039,6 +5480,19 @@
50395480
50405481 return 0;
50415482 }
5483
+
5484
+int perf_event_period(struct perf_event *event, u64 value)
5485
+{
5486
+ struct perf_event_context *ctx;
5487
+ int ret;
5488
+
5489
+ ctx = perf_event_ctx_lock(event);
5490
+ ret = _perf_event_period(event, value);
5491
+ perf_event_ctx_unlock(event, ctx);
5492
+
5493
+ return ret;
5494
+}
5495
+EXPORT_SYMBOL_GPL(perf_event_period);
50425496
50435497 static const struct file_operations perf_fops;
50445498
....@@ -5083,8 +5537,14 @@
50835537 return _perf_event_refresh(event, arg);
50845538
50855539 case PERF_EVENT_IOC_PERIOD:
5086
- return perf_event_period(event, (u64 __user *)arg);
5540
+ {
5541
+ u64 value;
50875542
5543
+ if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5544
+ return -EFAULT;
5545
+
5546
+ return _perf_event_period(event, value);
5547
+ }
50885548 case PERF_EVENT_IOC_ID:
50895549 {
50905550 u64 id = primary_event_id(event);
....@@ -5119,7 +5579,7 @@
51195579 return perf_event_set_bpf_prog(event, arg);
51205580
51215581 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5122
- struct ring_buffer *rb;
5582
+ struct perf_buffer *rb;
51235583
51245584 rcu_read_lock();
51255585 rb = rcu_dereference(event->rb);
....@@ -5255,7 +5715,7 @@
52555715 static void perf_event_init_userpage(struct perf_event *event)
52565716 {
52575717 struct perf_event_mmap_page *userpg;
5258
- struct ring_buffer *rb;
5718
+ struct perf_buffer *rb;
52595719
52605720 rcu_read_lock();
52615721 rb = rcu_dereference(event->rb);
....@@ -5287,7 +5747,7 @@
52875747 void perf_event_update_userpage(struct perf_event *event)
52885748 {
52895749 struct perf_event_mmap_page *userpg;
5290
- struct ring_buffer *rb;
5750
+ struct perf_buffer *rb;
52915751 u64 enabled, running, now;
52925752
52935753 rcu_read_lock();
....@@ -5338,7 +5798,7 @@
53385798 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
53395799 {
53405800 struct perf_event *event = vmf->vma->vm_file->private_data;
5341
- struct ring_buffer *rb;
5801
+ struct perf_buffer *rb;
53425802 vm_fault_t ret = VM_FAULT_SIGBUS;
53435803
53445804 if (vmf->flags & FAULT_FLAG_MKWRITE) {
....@@ -5371,10 +5831,12 @@
53715831 }
53725832
53735833 static void ring_buffer_attach(struct perf_event *event,
5374
- struct ring_buffer *rb)
5834
+ struct perf_buffer *rb)
53755835 {
5376
- struct ring_buffer *old_rb = NULL;
5836
+ struct perf_buffer *old_rb = NULL;
53775837 unsigned long flags;
5838
+
5839
+ WARN_ON_ONCE(event->parent);
53785840
53795841 if (event->rb) {
53805842 /*
....@@ -5431,7 +5893,10 @@
54315893
54325894 static void ring_buffer_wakeup(struct perf_event *event)
54335895 {
5434
- struct ring_buffer *rb;
5896
+ struct perf_buffer *rb;
5897
+
5898
+ if (event->parent)
5899
+ event = event->parent;
54355900
54365901 rcu_read_lock();
54375902 rb = rcu_dereference(event->rb);
....@@ -5442,14 +5907,17 @@
54425907 rcu_read_unlock();
54435908 }
54445909
5445
-struct ring_buffer *ring_buffer_get(struct perf_event *event)
5910
+struct perf_buffer *ring_buffer_get(struct perf_event *event)
54465911 {
5447
- struct ring_buffer *rb;
5912
+ struct perf_buffer *rb;
5913
+
5914
+ if (event->parent)
5915
+ event = event->parent;
54485916
54495917 rcu_read_lock();
54505918 rb = rcu_dereference(event->rb);
54515919 if (rb) {
5452
- if (!atomic_inc_not_zero(&rb->refcount))
5920
+ if (!refcount_inc_not_zero(&rb->refcount))
54535921 rb = NULL;
54545922 }
54555923 rcu_read_unlock();
....@@ -5457,9 +5925,9 @@
54575925 return rb;
54585926 }
54595927
5460
-void ring_buffer_put(struct ring_buffer *rb)
5928
+void ring_buffer_put(struct perf_buffer *rb)
54615929 {
5462
- if (!atomic_dec_and_test(&rb->refcount))
5930
+ if (!refcount_dec_and_test(&rb->refcount))
54635931 return;
54645932
54655933 WARN_ON_ONCE(!list_empty(&rb->event_list));
....@@ -5494,7 +5962,7 @@
54945962 static void perf_mmap_close(struct vm_area_struct *vma)
54955963 {
54965964 struct perf_event *event = vma->vm_file->private_data;
5497
- struct ring_buffer *rb = ring_buffer_get(event);
5965
+ struct perf_buffer *rb = ring_buffer_get(event);
54985966 struct user_struct *mmap_user = rb->mmap_user;
54995967 int mmap_locked = rb->mmap_locked;
55005968 unsigned long size = perf_data_size(rb);
....@@ -5519,12 +5987,12 @@
55195987 perf_pmu_output_stop(event);
55205988
55215989 /* now it's safe to free the pages */
5522
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5523
- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5990
+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
5991
+ atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
55245992
55255993 /* this has to be the last one */
55265994 rb_free_aux(rb);
5527
- WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5995
+ WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
55285996
55295997 mutex_unlock(&event->mmap_mutex);
55305998 }
....@@ -5593,8 +6061,9 @@
55936061 * undo the VM accounting.
55946062 */
55956063
5596
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5597
- vma->vm_mm->pinned_vm -= mmap_locked;
6064
+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6065
+ &mmap_user->locked_vm);
6066
+ atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
55986067 free_uid(mmap_user);
55996068
56006069 out_put:
....@@ -5603,7 +6072,7 @@
56036072
56046073 static const struct vm_operations_struct perf_mmap_vmops = {
56056074 .open = perf_mmap_open,
5606
- .close = perf_mmap_close, /* non mergable */
6075
+ .close = perf_mmap_close, /* non mergeable */
56076076 .fault = perf_mmap_fault,
56086077 .page_mkwrite = perf_mmap_fault,
56096078 };
....@@ -5613,8 +6082,8 @@
56136082 struct perf_event *event = file->private_data;
56146083 unsigned long user_locked, user_lock_limit;
56156084 struct user_struct *user = current_user();
6085
+ struct perf_buffer *rb = NULL;
56166086 unsigned long locked, lock_limit;
5617
- struct ring_buffer *rb = NULL;
56186087 unsigned long vma_size;
56196088 unsigned long nr_pages;
56206089 long user_extra = 0, extra = 0;
....@@ -5711,17 +6180,17 @@
57116180 again:
57126181 mutex_lock(&event->mmap_mutex);
57136182 if (event->rb) {
5714
- if (event->rb->nr_pages != nr_pages) {
6183
+ if (data_page_nr(event->rb) != nr_pages) {
57156184 ret = -EINVAL;
57166185 goto unlock;
57176186 }
57186187
57196188 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
57206189 /*
5721
- * Raced against perf_mmap_close() through
5722
- * perf_event_set_output(). Try again, hope for better
5723
- * luck.
6190
+ * Raced against perf_mmap_close(); remove the
6191
+ * event and try again.
57246192 */
6193
+ ring_buffer_attach(event, NULL);
57256194 mutex_unlock(&event->mmap_mutex);
57266195 goto again;
57276196 }
....@@ -5749,12 +6218,18 @@
57496218 user_locked = user_lock_limit;
57506219 user_locked += user_extra;
57516220
5752
- if (user_locked > user_lock_limit)
6221
+ if (user_locked > user_lock_limit) {
6222
+ /*
6223
+ * charge locked_vm until it hits user_lock_limit;
6224
+ * charge the rest from pinned_vm
6225
+ */
57536226 extra = user_locked - user_lock_limit;
6227
+ user_extra -= extra;
6228
+ }
57546229
57556230 lock_limit = rlimit(RLIMIT_MEMLOCK);
57566231 lock_limit >>= PAGE_SHIFT;
5757
- locked = vma->vm_mm->pinned_vm + extra;
6232
+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
57586233
57596234 if ((locked > lock_limit) && perf_is_paranoid() &&
57606235 !capable(CAP_IPC_LOCK)) {
....@@ -5783,6 +6258,8 @@
57836258
57846259 ring_buffer_attach(event, rb);
57856260
6261
+ perf_event_update_time(event);
6262
+ perf_set_shadow_time(event, event->ctx);
57866263 perf_event_init_userpage(event);
57876264 perf_event_update_userpage(event);
57886265 } else {
....@@ -5795,7 +6272,7 @@
57956272 unlock:
57966273 if (!ret) {
57976274 atomic_long_add(user_extra, &user->locked_vm);
5798
- vma->vm_mm->pinned_vm += extra;
6275
+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
57996276
58006277 atomic_inc(&event->mmap_count);
58016278 } else if (rb) {
....@@ -5932,18 +6409,25 @@
59326409 * Later on, we might change it to a list if there is
59336410 * another virtualization implementation supporting the callbacks.
59346411 */
5935
-struct perf_guest_info_callbacks *perf_guest_cbs;
6412
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
59366413
59376414 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
59386415 {
5939
- perf_guest_cbs = cbs;
6416
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
6417
+ return -EBUSY;
6418
+
6419
+ rcu_assign_pointer(perf_guest_cbs, cbs);
59406420 return 0;
59416421 }
59426422 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
59436423
59446424 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
59456425 {
5946
- perf_guest_cbs = NULL;
6426
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
6427
+ return -EINVAL;
6428
+
6429
+ rcu_assign_pointer(perf_guest_cbs, NULL);
6430
+ synchronize_rcu();
59476431 return 0;
59486432 }
59496433 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
....@@ -5965,14 +6449,13 @@
59656449 }
59666450
59676451 static void perf_sample_regs_user(struct perf_regs *regs_user,
5968
- struct pt_regs *regs,
5969
- struct pt_regs *regs_user_copy)
6452
+ struct pt_regs *regs)
59706453 {
59716454 if (user_mode(regs)) {
59726455 regs_user->abi = perf_reg_abi(current);
59736456 regs_user->regs = regs;
59746457 } else if (!(current->flags & PF_KTHREAD)) {
5975
- perf_get_regs_user(regs_user, regs, regs_user_copy);
6458
+ perf_get_regs_user(regs_user, regs);
59766459 } else {
59776460 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
59786461 regs_user->regs = NULL;
....@@ -5991,7 +6474,7 @@
59916474 * Get remaining task size from user stack pointer.
59926475 *
59936476 * It'd be better to take stack vma map and limit this more
5994
- * precisly, but there's no way to get it safely under interrupt,
6477
+ * precisely, but there's no way to get it safely under interrupt,
59956478 * so using TASK_SIZE as limit.
59966479 */
59976480 static u64 perf_ustack_task_size(struct pt_regs *regs)
....@@ -6073,10 +6556,9 @@
60736556
60746557 /* Data. */
60756558 sp = perf_user_stack_pointer(regs);
6076
- fs = get_fs();
6077
- set_fs(USER_DS);
6559
+ fs = force_uaccess_begin();
60786560 rem = __output_copy_user(handle, (void *) sp, dump_size);
6079
- set_fs(fs);
6561
+ force_uaccess_end(fs);
60806562 dyn_size = dump_size - rem;
60816563
60826564 perf_output_skip(handle, rem);
....@@ -6084,6 +6566,122 @@
60846566 /* Dynamic size. */
60856567 perf_output_put(handle, dyn_size);
60866568 }
6569
+}
6570
+
6571
+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6572
+ struct perf_sample_data *data,
6573
+ size_t size)
6574
+{
6575
+ struct perf_event *sampler = event->aux_event;
6576
+ struct perf_buffer *rb;
6577
+
6578
+ data->aux_size = 0;
6579
+
6580
+ if (!sampler)
6581
+ goto out;
6582
+
6583
+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6584
+ goto out;
6585
+
6586
+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6587
+ goto out;
6588
+
6589
+ rb = ring_buffer_get(sampler);
6590
+ if (!rb)
6591
+ goto out;
6592
+
6593
+ /*
6594
+ * If this is an NMI hit inside sampling code, don't take
6595
+ * the sample. See also perf_aux_sample_output().
6596
+ */
6597
+ if (READ_ONCE(rb->aux_in_sampling)) {
6598
+ data->aux_size = 0;
6599
+ } else {
6600
+ size = min_t(size_t, size, perf_aux_size(rb));
6601
+ data->aux_size = ALIGN(size, sizeof(u64));
6602
+ }
6603
+ ring_buffer_put(rb);
6604
+
6605
+out:
6606
+ return data->aux_size;
6607
+}
6608
+
6609
+long perf_pmu_snapshot_aux(struct perf_buffer *rb,
6610
+ struct perf_event *event,
6611
+ struct perf_output_handle *handle,
6612
+ unsigned long size)
6613
+{
6614
+ unsigned long flags;
6615
+ long ret;
6616
+
6617
+ /*
6618
+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
6619
+ * paths. If we start calling them in NMI context, they may race with
6620
+ * the IRQ ones, that is, for example, re-starting an event that's just
6621
+ * been stopped, which is why we're using a separate callback that
6622
+ * doesn't change the event state.
6623
+ *
6624
+ * IRQs need to be disabled to prevent IPIs from racing with us.
6625
+ */
6626
+ local_irq_save(flags);
6627
+ /*
6628
+ * Guard against NMI hits inside the critical section;
6629
+ * see also perf_prepare_sample_aux().
6630
+ */
6631
+ WRITE_ONCE(rb->aux_in_sampling, 1);
6632
+ barrier();
6633
+
6634
+ ret = event->pmu->snapshot_aux(event, handle, size);
6635
+
6636
+ barrier();
6637
+ WRITE_ONCE(rb->aux_in_sampling, 0);
6638
+ local_irq_restore(flags);
6639
+
6640
+ return ret;
6641
+}
6642
+
6643
+static void perf_aux_sample_output(struct perf_event *event,
6644
+ struct perf_output_handle *handle,
6645
+ struct perf_sample_data *data)
6646
+{
6647
+ struct perf_event *sampler = event->aux_event;
6648
+ struct perf_buffer *rb;
6649
+ unsigned long pad;
6650
+ long size;
6651
+
6652
+ if (WARN_ON_ONCE(!sampler || !data->aux_size))
6653
+ return;
6654
+
6655
+ rb = ring_buffer_get(sampler);
6656
+ if (!rb)
6657
+ return;
6658
+
6659
+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6660
+
6661
+ /*
6662
+ * An error here means that perf_output_copy() failed (returned a
6663
+ * non-zero surplus that it didn't copy), which in its current
6664
+ * enlightened implementation is not possible. If that changes, we'd
6665
+ * like to know.
6666
+ */
6667
+ if (WARN_ON_ONCE(size < 0))
6668
+ goto out_put;
6669
+
6670
+ /*
6671
+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
6672
+ * perf_prepare_sample_aux(), so should not be more than that.
6673
+ */
6674
+ pad = data->aux_size - size;
6675
+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
6676
+ pad = 8;
6677
+
6678
+ if (pad) {
6679
+ u64 zero = 0;
6680
+ perf_output_copy(handle, &zero, pad);
6681
+ }
6682
+
6683
+out_put:
6684
+ ring_buffer_put(rb);
60876685 }
60886686
60896687 static void __perf_event_header__init_id(struct perf_event_header *header,
....@@ -6255,6 +6853,11 @@
62556853 perf_output_read_one(handle, event, enabled, running);
62566854 }
62576855
6856
+static inline bool perf_sample_save_hw_index(struct perf_event *event)
6857
+{
6858
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
6859
+}
6860
+
62586861 void perf_output_sample(struct perf_output_handle *handle,
62596862 struct perf_event_header *header,
62606863 struct perf_sample_data *data,
....@@ -6343,6 +6946,8 @@
63436946 * sizeof(struct perf_branch_entry);
63446947
63456948 perf_output_put(handle, data->br_stack->nr);
6949
+ if (perf_sample_save_hw_index(event))
6950
+ perf_output_put(handle, data->br_stack->hw_idx);
63466951 perf_output_copy(handle, data->br_stack->entries, size);
63476952 } else {
63486953 /*
....@@ -6405,11 +7010,21 @@
64057010 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
64067011 perf_output_put(handle, data->phys_addr);
64077012
7013
+ if (sample_type & PERF_SAMPLE_CGROUP)
7014
+ perf_output_put(handle, data->cgroup);
7015
+
7016
+ if (sample_type & PERF_SAMPLE_AUX) {
7017
+ perf_output_put(handle, data->aux_size);
7018
+
7019
+ if (data->aux_size)
7020
+ perf_aux_sample_output(event, handle, data);
7021
+ }
7022
+
64087023 if (!event->attr.watermark) {
64097024 int wakeup_events = event->attr.wakeup_events;
64107025
64117026 if (wakeup_events) {
6412
- struct ring_buffer *rb = handle->rb;
7027
+ struct perf_buffer *rb = handle->rb;
64137028 int events = local_inc_return(&rb->events);
64147029
64157030 if (events >= wakeup_events) {
....@@ -6437,14 +7052,14 @@
64377052 * Walking the pages tables for user address.
64387053 * Interrupts are disabled, so it prevents any tear down
64397054 * of the page tables.
6440
- * Try IRQ-safe __get_user_pages_fast first.
7055
+ * Try IRQ-safe get_user_page_fast_only first.
64417056 * If failed, leave phys_addr as 0.
64427057 */
64437058 if (current->mm != NULL) {
64447059 struct page *p;
64457060
64467061 pagefault_disable();
6447
- if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {
7062
+ if (get_user_page_fast_only(virt, 0, &p)) {
64487063 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
64497064 put_page(p);
64507065 }
....@@ -6532,6 +7147,9 @@
65327147 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
65337148 int size = sizeof(u64); /* nr */
65347149 if (data->br_stack) {
7150
+ if (perf_sample_save_hw_index(event))
7151
+ size += sizeof(u64);
7152
+
65357153 size += data->br_stack->nr
65367154 * sizeof(struct perf_branch_entry);
65377155 }
....@@ -6539,8 +7157,7 @@
65397157 }
65407158
65417159 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6542
- perf_sample_regs_user(&data->regs_user, regs,
6543
- &data->regs_user_copy);
7160
+ perf_sample_regs_user(&data->regs_user, regs);
65447161
65457162 if (sample_type & PERF_SAMPLE_REGS_USER) {
65467163 /* regs dump ABI info */
....@@ -6556,7 +7173,7 @@
65567173
65577174 if (sample_type & PERF_SAMPLE_STACK_USER) {
65587175 /*
6559
- * Either we need PERF_SAMPLE_STACK_USER bit to be allways
7176
+ * Either we need PERF_SAMPLE_STACK_USER bit to be always
65607177 * processed as the last one or have additional check added
65617178 * in case new sample type is added, because we could eat
65627179 * up the rest of the sample size.
....@@ -6596,25 +7213,67 @@
65967213
65977214 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
65987215 data->phys_addr = perf_virt_to_phys(data->addr);
7216
+
7217
+#ifdef CONFIG_CGROUP_PERF
7218
+ if (sample_type & PERF_SAMPLE_CGROUP) {
7219
+ struct cgroup *cgrp;
7220
+
7221
+ /* protected by RCU */
7222
+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7223
+ data->cgroup = cgroup_id(cgrp);
7224
+ }
7225
+#endif
7226
+
7227
+ if (sample_type & PERF_SAMPLE_AUX) {
7228
+ u64 size;
7229
+
7230
+ header->size += sizeof(u64); /* size */
7231
+
7232
+ /*
7233
+ * Given the 16bit nature of header::size, an AUX sample can
7234
+ * easily overflow it, what with all the preceding sample bits.
7235
+ * Make sure this doesn't happen by using up to U16_MAX bytes
7236
+ * per sample in total (rounded down to 8 byte boundary).
7237
+ */
7238
+ size = min_t(size_t, U16_MAX - header->size,
7239
+ event->attr.aux_sample_size);
7240
+ size = rounddown(size, 8);
7241
+ size = perf_prepare_sample_aux(event, data, size);
7242
+
7243
+ WARN_ON_ONCE(size + header->size > U16_MAX);
7244
+ header->size += size;
7245
+ }
7246
+ /*
7247
+ * If you're adding more sample types here, you likely need to do
7248
+ * something about the overflowing header::size, like repurpose the
7249
+ * lowest 3 bits of size, which should be always zero at the moment.
7250
+ * This raises a more important question, do we really need 512k sized
7251
+ * samples and why, so good argumentation is in order for whatever you
7252
+ * do here next.
7253
+ */
7254
+ WARN_ON_ONCE(header->size & 7);
65997255 }
66007256
6601
-static __always_inline void
7257
+static __always_inline int
66027258 __perf_event_output(struct perf_event *event,
66037259 struct perf_sample_data *data,
66047260 struct pt_regs *regs,
66057261 int (*output_begin)(struct perf_output_handle *,
7262
+ struct perf_sample_data *,
66067263 struct perf_event *,
66077264 unsigned int))
66087265 {
66097266 struct perf_output_handle handle;
66107267 struct perf_event_header header;
7268
+ int err;
66117269
66127270 /* protect the callchain buffers */
66137271 rcu_read_lock();
66147272
66157273 perf_prepare_sample(&header, data, event, regs);
66167274
6617
- if (output_begin(&handle, event, header.size))
7275
+ err = output_begin(&handle, data, event, header.size);
7276
+ if (err)
66187277 goto exit;
66197278
66207279 perf_output_sample(&handle, &header, data, event);
....@@ -6623,6 +7282,7 @@
66237282
66247283 exit:
66257284 rcu_read_unlock();
7285
+ return err;
66267286 }
66277287
66287288 void
....@@ -6641,12 +7301,12 @@
66417301 __perf_event_output(event, data, regs, perf_output_begin_backward);
66427302 }
66437303
6644
-void
7304
+int
66457305 perf_event_output(struct perf_event *event,
66467306 struct perf_sample_data *data,
66477307 struct pt_regs *regs)
66487308 {
6649
- __perf_event_output(event, data, regs, perf_output_begin);
7309
+ return __perf_event_output(event, data, regs, perf_output_begin);
66507310 }
66517311
66527312 /*
....@@ -6678,7 +7338,7 @@
66787338 int ret;
66797339
66807340 perf_event_header__init_id(&read_event.header, &sample, event);
6681
- ret = perf_output_begin(&handle, event, read_event.header.size);
7341
+ ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
66827342 if (ret)
66837343 return;
66847344
....@@ -6823,7 +7483,7 @@
68237483 }
68247484
68257485 struct remote_output {
6826
- struct ring_buffer *rb;
7486
+ struct perf_buffer *rb;
68277487 int err;
68287488 };
68297489
....@@ -6831,7 +7491,7 @@
68317491 {
68327492 struct perf_event *parent = event->parent;
68337493 struct remote_output *ro = data;
6834
- struct ring_buffer *rb = ro->rb;
7494
+ struct perf_buffer *rb = ro->rb;
68357495 struct stop_event_data sd = {
68367496 .event = event,
68377497 };
....@@ -6947,7 +7607,7 @@
69477607
69487608 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
69497609
6950
- ret = perf_output_begin(&handle, event,
7610
+ ret = perf_output_begin(&handle, &sample, event,
69517611 task_event->event_id.header.size);
69527612 if (ret)
69537613 goto out;
....@@ -7050,7 +7710,7 @@
70507710 return;
70517711
70527712 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7053
- ret = perf_output_begin(&handle, event,
7713
+ ret = perf_output_begin(&handle, &sample, event,
70547714 comm_event->event_id.header.size);
70557715
70567716 if (ret)
....@@ -7150,7 +7810,7 @@
71507810
71517811 perf_event_header__init_id(&namespaces_event->event_id.header,
71527812 &sample, event);
7153
- ret = perf_output_begin(&handle, event,
7813
+ ret = perf_output_begin(&handle, &sample, event,
71547814 namespaces_event->event_id.header.size);
71557815 if (ret)
71567816 goto out;
....@@ -7175,7 +7835,7 @@
71757835 {
71767836 struct path ns_path;
71777837 struct inode *ns_inode;
7178
- void *error;
7838
+ int error;
71797839
71807840 error = ns_get_path(&ns_path, task, ns_ops);
71817841 if (!error) {
....@@ -7245,6 +7905,105 @@
72457905 }
72467906
72477907 /*
7908
+ * cgroup tracking
7909
+ */
7910
+#ifdef CONFIG_CGROUP_PERF
7911
+
7912
+struct perf_cgroup_event {
7913
+ char *path;
7914
+ int path_size;
7915
+ struct {
7916
+ struct perf_event_header header;
7917
+ u64 id;
7918
+ char path[];
7919
+ } event_id;
7920
+};
7921
+
7922
+static int perf_event_cgroup_match(struct perf_event *event)
7923
+{
7924
+ return event->attr.cgroup;
7925
+}
7926
+
7927
+static void perf_event_cgroup_output(struct perf_event *event, void *data)
7928
+{
7929
+ struct perf_cgroup_event *cgroup_event = data;
7930
+ struct perf_output_handle handle;
7931
+ struct perf_sample_data sample;
7932
+ u16 header_size = cgroup_event->event_id.header.size;
7933
+ int ret;
7934
+
7935
+ if (!perf_event_cgroup_match(event))
7936
+ return;
7937
+
7938
+ perf_event_header__init_id(&cgroup_event->event_id.header,
7939
+ &sample, event);
7940
+ ret = perf_output_begin(&handle, &sample, event,
7941
+ cgroup_event->event_id.header.size);
7942
+ if (ret)
7943
+ goto out;
7944
+
7945
+ perf_output_put(&handle, cgroup_event->event_id);
7946
+ __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
7947
+
7948
+ perf_event__output_id_sample(event, &handle, &sample);
7949
+
7950
+ perf_output_end(&handle);
7951
+out:
7952
+ cgroup_event->event_id.header.size = header_size;
7953
+}
7954
+
7955
+static void perf_event_cgroup(struct cgroup *cgrp)
7956
+{
7957
+ struct perf_cgroup_event cgroup_event;
7958
+ char path_enomem[16] = "//enomem";
7959
+ char *pathname;
7960
+ size_t size;
7961
+
7962
+ if (!atomic_read(&nr_cgroup_events))
7963
+ return;
7964
+
7965
+ cgroup_event = (struct perf_cgroup_event){
7966
+ .event_id = {
7967
+ .header = {
7968
+ .type = PERF_RECORD_CGROUP,
7969
+ .misc = 0,
7970
+ .size = sizeof(cgroup_event.event_id),
7971
+ },
7972
+ .id = cgroup_id(cgrp),
7973
+ },
7974
+ };
7975
+
7976
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
7977
+ if (pathname == NULL) {
7978
+ cgroup_event.path = path_enomem;
7979
+ } else {
7980
+ /* just to be sure to have enough space for alignment */
7981
+ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
7982
+ cgroup_event.path = pathname;
7983
+ }
7984
+
7985
+ /*
7986
+ * Since our buffer works in 8 byte units we need to align our string
7987
+ * size to a multiple of 8. However, we must guarantee the tail end is
7988
+ * zero'd out to avoid leaking random bits to userspace.
7989
+ */
7990
+ size = strlen(cgroup_event.path) + 1;
7991
+ while (!IS_ALIGNED(size, sizeof(u64)))
7992
+ cgroup_event.path[size++] = '\0';
7993
+
7994
+ cgroup_event.event_id.header.size += size;
7995
+ cgroup_event.path_size = size;
7996
+
7997
+ perf_iterate_sb(perf_event_cgroup_output,
7998
+ &cgroup_event,
7999
+ NULL);
8000
+
8001
+ kfree(pathname);
8002
+}
8003
+
8004
+#endif
8005
+
8006
+/*
72488007 * mmap tracking
72498008 */
72508009
....@@ -7304,7 +8063,7 @@
73048063 }
73058064
73068065 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7307
- ret = perf_output_begin(&handle, event,
8066
+ ret = perf_output_begin(&handle, &sample, event,
73088067 mmap_event->event_id.header.size);
73098068 if (ret)
73108069 goto out;
....@@ -7364,7 +8123,7 @@
73648123 flags |= MAP_EXECUTABLE;
73658124 if (vma->vm_flags & VM_LOCKED)
73668125 flags |= MAP_LOCKED;
7367
- if (vma->vm_flags & VM_HUGETLB)
8126
+ if (is_vm_hugetlb_page(vma))
73688127 flags |= MAP_HUGETLB;
73698128
73708129 if (file) {
....@@ -7614,7 +8373,7 @@
76148373 int ret;
76158374
76168375 perf_event_header__init_id(&rec.header, &sample, event);
7617
- ret = perf_output_begin(&handle, event, rec.header.size);
8376
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
76188377
76198378 if (ret)
76208379 return;
....@@ -7648,7 +8407,7 @@
76488407
76498408 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
76508409
7651
- ret = perf_output_begin(&handle, event,
8410
+ ret = perf_output_begin(&handle, &sample, event,
76528411 lost_samples_event.header.size);
76538412 if (ret)
76548413 return;
....@@ -7703,7 +8462,7 @@
77038462
77048463 perf_event_header__init_id(&se->event_id.header, &sample, event);
77058464
7706
- ret = perf_output_begin(&handle, event, se->event_id.header.size);
8465
+ ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
77078466 if (ret)
77088467 return;
77098468
....@@ -7778,7 +8537,7 @@
77788537
77798538 perf_event_header__init_id(&throttle_event.header, &sample, event);
77808539
7781
- ret = perf_output_begin(&handle, event,
8540
+ ret = perf_output_begin(&handle, &sample, event,
77828541 throttle_event.header.size);
77838542 if (ret)
77848543 return;
....@@ -7786,6 +8545,290 @@
77868545 perf_output_put(&handle, throttle_event);
77878546 perf_event__output_id_sample(event, &handle, &sample);
77888547 perf_output_end(&handle);
8548
+}
8549
+
8550
+/*
8551
+ * ksymbol register/unregister tracking
8552
+ */
8553
+
8554
+struct perf_ksymbol_event {
8555
+ const char *name;
8556
+ int name_len;
8557
+ struct {
8558
+ struct perf_event_header header;
8559
+ u64 addr;
8560
+ u32 len;
8561
+ u16 ksym_type;
8562
+ u16 flags;
8563
+ } event_id;
8564
+};
8565
+
8566
+static int perf_event_ksymbol_match(struct perf_event *event)
8567
+{
8568
+ return event->attr.ksymbol;
8569
+}
8570
+
8571
+static void perf_event_ksymbol_output(struct perf_event *event, void *data)
8572
+{
8573
+ struct perf_ksymbol_event *ksymbol_event = data;
8574
+ struct perf_output_handle handle;
8575
+ struct perf_sample_data sample;
8576
+ int ret;
8577
+
8578
+ if (!perf_event_ksymbol_match(event))
8579
+ return;
8580
+
8581
+ perf_event_header__init_id(&ksymbol_event->event_id.header,
8582
+ &sample, event);
8583
+ ret = perf_output_begin(&handle, &sample, event,
8584
+ ksymbol_event->event_id.header.size);
8585
+ if (ret)
8586
+ return;
8587
+
8588
+ perf_output_put(&handle, ksymbol_event->event_id);
8589
+ __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
8590
+ perf_event__output_id_sample(event, &handle, &sample);
8591
+
8592
+ perf_output_end(&handle);
8593
+}
8594
+
8595
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
8596
+ const char *sym)
8597
+{
8598
+ struct perf_ksymbol_event ksymbol_event;
8599
+ char name[KSYM_NAME_LEN];
8600
+ u16 flags = 0;
8601
+ int name_len;
8602
+
8603
+ if (!atomic_read(&nr_ksymbol_events))
8604
+ return;
8605
+
8606
+ if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
8607
+ ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
8608
+ goto err;
8609
+
8610
+ strlcpy(name, sym, KSYM_NAME_LEN);
8611
+ name_len = strlen(name) + 1;
8612
+ while (!IS_ALIGNED(name_len, sizeof(u64)))
8613
+ name[name_len++] = '\0';
8614
+ BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
8615
+
8616
+ if (unregister)
8617
+ flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
8618
+
8619
+ ksymbol_event = (struct perf_ksymbol_event){
8620
+ .name = name,
8621
+ .name_len = name_len,
8622
+ .event_id = {
8623
+ .header = {
8624
+ .type = PERF_RECORD_KSYMBOL,
8625
+ .size = sizeof(ksymbol_event.event_id) +
8626
+ name_len,
8627
+ },
8628
+ .addr = addr,
8629
+ .len = len,
8630
+ .ksym_type = ksym_type,
8631
+ .flags = flags,
8632
+ },
8633
+ };
8634
+
8635
+ perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
8636
+ return;
8637
+err:
8638
+ WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
8639
+}
8640
+
8641
+/*
8642
+ * bpf program load/unload tracking
8643
+ */
8644
+
8645
+struct perf_bpf_event {
8646
+ struct bpf_prog *prog;
8647
+ struct {
8648
+ struct perf_event_header header;
8649
+ u16 type;
8650
+ u16 flags;
8651
+ u32 id;
8652
+ u8 tag[BPF_TAG_SIZE];
8653
+ } event_id;
8654
+};
8655
+
8656
+static int perf_event_bpf_match(struct perf_event *event)
8657
+{
8658
+ return event->attr.bpf_event;
8659
+}
8660
+
8661
+static void perf_event_bpf_output(struct perf_event *event, void *data)
8662
+{
8663
+ struct perf_bpf_event *bpf_event = data;
8664
+ struct perf_output_handle handle;
8665
+ struct perf_sample_data sample;
8666
+ int ret;
8667
+
8668
+ if (!perf_event_bpf_match(event))
8669
+ return;
8670
+
8671
+ perf_event_header__init_id(&bpf_event->event_id.header,
8672
+ &sample, event);
8673
+ ret = perf_output_begin(&handle, &sample, event,
8674
+ bpf_event->event_id.header.size);
8675
+ if (ret)
8676
+ return;
8677
+
8678
+ perf_output_put(&handle, bpf_event->event_id);
8679
+ perf_event__output_id_sample(event, &handle, &sample);
8680
+
8681
+ perf_output_end(&handle);
8682
+}
8683
+
8684
+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
8685
+ enum perf_bpf_event_type type)
8686
+{
8687
+ bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
8688
+ int i;
8689
+
8690
+ if (prog->aux->func_cnt == 0) {
8691
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
8692
+ (u64)(unsigned long)prog->bpf_func,
8693
+ prog->jited_len, unregister,
8694
+ prog->aux->ksym.name);
8695
+ } else {
8696
+ for (i = 0; i < prog->aux->func_cnt; i++) {
8697
+ struct bpf_prog *subprog = prog->aux->func[i];
8698
+
8699
+ perf_event_ksymbol(
8700
+ PERF_RECORD_KSYMBOL_TYPE_BPF,
8701
+ (u64)(unsigned long)subprog->bpf_func,
8702
+ subprog->jited_len, unregister,
8703
+ subprog->aux->ksym.name);
8704
+ }
8705
+ }
8706
+}
8707
+
8708
+void perf_event_bpf_event(struct bpf_prog *prog,
8709
+ enum perf_bpf_event_type type,
8710
+ u16 flags)
8711
+{
8712
+ struct perf_bpf_event bpf_event;
8713
+
8714
+ if (type <= PERF_BPF_EVENT_UNKNOWN ||
8715
+ type >= PERF_BPF_EVENT_MAX)
8716
+ return;
8717
+
8718
+ switch (type) {
8719
+ case PERF_BPF_EVENT_PROG_LOAD:
8720
+ case PERF_BPF_EVENT_PROG_UNLOAD:
8721
+ if (atomic_read(&nr_ksymbol_events))
8722
+ perf_event_bpf_emit_ksymbols(prog, type);
8723
+ break;
8724
+ default:
8725
+ break;
8726
+ }
8727
+
8728
+ if (!atomic_read(&nr_bpf_events))
8729
+ return;
8730
+
8731
+ bpf_event = (struct perf_bpf_event){
8732
+ .prog = prog,
8733
+ .event_id = {
8734
+ .header = {
8735
+ .type = PERF_RECORD_BPF_EVENT,
8736
+ .size = sizeof(bpf_event.event_id),
8737
+ },
8738
+ .type = type,
8739
+ .flags = flags,
8740
+ .id = prog->aux->id,
8741
+ },
8742
+ };
8743
+
8744
+ BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
8745
+
8746
+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
8747
+ perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
8748
+}
8749
+
8750
+struct perf_text_poke_event {
8751
+ const void *old_bytes;
8752
+ const void *new_bytes;
8753
+ size_t pad;
8754
+ u16 old_len;
8755
+ u16 new_len;
8756
+
8757
+ struct {
8758
+ struct perf_event_header header;
8759
+
8760
+ u64 addr;
8761
+ } event_id;
8762
+};
8763
+
8764
+static int perf_event_text_poke_match(struct perf_event *event)
8765
+{
8766
+ return event->attr.text_poke;
8767
+}
8768
+
8769
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
8770
+{
8771
+ struct perf_text_poke_event *text_poke_event = data;
8772
+ struct perf_output_handle handle;
8773
+ struct perf_sample_data sample;
8774
+ u64 padding = 0;
8775
+ int ret;
8776
+
8777
+ if (!perf_event_text_poke_match(event))
8778
+ return;
8779
+
8780
+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
8781
+
8782
+ ret = perf_output_begin(&handle, &sample, event,
8783
+ text_poke_event->event_id.header.size);
8784
+ if (ret)
8785
+ return;
8786
+
8787
+ perf_output_put(&handle, text_poke_event->event_id);
8788
+ perf_output_put(&handle, text_poke_event->old_len);
8789
+ perf_output_put(&handle, text_poke_event->new_len);
8790
+
8791
+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
8792
+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
8793
+
8794
+ if (text_poke_event->pad)
8795
+ __output_copy(&handle, &padding, text_poke_event->pad);
8796
+
8797
+ perf_event__output_id_sample(event, &handle, &sample);
8798
+
8799
+ perf_output_end(&handle);
8800
+}
8801
+
8802
+void perf_event_text_poke(const void *addr, const void *old_bytes,
8803
+ size_t old_len, const void *new_bytes, size_t new_len)
8804
+{
8805
+ struct perf_text_poke_event text_poke_event;
8806
+ size_t tot, pad;
8807
+
8808
+ if (!atomic_read(&nr_text_poke_events))
8809
+ return;
8810
+
8811
+ tot = sizeof(text_poke_event.old_len) + old_len;
8812
+ tot += sizeof(text_poke_event.new_len) + new_len;
8813
+ pad = ALIGN(tot, sizeof(u64)) - tot;
8814
+
8815
+ text_poke_event = (struct perf_text_poke_event){
8816
+ .old_bytes = old_bytes,
8817
+ .new_bytes = new_bytes,
8818
+ .pad = pad,
8819
+ .old_len = old_len,
8820
+ .new_len = new_len,
8821
+ .event_id = {
8822
+ .header = {
8823
+ .type = PERF_RECORD_TEXT_POKE,
8824
+ .misc = PERF_RECORD_MISC_KERNEL,
8825
+ .size = sizeof(text_poke_event.event_id) + tot + pad,
8826
+ },
8827
+ .addr = (unsigned long)addr,
8828
+ },
8829
+ };
8830
+
8831
+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
77898832 }
77908833
77918834 void perf_event_itrace_started(struct perf_event *event)
....@@ -7818,7 +8861,7 @@
78188861 rec.tid = perf_event_tid(event, current);
78198862
78208863 perf_event_header__init_id(&rec.header, &sample, event);
7821
- ret = perf_output_begin(&handle, event, rec.header.size);
8864
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
78228865
78238866 if (ret)
78248867 return;
....@@ -7842,8 +8885,8 @@
78428885 hwc->interrupts = 1;
78438886 } else {
78448887 hwc->interrupts++;
7845
- if (unlikely(throttle
7846
- && hwc->interrupts >= max_samples_per_tick)) {
8888
+ if (unlikely(throttle &&
8889
+ hwc->interrupts > max_samples_per_tick)) {
78478890 __this_cpu_inc(perf_throttled_count);
78488891 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
78498892 hwc->interrupts = MAX_INTERRUPTS;
....@@ -8386,9 +9429,9 @@
83869429 if (event->hw.state & PERF_HES_STOPPED)
83879430 return 0;
83889431 /*
8389
- * All tracepoints are from kernel-space.
9432
+ * If exclude_kernel, only trace user-space tracepoints (uprobes)
83909433 */
8391
- if (event->attr.exclude_kernel)
9434
+ if (event->attr.exclude_kernel && !user_mode(regs))
83929435 return 0;
83939436
83949437 if (!perf_tp_filter_match(event, data))
....@@ -8514,30 +9557,39 @@
85149557 *
85159558 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
85169559 * if not set, create kprobe/uprobe
9560
+ *
9561
+ * The following values specify a reference counter (or semaphore in the
9562
+ * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
9563
+ * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
9564
+ *
9565
+ * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset
9566
+ * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
85179567 */
85189568 enum perf_probe_config {
85199569 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
9570
+ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
9571
+ PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
85209572 };
85219573
85229574 PMU_FORMAT_ATTR(retprobe, "config:0");
9575
+#endif
85239576
8524
-static struct attribute *probe_attrs[] = {
9577
+#ifdef CONFIG_KPROBE_EVENTS
9578
+static struct attribute *kprobe_attrs[] = {
85259579 &format_attr_retprobe.attr,
85269580 NULL,
85279581 };
85289582
8529
-static struct attribute_group probe_format_group = {
9583
+static struct attribute_group kprobe_format_group = {
85309584 .name = "format",
8531
- .attrs = probe_attrs,
9585
+ .attrs = kprobe_attrs,
85329586 };
85339587
8534
-static const struct attribute_group *probe_attr_groups[] = {
8535
- &probe_format_group,
9588
+static const struct attribute_group *kprobe_attr_groups[] = {
9589
+ &kprobe_format_group,
85369590 NULL,
85379591 };
8538
-#endif
85399592
8540
-#ifdef CONFIG_KPROBE_EVENTS
85419593 static int perf_kprobe_event_init(struct perf_event *event);
85429594 static struct pmu perf_kprobe = {
85439595 .task_ctx_nr = perf_sw_context,
....@@ -8547,7 +9599,7 @@
85479599 .start = perf_swevent_start,
85489600 .stop = perf_swevent_stop,
85499601 .read = perf_swevent_read,
8550
- .attr_groups = probe_attr_groups,
9602
+ .attr_groups = kprobe_attr_groups,
85519603 };
85529604
85539605 static int perf_kprobe_event_init(struct perf_event *event)
....@@ -8558,7 +9610,7 @@
85589610 if (event->attr.type != perf_kprobe.type)
85599611 return -ENOENT;
85609612
8561
- if (!capable(CAP_SYS_ADMIN))
9613
+ if (!perfmon_capable())
85629614 return -EACCES;
85639615
85649616 /*
....@@ -8579,6 +9631,24 @@
85799631 #endif /* CONFIG_KPROBE_EVENTS */
85809632
85819633 #ifdef CONFIG_UPROBE_EVENTS
9634
+PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
9635
+
9636
+static struct attribute *uprobe_attrs[] = {
9637
+ &format_attr_retprobe.attr,
9638
+ &format_attr_ref_ctr_offset.attr,
9639
+ NULL,
9640
+};
9641
+
9642
+static struct attribute_group uprobe_format_group = {
9643
+ .name = "format",
9644
+ .attrs = uprobe_attrs,
9645
+};
9646
+
9647
+static const struct attribute_group *uprobe_attr_groups[] = {
9648
+ &uprobe_format_group,
9649
+ NULL,
9650
+};
9651
+
85829652 static int perf_uprobe_event_init(struct perf_event *event);
85839653 static struct pmu perf_uprobe = {
85849654 .task_ctx_nr = perf_sw_context,
....@@ -8588,18 +9658,19 @@
85889658 .start = perf_swevent_start,
85899659 .stop = perf_swevent_stop,
85909660 .read = perf_swevent_read,
8591
- .attr_groups = probe_attr_groups,
9661
+ .attr_groups = uprobe_attr_groups,
85929662 };
85939663
85949664 static int perf_uprobe_event_init(struct perf_event *event)
85959665 {
85969666 int err;
9667
+ unsigned long ref_ctr_offset;
85979668 bool is_retprobe;
85989669
85999670 if (event->attr.type != perf_uprobe.type)
86009671 return -ENOENT;
86019672
8602
- if (!capable(CAP_SYS_ADMIN))
9673
+ if (!perfmon_capable())
86039674 return -EACCES;
86049675
86059676 /*
....@@ -8609,7 +9680,8 @@
86099680 return -EOPNOTSUPP;
86109681
86119682 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8612
- err = perf_uprobe_init(event, is_retprobe);
9683
+ ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
9684
+ err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
86139685 if (err)
86149686 return err;
86159687
....@@ -8647,7 +9719,6 @@
86479719 int ret = 0;
86489720
86499721 ctx.regs = perf_arch_bpf_user_pt_regs(regs);
8650
- preempt_disable();
86519722 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
86529723 goto out;
86539724 rcu_read_lock();
....@@ -8655,7 +9726,6 @@
86559726 rcu_read_unlock();
86569727 out:
86579728 __this_cpu_dec(bpf_prog_active);
8658
- preempt_enable();
86599729 if (!ret)
86609730 return;
86619731
....@@ -8676,6 +9746,24 @@
86769746 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
86779747 if (IS_ERR(prog))
86789748 return PTR_ERR(prog);
9749
+
9750
+ if (event->attr.precise_ip &&
9751
+ prog->call_get_stack &&
9752
+ (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
9753
+ event->attr.exclude_callchain_kernel ||
9754
+ event->attr.exclude_callchain_user)) {
9755
+ /*
9756
+ * On perf_event with precise_ip, calling bpf_get_stack()
9757
+ * may trigger unwinder warnings and occasional crashes.
9758
+ * bpf_get_[stack|stackid] works around this issue by using
9759
+ * callchain attached to perf_sample_data. If the
9760
+ * perf_event does not full (kernel and user) callchain
9761
+ * attached to perf_sample_data, do not allow attaching BPF
9762
+ * program that calls bpf_get_[stack|stackid].
9763
+ */
9764
+ bpf_prog_put(prog);
9765
+ return -EPROTO;
9766
+ }
86799767
86809768 event->prog = prog;
86819769 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
....@@ -8875,7 +9963,7 @@
88759963 /*
88769964 * Scan through mm's vmas and see if one of them matches the
88779965 * @filter; if so, adjust filter's address range.
8878
- * Called with mm::mmap_sem down for reading.
9966
+ * Called with mm::mmap_lock down for reading.
88799967 */
88809968 static void perf_addr_filter_apply(struct perf_addr_filter *filter,
88819969 struct mm_struct *mm,
....@@ -8917,7 +10005,7 @@
891710005 if (!mm)
891810006 goto restart;
891910007
8920
- down_read(&mm->mmap_sem);
10008
+ mmap_read_lock(mm);
892110009 }
892210010
892310011 raw_spin_lock_irqsave(&ifh->lock, flags);
....@@ -8943,7 +10031,7 @@
894310031 raw_spin_unlock_irqrestore(&ifh->lock, flags);
894410032
894510033 if (ifh->nr_file_filters) {
8946
- up_read(&mm->mmap_sem);
10034
+ mmap_read_unlock(mm);
894710035
894810036 mmput(mm);
894910037 }
....@@ -9050,6 +10138,7 @@
905010138 case IF_SRC_KERNELADDR:
905110139 case IF_SRC_KERNEL:
905210140 kernel = 1;
10141
+ fallthrough;
905310142
905410143 case IF_SRC_FILEADDR:
905510144 case IF_SRC_FILE:
....@@ -9136,8 +10225,11 @@
913610225 }
913710226
913810227 /* ready to consume more filters */
10228
+ kfree(filename);
10229
+ filename = NULL;
913910230 state = IF_STATE_ACTION;
914010231 filter = NULL;
10232
+ kernel = 0;
914110233 }
914210234 }
914310235
....@@ -9285,7 +10377,7 @@
928510377 period = max_t(u64, 10000, hwc->sample_period);
928610378 }
928710379 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
9288
- HRTIMER_MODE_REL_PINNED);
10380
+ HRTIMER_MODE_REL_PINNED_HARD);
928910381 }
929010382
929110383 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
....@@ -9640,8 +10732,7 @@
964010732 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
964110733 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
964210734
9643
- cpu_function_call(cpu,
9644
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
10735
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx);
964510736 }
964610737 cpus_read_unlock();
964710738 mutex_unlock(&mux_interval_mutex);
....@@ -9678,13 +10769,15 @@
967810769
967910770 pmu->dev->groups = pmu->attr_groups;
968010771 device_initialize(pmu->dev);
9681
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
9682
- if (ret)
9683
- goto free_dev;
968410772
968510773 dev_set_drvdata(pmu->dev, pmu);
968610774 pmu->dev->bus = &pmu_bus;
968710775 pmu->dev->release = pmu_dev_release;
10776
+
10777
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
10778
+ if (ret)
10779
+ goto free_dev;
10780
+
968810781 ret = device_add(pmu->dev);
968910782 if (ret)
969010783 goto free_dev;
....@@ -9692,6 +10785,12 @@
969210785 /* For PMUs with address filters, throw in an extra attribute: */
969310786 if (pmu->nr_addr_filters)
969410787 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
10788
+
10789
+ if (ret)
10790
+ goto del_dev;
10791
+
10792
+ if (pmu->attr_update)
10793
+ ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
969510794
969610795 if (ret)
969710796 goto del_dev;
....@@ -9712,7 +10811,7 @@
971210811
971310812 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
971410813 {
9715
- int cpu, ret;
10814
+ int cpu, ret, max = PERF_TYPE_MAX;
971610815
971710816 mutex_lock(&pmus_lock);
971810817 ret = -ENOMEM;
....@@ -9725,12 +10824,17 @@
972510824 goto skip_type;
972610825 pmu->name = name;
972710826
9728
- if (type < 0) {
9729
- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9730
- if (type < 0) {
9731
- ret = type;
10827
+ if (type != PERF_TYPE_SOFTWARE) {
10828
+ if (type >= 0)
10829
+ max = type;
10830
+
10831
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
10832
+ if (ret < 0)
973210833 goto free_pdc;
9733
- }
10834
+
10835
+ WARN_ON(type >= 0 && ret != type);
10836
+
10837
+ type = ret;
973410838 }
973510839 pmu->type = type;
973610840
....@@ -9776,6 +10880,9 @@
977610880 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
977710881
977810882 __perf_mux_hrtimer_init(cpuctx, cpu);
10883
+
10884
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
10885
+ cpuctx->heap = cpuctx->heap_default;
977910886 }
978010887
978110888 got_cpu_context:
....@@ -9807,7 +10914,16 @@
980710914 if (!pmu->event_idx)
980810915 pmu->event_idx = perf_event_idx_default;
980910916
9810
- list_add_rcu(&pmu->entry, &pmus);
10917
+ /*
10918
+ * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
10919
+ * since these cannot be in the IDR. This way the linear search
10920
+ * is fast, provided a valid software event is provided.
10921
+ */
10922
+ if (type == PERF_TYPE_SOFTWARE || !name)
10923
+ list_add_rcu(&pmu->entry, &pmus);
10924
+ else
10925
+ list_add_tail_rcu(&pmu->entry, &pmus);
10926
+
981110927 atomic_set(&pmu->exclusive_cnt, 0);
981210928 ret = 0;
981310929 unlock:
....@@ -9820,7 +10936,7 @@
982010936 put_device(pmu->dev);
982110937
982210938 free_idr:
9823
- if (pmu->type >= PERF_TYPE_MAX)
10939
+ if (pmu->type != PERF_TYPE_SOFTWARE)
982410940 idr_remove(&pmu_idr, pmu->type);
982510941
982610942 free_pdc:
....@@ -9842,7 +10958,7 @@
984210958 synchronize_rcu();
984310959
984410960 free_percpu(pmu->pmu_disable_count);
9845
- if (pmu->type >= PERF_TYPE_MAX)
10961
+ if (pmu->type != PERF_TYPE_SOFTWARE)
984610962 idr_remove(&pmu_idr, pmu->type);
984710963 if (pmu_bus_running) {
984810964 if (pmu->nr_addr_filters)
....@@ -9854,6 +10970,12 @@
985410970 mutex_unlock(&pmus_lock);
985510971 }
985610972 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
10973
+
10974
+static inline bool has_extended_regs(struct perf_event *event)
10975
+{
10976
+ return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
10977
+ (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
10978
+}
985710979
985810980 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
985910981 {
....@@ -9885,6 +11007,19 @@
988511007 if (ctx)
988611008 perf_event_ctx_unlock(event->group_leader, ctx);
988711009
11010
+ if (!ret) {
11011
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
11012
+ has_extended_regs(event))
11013
+ ret = -EOPNOTSUPP;
11014
+
11015
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
11016
+ event_has_any_exclude_flag(event))
11017
+ ret = -EINVAL;
11018
+
11019
+ if (ret && event->destroy)
11020
+ event->destroy(event);
11021
+ }
11022
+
988811023 if (ret)
988911024 module_put(pmu->module);
989011025
....@@ -9893,9 +11028,8 @@
989311028
989411029 static struct pmu *perf_init_event(struct perf_event *event)
989511030 {
11031
+ int idx, type, ret;
989611032 struct pmu *pmu;
9897
- int idx;
9898
- int ret;
989911033
990011034 idx = srcu_read_lock(&pmus_srcu);
990111035
....@@ -9907,17 +11041,32 @@
990711041 goto unlock;
990811042 }
990911043
11044
+ /*
11045
+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
11046
+ * are often aliases for PERF_TYPE_RAW.
11047
+ */
11048
+ type = event->attr.type;
11049
+ if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
11050
+ type = PERF_TYPE_RAW;
11051
+
11052
+again:
991011053 rcu_read_lock();
9911
- pmu = idr_find(&pmu_idr, event->attr.type);
11054
+ pmu = idr_find(&pmu_idr, type);
991211055 rcu_read_unlock();
991311056 if (pmu) {
991411057 ret = perf_try_init_event(pmu, event);
11058
+ if (ret == -ENOENT && event->attr.type != type) {
11059
+ type = event->attr.type;
11060
+ goto again;
11061
+ }
11062
+
991511063 if (ret)
991611064 pmu = ERR_PTR(ret);
11065
+
991711066 goto unlock;
991811067 }
991911068
9920
- list_for_each_entry_rcu(pmu, &pmus, entry) {
11069
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
992111070 ret = perf_try_init_event(pmu, event);
992211071 if (!ret)
992311072 goto unlock;
....@@ -9993,7 +11142,7 @@
999311142 if (event->parent)
999411143 return;
999511144
9996
- if (event->attach_state & PERF_ATTACH_TASK)
11145
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
999711146 inc = true;
999811147 if (event->attr.mmap || event->attr.mmap_data)
999911148 atomic_inc(&nr_mmap_events);
....@@ -10001,6 +11150,8 @@
1000111150 atomic_inc(&nr_comm_events);
1000211151 if (event->attr.namespaces)
1000311152 atomic_inc(&nr_namespaces_events);
11153
+ if (event->attr.cgroup)
11154
+ atomic_inc(&nr_cgroup_events);
1000411155 if (event->attr.task)
1000511156 atomic_inc(&nr_task_events);
1000611157 if (event->attr.freq)
....@@ -10013,6 +11164,12 @@
1001311164 inc = true;
1001411165 if (is_cgroup_event(event))
1001511166 inc = true;
11167
+ if (event->attr.ksymbol)
11168
+ atomic_inc(&nr_ksymbol_events);
11169
+ if (event->attr.bpf_event)
11170
+ atomic_inc(&nr_bpf_events);
11171
+ if (event->attr.text_poke)
11172
+ atomic_inc(&nr_text_poke_events);
1001611173
1001711174 if (inc) {
1001811175 /*
....@@ -10031,7 +11188,7 @@
1003111188 * call the perf scheduling hooks before proceeding to
1003211189 * install events that need them.
1003311190 */
10034
- synchronize_sched();
11191
+ synchronize_rcu();
1003511192 }
1003611193 /*
1003711194 * Now that we have waited for the sync_sched(), allow further
....@@ -10120,8 +11277,7 @@
1012011277 * and we cannot use the ctx information because we need the
1012111278 * pmu before we get a ctx.
1012211279 */
10123
- get_task_struct(task);
10124
- event->hw.target = task;
11280
+ event->hw.target = get_task_struct(task);
1012511281 }
1012611282
1012711283 event->clock = &local_clock;
....@@ -10133,12 +11289,9 @@
1013311289 context = parent_event->overflow_handler_context;
1013411290 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
1013511291 if (overflow_handler == bpf_overflow_handler) {
10136
- struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
11292
+ struct bpf_prog *prog = parent_event->prog;
1013711293
10138
- if (IS_ERR(prog)) {
10139
- err = PTR_ERR(prog);
10140
- goto err_ns;
10141
- }
11294
+ bpf_prog_inc(prog);
1014211295 event->prog = prog;
1014311296 event->orig_overflow_handler =
1014411297 parent_event->orig_overflow_handler;
....@@ -10179,16 +11332,31 @@
1017911332 if (!has_branch_stack(event))
1018011333 event->attr.branch_sample_type = 0;
1018111334
10182
- if (cgroup_fd != -1) {
10183
- err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
10184
- if (err)
10185
- goto err_ns;
10186
- }
10187
-
1018811335 pmu = perf_init_event(event);
1018911336 if (IS_ERR(pmu)) {
1019011337 err = PTR_ERR(pmu);
1019111338 goto err_ns;
11339
+ }
11340
+
11341
+ /*
11342
+ * Disallow uncore-cgroup events, they don't make sense as the cgroup will
11343
+ * be different on other CPUs in the uncore mask.
11344
+ */
11345
+ if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
11346
+ err = -EINVAL;
11347
+ goto err_pmu;
11348
+ }
11349
+
11350
+ if (event->attr.aux_output &&
11351
+ !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
11352
+ err = -EOPNOTSUPP;
11353
+ goto err_pmu;
11354
+ }
11355
+
11356
+ if (cgroup_fd != -1) {
11357
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
11358
+ if (err)
11359
+ goto err_pmu;
1019211360 }
1019311361
1019411362 err = exclusive_event_init(event);
....@@ -10251,12 +11419,12 @@
1025111419 exclusive_event_destroy(event);
1025211420
1025311421 err_pmu:
11422
+ if (is_cgroup_event(event))
11423
+ perf_detach_cgroup(event);
1025411424 if (event->destroy)
1025511425 event->destroy(event);
1025611426 module_put(pmu->module);
1025711427 err_ns:
10258
- if (is_cgroup_event(event))
10259
- perf_detach_cgroup(event);
1026011428 if (event->ns)
1026111429 put_pid_ns(event->ns);
1026211430 if (event->hw.target)
....@@ -10272,58 +11440,29 @@
1027211440 u32 size;
1027311441 int ret;
1027411442
10275
- if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
10276
- return -EFAULT;
10277
-
10278
- /*
10279
- * zero the full structure, so that a short copy will be nice.
10280
- */
11443
+ /* Zero the full structure, so that a short copy will be nice. */
1028111444 memset(attr, 0, sizeof(*attr));
1028211445
1028311446 ret = get_user(size, &uattr->size);
1028411447 if (ret)
1028511448 return ret;
1028611449
10287
- if (size > PAGE_SIZE) /* silly large */
10288
- goto err_size;
10289
-
10290
- if (!size) /* abi compat */
11450
+ /* ABI compatibility quirk: */
11451
+ if (!size)
1029111452 size = PERF_ATTR_SIZE_VER0;
10292
-
10293
- if (size < PERF_ATTR_SIZE_VER0)
11453
+ if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
1029411454 goto err_size;
1029511455
10296
- /*
10297
- * If we're handed a bigger struct than we know of,
10298
- * ensure all the unknown bits are 0 - i.e. new
10299
- * user-space does not rely on any kernel feature
10300
- * extensions we dont know about yet.
10301
- */
10302
- if (size > sizeof(*attr)) {
10303
- unsigned char __user *addr;
10304
- unsigned char __user *end;
10305
- unsigned char val;
10306
-
10307
- addr = (void __user *)uattr + sizeof(*attr);
10308
- end = (void __user *)uattr + size;
10309
-
10310
- for (; addr < end; addr++) {
10311
- ret = get_user(val, addr);
10312
- if (ret)
10313
- return ret;
10314
- if (val)
10315
- goto err_size;
10316
- }
10317
- size = sizeof(*attr);
11456
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
11457
+ if (ret) {
11458
+ if (ret == -E2BIG)
11459
+ goto err_size;
11460
+ return ret;
1031811461 }
10319
-
10320
- ret = copy_from_user(attr, uattr, size);
10321
- if (ret)
10322
- return -EFAULT;
1032311462
1032411463 attr->size = size;
1032511464
10326
- if (attr->__reserved_1)
11465
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
1032711466 return -EINVAL;
1032811467
1032911468 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
....@@ -10394,6 +11533,12 @@
1039411533
1039511534 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
1039611535 ret = perf_reg_validate(attr->sample_regs_intr);
11536
+
11537
+#ifndef CONFIG_CGROUP_PERF
11538
+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
11539
+ return -EINVAL;
11540
+#endif
11541
+
1039711542 out:
1039811543 return ret;
1039911544
....@@ -10403,14 +11548,25 @@
1040311548 goto out;
1040411549 }
1040511550
11551
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
11552
+{
11553
+ if (b < a)
11554
+ swap(a, b);
11555
+
11556
+ mutex_lock(a);
11557
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
11558
+}
11559
+
1040611560 static int
1040711561 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
1040811562 {
10409
- struct ring_buffer *rb = NULL;
11563
+ struct perf_buffer *rb = NULL;
1041011564 int ret = -EINVAL;
1041111565
10412
- if (!output_event)
11566
+ if (!output_event) {
11567
+ mutex_lock(&event->mmap_mutex);
1041311568 goto set;
11569
+ }
1041411570
1041511571 /* don't allow circular references */
1041611572 if (event == output_event)
....@@ -10425,7 +11581,7 @@
1042511581 /*
1042611582 * If its not a per-cpu rb, it must be the same task.
1042711583 */
10428
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
11584
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
1042911585 goto out;
1043011586
1043111587 /*
....@@ -10448,8 +11604,15 @@
1044811604 event->pmu != output_event->pmu)
1044911605 goto out;
1045011606
11607
+ /*
11608
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
11609
+ * output_event is already on rb->event_list, and the list iteration
11610
+ * restarts after every removal, it is guaranteed this new event is
11611
+ * observed *OR* if output_event is already removed, it's guaranteed we
11612
+ * observe !rb->mmap_count.
11613
+ */
11614
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
1045111615 set:
10452
- mutex_lock(&event->mmap_mutex);
1045311616 /* Can't redirect output if we've got an active mmap() */
1045411617 if (atomic_read(&event->mmap_count))
1045511618 goto unlock;
....@@ -10459,6 +11622,12 @@
1045911622 rb = ring_buffer_get(output_event);
1046011623 if (!rb)
1046111624 goto unlock;
11625
+
11626
+ /* did we race against perf_mmap_close() */
11627
+ if (!atomic_read(&rb->mmap_count)) {
11628
+ ring_buffer_put(rb);
11629
+ goto unlock;
11630
+ }
1046211631 }
1046311632
1046411633 ring_buffer_attach(event, rb);
....@@ -10466,18 +11635,11 @@
1046611635 ret = 0;
1046711636 unlock:
1046811637 mutex_unlock(&event->mmap_mutex);
11638
+ if (output_event)
11639
+ mutex_unlock(&output_event->mmap_mutex);
1046911640
1047011641 out:
1047111642 return ret;
10472
-}
10473
-
10474
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
10475
-{
10476
- if (b < a)
10477
- swap(a, b);
10478
-
10479
- mutex_lock(a);
10480
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
1048111643 }
1048211644
1048311645 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
....@@ -10500,11 +11662,11 @@
1050011662 break;
1050111663
1050211664 case CLOCK_BOOTTIME:
10503
- event->clock = &ktime_get_boot_ns;
11665
+ event->clock = &ktime_get_boottime_ns;
1050411666 break;
1050511667
1050611668 case CLOCK_TAI:
10507
- event->clock = &ktime_get_tai_ns;
11669
+ event->clock = &ktime_get_clocktai_ns;
1050811670 break;
1050911671
1051011672 default:
....@@ -10530,7 +11692,7 @@
1053011692 again:
1053111693 rcu_read_lock();
1053211694 gctx = READ_ONCE(group_leader->ctx);
10533
- if (!atomic_inc_not_zero(&gctx->refcount)) {
11695
+ if (!refcount_inc_not_zero(&gctx->refcount)) {
1053411696 rcu_read_unlock();
1053511697 goto again;
1053611698 }
....@@ -10563,7 +11725,7 @@
1056311725 struct perf_event *group_leader = NULL, *output_event = NULL;
1056411726 struct perf_event *event, *sibling;
1056511727 struct perf_event_attr attr;
10566
- struct perf_event_context *ctx, *uninitialized_var(gctx);
11728
+ struct perf_event_context *ctx, *gctx;
1056711729 struct file *event_file = NULL;
1056811730 struct fd group = {NULL, 0};
1056911731 struct task_struct *task = NULL;
....@@ -10578,15 +11740,12 @@
1057811740 if (flags & ~PERF_FLAG_ALL)
1057911741 return -EINVAL;
1058011742
10581
- if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
10582
- return -EACCES;
10583
-
10584
- /* Do we allow access to perf_event_open(2) ? */
10585
- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
11743
+ err = perf_copy_attr(attr_uptr, &attr);
1058611744 if (err)
1058711745 return err;
1058811746
10589
- err = perf_copy_attr(attr_uptr, &attr);
11747
+ /* Do we allow access to perf_event_open(2) ? */
11748
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
1059011749 if (err)
1059111750 return err;
1059211751
....@@ -10597,7 +11756,7 @@
1059711756 }
1059811757
1059911758 if (attr.namespaces) {
10600
- if (!capable(CAP_SYS_ADMIN))
11759
+ if (!perfmon_capable())
1060111760 return -EACCES;
1060211761 }
1060311762
....@@ -10612,6 +11771,13 @@
1061211771 /* Only privileged users can get physical addresses */
1061311772 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
1061411773 err = perf_allow_kernel(&attr);
11774
+ if (err)
11775
+ return err;
11776
+ }
11777
+
11778
+ /* REGS_INTR can leak data, lockdown must prevent this */
11779
+ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
11780
+ err = security_locked_down(LOCKDOWN_PERF);
1061511781 if (err)
1061611782 return err;
1061711783 }
....@@ -10657,24 +11823,6 @@
1065711823 goto err_task;
1065811824 }
1065911825
10660
- if (task) {
10661
- err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10662
- if (err)
10663
- goto err_task;
10664
-
10665
- /*
10666
- * Reuse ptrace permission checks for now.
10667
- *
10668
- * We must hold cred_guard_mutex across this and any potential
10669
- * perf_install_in_context() call for this new event to
10670
- * serialize against exec() altering our credentials (and the
10671
- * perf_event_exit_task() that could imply).
10672
- */
10673
- err = -EACCES;
10674
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10675
- goto err_cred;
10676
- }
10677
-
1067811826 if (flags & PERF_FLAG_PID_CGROUP)
1067911827 cgroup_fd = pid;
1068011828
....@@ -10682,7 +11830,7 @@
1068211830 NULL, NULL, cgroup_fd);
1068311831 if (IS_ERR(event)) {
1068411832 err = PTR_ERR(event);
10685
- goto err_cred;
11833
+ goto err_task;
1068611834 }
1068711835
1068811836 if (is_sampling_event(event)) {
....@@ -10776,6 +11924,9 @@
1077611924 * Do not allow to attach to a group in a different task
1077711925 * or CPU context. If we're moving SW events, we'll fix
1077811926 * this up later, so allow that.
11927
+ *
11928
+ * Racy, not holding group_leader->ctx->mutex, see comment with
11929
+ * perf_event_ctx_lock().
1077911930 */
1078011931 if (!move_group && group_leader->ctx != ctx)
1078111932 goto err_context;
....@@ -10799,6 +11950,24 @@
1079911950 err = PTR_ERR(event_file);
1080011951 event_file = NULL;
1080111952 goto err_context;
11953
+ }
11954
+
11955
+ if (task) {
11956
+ err = down_read_interruptible(&task->signal->exec_update_lock);
11957
+ if (err)
11958
+ goto err_file;
11959
+
11960
+ /*
11961
+ * Preserve ptrace permission check for backwards compatibility.
11962
+ *
11963
+ * We must hold exec_update_lock across this and any potential
11964
+ * perf_install_in_context() call for this new event to
11965
+ * serialize against exec() altering our credentials (and the
11966
+ * perf_event_exit_task() that could imply).
11967
+ */
11968
+ err = -EACCES;
11969
+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
11970
+ goto err_cred;
1080211971 }
1080311972
1080411973 if (move_group) {
....@@ -10825,6 +11994,7 @@
1082511994 } else {
1082611995 perf_event_ctx_unlock(group_leader, gctx);
1082711996 move_group = 0;
11997
+ goto not_move_group;
1082811998 }
1082911999 }
1083012000
....@@ -10841,7 +12011,17 @@
1084112011 }
1084212012 } else {
1084312013 mutex_lock(&ctx->mutex);
12014
+
12015
+ /*
12016
+ * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
12017
+ * see the group_leader && !move_group test earlier.
12018
+ */
12019
+ if (group_leader && group_leader->ctx != ctx) {
12020
+ err = -EINVAL;
12021
+ goto err_locked;
12022
+ }
1084412023 }
12024
+not_move_group:
1084512025
1084612026 if (ctx->task == TASK_TOMBSTONE) {
1084712027 err = -ESRCH;
....@@ -10869,6 +12049,10 @@
1086912049 }
1087012050 }
1087112051
12052
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
12053
+ err = -EINVAL;
12054
+ goto err_locked;
12055
+ }
1087212056
1087312057 /*
1087412058 * Must be under the same ctx::mutex as perf_install_in_context(),
....@@ -10950,7 +12134,7 @@
1095012134 mutex_unlock(&ctx->mutex);
1095112135
1095212136 if (task) {
10953
- mutex_unlock(&task->signal->cred_guard_mutex);
12137
+ up_read(&task->signal->exec_update_lock);
1095412138 put_task_struct(task);
1095512139 }
1095612140
....@@ -10972,7 +12156,10 @@
1097212156 if (move_group)
1097312157 perf_event_ctx_unlock(group_leader, gctx);
1097412158 mutex_unlock(&ctx->mutex);
10975
-/* err_file: */
12159
+err_cred:
12160
+ if (task)
12161
+ up_read(&task->signal->exec_update_lock);
12162
+err_file:
1097612163 fput(event_file);
1097712164 err_context:
1097812165 perf_unpin_context(ctx);
....@@ -10984,9 +12171,6 @@
1098412171 */
1098512172 if (!event_file)
1098612173 free_event(event);
10987
-err_cred:
10988
- if (task)
10989
- mutex_unlock(&task->signal->cred_guard_mutex);
1099012174 err_task:
1099112175 if (task)
1099212176 put_task_struct(task);
....@@ -11015,8 +12199,11 @@
1101512199 int err;
1101612200
1101712201 /*
11018
- * Get the target context (task or percpu):
12202
+ * Grouping is not supported for kernel events, neither is 'AUX',
12203
+ * make sure the caller's intentions are adjusted.
1101912204 */
12205
+ if (attr->aux_output)
12206
+ return ERR_PTR(-EINVAL);
1102012207
1102112208 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
1102212209 overflow_handler, context, -1);
....@@ -11028,6 +12215,9 @@
1102812215 /* Mark owner so we could distinguish it from user events. */
1102912216 event->owner = TASK_TOMBSTONE;
1103012217
12218
+ /*
12219
+ * Get the target context (task or percpu):
12220
+ */
1103112221 ctx = find_get_context(event->pmu, task, event);
1103212222 if (IS_ERR(ctx)) {
1103312223 err = PTR_ERR(ctx);
....@@ -11285,8 +12475,8 @@
1128512475 /*
1128612476 * When a child task exits, feed back event values to parent events.
1128712477 *
11288
- * Can be called with cred_guard_mutex held when called from
11289
- * install_exec_creds().
12478
+ * Can be called with exec_update_lock held when called from
12479
+ * setup_new_exec().
1129012480 */
1129112481 void perf_event_exit_task(struct task_struct *child)
1129212482 {
....@@ -11390,7 +12580,7 @@
1139012580 *
1139112581 * Wait for all events to drop their context reference.
1139212582 */
11393
- wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1);
12583
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
1139412584 put_ctx(ctx); /* must be last */
1139512585 }
1139612586 }
....@@ -11405,9 +12595,7 @@
1140512595
1140612596 struct file *perf_event_get(unsigned int fd)
1140712597 {
11408
- struct file *file;
11409
-
11410
- file = fget_raw(fd);
12598
+ struct file *file = fget(fd);
1141112599 if (!file)
1141212600 return ERR_PTR(-EBADF);
1141312601
....@@ -11477,8 +12665,7 @@
1147712665 !child_ctx->task_ctx_data) {
1147812666 struct pmu *pmu = child_event->pmu;
1147912667
11480
- child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
11481
- GFP_KERNEL);
12668
+ child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
1148212669 if (!child_ctx->task_ctx_data) {
1148312670 free_event(child_event);
1148412671 return ERR_PTR(-ENOMEM);
....@@ -11583,6 +12770,10 @@
1158312770 child, leader, child_ctx);
1158412771 if (IS_ERR(child_ctr))
1158512772 return PTR_ERR(child_ctr);
12773
+
12774
+ if (sub->aux_event == parent_event && child_ctr &&
12775
+ !perf_get_aux_event(child_ctr, leader))
12776
+ return -EINVAL;
1158612777 }
1158712778 return 0;
1158812779 }
....@@ -11778,7 +12969,7 @@
1177812969 }
1177912970 }
1178012971
11781
-void perf_swevent_init_cpu(unsigned int cpu)
12972
+static void perf_swevent_init_cpu(unsigned int cpu)
1178212973 {
1178312974 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
1178412975
....@@ -11975,6 +13166,12 @@
1197513166 kfree(jc);
1197613167 }
1197713168
13169
+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
13170
+{
13171
+ perf_event_cgroup(css->cgroup);
13172
+ return 0;
13173
+}
13174
+
1197813175 static int __perf_cgroup_move(void *info)
1197913176 {
1198013177 struct task_struct *task = info;
....@@ -11996,6 +13193,7 @@
1199613193 struct cgroup_subsys perf_event_cgrp_subsys = {
1199713194 .css_alloc = perf_cgroup_css_alloc,
1199813195 .css_free = perf_cgroup_css_free,
13196
+ .css_online = perf_cgroup_css_online,
1199913197 .attach = perf_cgroup_attach,
1200013198 /*
1200113199 * Implicitly enable on dfl hierarchy so that perf events can