hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/kernel/events/core.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Performance events core code:
34 *
....@@ -5,8 +6,6 @@
56 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
67 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
78 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
- *
9
- * For licensing details see kernel-base/COPYING
109 */
1110
1211 #include <linux/fs.h>
....@@ -29,6 +28,7 @@
2928 #include <linux/export.h>
3029 #include <linux/vmalloc.h>
3130 #include <linux/hardirq.h>
31
+#include <linux/hugetlb.h>
3232 #include <linux/rculist.h>
3333 #include <linux/uaccess.h>
3434 #include <linux/syscalls.h>
....@@ -50,6 +50,7 @@
5050 #include <linux/sched/mm.h>
5151 #include <linux/proc_ns.h>
5252 #include <linux/mount.h>
53
+#include <linux/min_heap.h>
5354
5455 #include "internal.h"
5556
....@@ -265,7 +266,7 @@
265266 if (!event->parent) {
266267 /*
267268 * If this is a !child event, we must hold ctx::mutex to
268
- * stabilize the the event->ctx relation. See
269
+ * stabilize the event->ctx relation. See
269270 * perf_event_ctx_lock().
270271 */
271272 lockdep_assert_held(&ctx->mutex);
....@@ -391,6 +392,10 @@
391392 static atomic_t nr_task_events __read_mostly;
392393 static atomic_t nr_freq_events __read_mostly;
393394 static atomic_t nr_switch_events __read_mostly;
395
+static atomic_t nr_ksymbol_events __read_mostly;
396
+static atomic_t nr_bpf_events __read_mostly;
397
+static atomic_t nr_cgroup_events __read_mostly;
398
+static atomic_t nr_text_poke_events __read_mostly;
394399
395400 static LIST_HEAD(pmus);
396401 static DEFINE_MUTEX(pmus_lock);
....@@ -403,13 +408,8 @@
403408 * 0 - disallow raw tracepoint access for unpriv
404409 * 1 - disallow cpu events for unpriv
405410 * 2 - disallow kernel profiling for unpriv
406
- * 3 - disallow all unpriv perf event use
407411 */
408
-#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
409
-int sysctl_perf_event_paranoid __read_mostly = 3;
410
-#else
411412 int sysctl_perf_event_paranoid __read_mostly = 2;
412
-#endif
413413
414414 /* Minimum for 512 kiB + 1 user control page */
415415 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
....@@ -444,8 +444,7 @@
444444 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
445445
446446 int perf_proc_update_handler(struct ctl_table *table, int write,
447
- void __user *buffer, size_t *lenp,
448
- loff_t *ppos)
447
+ void *buffer, size_t *lenp, loff_t *ppos)
449448 {
450449 int ret;
451450 int perf_cpu = sysctl_perf_cpu_time_max_percent;
....@@ -469,8 +468,7 @@
469468 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
470469
471470 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
472
- void __user *buffer, size_t *lenp,
473
- loff_t *ppos)
471
+ void *buffer, size_t *lenp, loff_t *ppos)
474472 {
475473 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
476474
....@@ -761,7 +759,7 @@
761759 /*
762760 * Do not update time when cgroup is not active
763761 */
764
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
762
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
765763 __update_cgrp_time(event->cgrp);
766764 }
767765
....@@ -901,6 +899,47 @@
901899 rcu_read_unlock();
902900 }
903901
902
+static int perf_cgroup_ensure_storage(struct perf_event *event,
903
+ struct cgroup_subsys_state *css)
904
+{
905
+ struct perf_cpu_context *cpuctx;
906
+ struct perf_event **storage;
907
+ int cpu, heap_size, ret = 0;
908
+
909
+ /*
910
+ * Allow storage to have sufficent space for an iterator for each
911
+ * possibly nested cgroup plus an iterator for events with no cgroup.
912
+ */
913
+ for (heap_size = 1; css; css = css->parent)
914
+ heap_size++;
915
+
916
+ for_each_possible_cpu(cpu) {
917
+ cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
918
+ if (heap_size <= cpuctx->heap_size)
919
+ continue;
920
+
921
+ storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
922
+ GFP_KERNEL, cpu_to_node(cpu));
923
+ if (!storage) {
924
+ ret = -ENOMEM;
925
+ break;
926
+ }
927
+
928
+ raw_spin_lock_irq(&cpuctx->ctx.lock);
929
+ if (cpuctx->heap_size < heap_size) {
930
+ swap(cpuctx->heap, storage);
931
+ if (storage == cpuctx->heap_default)
932
+ storage = NULL;
933
+ cpuctx->heap_size = heap_size;
934
+ }
935
+ raw_spin_unlock_irq(&cpuctx->ctx.lock);
936
+
937
+ kfree(storage);
938
+ }
939
+
940
+ return ret;
941
+}
942
+
904943 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
905944 struct perf_event_attr *attr,
906945 struct perf_event *group_leader)
....@@ -919,6 +958,10 @@
919958 ret = PTR_ERR(css);
920959 goto out;
921960 }
961
+
962
+ ret = perf_cgroup_ensure_storage(event, css);
963
+ if (ret)
964
+ goto out;
922965
923966 cgrp = container_of(css, struct perf_cgroup, css);
924967 event->cgrp = cgrp;
....@@ -945,25 +988,19 @@
945988 event->shadow_ctx_time = now - t->timestamp;
946989 }
947990
948
-/*
949
- * Update cpuctx->cgrp so that it is set when first cgroup event is added and
950
- * cleared when last cgroup event is removed.
951
- */
952991 static inline void
953
-list_update_cgroup_event(struct perf_event *event,
954
- struct perf_event_context *ctx, bool add)
992
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
955993 {
956994 struct perf_cpu_context *cpuctx;
957
- struct list_head *cpuctx_entry;
958995
959996 if (!is_cgroup_event(event))
960997 return;
961998
962999 /*
9631000 * Because cgroup events are always per-cpu events,
964
- * this will always be called from the right CPU.
1001
+ * @ctx == &cpuctx->ctx.
9651002 */
966
- cpuctx = __get_cpu_context(ctx);
1003
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
9671004
9681005 /*
9691006 * Since setting cpuctx->cgrp is conditional on the current @cgrp
....@@ -971,27 +1008,41 @@
9711008 * because if the first would mismatch, the second would not try again
9721009 * and we would leave cpuctx->cgrp unset.
9731010 */
974
- if (add && !cpuctx->cgrp) {
1011
+ if (ctx->is_active && !cpuctx->cgrp) {
9751012 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
9761013
9771014 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
9781015 cpuctx->cgrp = cgrp;
9791016 }
9801017
981
- if (add && ctx->nr_cgroups++)
982
- return;
983
- else if (!add && --ctx->nr_cgroups)
1018
+ if (ctx->nr_cgroups++)
9841019 return;
9851020
986
- /* no cgroup running */
987
- if (!add)
1021
+ list_add(&cpuctx->cgrp_cpuctx_entry,
1022
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1023
+}
1024
+
1025
+static inline void
1026
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1027
+{
1028
+ struct perf_cpu_context *cpuctx;
1029
+
1030
+ if (!is_cgroup_event(event))
1031
+ return;
1032
+
1033
+ /*
1034
+ * Because cgroup events are always per-cpu events,
1035
+ * @ctx == &cpuctx->ctx.
1036
+ */
1037
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1038
+
1039
+ if (--ctx->nr_cgroups)
1040
+ return;
1041
+
1042
+ if (ctx->is_active && cpuctx->cgrp)
9881043 cpuctx->cgrp = NULL;
9891044
990
- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
991
- if (add)
992
- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
993
- else
994
- list_del(cpuctx_entry);
1045
+ list_del(&cpuctx->cgrp_cpuctx_entry);
9951046 }
9961047
9971048 #else /* !CONFIG_CGROUP_PERF */
....@@ -1041,7 +1092,7 @@
10411092 {
10421093 }
10431094
1044
-void
1095
+static inline void
10451096 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
10461097 {
10471098 }
....@@ -1057,11 +1108,14 @@
10571108 }
10581109
10591110 static inline void
1060
-list_update_cgroup_event(struct perf_event *event,
1061
- struct perf_event_context *ctx, bool add)
1111
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
10621112 {
10631113 }
10641114
1115
+static inline void
1116
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1117
+{
1118
+}
10651119 #endif
10661120
10671121 /*
....@@ -1113,7 +1167,7 @@
11131167 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
11141168
11151169 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1116
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1170
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
11171171 timer->function = perf_mux_hrtimer_handler;
11181172 }
11191173
....@@ -1131,7 +1185,7 @@
11311185 if (!cpuctx->hrtimer_active) {
11321186 cpuctx->hrtimer_active = 1;
11331187 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1134
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1188
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
11351189 }
11361190 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
11371191
....@@ -1182,7 +1236,21 @@
11821236
11831237 static void get_ctx(struct perf_event_context *ctx)
11841238 {
1185
- WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1239
+ refcount_inc(&ctx->refcount);
1240
+}
1241
+
1242
+static void *alloc_task_ctx_data(struct pmu *pmu)
1243
+{
1244
+ if (pmu->task_ctx_cache)
1245
+ return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1246
+
1247
+ return NULL;
1248
+}
1249
+
1250
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1251
+{
1252
+ if (pmu->task_ctx_cache && task_ctx_data)
1253
+ kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
11861254 }
11871255
11881256 static void free_ctx(struct rcu_head *head)
....@@ -1190,13 +1258,13 @@
11901258 struct perf_event_context *ctx;
11911259
11921260 ctx = container_of(head, struct perf_event_context, rcu_head);
1193
- kfree(ctx->task_ctx_data);
1261
+ free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
11941262 kfree(ctx);
11951263 }
11961264
11971265 static void put_ctx(struct perf_event_context *ctx)
11981266 {
1199
- if (atomic_dec_and_test(&ctx->refcount)) {
1267
+ if (refcount_dec_and_test(&ctx->refcount)) {
12001268 if (ctx->parent_ctx)
12011269 put_ctx(ctx->parent_ctx);
12021270 if (ctx->task && ctx->task != TASK_TOMBSTONE)
....@@ -1232,7 +1300,7 @@
12321300 * life-time rules separate them. That is an exiting task cannot fork, and a
12331301 * spawning task cannot (yet) exit.
12341302 *
1235
- * But remember that that these are parent<->child context relations, and
1303
+ * But remember that these are parent<->child context relations, and
12361304 * migration does not affect children, therefore these two orderings should not
12371305 * interact.
12381306 *
....@@ -1258,13 +1326,13 @@
12581326 * function.
12591327 *
12601328 * Lock order:
1261
- * cred_guard_mutex
1329
+ * exec_update_lock
12621330 * task_struct::perf_event_mutex
12631331 * perf_event_context::mutex
12641332 * perf_event::child_mutex;
12651333 * perf_event_context::lock
12661334 * perf_event::mmap_mutex
1267
- * mmap_sem
1335
+ * mmap_lock
12681336 * perf_addr_filters_head::lock
12691337 *
12701338 * cpu_hotplug_lock
....@@ -1279,7 +1347,7 @@
12791347 again:
12801348 rcu_read_lock();
12811349 ctx = READ_ONCE(event->ctx);
1282
- if (!atomic_inc_not_zero(&ctx->refcount)) {
1350
+ if (!refcount_inc_not_zero(&ctx->refcount)) {
12831351 rcu_read_unlock();
12841352 goto again;
12851353 }
....@@ -1371,7 +1439,7 @@
13711439 /*
13721440 * Get the perf_event_context for a task and lock it.
13731441 *
1374
- * This has to cope with with the fact that until it is locked,
1442
+ * This has to cope with the fact that until it is locked,
13751443 * the context could get moved to another task.
13761444 */
13771445 static struct perf_event_context *
....@@ -1412,7 +1480,7 @@
14121480 }
14131481
14141482 if (ctx->task == TASK_TOMBSTONE ||
1415
- !atomic_inc_not_zero(&ctx->refcount)) {
1483
+ !refcount_inc_not_zero(&ctx->refcount)) {
14161484 raw_spin_unlock(&ctx->lock);
14171485 ctx = NULL;
14181486 } else {
....@@ -1540,6 +1608,30 @@
15401608 if (left->cpu > right->cpu)
15411609 return false;
15421610
1611
+#ifdef CONFIG_CGROUP_PERF
1612
+ if (left->cgrp != right->cgrp) {
1613
+ if (!left->cgrp || !left->cgrp->css.cgroup) {
1614
+ /*
1615
+ * Left has no cgroup but right does, no cgroups come
1616
+ * first.
1617
+ */
1618
+ return true;
1619
+ }
1620
+ if (!right->cgrp || !right->cgrp->css.cgroup) {
1621
+ /*
1622
+ * Right has no cgroup but left does, no cgroups come
1623
+ * first.
1624
+ */
1625
+ return false;
1626
+ }
1627
+ /* Two dissimilar cgroups, order by id. */
1628
+ if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
1629
+ return true;
1630
+
1631
+ return false;
1632
+ }
1633
+#endif
1634
+
15431635 if (left->group_index < right->group_index)
15441636 return true;
15451637 if (left->group_index > right->group_index)
....@@ -1619,25 +1711,48 @@
16191711 }
16201712
16211713 /*
1622
- * Get the leftmost event in the @cpu subtree.
1714
+ * Get the leftmost event in the cpu/cgroup subtree.
16231715 */
16241716 static struct perf_event *
1625
-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
1717
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1718
+ struct cgroup *cgrp)
16261719 {
16271720 struct perf_event *node_event = NULL, *match = NULL;
16281721 struct rb_node *node = groups->tree.rb_node;
1722
+#ifdef CONFIG_CGROUP_PERF
1723
+ u64 node_cgrp_id, cgrp_id = 0;
1724
+
1725
+ if (cgrp)
1726
+ cgrp_id = cgrp->kn->id;
1727
+#endif
16291728
16301729 while (node) {
16311730 node_event = container_of(node, struct perf_event, group_node);
16321731
16331732 if (cpu < node_event->cpu) {
16341733 node = node->rb_left;
1635
- } else if (cpu > node_event->cpu) {
1636
- node = node->rb_right;
1637
- } else {
1638
- match = node_event;
1639
- node = node->rb_left;
1734
+ continue;
16401735 }
1736
+ if (cpu > node_event->cpu) {
1737
+ node = node->rb_right;
1738
+ continue;
1739
+ }
1740
+#ifdef CONFIG_CGROUP_PERF
1741
+ node_cgrp_id = 0;
1742
+ if (node_event->cgrp && node_event->cgrp->css.cgroup)
1743
+ node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
1744
+
1745
+ if (cgrp_id < node_cgrp_id) {
1746
+ node = node->rb_left;
1747
+ continue;
1748
+ }
1749
+ if (cgrp_id > node_cgrp_id) {
1750
+ node = node->rb_right;
1751
+ continue;
1752
+ }
1753
+#endif
1754
+ match = node_event;
1755
+ node = node->rb_left;
16411756 }
16421757
16431758 return match;
....@@ -1650,12 +1765,26 @@
16501765 perf_event_groups_next(struct perf_event *event)
16511766 {
16521767 struct perf_event *next;
1768
+#ifdef CONFIG_CGROUP_PERF
1769
+ u64 curr_cgrp_id = 0;
1770
+ u64 next_cgrp_id = 0;
1771
+#endif
16531772
16541773 next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1655
- if (next && next->cpu == event->cpu)
1656
- return next;
1774
+ if (next == NULL || next->cpu != event->cpu)
1775
+ return NULL;
16571776
1658
- return NULL;
1777
+#ifdef CONFIG_CGROUP_PERF
1778
+ if (event->cgrp && event->cgrp->css.cgroup)
1779
+ curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
1780
+
1781
+ if (next->cgrp && next->cgrp->css.cgroup)
1782
+ next_cgrp_id = next->cgrp->css.cgroup->kn->id;
1783
+
1784
+ if (curr_cgrp_id != next_cgrp_id)
1785
+ return NULL;
1786
+#endif
1787
+ return next;
16591788 }
16601789
16611790 /*
....@@ -1691,12 +1820,13 @@
16911820 add_event_to_groups(event, ctx);
16921821 }
16931822
1694
- list_update_cgroup_event(event, ctx, true);
1695
-
16961823 list_add_rcu(&event->event_entry, &ctx->event_list);
16971824 ctx->nr_events++;
16981825 if (event->attr.inherit_stat)
16991826 ctx->nr_stat++;
1827
+
1828
+ if (event->state > PERF_EVENT_STATE_OFF)
1829
+ perf_cgroup_event_enable(event, ctx);
17001830
17011831 ctx->generation++;
17021832 }
....@@ -1762,6 +1892,9 @@
17621892
17631893 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
17641894 size += sizeof(data->phys_addr);
1895
+
1896
+ if (sample_type & PERF_SAMPLE_CGROUP)
1897
+ size += sizeof(data->cgroup);
17651898
17661899 event->header_size = size;
17671900 }
....@@ -1873,8 +2006,6 @@
18732006
18742007 event->attach_state &= ~PERF_ATTACH_CONTEXT;
18752008
1876
- list_update_cgroup_event(event, ctx, false);
1877
-
18782009 ctx->nr_events--;
18792010 if (event->attr.inherit_stat)
18802011 ctx->nr_stat--;
....@@ -1891,14 +2022,136 @@
18912022 * of error state is by explicit re-enabling
18922023 * of the event
18932024 */
1894
- if (event->state > PERF_EVENT_STATE_OFF)
2025
+ if (event->state > PERF_EVENT_STATE_OFF) {
2026
+ perf_cgroup_event_disable(event, ctx);
18952027 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2028
+ }
18962029
18972030 ctx->generation++;
18982031 }
18992032
2033
+static int
2034
+perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2035
+{
2036
+ if (!has_aux(aux_event))
2037
+ return 0;
2038
+
2039
+ if (!event->pmu->aux_output_match)
2040
+ return 0;
2041
+
2042
+ return event->pmu->aux_output_match(aux_event);
2043
+}
2044
+
2045
+static void put_event(struct perf_event *event);
2046
+static void event_sched_out(struct perf_event *event,
2047
+ struct perf_cpu_context *cpuctx,
2048
+ struct perf_event_context *ctx);
2049
+
2050
+static void perf_put_aux_event(struct perf_event *event)
2051
+{
2052
+ struct perf_event_context *ctx = event->ctx;
2053
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2054
+ struct perf_event *iter;
2055
+
2056
+ /*
2057
+ * If event uses aux_event tear down the link
2058
+ */
2059
+ if (event->aux_event) {
2060
+ iter = event->aux_event;
2061
+ event->aux_event = NULL;
2062
+ put_event(iter);
2063
+ return;
2064
+ }
2065
+
2066
+ /*
2067
+ * If the event is an aux_event, tear down all links to
2068
+ * it from other events.
2069
+ */
2070
+ for_each_sibling_event(iter, event->group_leader) {
2071
+ if (iter->aux_event != event)
2072
+ continue;
2073
+
2074
+ iter->aux_event = NULL;
2075
+ put_event(event);
2076
+
2077
+ /*
2078
+ * If it's ACTIVE, schedule it out and put it into ERROR
2079
+ * state so that we don't try to schedule it again. Note
2080
+ * that perf_event_enable() will clear the ERROR status.
2081
+ */
2082
+ event_sched_out(iter, cpuctx, ctx);
2083
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2084
+ }
2085
+}
2086
+
2087
+static bool perf_need_aux_event(struct perf_event *event)
2088
+{
2089
+ return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2090
+}
2091
+
2092
+static int perf_get_aux_event(struct perf_event *event,
2093
+ struct perf_event *group_leader)
2094
+{
2095
+ /*
2096
+ * Our group leader must be an aux event if we want to be
2097
+ * an aux_output. This way, the aux event will precede its
2098
+ * aux_output events in the group, and therefore will always
2099
+ * schedule first.
2100
+ */
2101
+ if (!group_leader)
2102
+ return 0;
2103
+
2104
+ /*
2105
+ * aux_output and aux_sample_size are mutually exclusive.
2106
+ */
2107
+ if (event->attr.aux_output && event->attr.aux_sample_size)
2108
+ return 0;
2109
+
2110
+ if (event->attr.aux_output &&
2111
+ !perf_aux_output_match(event, group_leader))
2112
+ return 0;
2113
+
2114
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2115
+ return 0;
2116
+
2117
+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
2118
+ return 0;
2119
+
2120
+ /*
2121
+ * Link aux_outputs to their aux event; this is undone in
2122
+ * perf_group_detach() by perf_put_aux_event(). When the
2123
+ * group in torn down, the aux_output events loose their
2124
+ * link to the aux_event and can't schedule any more.
2125
+ */
2126
+ event->aux_event = group_leader;
2127
+
2128
+ return 1;
2129
+}
2130
+
2131
+static inline struct list_head *get_event_list(struct perf_event *event)
2132
+{
2133
+ struct perf_event_context *ctx = event->ctx;
2134
+ return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2135
+}
2136
+
2137
+/*
2138
+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
2139
+ * cannot exist on their own, schedule them out and move them into the ERROR
2140
+ * state. Also see _perf_event_enable(), it will not be able to recover
2141
+ * this ERROR state.
2142
+ */
2143
+static inline void perf_remove_sibling_event(struct perf_event *event)
2144
+{
2145
+ struct perf_event_context *ctx = event->ctx;
2146
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2147
+
2148
+ event_sched_out(event, cpuctx, ctx);
2149
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2150
+}
2151
+
19002152 static void perf_group_detach(struct perf_event *event)
19012153 {
2154
+ struct perf_event *leader = event->group_leader;
19022155 struct perf_event *sibling, *tmp;
19032156 struct perf_event_context *ctx = event->ctx;
19042157
....@@ -1912,10 +2165,12 @@
19122165
19132166 event->attach_state &= ~PERF_ATTACH_GROUP;
19142167
2168
+ perf_put_aux_event(event);
2169
+
19152170 /*
19162171 * If this is a sibling, remove it from its group.
19172172 */
1918
- if (event->group_leader != event) {
2173
+ if (leader != event) {
19192174 list_del_init(&event->sibling_list);
19202175 event->group_leader->nr_siblings--;
19212176 goto out;
....@@ -1928,6 +2183,9 @@
19282183 */
19292184 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
19302185
2186
+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2187
+ perf_remove_sibling_event(sibling);
2188
+
19312189 sibling->group_leader = sibling;
19322190 list_del_init(&sibling->sibling_list);
19332191
....@@ -1937,22 +2195,18 @@
19372195 if (!RB_EMPTY_NODE(&event->group_node)) {
19382196 add_event_to_groups(sibling, event->ctx);
19392197
1940
- if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1941
- struct list_head *list = sibling->attr.pinned ?
1942
- &ctx->pinned_active : &ctx->flexible_active;
1943
-
1944
- list_add_tail(&sibling->active_list, list);
1945
- }
2198
+ if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2199
+ list_add_tail(&sibling->active_list, get_event_list(sibling));
19462200 }
19472201
19482202 WARN_ON_ONCE(sibling->ctx != event->ctx);
19492203 }
19502204
19512205 out:
1952
- perf_event__header_size(event->group_leader);
1953
-
1954
- for_each_sibling_event(tmp, event->group_leader)
2206
+ for_each_sibling_event(tmp, leader)
19552207 perf_event__header_size(tmp);
2208
+
2209
+ perf_event__header_size(leader);
19562210 }
19572211
19582212 static bool is_orphaned_event(struct perf_event *event)
....@@ -2021,6 +2275,7 @@
20212275
20222276 if (READ_ONCE(event->pending_disable) >= 0) {
20232277 WRITE_ONCE(event->pending_disable, -1);
2278
+ perf_cgroup_event_disable(event, ctx);
20242279 state = PERF_EVENT_STATE_OFF;
20252280 }
20262281 perf_event_set_state(event, state);
....@@ -2058,9 +2313,6 @@
20582313 event_sched_out(event, cpuctx, ctx);
20592314
20602315 perf_pmu_enable(ctx->pmu);
2061
-
2062
- if (group_event->attr.exclusive)
2063
- cpuctx->exclusive = 0;
20642316 }
20652317
20662318 #define DETACH_GROUP 0x01UL
....@@ -2091,6 +2343,7 @@
20912343
20922344 if (!ctx->nr_events && ctx->is_active) {
20932345 ctx->is_active = 0;
2346
+ ctx->rotate_necessary = 0;
20942347 if (ctx->task) {
20952348 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
20962349 cpuctx->task_ctx = NULL;
....@@ -2157,6 +2410,7 @@
21572410 event_sched_out(event, cpuctx, ctx);
21582411
21592412 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2413
+ perf_cgroup_event_disable(event, ctx);
21602414 }
21612415
21622416 /*
....@@ -2164,7 +2418,7 @@
21642418 *
21652419 * If event->ctx is a cloned context, callers must make sure that
21662420 * every task struct that event->ctx->task could possibly point to
2167
- * remains valid. This condition is satisifed when called through
2421
+ * remains valid. This condition is satisfied when called through
21682422 * perf_event_for_each_child or perf_event_for_each because they
21692423 * hold the top-level event's child_mutex, so any descendant that
21702424 * goes to exit will block in perf_event_exit_event().
....@@ -2238,7 +2492,7 @@
22382492 * But this is a bit hairy.
22392493 *
22402494 * So instead, we have an explicit cgroup call to remain
2241
- * within the time time source all along. We believe it
2495
+ * within the time source all along. We believe it
22422496 * is cleaner and simpler to understand.
22432497 */
22442498 if (is_cgroup_event(event))
....@@ -2258,6 +2512,8 @@
22582512 struct perf_event_context *ctx)
22592513 {
22602514 int ret = 0;
2515
+
2516
+ WARN_ON_ONCE(event->ctx != ctx);
22612517
22622518 lockdep_assert_held(&ctx->lock);
22632519
....@@ -2325,11 +2581,8 @@
23252581
23262582 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
23272583
2328
- if (event_sched_in(group_event, cpuctx, ctx)) {
2329
- pmu->cancel_txn(pmu);
2330
- perf_mux_hrtimer_restart(cpuctx);
2331
- return -EAGAIN;
2332
- }
2584
+ if (event_sched_in(group_event, cpuctx, ctx))
2585
+ goto error;
23332586
23342587 /*
23352588 * Schedule in siblings as one group (if any):
....@@ -2358,10 +2611,8 @@
23582611 }
23592612 event_sched_out(group_event, cpuctx, ctx);
23602613
2614
+error:
23612615 pmu->cancel_txn(pmu);
2362
-
2363
- perf_mux_hrtimer_restart(cpuctx);
2364
-
23652616 return -EAGAIN;
23662617 }
23672618
....@@ -2387,7 +2638,7 @@
23872638 * If this group is exclusive and there are already
23882639 * events on the CPU, it can't go on.
23892640 */
2390
- if (event->attr.exclusive && cpuctx->active_oncpu)
2641
+ if (event->attr.exclusive && !list_empty(get_event_list(event)))
23912642 return 0;
23922643 /*
23932644 * Otherwise, try to add it if all previous groups were able
....@@ -2488,6 +2739,16 @@
24882739 perf_pmu_enable(cpuctx->ctx.pmu);
24892740 }
24902741
2742
+void perf_pmu_resched(struct pmu *pmu)
2743
+{
2744
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2745
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
2746
+
2747
+ perf_ctx_lock(cpuctx, task_ctx);
2748
+ ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2749
+ perf_ctx_unlock(cpuctx, task_ctx);
2750
+}
2751
+
24912752 /*
24922753 * Cross CPU call to install and enable a performance event
24932754 *
....@@ -2528,7 +2789,7 @@
25282789 }
25292790
25302791 #ifdef CONFIG_CGROUP_PERF
2531
- if (is_cgroup_event(event)) {
2792
+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
25322793 /*
25332794 * If the current cgroup doesn't match the event's
25342795 * cgroup, we should not try to schedule it.
....@@ -2580,6 +2841,25 @@
25802841 * will be 'complete'. See perf_iterate_sb_cpu().
25812842 */
25822843 smp_store_release(&event->ctx, ctx);
2844
+
2845
+ /*
2846
+ * perf_event_attr::disabled events will not run and can be initialized
2847
+ * without IPI. Except when this is the first event for the context, in
2848
+ * that case we need the magic of the IPI to set ctx->is_active.
2849
+ *
2850
+ * The IOC_ENABLE that is sure to follow the creation of a disabled
2851
+ * event will issue the IPI and reprogram the hardware.
2852
+ */
2853
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
2854
+ raw_spin_lock_irq(&ctx->lock);
2855
+ if (ctx->task == TASK_TOMBSTONE) {
2856
+ raw_spin_unlock_irq(&ctx->lock);
2857
+ return;
2858
+ }
2859
+ add_event_to_ctx(event, ctx);
2860
+ raw_spin_unlock_irq(&ctx->lock);
2861
+ return;
2862
+ }
25832863
25842864 if (!task) {
25852865 cpu_function_call(cpu, __perf_install_in_context, event);
....@@ -2669,6 +2949,7 @@
26692949 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
26702950
26712951 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2952
+ perf_cgroup_event_enable(event, ctx);
26722953
26732954 if (!ctx->is_active)
26742955 return;
....@@ -2710,6 +2991,7 @@
27102991 raw_spin_lock_irq(&ctx->lock);
27112992 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
27122993 event->state < PERF_EVENT_STATE_ERROR) {
2994
+out:
27132995 raw_spin_unlock_irq(&ctx->lock);
27142996 return;
27152997 }
....@@ -2721,8 +3003,16 @@
27213003 * has gone back into error state, as distinct from the task having
27223004 * been scheduled away before the cross-call arrived.
27233005 */
2724
- if (event->state == PERF_EVENT_STATE_ERROR)
3006
+ if (event->state == PERF_EVENT_STATE_ERROR) {
3007
+ /*
3008
+ * Detached SIBLING events cannot leave ERROR state.
3009
+ */
3010
+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
3011
+ event->group_leader == event)
3012
+ goto out;
3013
+
27253014 event->state = PERF_EVENT_STATE_OFF;
3015
+ }
27263016 raw_spin_unlock_irq(&ctx->lock);
27273017
27283018 event_function_call(event, __perf_event_enable, NULL);
....@@ -2826,7 +3116,7 @@
28263116 * pre-existing mappings, called once when new filters arrive via SET_FILTER
28273117 * ioctl;
28283118 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2829
- * registered mapping, called for every new mmap(), with mm::mmap_sem down
3119
+ * registered mapping, called for every new mmap(), with mm::mmap_lock down
28303120 * for reading;
28313121 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
28323122 * of exec.
....@@ -2966,6 +3256,13 @@
29663256 if (is_active & EVENT_FLEXIBLE) {
29673257 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
29683258 group_sched_out(event, cpuctx, ctx);
3259
+
3260
+ /*
3261
+ * Since we cleared EVENT_FLEXIBLE, also clear
3262
+ * rotate_necessary, is will be reset by
3263
+ * ctx_flexible_sched_in() when needed.
3264
+ */
3265
+ ctx->rotate_necessary = 0;
29693266 }
29703267 perf_pmu_enable(ctx->pmu);
29713268 }
....@@ -3080,10 +3377,12 @@
30803377 struct perf_event_context *parent, *next_parent;
30813378 struct perf_cpu_context *cpuctx;
30823379 int do_switch = 1;
3380
+ struct pmu *pmu;
30833381
30843382 if (likely(!ctx))
30853383 return;
30863384
3385
+ pmu = ctx->pmu;
30873386 cpuctx = __get_cpu_context(ctx);
30883387 if (!cpuctx->task_ctx)
30893388 return;
....@@ -3113,10 +3412,27 @@
31133412 raw_spin_lock(&ctx->lock);
31143413 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
31153414 if (context_equiv(ctx, next_ctx)) {
3415
+
31163416 WRITE_ONCE(ctx->task, next);
31173417 WRITE_ONCE(next_ctx->task, task);
31183418
3119
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3419
+ perf_pmu_disable(pmu);
3420
+
3421
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
3422
+ pmu->sched_task(ctx, false);
3423
+
3424
+ /*
3425
+ * PMU specific parts of task perf context can require
3426
+ * additional synchronization. As an example of such
3427
+ * synchronization see implementation details of Intel
3428
+ * LBR call stack data profiling;
3429
+ */
3430
+ if (pmu->swap_task_ctx)
3431
+ pmu->swap_task_ctx(ctx, next_ctx);
3432
+ else
3433
+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3434
+
3435
+ perf_pmu_enable(pmu);
31203436
31213437 /*
31223438 * RCU_INIT_POINTER here is safe because we've not
....@@ -3140,7 +3456,13 @@
31403456
31413457 if (do_switch) {
31423458 raw_spin_lock(&ctx->lock);
3459
+ perf_pmu_disable(pmu);
3460
+
3461
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
3462
+ pmu->sched_task(ctx, false);
31433463 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3464
+
3465
+ perf_pmu_enable(pmu);
31443466 raw_spin_unlock(&ctx->lock);
31453467 }
31463468 }
....@@ -3176,29 +3498,39 @@
31763498 * PEBS requires this to provide PID/TID information. This requires we flush
31773499 * all queued PEBS records before we context switch to a new task.
31783500 */
3501
+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3502
+{
3503
+ struct pmu *pmu;
3504
+
3505
+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3506
+
3507
+ if (WARN_ON_ONCE(!pmu->sched_task))
3508
+ return;
3509
+
3510
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3511
+ perf_pmu_disable(pmu);
3512
+
3513
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
3514
+
3515
+ perf_pmu_enable(pmu);
3516
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3517
+}
3518
+
31793519 static void perf_pmu_sched_task(struct task_struct *prev,
31803520 struct task_struct *next,
31813521 bool sched_in)
31823522 {
31833523 struct perf_cpu_context *cpuctx;
3184
- struct pmu *pmu;
31853524
31863525 if (prev == next)
31873526 return;
31883527
31893528 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3190
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3191
-
3192
- if (WARN_ON_ONCE(!pmu->sched_task))
3529
+ /* will be handled in perf_event_context_sched_in/out */
3530
+ if (cpuctx->task_ctx)
31933531 continue;
31943532
3195
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3196
- perf_pmu_disable(pmu);
3197
-
3198
- pmu->sched_task(cpuctx->task_ctx, sched_in);
3199
-
3200
- perf_pmu_enable(pmu);
3201
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3533
+ __perf_pmu_sched_task(cpuctx, sched_in);
32023534 }
32033535 }
32043536
....@@ -3251,83 +3583,149 @@
32513583 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
32523584 }
32533585
3254
-static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3255
- int (*func)(struct perf_event *, void *), void *data)
3586
+static bool perf_less_group_idx(const void *l, const void *r)
32563587 {
3257
- struct perf_event **evt, *evt1, *evt2;
3588
+ const struct perf_event *le = *(const struct perf_event **)l;
3589
+ const struct perf_event *re = *(const struct perf_event **)r;
3590
+
3591
+ return le->group_index < re->group_index;
3592
+}
3593
+
3594
+static void swap_ptr(void *l, void *r)
3595
+{
3596
+ void **lp = l, **rp = r;
3597
+
3598
+ swap(*lp, *rp);
3599
+}
3600
+
3601
+static const struct min_heap_callbacks perf_min_heap = {
3602
+ .elem_size = sizeof(struct perf_event *),
3603
+ .less = perf_less_group_idx,
3604
+ .swp = swap_ptr,
3605
+};
3606
+
3607
+static void __heap_add(struct min_heap *heap, struct perf_event *event)
3608
+{
3609
+ struct perf_event **itrs = heap->data;
3610
+
3611
+ if (event) {
3612
+ itrs[heap->nr] = event;
3613
+ heap->nr++;
3614
+ }
3615
+}
3616
+
3617
+static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3618
+ struct perf_event_groups *groups, int cpu,
3619
+ int (*func)(struct perf_event *, void *),
3620
+ void *data)
3621
+{
3622
+#ifdef CONFIG_CGROUP_PERF
3623
+ struct cgroup_subsys_state *css = NULL;
3624
+#endif
3625
+ /* Space for per CPU and/or any CPU event iterators. */
3626
+ struct perf_event *itrs[2];
3627
+ struct min_heap event_heap;
3628
+ struct perf_event **evt;
32583629 int ret;
32593630
3260
- evt1 = perf_event_groups_first(groups, -1);
3261
- evt2 = perf_event_groups_first(groups, cpu);
3631
+ if (cpuctx) {
3632
+ event_heap = (struct min_heap){
3633
+ .data = cpuctx->heap,
3634
+ .nr = 0,
3635
+ .size = cpuctx->heap_size,
3636
+ };
32623637
3263
- while (evt1 || evt2) {
3264
- if (evt1 && evt2) {
3265
- if (evt1->group_index < evt2->group_index)
3266
- evt = &evt1;
3267
- else
3268
- evt = &evt2;
3269
- } else if (evt1) {
3270
- evt = &evt1;
3271
- } else {
3272
- evt = &evt2;
3273
- }
3638
+ lockdep_assert_held(&cpuctx->ctx.lock);
32743639
3640
+#ifdef CONFIG_CGROUP_PERF
3641
+ if (cpuctx->cgrp)
3642
+ css = &cpuctx->cgrp->css;
3643
+#endif
3644
+ } else {
3645
+ event_heap = (struct min_heap){
3646
+ .data = itrs,
3647
+ .nr = 0,
3648
+ .size = ARRAY_SIZE(itrs),
3649
+ };
3650
+ /* Events not within a CPU context may be on any CPU. */
3651
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3652
+ }
3653
+ evt = event_heap.data;
3654
+
3655
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3656
+
3657
+#ifdef CONFIG_CGROUP_PERF
3658
+ for (; css; css = css->parent)
3659
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3660
+#endif
3661
+
3662
+ min_heapify_all(&event_heap, &perf_min_heap);
3663
+
3664
+ while (event_heap.nr) {
32753665 ret = func(*evt, data);
32763666 if (ret)
32773667 return ret;
32783668
32793669 *evt = perf_event_groups_next(*evt);
3280
- }
3281
-
3282
- return 0;
3283
-}
3284
-
3285
-struct sched_in_data {
3286
- struct perf_event_context *ctx;
3287
- struct perf_cpu_context *cpuctx;
3288
- int can_add_hw;
3289
-};
3290
-
3291
-static int pinned_sched_in(struct perf_event *event, void *data)
3292
-{
3293
- struct sched_in_data *sid = data;
3294
-
3295
- if (event->state <= PERF_EVENT_STATE_OFF)
3296
- return 0;
3297
-
3298
- if (!event_filter_match(event))
3299
- return 0;
3300
-
3301
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3302
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3303
- list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3304
- }
3305
-
3306
- /*
3307
- * If this pinned group hasn't been scheduled,
3308
- * put it in error state.
3309
- */
3310
- if (event->state == PERF_EVENT_STATE_INACTIVE)
3311
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3312
-
3313
- return 0;
3314
-}
3315
-
3316
-static int flexible_sched_in(struct perf_event *event, void *data)
3317
-{
3318
- struct sched_in_data *sid = data;
3319
-
3320
- if (event->state <= PERF_EVENT_STATE_OFF)
3321
- return 0;
3322
-
3323
- if (!event_filter_match(event))
3324
- return 0;
3325
-
3326
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3327
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3328
- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3670
+ if (*evt)
3671
+ min_heapify(&event_heap, 0, &perf_min_heap);
33293672 else
3330
- sid->can_add_hw = 0;
3673
+ min_heap_pop(&event_heap, &perf_min_heap);
3674
+ }
3675
+
3676
+ return 0;
3677
+}
3678
+
3679
+static inline bool event_update_userpage(struct perf_event *event)
3680
+{
3681
+ if (likely(!atomic_read(&event->mmap_count)))
3682
+ return false;
3683
+
3684
+ perf_event_update_time(event);
3685
+ perf_set_shadow_time(event, event->ctx);
3686
+ perf_event_update_userpage(event);
3687
+
3688
+ return true;
3689
+}
3690
+
3691
+static inline void group_update_userpage(struct perf_event *group_event)
3692
+{
3693
+ struct perf_event *event;
3694
+
3695
+ if (!event_update_userpage(group_event))
3696
+ return;
3697
+
3698
+ for_each_sibling_event(event, group_event)
3699
+ event_update_userpage(event);
3700
+}
3701
+
3702
+static int merge_sched_in(struct perf_event *event, void *data)
3703
+{
3704
+ struct perf_event_context *ctx = event->ctx;
3705
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3706
+ int *can_add_hw = data;
3707
+
3708
+ if (event->state <= PERF_EVENT_STATE_OFF)
3709
+ return 0;
3710
+
3711
+ if (!event_filter_match(event))
3712
+ return 0;
3713
+
3714
+ if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3715
+ if (!group_sched_in(event, cpuctx, ctx))
3716
+ list_add_tail(&event->active_list, get_event_list(event));
3717
+ }
3718
+
3719
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
3720
+ *can_add_hw = 0;
3721
+ if (event->attr.pinned) {
3722
+ perf_cgroup_event_disable(event, ctx);
3723
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3724
+ } else {
3725
+ ctx->rotate_necessary = 1;
3726
+ perf_mux_hrtimer_restart(cpuctx);
3727
+ group_update_userpage(event);
3728
+ }
33313729 }
33323730
33333731 return 0;
....@@ -3337,30 +3735,28 @@
33373735 ctx_pinned_sched_in(struct perf_event_context *ctx,
33383736 struct perf_cpu_context *cpuctx)
33393737 {
3340
- struct sched_in_data sid = {
3341
- .ctx = ctx,
3342
- .cpuctx = cpuctx,
3343
- .can_add_hw = 1,
3344
- };
3738
+ int can_add_hw = 1;
33453739
3346
- visit_groups_merge(&ctx->pinned_groups,
3740
+ if (ctx != &cpuctx->ctx)
3741
+ cpuctx = NULL;
3742
+
3743
+ visit_groups_merge(cpuctx, &ctx->pinned_groups,
33473744 smp_processor_id(),
3348
- pinned_sched_in, &sid);
3745
+ merge_sched_in, &can_add_hw);
33493746 }
33503747
33513748 static void
33523749 ctx_flexible_sched_in(struct perf_event_context *ctx,
33533750 struct perf_cpu_context *cpuctx)
33543751 {
3355
- struct sched_in_data sid = {
3356
- .ctx = ctx,
3357
- .cpuctx = cpuctx,
3358
- .can_add_hw = 1,
3359
- };
3752
+ int can_add_hw = 1;
33603753
3361
- visit_groups_merge(&ctx->flexible_groups,
3754
+ if (ctx != &cpuctx->ctx)
3755
+ cpuctx = NULL;
3756
+
3757
+ visit_groups_merge(cpuctx, &ctx->flexible_groups,
33623758 smp_processor_id(),
3363
- flexible_sched_in, &sid);
3759
+ merge_sched_in, &can_add_hw);
33643760 }
33653761
33663762 static void
....@@ -3419,10 +3815,14 @@
34193815 struct task_struct *task)
34203816 {
34213817 struct perf_cpu_context *cpuctx;
3818
+ struct pmu *pmu = ctx->pmu;
34223819
34233820 cpuctx = __get_cpu_context(ctx);
3424
- if (cpuctx->task_ctx == ctx)
3821
+ if (cpuctx->task_ctx == ctx) {
3822
+ if (cpuctx->sched_cb_usage)
3823
+ __perf_pmu_sched_task(cpuctx, true);
34253824 return;
3825
+ }
34263826
34273827 perf_ctx_lock(cpuctx, ctx);
34283828 /*
....@@ -3432,7 +3832,7 @@
34323832 if (!ctx->nr_events)
34333833 goto unlock;
34343834
3435
- perf_pmu_disable(ctx->pmu);
3835
+ perf_pmu_disable(pmu);
34363836 /*
34373837 * We want to keep the following priority order:
34383838 * cpu pinned (that don't need to move), task pinned,
....@@ -3444,7 +3844,11 @@
34443844 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
34453845 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
34463846 perf_event_sched_in(cpuctx, ctx, task);
3447
- perf_pmu_enable(ctx->pmu);
3847
+
3848
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
3849
+ pmu->sched_task(cpuctx->task_ctx, true);
3850
+
3851
+ perf_pmu_enable(pmu);
34483852
34493853 unlock:
34503854 perf_ctx_unlock(cpuctx, ctx);
....@@ -3685,34 +4089,45 @@
36854089 perf_event_groups_insert(&ctx->flexible_groups, event);
36864090 }
36874091
4092
+/* pick an event from the flexible_groups to rotate */
36884093 static inline struct perf_event *
3689
-ctx_first_active(struct perf_event_context *ctx)
4094
+ctx_event_to_rotate(struct perf_event_context *ctx)
36904095 {
3691
- return list_first_entry_or_null(&ctx->flexible_active,
3692
- struct perf_event, active_list);
4096
+ struct perf_event *event;
4097
+
4098
+ /* pick the first active flexible event */
4099
+ event = list_first_entry_or_null(&ctx->flexible_active,
4100
+ struct perf_event, active_list);
4101
+
4102
+ /* if no active flexible event, pick the first event */
4103
+ if (!event) {
4104
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4105
+ typeof(*event), group_node);
4106
+ }
4107
+
4108
+ /*
4109
+ * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4110
+ * finds there are unschedulable events, it will set it again.
4111
+ */
4112
+ ctx->rotate_necessary = 0;
4113
+
4114
+ return event;
36934115 }
36944116
36954117 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
36964118 {
36974119 struct perf_event *cpu_event = NULL, *task_event = NULL;
3698
- bool cpu_rotate = false, task_rotate = false;
3699
- struct perf_event_context *ctx = NULL;
4120
+ struct perf_event_context *task_ctx = NULL;
4121
+ int cpu_rotate, task_rotate;
37004122
37014123 /*
37024124 * Since we run this from IRQ context, nobody can install new
37034125 * events, thus the event count values are stable.
37044126 */
37054127
3706
- if (cpuctx->ctx.nr_events) {
3707
- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3708
- cpu_rotate = true;
3709
- }
3710
-
3711
- ctx = cpuctx->task_ctx;
3712
- if (ctx && ctx->nr_events) {
3713
- if (ctx->nr_events != ctx->nr_active)
3714
- task_rotate = true;
3715
- }
4128
+ cpu_rotate = cpuctx->ctx.rotate_necessary;
4129
+ task_ctx = cpuctx->task_ctx;
4130
+ task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
37164131
37174132 if (!(cpu_rotate || task_rotate))
37184133 return false;
....@@ -3721,25 +4136,25 @@
37214136 perf_pmu_disable(cpuctx->ctx.pmu);
37224137
37234138 if (task_rotate)
3724
- task_event = ctx_first_active(ctx);
4139
+ task_event = ctx_event_to_rotate(task_ctx);
37254140 if (cpu_rotate)
3726
- cpu_event = ctx_first_active(&cpuctx->ctx);
4141
+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
37274142
37284143 /*
37294144 * As per the order given at ctx_resched() first 'pop' task flexible
37304145 * and then, if needed CPU flexible.
37314146 */
3732
- if (task_event || (ctx && cpu_event))
3733
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
4147
+ if (task_event || (task_ctx && cpu_event))
4148
+ ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
37344149 if (cpu_event)
37354150 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
37364151
37374152 if (task_event)
3738
- rotate_ctx(ctx, task_event);
4153
+ rotate_ctx(task_ctx, task_event);
37394154 if (cpu_event)
37404155 rotate_ctx(&cpuctx->ctx, cpu_event);
37414156
3742
- perf_event_sched_in(cpuctx, ctx, current);
4157
+ perf_event_sched_in(cpuctx, task_ctx, current);
37434158
37444159 perf_pmu_enable(cpuctx->ctx.pmu);
37454160 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
....@@ -3983,6 +4398,7 @@
39834398
39844399 return ret;
39854400 }
4401
+EXPORT_SYMBOL_GPL(perf_event_read_local);
39864402
39874403 static int perf_event_read(struct perf_event *event, bool group)
39884404 {
....@@ -4074,7 +4490,7 @@
40744490 INIT_LIST_HEAD(&ctx->event_list);
40754491 INIT_LIST_HEAD(&ctx->pinned_active);
40764492 INIT_LIST_HEAD(&ctx->flexible_active);
4077
- atomic_set(&ctx->refcount, 1);
4493
+ refcount_set(&ctx->refcount, 1);
40784494 }
40794495
40804496 static struct perf_event_context *
....@@ -4087,10 +4503,8 @@
40874503 return NULL;
40884504
40894505 __perf_event_init_context(ctx);
4090
- if (task) {
4091
- ctx->task = task;
4092
- get_task_struct(task);
4093
- }
4506
+ if (task)
4507
+ ctx->task = get_task_struct(task);
40944508 ctx->pmu = pmu;
40954509
40964510 return ctx;
....@@ -4152,7 +4566,7 @@
41524566 goto errout;
41534567
41544568 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4155
- task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
4569
+ task_ctx_data = alloc_task_ctx_data(pmu);
41564570 if (!task_ctx_data) {
41574571 err = -ENOMEM;
41584572 goto errout;
....@@ -4210,11 +4624,11 @@
42104624 }
42114625 }
42124626
4213
- kfree(task_ctx_data);
4627
+ free_task_ctx_data(pmu, task_ctx_data);
42144628 return ctx;
42154629
42164630 errout:
4217
- kfree(task_ctx_data);
4631
+ free_task_ctx_data(pmu, task_ctx_data);
42184632 return ERR_PTR(err);
42194633 }
42204634
....@@ -4233,7 +4647,7 @@
42334647 }
42344648
42354649 static void ring_buffer_attach(struct perf_event *event,
4236
- struct ring_buffer *rb);
4650
+ struct perf_buffer *rb);
42374651
42384652 static void detach_sb_event(struct perf_event *event)
42394653 {
....@@ -4256,8 +4670,9 @@
42564670
42574671 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
42584672 attr->comm || attr->comm_exec ||
4259
- attr->task ||
4260
- attr->context_switch)
4673
+ attr->task || attr->ksymbol ||
4674
+ attr->context_switch || attr->text_poke ||
4675
+ attr->bpf_event)
42614676 return true;
42624677 return false;
42634678 }
....@@ -4306,7 +4721,7 @@
43064721 if (event->parent)
43074722 return;
43084723
4309
- if (event->attach_state & PERF_ATTACH_TASK)
4724
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
43104725 dec = true;
43114726 if (event->attr.mmap || event->attr.mmap_data)
43124727 atomic_dec(&nr_mmap_events);
....@@ -4314,6 +4729,8 @@
43144729 atomic_dec(&nr_comm_events);
43154730 if (event->attr.namespaces)
43164731 atomic_dec(&nr_namespaces_events);
4732
+ if (event->attr.cgroup)
4733
+ atomic_dec(&nr_cgroup_events);
43174734 if (event->attr.task)
43184735 atomic_dec(&nr_task_events);
43194736 if (event->attr.freq)
....@@ -4326,6 +4743,12 @@
43264743 dec = true;
43274744 if (has_branch_stack(event))
43284745 dec = true;
4746
+ if (event->attr.ksymbol)
4747
+ atomic_dec(&nr_ksymbol_events);
4748
+ if (event->attr.bpf_event)
4749
+ atomic_dec(&nr_bpf_events);
4750
+ if (event->attr.text_poke)
4751
+ atomic_dec(&nr_text_poke_events);
43294752
43304753 if (dec) {
43314754 if (!atomic_add_unless(&perf_sched_count, -1, 1))
....@@ -4909,7 +5332,7 @@
49095332 static __poll_t perf_poll(struct file *file, poll_table *wait)
49105333 {
49115334 struct perf_event *event = file->private_data;
4912
- struct ring_buffer *rb;
5335
+ struct perf_buffer *rb;
49135336 __poll_t events = EPOLLHUP;
49145337
49155338 poll_wait(file, &event->waitq, wait);
....@@ -4935,6 +5358,24 @@
49355358 local64_set(&event->count, 0);
49365359 perf_event_update_userpage(event);
49375360 }
5361
+
5362
+/* Assume it's not an event with inherit set. */
5363
+u64 perf_event_pause(struct perf_event *event, bool reset)
5364
+{
5365
+ struct perf_event_context *ctx;
5366
+ u64 count;
5367
+
5368
+ ctx = perf_event_ctx_lock(event);
5369
+ WARN_ON_ONCE(event->attr.inherit);
5370
+ _perf_event_disable(event);
5371
+ count = local64_read(&event->count);
5372
+ if (reset)
5373
+ local64_set(&event->count, 0);
5374
+ perf_event_ctx_unlock(event, ctx);
5375
+
5376
+ return count;
5377
+}
5378
+EXPORT_SYMBOL_GPL(perf_event_pause);
49385379
49395380 /*
49405381 * Holding the top-level event's child_mutex means that any
....@@ -5013,15 +5454,10 @@
50135454 return event->pmu->check_period(event, value);
50145455 }
50155456
5016
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
5457
+static int _perf_event_period(struct perf_event *event, u64 value)
50175458 {
5018
- u64 value;
5019
-
50205459 if (!is_sampling_event(event))
50215460 return -EINVAL;
5022
-
5023
- if (copy_from_user(&value, arg, sizeof(value)))
5024
- return -EFAULT;
50255461
50265462 if (!value)
50275463 return -EINVAL;
....@@ -5039,6 +5475,19 @@
50395475
50405476 return 0;
50415477 }
5478
+
5479
+int perf_event_period(struct perf_event *event, u64 value)
5480
+{
5481
+ struct perf_event_context *ctx;
5482
+ int ret;
5483
+
5484
+ ctx = perf_event_ctx_lock(event);
5485
+ ret = _perf_event_period(event, value);
5486
+ perf_event_ctx_unlock(event, ctx);
5487
+
5488
+ return ret;
5489
+}
5490
+EXPORT_SYMBOL_GPL(perf_event_period);
50425491
50435492 static const struct file_operations perf_fops;
50445493
....@@ -5083,8 +5532,14 @@
50835532 return _perf_event_refresh(event, arg);
50845533
50855534 case PERF_EVENT_IOC_PERIOD:
5086
- return perf_event_period(event, (u64 __user *)arg);
5535
+ {
5536
+ u64 value;
50875537
5538
+ if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5539
+ return -EFAULT;
5540
+
5541
+ return _perf_event_period(event, value);
5542
+ }
50885543 case PERF_EVENT_IOC_ID:
50895544 {
50905545 u64 id = primary_event_id(event);
....@@ -5119,7 +5574,7 @@
51195574 return perf_event_set_bpf_prog(event, arg);
51205575
51215576 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5122
- struct ring_buffer *rb;
5577
+ struct perf_buffer *rb;
51235578
51245579 rcu_read_lock();
51255580 rb = rcu_dereference(event->rb);
....@@ -5255,7 +5710,7 @@
52555710 static void perf_event_init_userpage(struct perf_event *event)
52565711 {
52575712 struct perf_event_mmap_page *userpg;
5258
- struct ring_buffer *rb;
5713
+ struct perf_buffer *rb;
52595714
52605715 rcu_read_lock();
52615716 rb = rcu_dereference(event->rb);
....@@ -5287,7 +5742,7 @@
52875742 void perf_event_update_userpage(struct perf_event *event)
52885743 {
52895744 struct perf_event_mmap_page *userpg;
5290
- struct ring_buffer *rb;
5745
+ struct perf_buffer *rb;
52915746 u64 enabled, running, now;
52925747
52935748 rcu_read_lock();
....@@ -5338,7 +5793,7 @@
53385793 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
53395794 {
53405795 struct perf_event *event = vmf->vma->vm_file->private_data;
5341
- struct ring_buffer *rb;
5796
+ struct perf_buffer *rb;
53425797 vm_fault_t ret = VM_FAULT_SIGBUS;
53435798
53445799 if (vmf->flags & FAULT_FLAG_MKWRITE) {
....@@ -5371,10 +5826,12 @@
53715826 }
53725827
53735828 static void ring_buffer_attach(struct perf_event *event,
5374
- struct ring_buffer *rb)
5829
+ struct perf_buffer *rb)
53755830 {
5376
- struct ring_buffer *old_rb = NULL;
5831
+ struct perf_buffer *old_rb = NULL;
53775832 unsigned long flags;
5833
+
5834
+ WARN_ON_ONCE(event->parent);
53785835
53795836 if (event->rb) {
53805837 /*
....@@ -5431,7 +5888,10 @@
54315888
54325889 static void ring_buffer_wakeup(struct perf_event *event)
54335890 {
5434
- struct ring_buffer *rb;
5891
+ struct perf_buffer *rb;
5892
+
5893
+ if (event->parent)
5894
+ event = event->parent;
54355895
54365896 rcu_read_lock();
54375897 rb = rcu_dereference(event->rb);
....@@ -5442,14 +5902,17 @@
54425902 rcu_read_unlock();
54435903 }
54445904
5445
-struct ring_buffer *ring_buffer_get(struct perf_event *event)
5905
+struct perf_buffer *ring_buffer_get(struct perf_event *event)
54465906 {
5447
- struct ring_buffer *rb;
5907
+ struct perf_buffer *rb;
5908
+
5909
+ if (event->parent)
5910
+ event = event->parent;
54485911
54495912 rcu_read_lock();
54505913 rb = rcu_dereference(event->rb);
54515914 if (rb) {
5452
- if (!atomic_inc_not_zero(&rb->refcount))
5915
+ if (!refcount_inc_not_zero(&rb->refcount))
54535916 rb = NULL;
54545917 }
54555918 rcu_read_unlock();
....@@ -5457,9 +5920,9 @@
54575920 return rb;
54585921 }
54595922
5460
-void ring_buffer_put(struct ring_buffer *rb)
5923
+void ring_buffer_put(struct perf_buffer *rb)
54615924 {
5462
- if (!atomic_dec_and_test(&rb->refcount))
5925
+ if (!refcount_dec_and_test(&rb->refcount))
54635926 return;
54645927
54655928 WARN_ON_ONCE(!list_empty(&rb->event_list));
....@@ -5494,7 +5957,7 @@
54945957 static void perf_mmap_close(struct vm_area_struct *vma)
54955958 {
54965959 struct perf_event *event = vma->vm_file->private_data;
5497
- struct ring_buffer *rb = ring_buffer_get(event);
5960
+ struct perf_buffer *rb = ring_buffer_get(event);
54985961 struct user_struct *mmap_user = rb->mmap_user;
54995962 int mmap_locked = rb->mmap_locked;
55005963 unsigned long size = perf_data_size(rb);
....@@ -5519,12 +5982,12 @@
55195982 perf_pmu_output_stop(event);
55205983
55215984 /* now it's safe to free the pages */
5522
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5523
- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5985
+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
5986
+ atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
55245987
55255988 /* this has to be the last one */
55265989 rb_free_aux(rb);
5527
- WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5990
+ WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
55285991
55295992 mutex_unlock(&event->mmap_mutex);
55305993 }
....@@ -5593,8 +6056,9 @@
55936056 * undo the VM accounting.
55946057 */
55956058
5596
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5597
- vma->vm_mm->pinned_vm -= mmap_locked;
6059
+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6060
+ &mmap_user->locked_vm);
6061
+ atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
55986062 free_uid(mmap_user);
55996063
56006064 out_put:
....@@ -5603,7 +6067,7 @@
56036067
56046068 static const struct vm_operations_struct perf_mmap_vmops = {
56056069 .open = perf_mmap_open,
5606
- .close = perf_mmap_close, /* non mergable */
6070
+ .close = perf_mmap_close, /* non mergeable */
56076071 .fault = perf_mmap_fault,
56086072 .page_mkwrite = perf_mmap_fault,
56096073 };
....@@ -5613,8 +6077,8 @@
56136077 struct perf_event *event = file->private_data;
56146078 unsigned long user_locked, user_lock_limit;
56156079 struct user_struct *user = current_user();
6080
+ struct perf_buffer *rb = NULL;
56166081 unsigned long locked, lock_limit;
5617
- struct ring_buffer *rb = NULL;
56186082 unsigned long vma_size;
56196083 unsigned long nr_pages;
56206084 long user_extra = 0, extra = 0;
....@@ -5711,17 +6175,17 @@
57116175 again:
57126176 mutex_lock(&event->mmap_mutex);
57136177 if (event->rb) {
5714
- if (event->rb->nr_pages != nr_pages) {
6178
+ if (data_page_nr(event->rb) != nr_pages) {
57156179 ret = -EINVAL;
57166180 goto unlock;
57176181 }
57186182
57196183 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
57206184 /*
5721
- * Raced against perf_mmap_close() through
5722
- * perf_event_set_output(). Try again, hope for better
5723
- * luck.
6185
+ * Raced against perf_mmap_close(); remove the
6186
+ * event and try again.
57246187 */
6188
+ ring_buffer_attach(event, NULL);
57256189 mutex_unlock(&event->mmap_mutex);
57266190 goto again;
57276191 }
....@@ -5749,12 +6213,18 @@
57496213 user_locked = user_lock_limit;
57506214 user_locked += user_extra;
57516215
5752
- if (user_locked > user_lock_limit)
6216
+ if (user_locked > user_lock_limit) {
6217
+ /*
6218
+ * charge locked_vm until it hits user_lock_limit;
6219
+ * charge the rest from pinned_vm
6220
+ */
57536221 extra = user_locked - user_lock_limit;
6222
+ user_extra -= extra;
6223
+ }
57546224
57556225 lock_limit = rlimit(RLIMIT_MEMLOCK);
57566226 lock_limit >>= PAGE_SHIFT;
5757
- locked = vma->vm_mm->pinned_vm + extra;
6227
+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
57586228
57596229 if ((locked > lock_limit) && perf_is_paranoid() &&
57606230 !capable(CAP_IPC_LOCK)) {
....@@ -5783,6 +6253,8 @@
57836253
57846254 ring_buffer_attach(event, rb);
57856255
6256
+ perf_event_update_time(event);
6257
+ perf_set_shadow_time(event, event->ctx);
57866258 perf_event_init_userpage(event);
57876259 perf_event_update_userpage(event);
57886260 } else {
....@@ -5795,7 +6267,7 @@
57956267 unlock:
57966268 if (!ret) {
57976269 atomic_long_add(user_extra, &user->locked_vm);
5798
- vma->vm_mm->pinned_vm += extra;
6270
+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
57996271
58006272 atomic_inc(&event->mmap_count);
58016273 } else if (rb) {
....@@ -5932,18 +6404,25 @@
59326404 * Later on, we might change it to a list if there is
59336405 * another virtualization implementation supporting the callbacks.
59346406 */
5935
-struct perf_guest_info_callbacks *perf_guest_cbs;
6407
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
59366408
59376409 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
59386410 {
5939
- perf_guest_cbs = cbs;
6411
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
6412
+ return -EBUSY;
6413
+
6414
+ rcu_assign_pointer(perf_guest_cbs, cbs);
59406415 return 0;
59416416 }
59426417 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
59436418
59446419 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
59456420 {
5946
- perf_guest_cbs = NULL;
6421
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
6422
+ return -EINVAL;
6423
+
6424
+ rcu_assign_pointer(perf_guest_cbs, NULL);
6425
+ synchronize_rcu();
59476426 return 0;
59486427 }
59496428 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
....@@ -5965,14 +6444,13 @@
59656444 }
59666445
59676446 static void perf_sample_regs_user(struct perf_regs *regs_user,
5968
- struct pt_regs *regs,
5969
- struct pt_regs *regs_user_copy)
6447
+ struct pt_regs *regs)
59706448 {
59716449 if (user_mode(regs)) {
59726450 regs_user->abi = perf_reg_abi(current);
59736451 regs_user->regs = regs;
59746452 } else if (!(current->flags & PF_KTHREAD)) {
5975
- perf_get_regs_user(regs_user, regs, regs_user_copy);
6453
+ perf_get_regs_user(regs_user, regs);
59766454 } else {
59776455 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
59786456 regs_user->regs = NULL;
....@@ -5991,7 +6469,7 @@
59916469 * Get remaining task size from user stack pointer.
59926470 *
59936471 * It'd be better to take stack vma map and limit this more
5994
- * precisly, but there's no way to get it safely under interrupt,
6472
+ * precisely, but there's no way to get it safely under interrupt,
59956473 * so using TASK_SIZE as limit.
59966474 */
59976475 static u64 perf_ustack_task_size(struct pt_regs *regs)
....@@ -6073,10 +6551,9 @@
60736551
60746552 /* Data. */
60756553 sp = perf_user_stack_pointer(regs);
6076
- fs = get_fs();
6077
- set_fs(USER_DS);
6554
+ fs = force_uaccess_begin();
60786555 rem = __output_copy_user(handle, (void *) sp, dump_size);
6079
- set_fs(fs);
6556
+ force_uaccess_end(fs);
60806557 dyn_size = dump_size - rem;
60816558
60826559 perf_output_skip(handle, rem);
....@@ -6084,6 +6561,122 @@
60846561 /* Dynamic size. */
60856562 perf_output_put(handle, dyn_size);
60866563 }
6564
+}
6565
+
6566
+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6567
+ struct perf_sample_data *data,
6568
+ size_t size)
6569
+{
6570
+ struct perf_event *sampler = event->aux_event;
6571
+ struct perf_buffer *rb;
6572
+
6573
+ data->aux_size = 0;
6574
+
6575
+ if (!sampler)
6576
+ goto out;
6577
+
6578
+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6579
+ goto out;
6580
+
6581
+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6582
+ goto out;
6583
+
6584
+ rb = ring_buffer_get(sampler);
6585
+ if (!rb)
6586
+ goto out;
6587
+
6588
+ /*
6589
+ * If this is an NMI hit inside sampling code, don't take
6590
+ * the sample. See also perf_aux_sample_output().
6591
+ */
6592
+ if (READ_ONCE(rb->aux_in_sampling)) {
6593
+ data->aux_size = 0;
6594
+ } else {
6595
+ size = min_t(size_t, size, perf_aux_size(rb));
6596
+ data->aux_size = ALIGN(size, sizeof(u64));
6597
+ }
6598
+ ring_buffer_put(rb);
6599
+
6600
+out:
6601
+ return data->aux_size;
6602
+}
6603
+
6604
+long perf_pmu_snapshot_aux(struct perf_buffer *rb,
6605
+ struct perf_event *event,
6606
+ struct perf_output_handle *handle,
6607
+ unsigned long size)
6608
+{
6609
+ unsigned long flags;
6610
+ long ret;
6611
+
6612
+ /*
6613
+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
6614
+ * paths. If we start calling them in NMI context, they may race with
6615
+ * the IRQ ones, that is, for example, re-starting an event that's just
6616
+ * been stopped, which is why we're using a separate callback that
6617
+ * doesn't change the event state.
6618
+ *
6619
+ * IRQs need to be disabled to prevent IPIs from racing with us.
6620
+ */
6621
+ local_irq_save(flags);
6622
+ /*
6623
+ * Guard against NMI hits inside the critical section;
6624
+ * see also perf_prepare_sample_aux().
6625
+ */
6626
+ WRITE_ONCE(rb->aux_in_sampling, 1);
6627
+ barrier();
6628
+
6629
+ ret = event->pmu->snapshot_aux(event, handle, size);
6630
+
6631
+ barrier();
6632
+ WRITE_ONCE(rb->aux_in_sampling, 0);
6633
+ local_irq_restore(flags);
6634
+
6635
+ return ret;
6636
+}
6637
+
6638
+static void perf_aux_sample_output(struct perf_event *event,
6639
+ struct perf_output_handle *handle,
6640
+ struct perf_sample_data *data)
6641
+{
6642
+ struct perf_event *sampler = event->aux_event;
6643
+ struct perf_buffer *rb;
6644
+ unsigned long pad;
6645
+ long size;
6646
+
6647
+ if (WARN_ON_ONCE(!sampler || !data->aux_size))
6648
+ return;
6649
+
6650
+ rb = ring_buffer_get(sampler);
6651
+ if (!rb)
6652
+ return;
6653
+
6654
+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6655
+
6656
+ /*
6657
+ * An error here means that perf_output_copy() failed (returned a
6658
+ * non-zero surplus that it didn't copy), which in its current
6659
+ * enlightened implementation is not possible. If that changes, we'd
6660
+ * like to know.
6661
+ */
6662
+ if (WARN_ON_ONCE(size < 0))
6663
+ goto out_put;
6664
+
6665
+ /*
6666
+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
6667
+ * perf_prepare_sample_aux(), so should not be more than that.
6668
+ */
6669
+ pad = data->aux_size - size;
6670
+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
6671
+ pad = 8;
6672
+
6673
+ if (pad) {
6674
+ u64 zero = 0;
6675
+ perf_output_copy(handle, &zero, pad);
6676
+ }
6677
+
6678
+out_put:
6679
+ ring_buffer_put(rb);
60876680 }
60886681
60896682 static void __perf_event_header__init_id(struct perf_event_header *header,
....@@ -6255,6 +6848,11 @@
62556848 perf_output_read_one(handle, event, enabled, running);
62566849 }
62576850
6851
+static inline bool perf_sample_save_hw_index(struct perf_event *event)
6852
+{
6853
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
6854
+}
6855
+
62586856 void perf_output_sample(struct perf_output_handle *handle,
62596857 struct perf_event_header *header,
62606858 struct perf_sample_data *data,
....@@ -6343,6 +6941,8 @@
63436941 * sizeof(struct perf_branch_entry);
63446942
63456943 perf_output_put(handle, data->br_stack->nr);
6944
+ if (perf_sample_save_hw_index(event))
6945
+ perf_output_put(handle, data->br_stack->hw_idx);
63466946 perf_output_copy(handle, data->br_stack->entries, size);
63476947 } else {
63486948 /*
....@@ -6405,11 +7005,21 @@
64057005 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
64067006 perf_output_put(handle, data->phys_addr);
64077007
7008
+ if (sample_type & PERF_SAMPLE_CGROUP)
7009
+ perf_output_put(handle, data->cgroup);
7010
+
7011
+ if (sample_type & PERF_SAMPLE_AUX) {
7012
+ perf_output_put(handle, data->aux_size);
7013
+
7014
+ if (data->aux_size)
7015
+ perf_aux_sample_output(event, handle, data);
7016
+ }
7017
+
64087018 if (!event->attr.watermark) {
64097019 int wakeup_events = event->attr.wakeup_events;
64107020
64117021 if (wakeup_events) {
6412
- struct ring_buffer *rb = handle->rb;
7022
+ struct perf_buffer *rb = handle->rb;
64137023 int events = local_inc_return(&rb->events);
64147024
64157025 if (events >= wakeup_events) {
....@@ -6437,14 +7047,14 @@
64377047 * Walking the pages tables for user address.
64387048 * Interrupts are disabled, so it prevents any tear down
64397049 * of the page tables.
6440
- * Try IRQ-safe __get_user_pages_fast first.
7050
+ * Try IRQ-safe get_user_page_fast_only first.
64417051 * If failed, leave phys_addr as 0.
64427052 */
64437053 if (current->mm != NULL) {
64447054 struct page *p;
64457055
64467056 pagefault_disable();
6447
- if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {
7057
+ if (get_user_page_fast_only(virt, 0, &p)) {
64487058 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
64497059 put_page(p);
64507060 }
....@@ -6532,6 +7142,9 @@
65327142 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
65337143 int size = sizeof(u64); /* nr */
65347144 if (data->br_stack) {
7145
+ if (perf_sample_save_hw_index(event))
7146
+ size += sizeof(u64);
7147
+
65357148 size += data->br_stack->nr
65367149 * sizeof(struct perf_branch_entry);
65377150 }
....@@ -6539,8 +7152,7 @@
65397152 }
65407153
65417154 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6542
- perf_sample_regs_user(&data->regs_user, regs,
6543
- &data->regs_user_copy);
7155
+ perf_sample_regs_user(&data->regs_user, regs);
65447156
65457157 if (sample_type & PERF_SAMPLE_REGS_USER) {
65467158 /* regs dump ABI info */
....@@ -6556,7 +7168,7 @@
65567168
65577169 if (sample_type & PERF_SAMPLE_STACK_USER) {
65587170 /*
6559
- * Either we need PERF_SAMPLE_STACK_USER bit to be allways
7171
+ * Either we need PERF_SAMPLE_STACK_USER bit to be always
65607172 * processed as the last one or have additional check added
65617173 * in case new sample type is added, because we could eat
65627174 * up the rest of the sample size.
....@@ -6596,25 +7208,67 @@
65967208
65977209 if (sample_type & PERF_SAMPLE_PHYS_ADDR)
65987210 data->phys_addr = perf_virt_to_phys(data->addr);
7211
+
7212
+#ifdef CONFIG_CGROUP_PERF
7213
+ if (sample_type & PERF_SAMPLE_CGROUP) {
7214
+ struct cgroup *cgrp;
7215
+
7216
+ /* protected by RCU */
7217
+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7218
+ data->cgroup = cgroup_id(cgrp);
7219
+ }
7220
+#endif
7221
+
7222
+ if (sample_type & PERF_SAMPLE_AUX) {
7223
+ u64 size;
7224
+
7225
+ header->size += sizeof(u64); /* size */
7226
+
7227
+ /*
7228
+ * Given the 16bit nature of header::size, an AUX sample can
7229
+ * easily overflow it, what with all the preceding sample bits.
7230
+ * Make sure this doesn't happen by using up to U16_MAX bytes
7231
+ * per sample in total (rounded down to 8 byte boundary).
7232
+ */
7233
+ size = min_t(size_t, U16_MAX - header->size,
7234
+ event->attr.aux_sample_size);
7235
+ size = rounddown(size, 8);
7236
+ size = perf_prepare_sample_aux(event, data, size);
7237
+
7238
+ WARN_ON_ONCE(size + header->size > U16_MAX);
7239
+ header->size += size;
7240
+ }
7241
+ /*
7242
+ * If you're adding more sample types here, you likely need to do
7243
+ * something about the overflowing header::size, like repurpose the
7244
+ * lowest 3 bits of size, which should be always zero at the moment.
7245
+ * This raises a more important question, do we really need 512k sized
7246
+ * samples and why, so good argumentation is in order for whatever you
7247
+ * do here next.
7248
+ */
7249
+ WARN_ON_ONCE(header->size & 7);
65997250 }
66007251
6601
-static __always_inline void
7252
+static __always_inline int
66027253 __perf_event_output(struct perf_event *event,
66037254 struct perf_sample_data *data,
66047255 struct pt_regs *regs,
66057256 int (*output_begin)(struct perf_output_handle *,
7257
+ struct perf_sample_data *,
66067258 struct perf_event *,
66077259 unsigned int))
66087260 {
66097261 struct perf_output_handle handle;
66107262 struct perf_event_header header;
7263
+ int err;
66117264
66127265 /* protect the callchain buffers */
66137266 rcu_read_lock();
66147267
66157268 perf_prepare_sample(&header, data, event, regs);
66167269
6617
- if (output_begin(&handle, event, header.size))
7270
+ err = output_begin(&handle, data, event, header.size);
7271
+ if (err)
66187272 goto exit;
66197273
66207274 perf_output_sample(&handle, &header, data, event);
....@@ -6623,6 +7277,7 @@
66237277
66247278 exit:
66257279 rcu_read_unlock();
7280
+ return err;
66267281 }
66277282
66287283 void
....@@ -6641,12 +7296,12 @@
66417296 __perf_event_output(event, data, regs, perf_output_begin_backward);
66427297 }
66437298
6644
-void
7299
+int
66457300 perf_event_output(struct perf_event *event,
66467301 struct perf_sample_data *data,
66477302 struct pt_regs *regs)
66487303 {
6649
- __perf_event_output(event, data, regs, perf_output_begin);
7304
+ return __perf_event_output(event, data, regs, perf_output_begin);
66507305 }
66517306
66527307 /*
....@@ -6678,7 +7333,7 @@
66787333 int ret;
66797334
66807335 perf_event_header__init_id(&read_event.header, &sample, event);
6681
- ret = perf_output_begin(&handle, event, read_event.header.size);
7336
+ ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
66827337 if (ret)
66837338 return;
66847339
....@@ -6823,7 +7478,7 @@
68237478 }
68247479
68257480 struct remote_output {
6826
- struct ring_buffer *rb;
7481
+ struct perf_buffer *rb;
68277482 int err;
68287483 };
68297484
....@@ -6831,7 +7486,7 @@
68317486 {
68327487 struct perf_event *parent = event->parent;
68337488 struct remote_output *ro = data;
6834
- struct ring_buffer *rb = ro->rb;
7489
+ struct perf_buffer *rb = ro->rb;
68357490 struct stop_event_data sd = {
68367491 .event = event,
68377492 };
....@@ -6947,7 +7602,7 @@
69477602
69487603 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
69497604
6950
- ret = perf_output_begin(&handle, event,
7605
+ ret = perf_output_begin(&handle, &sample, event,
69517606 task_event->event_id.header.size);
69527607 if (ret)
69537608 goto out;
....@@ -7050,7 +7705,7 @@
70507705 return;
70517706
70527707 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7053
- ret = perf_output_begin(&handle, event,
7708
+ ret = perf_output_begin(&handle, &sample, event,
70547709 comm_event->event_id.header.size);
70557710
70567711 if (ret)
....@@ -7150,7 +7805,7 @@
71507805
71517806 perf_event_header__init_id(&namespaces_event->event_id.header,
71527807 &sample, event);
7153
- ret = perf_output_begin(&handle, event,
7808
+ ret = perf_output_begin(&handle, &sample, event,
71547809 namespaces_event->event_id.header.size);
71557810 if (ret)
71567811 goto out;
....@@ -7175,7 +7830,7 @@
71757830 {
71767831 struct path ns_path;
71777832 struct inode *ns_inode;
7178
- void *error;
7833
+ int error;
71797834
71807835 error = ns_get_path(&ns_path, task, ns_ops);
71817836 if (!error) {
....@@ -7245,6 +7900,105 @@
72457900 }
72467901
72477902 /*
7903
+ * cgroup tracking
7904
+ */
7905
+#ifdef CONFIG_CGROUP_PERF
7906
+
7907
+struct perf_cgroup_event {
7908
+ char *path;
7909
+ int path_size;
7910
+ struct {
7911
+ struct perf_event_header header;
7912
+ u64 id;
7913
+ char path[];
7914
+ } event_id;
7915
+};
7916
+
7917
+static int perf_event_cgroup_match(struct perf_event *event)
7918
+{
7919
+ return event->attr.cgroup;
7920
+}
7921
+
7922
+static void perf_event_cgroup_output(struct perf_event *event, void *data)
7923
+{
7924
+ struct perf_cgroup_event *cgroup_event = data;
7925
+ struct perf_output_handle handle;
7926
+ struct perf_sample_data sample;
7927
+ u16 header_size = cgroup_event->event_id.header.size;
7928
+ int ret;
7929
+
7930
+ if (!perf_event_cgroup_match(event))
7931
+ return;
7932
+
7933
+ perf_event_header__init_id(&cgroup_event->event_id.header,
7934
+ &sample, event);
7935
+ ret = perf_output_begin(&handle, &sample, event,
7936
+ cgroup_event->event_id.header.size);
7937
+ if (ret)
7938
+ goto out;
7939
+
7940
+ perf_output_put(&handle, cgroup_event->event_id);
7941
+ __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
7942
+
7943
+ perf_event__output_id_sample(event, &handle, &sample);
7944
+
7945
+ perf_output_end(&handle);
7946
+out:
7947
+ cgroup_event->event_id.header.size = header_size;
7948
+}
7949
+
7950
+static void perf_event_cgroup(struct cgroup *cgrp)
7951
+{
7952
+ struct perf_cgroup_event cgroup_event;
7953
+ char path_enomem[16] = "//enomem";
7954
+ char *pathname;
7955
+ size_t size;
7956
+
7957
+ if (!atomic_read(&nr_cgroup_events))
7958
+ return;
7959
+
7960
+ cgroup_event = (struct perf_cgroup_event){
7961
+ .event_id = {
7962
+ .header = {
7963
+ .type = PERF_RECORD_CGROUP,
7964
+ .misc = 0,
7965
+ .size = sizeof(cgroup_event.event_id),
7966
+ },
7967
+ .id = cgroup_id(cgrp),
7968
+ },
7969
+ };
7970
+
7971
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
7972
+ if (pathname == NULL) {
7973
+ cgroup_event.path = path_enomem;
7974
+ } else {
7975
+ /* just to be sure to have enough space for alignment */
7976
+ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
7977
+ cgroup_event.path = pathname;
7978
+ }
7979
+
7980
+ /*
7981
+ * Since our buffer works in 8 byte units we need to align our string
7982
+ * size to a multiple of 8. However, we must guarantee the tail end is
7983
+ * zero'd out to avoid leaking random bits to userspace.
7984
+ */
7985
+ size = strlen(cgroup_event.path) + 1;
7986
+ while (!IS_ALIGNED(size, sizeof(u64)))
7987
+ cgroup_event.path[size++] = '\0';
7988
+
7989
+ cgroup_event.event_id.header.size += size;
7990
+ cgroup_event.path_size = size;
7991
+
7992
+ perf_iterate_sb(perf_event_cgroup_output,
7993
+ &cgroup_event,
7994
+ NULL);
7995
+
7996
+ kfree(pathname);
7997
+}
7998
+
7999
+#endif
8000
+
8001
+/*
72488002 * mmap tracking
72498003 */
72508004
....@@ -7304,7 +8058,7 @@
73048058 }
73058059
73068060 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7307
- ret = perf_output_begin(&handle, event,
8061
+ ret = perf_output_begin(&handle, &sample, event,
73088062 mmap_event->event_id.header.size);
73098063 if (ret)
73108064 goto out;
....@@ -7364,7 +8118,7 @@
73648118 flags |= MAP_EXECUTABLE;
73658119 if (vma->vm_flags & VM_LOCKED)
73668120 flags |= MAP_LOCKED;
7367
- if (vma->vm_flags & VM_HUGETLB)
8121
+ if (is_vm_hugetlb_page(vma))
73688122 flags |= MAP_HUGETLB;
73698123
73708124 if (file) {
....@@ -7614,7 +8368,7 @@
76148368 int ret;
76158369
76168370 perf_event_header__init_id(&rec.header, &sample, event);
7617
- ret = perf_output_begin(&handle, event, rec.header.size);
8371
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
76188372
76198373 if (ret)
76208374 return;
....@@ -7648,7 +8402,7 @@
76488402
76498403 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
76508404
7651
- ret = perf_output_begin(&handle, event,
8405
+ ret = perf_output_begin(&handle, &sample, event,
76528406 lost_samples_event.header.size);
76538407 if (ret)
76548408 return;
....@@ -7703,7 +8457,7 @@
77038457
77048458 perf_event_header__init_id(&se->event_id.header, &sample, event);
77058459
7706
- ret = perf_output_begin(&handle, event, se->event_id.header.size);
8460
+ ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
77078461 if (ret)
77088462 return;
77098463
....@@ -7778,7 +8532,7 @@
77788532
77798533 perf_event_header__init_id(&throttle_event.header, &sample, event);
77808534
7781
- ret = perf_output_begin(&handle, event,
8535
+ ret = perf_output_begin(&handle, &sample, event,
77828536 throttle_event.header.size);
77838537 if (ret)
77848538 return;
....@@ -7786,6 +8540,290 @@
77868540 perf_output_put(&handle, throttle_event);
77878541 perf_event__output_id_sample(event, &handle, &sample);
77888542 perf_output_end(&handle);
8543
+}
8544
+
8545
+/*
8546
+ * ksymbol register/unregister tracking
8547
+ */
8548
+
8549
+struct perf_ksymbol_event {
8550
+ const char *name;
8551
+ int name_len;
8552
+ struct {
8553
+ struct perf_event_header header;
8554
+ u64 addr;
8555
+ u32 len;
8556
+ u16 ksym_type;
8557
+ u16 flags;
8558
+ } event_id;
8559
+};
8560
+
8561
+static int perf_event_ksymbol_match(struct perf_event *event)
8562
+{
8563
+ return event->attr.ksymbol;
8564
+}
8565
+
8566
+static void perf_event_ksymbol_output(struct perf_event *event, void *data)
8567
+{
8568
+ struct perf_ksymbol_event *ksymbol_event = data;
8569
+ struct perf_output_handle handle;
8570
+ struct perf_sample_data sample;
8571
+ int ret;
8572
+
8573
+ if (!perf_event_ksymbol_match(event))
8574
+ return;
8575
+
8576
+ perf_event_header__init_id(&ksymbol_event->event_id.header,
8577
+ &sample, event);
8578
+ ret = perf_output_begin(&handle, &sample, event,
8579
+ ksymbol_event->event_id.header.size);
8580
+ if (ret)
8581
+ return;
8582
+
8583
+ perf_output_put(&handle, ksymbol_event->event_id);
8584
+ __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
8585
+ perf_event__output_id_sample(event, &handle, &sample);
8586
+
8587
+ perf_output_end(&handle);
8588
+}
8589
+
8590
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
8591
+ const char *sym)
8592
+{
8593
+ struct perf_ksymbol_event ksymbol_event;
8594
+ char name[KSYM_NAME_LEN];
8595
+ u16 flags = 0;
8596
+ int name_len;
8597
+
8598
+ if (!atomic_read(&nr_ksymbol_events))
8599
+ return;
8600
+
8601
+ if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
8602
+ ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
8603
+ goto err;
8604
+
8605
+ strlcpy(name, sym, KSYM_NAME_LEN);
8606
+ name_len = strlen(name) + 1;
8607
+ while (!IS_ALIGNED(name_len, sizeof(u64)))
8608
+ name[name_len++] = '\0';
8609
+ BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
8610
+
8611
+ if (unregister)
8612
+ flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
8613
+
8614
+ ksymbol_event = (struct perf_ksymbol_event){
8615
+ .name = name,
8616
+ .name_len = name_len,
8617
+ .event_id = {
8618
+ .header = {
8619
+ .type = PERF_RECORD_KSYMBOL,
8620
+ .size = sizeof(ksymbol_event.event_id) +
8621
+ name_len,
8622
+ },
8623
+ .addr = addr,
8624
+ .len = len,
8625
+ .ksym_type = ksym_type,
8626
+ .flags = flags,
8627
+ },
8628
+ };
8629
+
8630
+ perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
8631
+ return;
8632
+err:
8633
+ WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
8634
+}
8635
+
8636
+/*
8637
+ * bpf program load/unload tracking
8638
+ */
8639
+
8640
+struct perf_bpf_event {
8641
+ struct bpf_prog *prog;
8642
+ struct {
8643
+ struct perf_event_header header;
8644
+ u16 type;
8645
+ u16 flags;
8646
+ u32 id;
8647
+ u8 tag[BPF_TAG_SIZE];
8648
+ } event_id;
8649
+};
8650
+
8651
+static int perf_event_bpf_match(struct perf_event *event)
8652
+{
8653
+ return event->attr.bpf_event;
8654
+}
8655
+
8656
+static void perf_event_bpf_output(struct perf_event *event, void *data)
8657
+{
8658
+ struct perf_bpf_event *bpf_event = data;
8659
+ struct perf_output_handle handle;
8660
+ struct perf_sample_data sample;
8661
+ int ret;
8662
+
8663
+ if (!perf_event_bpf_match(event))
8664
+ return;
8665
+
8666
+ perf_event_header__init_id(&bpf_event->event_id.header,
8667
+ &sample, event);
8668
+ ret = perf_output_begin(&handle, data, event,
8669
+ bpf_event->event_id.header.size);
8670
+ if (ret)
8671
+ return;
8672
+
8673
+ perf_output_put(&handle, bpf_event->event_id);
8674
+ perf_event__output_id_sample(event, &handle, &sample);
8675
+
8676
+ perf_output_end(&handle);
8677
+}
8678
+
8679
+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
8680
+ enum perf_bpf_event_type type)
8681
+{
8682
+ bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
8683
+ int i;
8684
+
8685
+ if (prog->aux->func_cnt == 0) {
8686
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
8687
+ (u64)(unsigned long)prog->bpf_func,
8688
+ prog->jited_len, unregister,
8689
+ prog->aux->ksym.name);
8690
+ } else {
8691
+ for (i = 0; i < prog->aux->func_cnt; i++) {
8692
+ struct bpf_prog *subprog = prog->aux->func[i];
8693
+
8694
+ perf_event_ksymbol(
8695
+ PERF_RECORD_KSYMBOL_TYPE_BPF,
8696
+ (u64)(unsigned long)subprog->bpf_func,
8697
+ subprog->jited_len, unregister,
8698
+ subprog->aux->ksym.name);
8699
+ }
8700
+ }
8701
+}
8702
+
8703
+void perf_event_bpf_event(struct bpf_prog *prog,
8704
+ enum perf_bpf_event_type type,
8705
+ u16 flags)
8706
+{
8707
+ struct perf_bpf_event bpf_event;
8708
+
8709
+ if (type <= PERF_BPF_EVENT_UNKNOWN ||
8710
+ type >= PERF_BPF_EVENT_MAX)
8711
+ return;
8712
+
8713
+ switch (type) {
8714
+ case PERF_BPF_EVENT_PROG_LOAD:
8715
+ case PERF_BPF_EVENT_PROG_UNLOAD:
8716
+ if (atomic_read(&nr_ksymbol_events))
8717
+ perf_event_bpf_emit_ksymbols(prog, type);
8718
+ break;
8719
+ default:
8720
+ break;
8721
+ }
8722
+
8723
+ if (!atomic_read(&nr_bpf_events))
8724
+ return;
8725
+
8726
+ bpf_event = (struct perf_bpf_event){
8727
+ .prog = prog,
8728
+ .event_id = {
8729
+ .header = {
8730
+ .type = PERF_RECORD_BPF_EVENT,
8731
+ .size = sizeof(bpf_event.event_id),
8732
+ },
8733
+ .type = type,
8734
+ .flags = flags,
8735
+ .id = prog->aux->id,
8736
+ },
8737
+ };
8738
+
8739
+ BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
8740
+
8741
+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
8742
+ perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
8743
+}
8744
+
8745
+struct perf_text_poke_event {
8746
+ const void *old_bytes;
8747
+ const void *new_bytes;
8748
+ size_t pad;
8749
+ u16 old_len;
8750
+ u16 new_len;
8751
+
8752
+ struct {
8753
+ struct perf_event_header header;
8754
+
8755
+ u64 addr;
8756
+ } event_id;
8757
+};
8758
+
8759
+static int perf_event_text_poke_match(struct perf_event *event)
8760
+{
8761
+ return event->attr.text_poke;
8762
+}
8763
+
8764
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
8765
+{
8766
+ struct perf_text_poke_event *text_poke_event = data;
8767
+ struct perf_output_handle handle;
8768
+ struct perf_sample_data sample;
8769
+ u64 padding = 0;
8770
+ int ret;
8771
+
8772
+ if (!perf_event_text_poke_match(event))
8773
+ return;
8774
+
8775
+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
8776
+
8777
+ ret = perf_output_begin(&handle, &sample, event,
8778
+ text_poke_event->event_id.header.size);
8779
+ if (ret)
8780
+ return;
8781
+
8782
+ perf_output_put(&handle, text_poke_event->event_id);
8783
+ perf_output_put(&handle, text_poke_event->old_len);
8784
+ perf_output_put(&handle, text_poke_event->new_len);
8785
+
8786
+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
8787
+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
8788
+
8789
+ if (text_poke_event->pad)
8790
+ __output_copy(&handle, &padding, text_poke_event->pad);
8791
+
8792
+ perf_event__output_id_sample(event, &handle, &sample);
8793
+
8794
+ perf_output_end(&handle);
8795
+}
8796
+
8797
+void perf_event_text_poke(const void *addr, const void *old_bytes,
8798
+ size_t old_len, const void *new_bytes, size_t new_len)
8799
+{
8800
+ struct perf_text_poke_event text_poke_event;
8801
+ size_t tot, pad;
8802
+
8803
+ if (!atomic_read(&nr_text_poke_events))
8804
+ return;
8805
+
8806
+ tot = sizeof(text_poke_event.old_len) + old_len;
8807
+ tot += sizeof(text_poke_event.new_len) + new_len;
8808
+ pad = ALIGN(tot, sizeof(u64)) - tot;
8809
+
8810
+ text_poke_event = (struct perf_text_poke_event){
8811
+ .old_bytes = old_bytes,
8812
+ .new_bytes = new_bytes,
8813
+ .pad = pad,
8814
+ .old_len = old_len,
8815
+ .new_len = new_len,
8816
+ .event_id = {
8817
+ .header = {
8818
+ .type = PERF_RECORD_TEXT_POKE,
8819
+ .misc = PERF_RECORD_MISC_KERNEL,
8820
+ .size = sizeof(text_poke_event.event_id) + tot + pad,
8821
+ },
8822
+ .addr = (unsigned long)addr,
8823
+ },
8824
+ };
8825
+
8826
+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
77898827 }
77908828
77918829 void perf_event_itrace_started(struct perf_event *event)
....@@ -7818,7 +8856,7 @@
78188856 rec.tid = perf_event_tid(event, current);
78198857
78208858 perf_event_header__init_id(&rec.header, &sample, event);
7821
- ret = perf_output_begin(&handle, event, rec.header.size);
8859
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
78228860
78238861 if (ret)
78248862 return;
....@@ -8386,9 +9424,9 @@
83869424 if (event->hw.state & PERF_HES_STOPPED)
83879425 return 0;
83889426 /*
8389
- * All tracepoints are from kernel-space.
9427
+ * If exclude_kernel, only trace user-space tracepoints (uprobes)
83909428 */
8391
- if (event->attr.exclude_kernel)
9429
+ if (event->attr.exclude_kernel && !user_mode(regs))
83929430 return 0;
83939431
83949432 if (!perf_tp_filter_match(event, data))
....@@ -8514,30 +9552,39 @@
85149552 *
85159553 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
85169554 * if not set, create kprobe/uprobe
9555
+ *
9556
+ * The following values specify a reference counter (or semaphore in the
9557
+ * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
9558
+ * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
9559
+ *
9560
+ * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset
9561
+ * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
85179562 */
85189563 enum perf_probe_config {
85199564 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
9565
+ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
9566
+ PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
85209567 };
85219568
85229569 PMU_FORMAT_ATTR(retprobe, "config:0");
9570
+#endif
85239571
8524
-static struct attribute *probe_attrs[] = {
9572
+#ifdef CONFIG_KPROBE_EVENTS
9573
+static struct attribute *kprobe_attrs[] = {
85259574 &format_attr_retprobe.attr,
85269575 NULL,
85279576 };
85289577
8529
-static struct attribute_group probe_format_group = {
9578
+static struct attribute_group kprobe_format_group = {
85309579 .name = "format",
8531
- .attrs = probe_attrs,
9580
+ .attrs = kprobe_attrs,
85329581 };
85339582
8534
-static const struct attribute_group *probe_attr_groups[] = {
8535
- &probe_format_group,
9583
+static const struct attribute_group *kprobe_attr_groups[] = {
9584
+ &kprobe_format_group,
85369585 NULL,
85379586 };
8538
-#endif
85399587
8540
-#ifdef CONFIG_KPROBE_EVENTS
85419588 static int perf_kprobe_event_init(struct perf_event *event);
85429589 static struct pmu perf_kprobe = {
85439590 .task_ctx_nr = perf_sw_context,
....@@ -8547,7 +9594,7 @@
85479594 .start = perf_swevent_start,
85489595 .stop = perf_swevent_stop,
85499596 .read = perf_swevent_read,
8550
- .attr_groups = probe_attr_groups,
9597
+ .attr_groups = kprobe_attr_groups,
85519598 };
85529599
85539600 static int perf_kprobe_event_init(struct perf_event *event)
....@@ -8558,7 +9605,7 @@
85589605 if (event->attr.type != perf_kprobe.type)
85599606 return -ENOENT;
85609607
8561
- if (!capable(CAP_SYS_ADMIN))
9608
+ if (!perfmon_capable())
85629609 return -EACCES;
85639610
85649611 /*
....@@ -8579,6 +9626,24 @@
85799626 #endif /* CONFIG_KPROBE_EVENTS */
85809627
85819628 #ifdef CONFIG_UPROBE_EVENTS
9629
+PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
9630
+
9631
+static struct attribute *uprobe_attrs[] = {
9632
+ &format_attr_retprobe.attr,
9633
+ &format_attr_ref_ctr_offset.attr,
9634
+ NULL,
9635
+};
9636
+
9637
+static struct attribute_group uprobe_format_group = {
9638
+ .name = "format",
9639
+ .attrs = uprobe_attrs,
9640
+};
9641
+
9642
+static const struct attribute_group *uprobe_attr_groups[] = {
9643
+ &uprobe_format_group,
9644
+ NULL,
9645
+};
9646
+
85829647 static int perf_uprobe_event_init(struct perf_event *event);
85839648 static struct pmu perf_uprobe = {
85849649 .task_ctx_nr = perf_sw_context,
....@@ -8588,18 +9653,19 @@
85889653 .start = perf_swevent_start,
85899654 .stop = perf_swevent_stop,
85909655 .read = perf_swevent_read,
8591
- .attr_groups = probe_attr_groups,
9656
+ .attr_groups = uprobe_attr_groups,
85929657 };
85939658
85949659 static int perf_uprobe_event_init(struct perf_event *event)
85959660 {
85969661 int err;
9662
+ unsigned long ref_ctr_offset;
85979663 bool is_retprobe;
85989664
85999665 if (event->attr.type != perf_uprobe.type)
86009666 return -ENOENT;
86019667
8602
- if (!capable(CAP_SYS_ADMIN))
9668
+ if (!perfmon_capable())
86039669 return -EACCES;
86049670
86059671 /*
....@@ -8609,7 +9675,8 @@
86099675 return -EOPNOTSUPP;
86109676
86119677 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8612
- err = perf_uprobe_init(event, is_retprobe);
9678
+ ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
9679
+ err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
86139680 if (err)
86149681 return err;
86159682
....@@ -8647,7 +9714,6 @@
86479714 int ret = 0;
86489715
86499716 ctx.regs = perf_arch_bpf_user_pt_regs(regs);
8650
- preempt_disable();
86519717 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
86529718 goto out;
86539719 rcu_read_lock();
....@@ -8655,7 +9721,6 @@
86559721 rcu_read_unlock();
86569722 out:
86579723 __this_cpu_dec(bpf_prog_active);
8658
- preempt_enable();
86599724 if (!ret)
86609725 return;
86619726
....@@ -8676,6 +9741,24 @@
86769741 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
86779742 if (IS_ERR(prog))
86789743 return PTR_ERR(prog);
9744
+
9745
+ if (event->attr.precise_ip &&
9746
+ prog->call_get_stack &&
9747
+ (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
9748
+ event->attr.exclude_callchain_kernel ||
9749
+ event->attr.exclude_callchain_user)) {
9750
+ /*
9751
+ * On perf_event with precise_ip, calling bpf_get_stack()
9752
+ * may trigger unwinder warnings and occasional crashes.
9753
+ * bpf_get_[stack|stackid] works around this issue by using
9754
+ * callchain attached to perf_sample_data. If the
9755
+ * perf_event does not full (kernel and user) callchain
9756
+ * attached to perf_sample_data, do not allow attaching BPF
9757
+ * program that calls bpf_get_[stack|stackid].
9758
+ */
9759
+ bpf_prog_put(prog);
9760
+ return -EPROTO;
9761
+ }
86799762
86809763 event->prog = prog;
86819764 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
....@@ -8875,7 +9958,7 @@
88759958 /*
88769959 * Scan through mm's vmas and see if one of them matches the
88779960 * @filter; if so, adjust filter's address range.
8878
- * Called with mm::mmap_sem down for reading.
9961
+ * Called with mm::mmap_lock down for reading.
88799962 */
88809963 static void perf_addr_filter_apply(struct perf_addr_filter *filter,
88819964 struct mm_struct *mm,
....@@ -8917,7 +10000,7 @@
891710000 if (!mm)
891810001 goto restart;
891910002
8920
- down_read(&mm->mmap_sem);
10003
+ mmap_read_lock(mm);
892110004 }
892210005
892310006 raw_spin_lock_irqsave(&ifh->lock, flags);
....@@ -8943,7 +10026,7 @@
894310026 raw_spin_unlock_irqrestore(&ifh->lock, flags);
894410027
894510028 if (ifh->nr_file_filters) {
8946
- up_read(&mm->mmap_sem);
10029
+ mmap_read_unlock(mm);
894710030
894810031 mmput(mm);
894910032 }
....@@ -9050,6 +10133,7 @@
905010133 case IF_SRC_KERNELADDR:
905110134 case IF_SRC_KERNEL:
905210135 kernel = 1;
10136
+ fallthrough;
905310137
905410138 case IF_SRC_FILEADDR:
905510139 case IF_SRC_FILE:
....@@ -9136,8 +10220,11 @@
913610220 }
913710221
913810222 /* ready to consume more filters */
10223
+ kfree(filename);
10224
+ filename = NULL;
913910225 state = IF_STATE_ACTION;
914010226 filter = NULL;
10227
+ kernel = 0;
914110228 }
914210229 }
914310230
....@@ -9285,7 +10372,7 @@
928510372 period = max_t(u64, 10000, hwc->sample_period);
928610373 }
928710374 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
9288
- HRTIMER_MODE_REL_PINNED);
10375
+ HRTIMER_MODE_REL_PINNED_HARD);
928910376 }
929010377
929110378 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
....@@ -9307,7 +10394,7 @@
930710394 if (!is_sampling_event(event))
930810395 return;
930910396
9310
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10397
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
931110398 hwc->hrtimer.function = perf_swevent_hrtimer;
931210399
931310400 /*
....@@ -9696,6 +10783,12 @@
969610783 if (ret)
969710784 goto del_dev;
969810785
10786
+ if (pmu->attr_update)
10787
+ ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
10788
+
10789
+ if (ret)
10790
+ goto del_dev;
10791
+
969910792 out:
970010793 return ret;
970110794
....@@ -9712,7 +10805,7 @@
971210805
971310806 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
971410807 {
9715
- int cpu, ret;
10808
+ int cpu, ret, max = PERF_TYPE_MAX;
971610809
971710810 mutex_lock(&pmus_lock);
971810811 ret = -ENOMEM;
....@@ -9725,12 +10818,17 @@
972510818 goto skip_type;
972610819 pmu->name = name;
972710820
9728
- if (type < 0) {
9729
- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9730
- if (type < 0) {
9731
- ret = type;
10821
+ if (type != PERF_TYPE_SOFTWARE) {
10822
+ if (type >= 0)
10823
+ max = type;
10824
+
10825
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
10826
+ if (ret < 0)
973210827 goto free_pdc;
9733
- }
10828
+
10829
+ WARN_ON(type >= 0 && ret != type);
10830
+
10831
+ type = ret;
973410832 }
973510833 pmu->type = type;
973610834
....@@ -9776,6 +10874,9 @@
977610874 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
977710875
977810876 __perf_mux_hrtimer_init(cpuctx, cpu);
10877
+
10878
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
10879
+ cpuctx->heap = cpuctx->heap_default;
977910880 }
978010881
978110882 got_cpu_context:
....@@ -9807,7 +10908,16 @@
980710908 if (!pmu->event_idx)
980810909 pmu->event_idx = perf_event_idx_default;
980910910
9810
- list_add_rcu(&pmu->entry, &pmus);
10911
+ /*
10912
+ * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
10913
+ * since these cannot be in the IDR. This way the linear search
10914
+ * is fast, provided a valid software event is provided.
10915
+ */
10916
+ if (type == PERF_TYPE_SOFTWARE || !name)
10917
+ list_add_rcu(&pmu->entry, &pmus);
10918
+ else
10919
+ list_add_tail_rcu(&pmu->entry, &pmus);
10920
+
981110921 atomic_set(&pmu->exclusive_cnt, 0);
981210922 ret = 0;
981310923 unlock:
....@@ -9820,7 +10930,7 @@
982010930 put_device(pmu->dev);
982110931
982210932 free_idr:
9823
- if (pmu->type >= PERF_TYPE_MAX)
10933
+ if (pmu->type != PERF_TYPE_SOFTWARE)
982410934 idr_remove(&pmu_idr, pmu->type);
982510935
982610936 free_pdc:
....@@ -9842,7 +10952,7 @@
984210952 synchronize_rcu();
984310953
984410954 free_percpu(pmu->pmu_disable_count);
9845
- if (pmu->type >= PERF_TYPE_MAX)
10955
+ if (pmu->type != PERF_TYPE_SOFTWARE)
984610956 idr_remove(&pmu_idr, pmu->type);
984710957 if (pmu_bus_running) {
984810958 if (pmu->nr_addr_filters)
....@@ -9854,6 +10964,12 @@
985410964 mutex_unlock(&pmus_lock);
985510965 }
985610966 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
10967
+
10968
+static inline bool has_extended_regs(struct perf_event *event)
10969
+{
10970
+ return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
10971
+ (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
10972
+}
985710973
985810974 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
985910975 {
....@@ -9885,6 +11001,19 @@
988511001 if (ctx)
988611002 perf_event_ctx_unlock(event->group_leader, ctx);
988711003
11004
+ if (!ret) {
11005
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
11006
+ has_extended_regs(event))
11007
+ ret = -EOPNOTSUPP;
11008
+
11009
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
11010
+ event_has_any_exclude_flag(event))
11011
+ ret = -EINVAL;
11012
+
11013
+ if (ret && event->destroy)
11014
+ event->destroy(event);
11015
+ }
11016
+
988811017 if (ret)
988911018 module_put(pmu->module);
989011019
....@@ -9893,9 +11022,8 @@
989311022
989411023 static struct pmu *perf_init_event(struct perf_event *event)
989511024 {
11025
+ int idx, type, ret;
989611026 struct pmu *pmu;
9897
- int idx;
9898
- int ret;
989911027
990011028 idx = srcu_read_lock(&pmus_srcu);
990111029
....@@ -9907,17 +11035,32 @@
990711035 goto unlock;
990811036 }
990911037
11038
+ /*
11039
+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
11040
+ * are often aliases for PERF_TYPE_RAW.
11041
+ */
11042
+ type = event->attr.type;
11043
+ if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
11044
+ type = PERF_TYPE_RAW;
11045
+
11046
+again:
991011047 rcu_read_lock();
9911
- pmu = idr_find(&pmu_idr, event->attr.type);
11048
+ pmu = idr_find(&pmu_idr, type);
991211049 rcu_read_unlock();
991311050 if (pmu) {
991411051 ret = perf_try_init_event(pmu, event);
11052
+ if (ret == -ENOENT && event->attr.type != type) {
11053
+ type = event->attr.type;
11054
+ goto again;
11055
+ }
11056
+
991511057 if (ret)
991611058 pmu = ERR_PTR(ret);
11059
+
991711060 goto unlock;
991811061 }
991911062
9920
- list_for_each_entry_rcu(pmu, &pmus, entry) {
11063
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
992111064 ret = perf_try_init_event(pmu, event);
992211065 if (!ret)
992311066 goto unlock;
....@@ -9993,7 +11136,7 @@
999311136 if (event->parent)
999411137 return;
999511138
9996
- if (event->attach_state & PERF_ATTACH_TASK)
11139
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
999711140 inc = true;
999811141 if (event->attr.mmap || event->attr.mmap_data)
999911142 atomic_inc(&nr_mmap_events);
....@@ -10001,6 +11144,8 @@
1000111144 atomic_inc(&nr_comm_events);
1000211145 if (event->attr.namespaces)
1000311146 atomic_inc(&nr_namespaces_events);
11147
+ if (event->attr.cgroup)
11148
+ atomic_inc(&nr_cgroup_events);
1000411149 if (event->attr.task)
1000511150 atomic_inc(&nr_task_events);
1000611151 if (event->attr.freq)
....@@ -10013,6 +11158,12 @@
1001311158 inc = true;
1001411159 if (is_cgroup_event(event))
1001511160 inc = true;
11161
+ if (event->attr.ksymbol)
11162
+ atomic_inc(&nr_ksymbol_events);
11163
+ if (event->attr.bpf_event)
11164
+ atomic_inc(&nr_bpf_events);
11165
+ if (event->attr.text_poke)
11166
+ atomic_inc(&nr_text_poke_events);
1001611167
1001711168 if (inc) {
1001811169 /*
....@@ -10031,7 +11182,7 @@
1003111182 * call the perf scheduling hooks before proceeding to
1003211183 * install events that need them.
1003311184 */
10034
- synchronize_sched();
11185
+ synchronize_rcu();
1003511186 }
1003611187 /*
1003711188 * Now that we have waited for the sync_sched(), allow further
....@@ -10120,8 +11271,7 @@
1012011271 * and we cannot use the ctx information because we need the
1012111272 * pmu before we get a ctx.
1012211273 */
10123
- get_task_struct(task);
10124
- event->hw.target = task;
11274
+ event->hw.target = get_task_struct(task);
1012511275 }
1012611276
1012711277 event->clock = &local_clock;
....@@ -10133,12 +11283,9 @@
1013311283 context = parent_event->overflow_handler_context;
1013411284 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
1013511285 if (overflow_handler == bpf_overflow_handler) {
10136
- struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
11286
+ struct bpf_prog *prog = parent_event->prog;
1013711287
10138
- if (IS_ERR(prog)) {
10139
- err = PTR_ERR(prog);
10140
- goto err_ns;
10141
- }
11288
+ bpf_prog_inc(prog);
1014211289 event->prog = prog;
1014311290 event->orig_overflow_handler =
1014411291 parent_event->orig_overflow_handler;
....@@ -10179,16 +11326,31 @@
1017911326 if (!has_branch_stack(event))
1018011327 event->attr.branch_sample_type = 0;
1018111328
10182
- if (cgroup_fd != -1) {
10183
- err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
10184
- if (err)
10185
- goto err_ns;
10186
- }
10187
-
1018811329 pmu = perf_init_event(event);
1018911330 if (IS_ERR(pmu)) {
1019011331 err = PTR_ERR(pmu);
1019111332 goto err_ns;
11333
+ }
11334
+
11335
+ /*
11336
+ * Disallow uncore-cgroup events, they don't make sense as the cgroup will
11337
+ * be different on other CPUs in the uncore mask.
11338
+ */
11339
+ if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
11340
+ err = -EINVAL;
11341
+ goto err_pmu;
11342
+ }
11343
+
11344
+ if (event->attr.aux_output &&
11345
+ !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
11346
+ err = -EOPNOTSUPP;
11347
+ goto err_pmu;
11348
+ }
11349
+
11350
+ if (cgroup_fd != -1) {
11351
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
11352
+ if (err)
11353
+ goto err_pmu;
1019211354 }
1019311355
1019411356 err = exclusive_event_init(event);
....@@ -10251,12 +11413,12 @@
1025111413 exclusive_event_destroy(event);
1025211414
1025311415 err_pmu:
11416
+ if (is_cgroup_event(event))
11417
+ perf_detach_cgroup(event);
1025411418 if (event->destroy)
1025511419 event->destroy(event);
1025611420 module_put(pmu->module);
1025711421 err_ns:
10258
- if (is_cgroup_event(event))
10259
- perf_detach_cgroup(event);
1026011422 if (event->ns)
1026111423 put_pid_ns(event->ns);
1026211424 if (event->hw.target)
....@@ -10272,58 +11434,29 @@
1027211434 u32 size;
1027311435 int ret;
1027411436
10275
- if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
10276
- return -EFAULT;
10277
-
10278
- /*
10279
- * zero the full structure, so that a short copy will be nice.
10280
- */
11437
+ /* Zero the full structure, so that a short copy will be nice. */
1028111438 memset(attr, 0, sizeof(*attr));
1028211439
1028311440 ret = get_user(size, &uattr->size);
1028411441 if (ret)
1028511442 return ret;
1028611443
10287
- if (size > PAGE_SIZE) /* silly large */
10288
- goto err_size;
10289
-
10290
- if (!size) /* abi compat */
11444
+ /* ABI compatibility quirk: */
11445
+ if (!size)
1029111446 size = PERF_ATTR_SIZE_VER0;
10292
-
10293
- if (size < PERF_ATTR_SIZE_VER0)
11447
+ if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
1029411448 goto err_size;
1029511449
10296
- /*
10297
- * If we're handed a bigger struct than we know of,
10298
- * ensure all the unknown bits are 0 - i.e. new
10299
- * user-space does not rely on any kernel feature
10300
- * extensions we dont know about yet.
10301
- */
10302
- if (size > sizeof(*attr)) {
10303
- unsigned char __user *addr;
10304
- unsigned char __user *end;
10305
- unsigned char val;
10306
-
10307
- addr = (void __user *)uattr + sizeof(*attr);
10308
- end = (void __user *)uattr + size;
10309
-
10310
- for (; addr < end; addr++) {
10311
- ret = get_user(val, addr);
10312
- if (ret)
10313
- return ret;
10314
- if (val)
10315
- goto err_size;
10316
- }
10317
- size = sizeof(*attr);
11450
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
11451
+ if (ret) {
11452
+ if (ret == -E2BIG)
11453
+ goto err_size;
11454
+ return ret;
1031811455 }
10319
-
10320
- ret = copy_from_user(attr, uattr, size);
10321
- if (ret)
10322
- return -EFAULT;
1032311456
1032411457 attr->size = size;
1032511458
10326
- if (attr->__reserved_1)
11459
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
1032711460 return -EINVAL;
1032811461
1032911462 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
....@@ -10394,6 +11527,12 @@
1039411527
1039511528 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
1039611529 ret = perf_reg_validate(attr->sample_regs_intr);
11530
+
11531
+#ifndef CONFIG_CGROUP_PERF
11532
+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
11533
+ return -EINVAL;
11534
+#endif
11535
+
1039711536 out:
1039811537 return ret;
1039911538
....@@ -10403,14 +11542,25 @@
1040311542 goto out;
1040411543 }
1040511544
11545
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
11546
+{
11547
+ if (b < a)
11548
+ swap(a, b);
11549
+
11550
+ mutex_lock(a);
11551
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
11552
+}
11553
+
1040611554 static int
1040711555 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
1040811556 {
10409
- struct ring_buffer *rb = NULL;
11557
+ struct perf_buffer *rb = NULL;
1041011558 int ret = -EINVAL;
1041111559
10412
- if (!output_event)
11560
+ if (!output_event) {
11561
+ mutex_lock(&event->mmap_mutex);
1041311562 goto set;
11563
+ }
1041411564
1041511565 /* don't allow circular references */
1041611566 if (event == output_event)
....@@ -10448,8 +11598,15 @@
1044811598 event->pmu != output_event->pmu)
1044911599 goto out;
1045011600
11601
+ /*
11602
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
11603
+ * output_event is already on rb->event_list, and the list iteration
11604
+ * restarts after every removal, it is guaranteed this new event is
11605
+ * observed *OR* if output_event is already removed, it's guaranteed we
11606
+ * observe !rb->mmap_count.
11607
+ */
11608
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
1045111609 set:
10452
- mutex_lock(&event->mmap_mutex);
1045311610 /* Can't redirect output if we've got an active mmap() */
1045411611 if (atomic_read(&event->mmap_count))
1045511612 goto unlock;
....@@ -10459,6 +11616,12 @@
1045911616 rb = ring_buffer_get(output_event);
1046011617 if (!rb)
1046111618 goto unlock;
11619
+
11620
+ /* did we race against perf_mmap_close() */
11621
+ if (!atomic_read(&rb->mmap_count)) {
11622
+ ring_buffer_put(rb);
11623
+ goto unlock;
11624
+ }
1046211625 }
1046311626
1046411627 ring_buffer_attach(event, rb);
....@@ -10466,18 +11629,11 @@
1046611629 ret = 0;
1046711630 unlock:
1046811631 mutex_unlock(&event->mmap_mutex);
11632
+ if (output_event)
11633
+ mutex_unlock(&output_event->mmap_mutex);
1046911634
1047011635 out:
1047111636 return ret;
10472
-}
10473
-
10474
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
10475
-{
10476
- if (b < a)
10477
- swap(a, b);
10478
-
10479
- mutex_lock(a);
10480
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
1048111637 }
1048211638
1048311639 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
....@@ -10500,11 +11656,11 @@
1050011656 break;
1050111657
1050211658 case CLOCK_BOOTTIME:
10503
- event->clock = &ktime_get_boot_ns;
11659
+ event->clock = &ktime_get_boottime_ns;
1050411660 break;
1050511661
1050611662 case CLOCK_TAI:
10507
- event->clock = &ktime_get_tai_ns;
11663
+ event->clock = &ktime_get_clocktai_ns;
1050811664 break;
1050911665
1051011666 default:
....@@ -10530,7 +11686,7 @@
1053011686 again:
1053111687 rcu_read_lock();
1053211688 gctx = READ_ONCE(group_leader->ctx);
10533
- if (!atomic_inc_not_zero(&gctx->refcount)) {
11689
+ if (!refcount_inc_not_zero(&gctx->refcount)) {
1053411690 rcu_read_unlock();
1053511691 goto again;
1053611692 }
....@@ -10563,7 +11719,7 @@
1056311719 struct perf_event *group_leader = NULL, *output_event = NULL;
1056411720 struct perf_event *event, *sibling;
1056511721 struct perf_event_attr attr;
10566
- struct perf_event_context *ctx, *uninitialized_var(gctx);
11722
+ struct perf_event_context *ctx, *gctx;
1056711723 struct file *event_file = NULL;
1056811724 struct fd group = {NULL, 0};
1056911725 struct task_struct *task = NULL;
....@@ -10577,9 +11733,6 @@
1057711733 /* for future expandability... */
1057811734 if (flags & ~PERF_FLAG_ALL)
1057911735 return -EINVAL;
10580
-
10581
- if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
10582
- return -EACCES;
1058311736
1058411737 /* Do we allow access to perf_event_open(2) ? */
1058511738 err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
....@@ -10597,7 +11750,7 @@
1059711750 }
1059811751
1059911752 if (attr.namespaces) {
10600
- if (!capable(CAP_SYS_ADMIN))
11753
+ if (!perfmon_capable())
1060111754 return -EACCES;
1060211755 }
1060311756
....@@ -10612,6 +11765,13 @@
1061211765 /* Only privileged users can get physical addresses */
1061311766 if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
1061411767 err = perf_allow_kernel(&attr);
11768
+ if (err)
11769
+ return err;
11770
+ }
11771
+
11772
+ /* REGS_INTR can leak data, lockdown must prevent this */
11773
+ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
11774
+ err = security_locked_down(LOCKDOWN_PERF);
1061511775 if (err)
1061611776 return err;
1061711777 }
....@@ -10657,24 +11817,6 @@
1065711817 goto err_task;
1065811818 }
1065911819
10660
- if (task) {
10661
- err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10662
- if (err)
10663
- goto err_task;
10664
-
10665
- /*
10666
- * Reuse ptrace permission checks for now.
10667
- *
10668
- * We must hold cred_guard_mutex across this and any potential
10669
- * perf_install_in_context() call for this new event to
10670
- * serialize against exec() altering our credentials (and the
10671
- * perf_event_exit_task() that could imply).
10672
- */
10673
- err = -EACCES;
10674
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10675
- goto err_cred;
10676
- }
10677
-
1067811820 if (flags & PERF_FLAG_PID_CGROUP)
1067911821 cgroup_fd = pid;
1068011822
....@@ -10682,7 +11824,7 @@
1068211824 NULL, NULL, cgroup_fd);
1068311825 if (IS_ERR(event)) {
1068411826 err = PTR_ERR(event);
10685
- goto err_cred;
11827
+ goto err_task;
1068611828 }
1068711829
1068811830 if (is_sampling_event(event)) {
....@@ -10776,6 +11918,9 @@
1077611918 * Do not allow to attach to a group in a different task
1077711919 * or CPU context. If we're moving SW events, we'll fix
1077811920 * this up later, so allow that.
11921
+ *
11922
+ * Racy, not holding group_leader->ctx->mutex, see comment with
11923
+ * perf_event_ctx_lock().
1077911924 */
1078011925 if (!move_group && group_leader->ctx != ctx)
1078111926 goto err_context;
....@@ -10799,6 +11944,24 @@
1079911944 err = PTR_ERR(event_file);
1080011945 event_file = NULL;
1080111946 goto err_context;
11947
+ }
11948
+
11949
+ if (task) {
11950
+ err = down_read_interruptible(&task->signal->exec_update_lock);
11951
+ if (err)
11952
+ goto err_file;
11953
+
11954
+ /*
11955
+ * Preserve ptrace permission check for backwards compatibility.
11956
+ *
11957
+ * We must hold exec_update_lock across this and any potential
11958
+ * perf_install_in_context() call for this new event to
11959
+ * serialize against exec() altering our credentials (and the
11960
+ * perf_event_exit_task() that could imply).
11961
+ */
11962
+ err = -EACCES;
11963
+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
11964
+ goto err_cred;
1080211965 }
1080311966
1080411967 if (move_group) {
....@@ -10825,6 +11988,7 @@
1082511988 } else {
1082611989 perf_event_ctx_unlock(group_leader, gctx);
1082711990 move_group = 0;
11991
+ goto not_move_group;
1082811992 }
1082911993 }
1083011994
....@@ -10841,7 +12005,17 @@
1084112005 }
1084212006 } else {
1084312007 mutex_lock(&ctx->mutex);
12008
+
12009
+ /*
12010
+ * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
12011
+ * see the group_leader && !move_group test earlier.
12012
+ */
12013
+ if (group_leader && group_leader->ctx != ctx) {
12014
+ err = -EINVAL;
12015
+ goto err_locked;
12016
+ }
1084412017 }
12018
+not_move_group:
1084512019
1084612020 if (ctx->task == TASK_TOMBSTONE) {
1084712021 err = -ESRCH;
....@@ -10869,6 +12043,10 @@
1086912043 }
1087012044 }
1087112045
12046
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
12047
+ err = -EINVAL;
12048
+ goto err_locked;
12049
+ }
1087212050
1087312051 /*
1087412052 * Must be under the same ctx::mutex as perf_install_in_context(),
....@@ -10950,7 +12128,7 @@
1095012128 mutex_unlock(&ctx->mutex);
1095112129
1095212130 if (task) {
10953
- mutex_unlock(&task->signal->cred_guard_mutex);
12131
+ up_read(&task->signal->exec_update_lock);
1095412132 put_task_struct(task);
1095512133 }
1095612134
....@@ -10972,7 +12150,10 @@
1097212150 if (move_group)
1097312151 perf_event_ctx_unlock(group_leader, gctx);
1097412152 mutex_unlock(&ctx->mutex);
10975
-/* err_file: */
12153
+err_cred:
12154
+ if (task)
12155
+ up_read(&task->signal->exec_update_lock);
12156
+err_file:
1097612157 fput(event_file);
1097712158 err_context:
1097812159 perf_unpin_context(ctx);
....@@ -10984,9 +12165,6 @@
1098412165 */
1098512166 if (!event_file)
1098612167 free_event(event);
10987
-err_cred:
10988
- if (task)
10989
- mutex_unlock(&task->signal->cred_guard_mutex);
1099012168 err_task:
1099112169 if (task)
1099212170 put_task_struct(task);
....@@ -11015,8 +12193,11 @@
1101512193 int err;
1101612194
1101712195 /*
11018
- * Get the target context (task or percpu):
12196
+ * Grouping is not supported for kernel events, neither is 'AUX',
12197
+ * make sure the caller's intentions are adjusted.
1101912198 */
12199
+ if (attr->aux_output)
12200
+ return ERR_PTR(-EINVAL);
1102012201
1102112202 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
1102212203 overflow_handler, context, -1);
....@@ -11028,6 +12209,9 @@
1102812209 /* Mark owner so we could distinguish it from user events. */
1102912210 event->owner = TASK_TOMBSTONE;
1103012211
12212
+ /*
12213
+ * Get the target context (task or percpu):
12214
+ */
1103112215 ctx = find_get_context(event->pmu, task, event);
1103212216 if (IS_ERR(ctx)) {
1103312217 err = PTR_ERR(ctx);
....@@ -11285,8 +12469,8 @@
1128512469 /*
1128612470 * When a child task exits, feed back event values to parent events.
1128712471 *
11288
- * Can be called with cred_guard_mutex held when called from
11289
- * install_exec_creds().
12472
+ * Can be called with exec_update_lock held when called from
12473
+ * setup_new_exec().
1129012474 */
1129112475 void perf_event_exit_task(struct task_struct *child)
1129212476 {
....@@ -11390,7 +12574,7 @@
1139012574 *
1139112575 * Wait for all events to drop their context reference.
1139212576 */
11393
- wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1);
12577
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
1139412578 put_ctx(ctx); /* must be last */
1139512579 }
1139612580 }
....@@ -11405,9 +12589,7 @@
1140512589
1140612590 struct file *perf_event_get(unsigned int fd)
1140712591 {
11408
- struct file *file;
11409
-
11410
- file = fget_raw(fd);
12592
+ struct file *file = fget(fd);
1141112593 if (!file)
1141212594 return ERR_PTR(-EBADF);
1141312595
....@@ -11477,8 +12659,7 @@
1147712659 !child_ctx->task_ctx_data) {
1147812660 struct pmu *pmu = child_event->pmu;
1147912661
11480
- child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
11481
- GFP_KERNEL);
12662
+ child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
1148212663 if (!child_ctx->task_ctx_data) {
1148312664 free_event(child_event);
1148412665 return ERR_PTR(-ENOMEM);
....@@ -11583,6 +12764,10 @@
1158312764 child, leader, child_ctx);
1158412765 if (IS_ERR(child_ctr))
1158512766 return PTR_ERR(child_ctr);
12767
+
12768
+ if (sub->aux_event == parent_event && child_ctr &&
12769
+ !perf_get_aux_event(child_ctr, leader))
12770
+ return -EINVAL;
1158612771 }
1158712772 return 0;
1158812773 }
....@@ -11778,7 +12963,7 @@
1177812963 }
1177912964 }
1178012965
11781
-void perf_swevent_init_cpu(unsigned int cpu)
12966
+static void perf_swevent_init_cpu(unsigned int cpu)
1178212967 {
1178312968 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
1178412969
....@@ -11975,6 +13160,12 @@
1197513160 kfree(jc);
1197613161 }
1197713162
13163
+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
13164
+{
13165
+ perf_event_cgroup(css->cgroup);
13166
+ return 0;
13167
+}
13168
+
1197813169 static int __perf_cgroup_move(void *info)
1197913170 {
1198013171 struct task_struct *task = info;
....@@ -11996,6 +13187,7 @@
1199613187 struct cgroup_subsys perf_event_cgrp_subsys = {
1199713188 .css_alloc = perf_cgroup_css_alloc,
1199813189 .css_free = perf_cgroup_css_free,
13190
+ .css_online = perf_cgroup_css_online,
1199913191 .attach = perf_cgroup_attach,
1200013192 /*
1200113193 * Implicitly enable on dfl hierarchy so that perf events can