.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
---|
1 | 2 | /* |
---|
2 | 3 | * Performance events core code: |
---|
3 | 4 | * |
---|
.. | .. |
---|
5 | 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
---|
6 | 7 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra |
---|
7 | 8 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
---|
8 | | - * |
---|
9 | | - * For licensing details see kernel-base/COPYING |
---|
10 | 9 | */ |
---|
11 | 10 | |
---|
12 | 11 | #include <linux/fs.h> |
---|
.. | .. |
---|
29 | 28 | #include <linux/export.h> |
---|
30 | 29 | #include <linux/vmalloc.h> |
---|
31 | 30 | #include <linux/hardirq.h> |
---|
| 31 | +#include <linux/hugetlb.h> |
---|
32 | 32 | #include <linux/rculist.h> |
---|
33 | 33 | #include <linux/uaccess.h> |
---|
34 | 34 | #include <linux/syscalls.h> |
---|
.. | .. |
---|
50 | 50 | #include <linux/sched/mm.h> |
---|
51 | 51 | #include <linux/proc_ns.h> |
---|
52 | 52 | #include <linux/mount.h> |
---|
| 53 | +#include <linux/min_heap.h> |
---|
53 | 54 | |
---|
54 | 55 | #include "internal.h" |
---|
55 | 56 | |
---|
.. | .. |
---|
265 | 266 | if (!event->parent) { |
---|
266 | 267 | /* |
---|
267 | 268 | * If this is a !child event, we must hold ctx::mutex to |
---|
268 | | - * stabilize the the event->ctx relation. See |
---|
| 269 | + * stabilize the event->ctx relation. See |
---|
269 | 270 | * perf_event_ctx_lock(). |
---|
270 | 271 | */ |
---|
271 | 272 | lockdep_assert_held(&ctx->mutex); |
---|
.. | .. |
---|
391 | 392 | static atomic_t nr_task_events __read_mostly; |
---|
392 | 393 | static atomic_t nr_freq_events __read_mostly; |
---|
393 | 394 | static atomic_t nr_switch_events __read_mostly; |
---|
| 395 | +static atomic_t nr_ksymbol_events __read_mostly; |
---|
| 396 | +static atomic_t nr_bpf_events __read_mostly; |
---|
| 397 | +static atomic_t nr_cgroup_events __read_mostly; |
---|
| 398 | +static atomic_t nr_text_poke_events __read_mostly; |
---|
394 | 399 | |
---|
395 | 400 | static LIST_HEAD(pmus); |
---|
396 | 401 | static DEFINE_MUTEX(pmus_lock); |
---|
.. | .. |
---|
403 | 408 | * 0 - disallow raw tracepoint access for unpriv |
---|
404 | 409 | * 1 - disallow cpu events for unpriv |
---|
405 | 410 | * 2 - disallow kernel profiling for unpriv |
---|
406 | | - * 3 - disallow all unpriv perf event use |
---|
407 | 411 | */ |
---|
408 | | -#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT |
---|
409 | | -int sysctl_perf_event_paranoid __read_mostly = 3; |
---|
410 | | -#else |
---|
411 | 412 | int sysctl_perf_event_paranoid __read_mostly = 2; |
---|
412 | | -#endif |
---|
413 | 413 | |
---|
414 | 414 | /* Minimum for 512 kiB + 1 user control page */ |
---|
415 | 415 | int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ |
---|
.. | .. |
---|
444 | 444 | static bool perf_rotate_context(struct perf_cpu_context *cpuctx); |
---|
445 | 445 | |
---|
446 | 446 | int perf_proc_update_handler(struct ctl_table *table, int write, |
---|
447 | | - void __user *buffer, size_t *lenp, |
---|
448 | | - loff_t *ppos) |
---|
| 447 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
449 | 448 | { |
---|
450 | 449 | int ret; |
---|
451 | 450 | int perf_cpu = sysctl_perf_cpu_time_max_percent; |
---|
.. | .. |
---|
469 | 468 | int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; |
---|
470 | 469 | |
---|
471 | 470 | int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, |
---|
472 | | - void __user *buffer, size_t *lenp, |
---|
473 | | - loff_t *ppos) |
---|
| 471 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
474 | 472 | { |
---|
475 | 473 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
---|
476 | 474 | |
---|
.. | .. |
---|
761 | 759 | /* |
---|
762 | 760 | * Do not update time when cgroup is not active |
---|
763 | 761 | */ |
---|
764 | | - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) |
---|
| 762 | + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) |
---|
765 | 763 | __update_cgrp_time(event->cgrp); |
---|
766 | 764 | } |
---|
767 | 765 | |
---|
.. | .. |
---|
901 | 899 | rcu_read_unlock(); |
---|
902 | 900 | } |
---|
903 | 901 | |
---|
| 902 | +static int perf_cgroup_ensure_storage(struct perf_event *event, |
---|
| 903 | + struct cgroup_subsys_state *css) |
---|
| 904 | +{ |
---|
| 905 | + struct perf_cpu_context *cpuctx; |
---|
| 906 | + struct perf_event **storage; |
---|
| 907 | + int cpu, heap_size, ret = 0; |
---|
| 908 | + |
---|
| 909 | + /* |
---|
| 910 | + * Allow storage to have sufficent space for an iterator for each |
---|
| 911 | + * possibly nested cgroup plus an iterator for events with no cgroup. |
---|
| 912 | + */ |
---|
| 913 | + for (heap_size = 1; css; css = css->parent) |
---|
| 914 | + heap_size++; |
---|
| 915 | + |
---|
| 916 | + for_each_possible_cpu(cpu) { |
---|
| 917 | + cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); |
---|
| 918 | + if (heap_size <= cpuctx->heap_size) |
---|
| 919 | + continue; |
---|
| 920 | + |
---|
| 921 | + storage = kmalloc_node(heap_size * sizeof(struct perf_event *), |
---|
| 922 | + GFP_KERNEL, cpu_to_node(cpu)); |
---|
| 923 | + if (!storage) { |
---|
| 924 | + ret = -ENOMEM; |
---|
| 925 | + break; |
---|
| 926 | + } |
---|
| 927 | + |
---|
| 928 | + raw_spin_lock_irq(&cpuctx->ctx.lock); |
---|
| 929 | + if (cpuctx->heap_size < heap_size) { |
---|
| 930 | + swap(cpuctx->heap, storage); |
---|
| 931 | + if (storage == cpuctx->heap_default) |
---|
| 932 | + storage = NULL; |
---|
| 933 | + cpuctx->heap_size = heap_size; |
---|
| 934 | + } |
---|
| 935 | + raw_spin_unlock_irq(&cpuctx->ctx.lock); |
---|
| 936 | + |
---|
| 937 | + kfree(storage); |
---|
| 938 | + } |
---|
| 939 | + |
---|
| 940 | + return ret; |
---|
| 941 | +} |
---|
| 942 | + |
---|
904 | 943 | static inline int perf_cgroup_connect(int fd, struct perf_event *event, |
---|
905 | 944 | struct perf_event_attr *attr, |
---|
906 | 945 | struct perf_event *group_leader) |
---|
.. | .. |
---|
919 | 958 | ret = PTR_ERR(css); |
---|
920 | 959 | goto out; |
---|
921 | 960 | } |
---|
| 961 | + |
---|
| 962 | + ret = perf_cgroup_ensure_storage(event, css); |
---|
| 963 | + if (ret) |
---|
| 964 | + goto out; |
---|
922 | 965 | |
---|
923 | 966 | cgrp = container_of(css, struct perf_cgroup, css); |
---|
924 | 967 | event->cgrp = cgrp; |
---|
.. | .. |
---|
945 | 988 | event->shadow_ctx_time = now - t->timestamp; |
---|
946 | 989 | } |
---|
947 | 990 | |
---|
948 | | -/* |
---|
949 | | - * Update cpuctx->cgrp so that it is set when first cgroup event is added and |
---|
950 | | - * cleared when last cgroup event is removed. |
---|
951 | | - */ |
---|
952 | 991 | static inline void |
---|
953 | | -list_update_cgroup_event(struct perf_event *event, |
---|
954 | | - struct perf_event_context *ctx, bool add) |
---|
| 992 | +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) |
---|
955 | 993 | { |
---|
956 | 994 | struct perf_cpu_context *cpuctx; |
---|
957 | | - struct list_head *cpuctx_entry; |
---|
958 | 995 | |
---|
959 | 996 | if (!is_cgroup_event(event)) |
---|
960 | 997 | return; |
---|
961 | 998 | |
---|
962 | 999 | /* |
---|
963 | 1000 | * Because cgroup events are always per-cpu events, |
---|
964 | | - * this will always be called from the right CPU. |
---|
| 1001 | + * @ctx == &cpuctx->ctx. |
---|
965 | 1002 | */ |
---|
966 | | - cpuctx = __get_cpu_context(ctx); |
---|
| 1003 | + cpuctx = container_of(ctx, struct perf_cpu_context, ctx); |
---|
967 | 1004 | |
---|
968 | 1005 | /* |
---|
969 | 1006 | * Since setting cpuctx->cgrp is conditional on the current @cgrp |
---|
.. | .. |
---|
971 | 1008 | * because if the first would mismatch, the second would not try again |
---|
972 | 1009 | * and we would leave cpuctx->cgrp unset. |
---|
973 | 1010 | */ |
---|
974 | | - if (add && !cpuctx->cgrp) { |
---|
| 1011 | + if (ctx->is_active && !cpuctx->cgrp) { |
---|
975 | 1012 | struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); |
---|
976 | 1013 | |
---|
977 | 1014 | if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) |
---|
978 | 1015 | cpuctx->cgrp = cgrp; |
---|
979 | 1016 | } |
---|
980 | 1017 | |
---|
981 | | - if (add && ctx->nr_cgroups++) |
---|
982 | | - return; |
---|
983 | | - else if (!add && --ctx->nr_cgroups) |
---|
| 1018 | + if (ctx->nr_cgroups++) |
---|
984 | 1019 | return; |
---|
985 | 1020 | |
---|
986 | | - /* no cgroup running */ |
---|
987 | | - if (!add) |
---|
| 1021 | + list_add(&cpuctx->cgrp_cpuctx_entry, |
---|
| 1022 | + per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); |
---|
| 1023 | +} |
---|
| 1024 | + |
---|
| 1025 | +static inline void |
---|
| 1026 | +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) |
---|
| 1027 | +{ |
---|
| 1028 | + struct perf_cpu_context *cpuctx; |
---|
| 1029 | + |
---|
| 1030 | + if (!is_cgroup_event(event)) |
---|
| 1031 | + return; |
---|
| 1032 | + |
---|
| 1033 | + /* |
---|
| 1034 | + * Because cgroup events are always per-cpu events, |
---|
| 1035 | + * @ctx == &cpuctx->ctx. |
---|
| 1036 | + */ |
---|
| 1037 | + cpuctx = container_of(ctx, struct perf_cpu_context, ctx); |
---|
| 1038 | + |
---|
| 1039 | + if (--ctx->nr_cgroups) |
---|
| 1040 | + return; |
---|
| 1041 | + |
---|
| 1042 | + if (ctx->is_active && cpuctx->cgrp) |
---|
988 | 1043 | cpuctx->cgrp = NULL; |
---|
989 | 1044 | |
---|
990 | | - cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; |
---|
991 | | - if (add) |
---|
992 | | - list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); |
---|
993 | | - else |
---|
994 | | - list_del(cpuctx_entry); |
---|
| 1045 | + list_del(&cpuctx->cgrp_cpuctx_entry); |
---|
995 | 1046 | } |
---|
996 | 1047 | |
---|
997 | 1048 | #else /* !CONFIG_CGROUP_PERF */ |
---|
.. | .. |
---|
1041 | 1092 | { |
---|
1042 | 1093 | } |
---|
1043 | 1094 | |
---|
1044 | | -void |
---|
| 1095 | +static inline void |
---|
1045 | 1096 | perf_cgroup_switch(struct task_struct *task, struct task_struct *next) |
---|
1046 | 1097 | { |
---|
1047 | 1098 | } |
---|
.. | .. |
---|
1057 | 1108 | } |
---|
1058 | 1109 | |
---|
1059 | 1110 | static inline void |
---|
1060 | | -list_update_cgroup_event(struct perf_event *event, |
---|
1061 | | - struct perf_event_context *ctx, bool add) |
---|
| 1111 | +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) |
---|
1062 | 1112 | { |
---|
1063 | 1113 | } |
---|
1064 | 1114 | |
---|
| 1115 | +static inline void |
---|
| 1116 | +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) |
---|
| 1117 | +{ |
---|
| 1118 | +} |
---|
1065 | 1119 | #endif |
---|
1066 | 1120 | |
---|
1067 | 1121 | /* |
---|
.. | .. |
---|
1131 | 1185 | if (!cpuctx->hrtimer_active) { |
---|
1132 | 1186 | cpuctx->hrtimer_active = 1; |
---|
1133 | 1187 | hrtimer_forward_now(timer, cpuctx->hrtimer_interval); |
---|
1134 | | - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); |
---|
| 1188 | + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); |
---|
1135 | 1189 | } |
---|
1136 | 1190 | raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); |
---|
1137 | 1191 | |
---|
1138 | 1192 | return 0; |
---|
| 1193 | +} |
---|
| 1194 | + |
---|
| 1195 | +static int perf_mux_hrtimer_restart_ipi(void *arg) |
---|
| 1196 | +{ |
---|
| 1197 | + return perf_mux_hrtimer_restart(arg); |
---|
1139 | 1198 | } |
---|
1140 | 1199 | |
---|
1141 | 1200 | void perf_pmu_disable(struct pmu *pmu) |
---|
.. | .. |
---|
1182 | 1241 | |
---|
1183 | 1242 | static void get_ctx(struct perf_event_context *ctx) |
---|
1184 | 1243 | { |
---|
1185 | | - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); |
---|
| 1244 | + refcount_inc(&ctx->refcount); |
---|
| 1245 | +} |
---|
| 1246 | + |
---|
| 1247 | +static void *alloc_task_ctx_data(struct pmu *pmu) |
---|
| 1248 | +{ |
---|
| 1249 | + if (pmu->task_ctx_cache) |
---|
| 1250 | + return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); |
---|
| 1251 | + |
---|
| 1252 | + return NULL; |
---|
| 1253 | +} |
---|
| 1254 | + |
---|
| 1255 | +static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) |
---|
| 1256 | +{ |
---|
| 1257 | + if (pmu->task_ctx_cache && task_ctx_data) |
---|
| 1258 | + kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); |
---|
1186 | 1259 | } |
---|
1187 | 1260 | |
---|
1188 | 1261 | static void free_ctx(struct rcu_head *head) |
---|
.. | .. |
---|
1190 | 1263 | struct perf_event_context *ctx; |
---|
1191 | 1264 | |
---|
1192 | 1265 | ctx = container_of(head, struct perf_event_context, rcu_head); |
---|
1193 | | - kfree(ctx->task_ctx_data); |
---|
| 1266 | + free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); |
---|
1194 | 1267 | kfree(ctx); |
---|
1195 | 1268 | } |
---|
1196 | 1269 | |
---|
1197 | 1270 | static void put_ctx(struct perf_event_context *ctx) |
---|
1198 | 1271 | { |
---|
1199 | | - if (atomic_dec_and_test(&ctx->refcount)) { |
---|
| 1272 | + if (refcount_dec_and_test(&ctx->refcount)) { |
---|
1200 | 1273 | if (ctx->parent_ctx) |
---|
1201 | 1274 | put_ctx(ctx->parent_ctx); |
---|
1202 | 1275 | if (ctx->task && ctx->task != TASK_TOMBSTONE) |
---|
.. | .. |
---|
1232 | 1305 | * life-time rules separate them. That is an exiting task cannot fork, and a |
---|
1233 | 1306 | * spawning task cannot (yet) exit. |
---|
1234 | 1307 | * |
---|
1235 | | - * But remember that that these are parent<->child context relations, and |
---|
| 1308 | + * But remember that these are parent<->child context relations, and |
---|
1236 | 1309 | * migration does not affect children, therefore these two orderings should not |
---|
1237 | 1310 | * interact. |
---|
1238 | 1311 | * |
---|
.. | .. |
---|
1258 | 1331 | * function. |
---|
1259 | 1332 | * |
---|
1260 | 1333 | * Lock order: |
---|
1261 | | - * cred_guard_mutex |
---|
| 1334 | + * exec_update_lock |
---|
1262 | 1335 | * task_struct::perf_event_mutex |
---|
1263 | 1336 | * perf_event_context::mutex |
---|
1264 | 1337 | * perf_event::child_mutex; |
---|
1265 | 1338 | * perf_event_context::lock |
---|
1266 | 1339 | * perf_event::mmap_mutex |
---|
1267 | | - * mmap_sem |
---|
| 1340 | + * mmap_lock |
---|
1268 | 1341 | * perf_addr_filters_head::lock |
---|
1269 | 1342 | * |
---|
1270 | 1343 | * cpu_hotplug_lock |
---|
.. | .. |
---|
1279 | 1352 | again: |
---|
1280 | 1353 | rcu_read_lock(); |
---|
1281 | 1354 | ctx = READ_ONCE(event->ctx); |
---|
1282 | | - if (!atomic_inc_not_zero(&ctx->refcount)) { |
---|
| 1355 | + if (!refcount_inc_not_zero(&ctx->refcount)) { |
---|
1283 | 1356 | rcu_read_unlock(); |
---|
1284 | 1357 | goto again; |
---|
1285 | 1358 | } |
---|
.. | .. |
---|
1371 | 1444 | /* |
---|
1372 | 1445 | * Get the perf_event_context for a task and lock it. |
---|
1373 | 1446 | * |
---|
1374 | | - * This has to cope with with the fact that until it is locked, |
---|
| 1447 | + * This has to cope with the fact that until it is locked, |
---|
1375 | 1448 | * the context could get moved to another task. |
---|
1376 | 1449 | */ |
---|
1377 | 1450 | static struct perf_event_context * |
---|
.. | .. |
---|
1412 | 1485 | } |
---|
1413 | 1486 | |
---|
1414 | 1487 | if (ctx->task == TASK_TOMBSTONE || |
---|
1415 | | - !atomic_inc_not_zero(&ctx->refcount)) { |
---|
| 1488 | + !refcount_inc_not_zero(&ctx->refcount)) { |
---|
1416 | 1489 | raw_spin_unlock(&ctx->lock); |
---|
1417 | 1490 | ctx = NULL; |
---|
1418 | 1491 | } else { |
---|
.. | .. |
---|
1540 | 1613 | if (left->cpu > right->cpu) |
---|
1541 | 1614 | return false; |
---|
1542 | 1615 | |
---|
| 1616 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 1617 | + if (left->cgrp != right->cgrp) { |
---|
| 1618 | + if (!left->cgrp || !left->cgrp->css.cgroup) { |
---|
| 1619 | + /* |
---|
| 1620 | + * Left has no cgroup but right does, no cgroups come |
---|
| 1621 | + * first. |
---|
| 1622 | + */ |
---|
| 1623 | + return true; |
---|
| 1624 | + } |
---|
| 1625 | + if (!right->cgrp || !right->cgrp->css.cgroup) { |
---|
| 1626 | + /* |
---|
| 1627 | + * Right has no cgroup but left does, no cgroups come |
---|
| 1628 | + * first. |
---|
| 1629 | + */ |
---|
| 1630 | + return false; |
---|
| 1631 | + } |
---|
| 1632 | + /* Two dissimilar cgroups, order by id. */ |
---|
| 1633 | + if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id) |
---|
| 1634 | + return true; |
---|
| 1635 | + |
---|
| 1636 | + return false; |
---|
| 1637 | + } |
---|
| 1638 | +#endif |
---|
| 1639 | + |
---|
1543 | 1640 | if (left->group_index < right->group_index) |
---|
1544 | 1641 | return true; |
---|
1545 | 1642 | if (left->group_index > right->group_index) |
---|
.. | .. |
---|
1619 | 1716 | } |
---|
1620 | 1717 | |
---|
1621 | 1718 | /* |
---|
1622 | | - * Get the leftmost event in the @cpu subtree. |
---|
| 1719 | + * Get the leftmost event in the cpu/cgroup subtree. |
---|
1623 | 1720 | */ |
---|
1624 | 1721 | static struct perf_event * |
---|
1625 | | -perf_event_groups_first(struct perf_event_groups *groups, int cpu) |
---|
| 1722 | +perf_event_groups_first(struct perf_event_groups *groups, int cpu, |
---|
| 1723 | + struct cgroup *cgrp) |
---|
1626 | 1724 | { |
---|
1627 | 1725 | struct perf_event *node_event = NULL, *match = NULL; |
---|
1628 | 1726 | struct rb_node *node = groups->tree.rb_node; |
---|
| 1727 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 1728 | + u64 node_cgrp_id, cgrp_id = 0; |
---|
| 1729 | + |
---|
| 1730 | + if (cgrp) |
---|
| 1731 | + cgrp_id = cgrp->kn->id; |
---|
| 1732 | +#endif |
---|
1629 | 1733 | |
---|
1630 | 1734 | while (node) { |
---|
1631 | 1735 | node_event = container_of(node, struct perf_event, group_node); |
---|
1632 | 1736 | |
---|
1633 | 1737 | if (cpu < node_event->cpu) { |
---|
1634 | 1738 | node = node->rb_left; |
---|
1635 | | - } else if (cpu > node_event->cpu) { |
---|
1636 | | - node = node->rb_right; |
---|
1637 | | - } else { |
---|
1638 | | - match = node_event; |
---|
1639 | | - node = node->rb_left; |
---|
| 1739 | + continue; |
---|
1640 | 1740 | } |
---|
| 1741 | + if (cpu > node_event->cpu) { |
---|
| 1742 | + node = node->rb_right; |
---|
| 1743 | + continue; |
---|
| 1744 | + } |
---|
| 1745 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 1746 | + node_cgrp_id = 0; |
---|
| 1747 | + if (node_event->cgrp && node_event->cgrp->css.cgroup) |
---|
| 1748 | + node_cgrp_id = node_event->cgrp->css.cgroup->kn->id; |
---|
| 1749 | + |
---|
| 1750 | + if (cgrp_id < node_cgrp_id) { |
---|
| 1751 | + node = node->rb_left; |
---|
| 1752 | + continue; |
---|
| 1753 | + } |
---|
| 1754 | + if (cgrp_id > node_cgrp_id) { |
---|
| 1755 | + node = node->rb_right; |
---|
| 1756 | + continue; |
---|
| 1757 | + } |
---|
| 1758 | +#endif |
---|
| 1759 | + match = node_event; |
---|
| 1760 | + node = node->rb_left; |
---|
1641 | 1761 | } |
---|
1642 | 1762 | |
---|
1643 | 1763 | return match; |
---|
.. | .. |
---|
1650 | 1770 | perf_event_groups_next(struct perf_event *event) |
---|
1651 | 1771 | { |
---|
1652 | 1772 | struct perf_event *next; |
---|
| 1773 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 1774 | + u64 curr_cgrp_id = 0; |
---|
| 1775 | + u64 next_cgrp_id = 0; |
---|
| 1776 | +#endif |
---|
1653 | 1777 | |
---|
1654 | 1778 | next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); |
---|
1655 | | - if (next && next->cpu == event->cpu) |
---|
1656 | | - return next; |
---|
| 1779 | + if (next == NULL || next->cpu != event->cpu) |
---|
| 1780 | + return NULL; |
---|
1657 | 1781 | |
---|
1658 | | - return NULL; |
---|
| 1782 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 1783 | + if (event->cgrp && event->cgrp->css.cgroup) |
---|
| 1784 | + curr_cgrp_id = event->cgrp->css.cgroup->kn->id; |
---|
| 1785 | + |
---|
| 1786 | + if (next->cgrp && next->cgrp->css.cgroup) |
---|
| 1787 | + next_cgrp_id = next->cgrp->css.cgroup->kn->id; |
---|
| 1788 | + |
---|
| 1789 | + if (curr_cgrp_id != next_cgrp_id) |
---|
| 1790 | + return NULL; |
---|
| 1791 | +#endif |
---|
| 1792 | + return next; |
---|
1659 | 1793 | } |
---|
1660 | 1794 | |
---|
1661 | 1795 | /* |
---|
.. | .. |
---|
1691 | 1825 | add_event_to_groups(event, ctx); |
---|
1692 | 1826 | } |
---|
1693 | 1827 | |
---|
1694 | | - list_update_cgroup_event(event, ctx, true); |
---|
1695 | | - |
---|
1696 | 1828 | list_add_rcu(&event->event_entry, &ctx->event_list); |
---|
1697 | 1829 | ctx->nr_events++; |
---|
1698 | 1830 | if (event->attr.inherit_stat) |
---|
1699 | 1831 | ctx->nr_stat++; |
---|
| 1832 | + |
---|
| 1833 | + if (event->state > PERF_EVENT_STATE_OFF) |
---|
| 1834 | + perf_cgroup_event_enable(event, ctx); |
---|
1700 | 1835 | |
---|
1701 | 1836 | ctx->generation++; |
---|
1702 | 1837 | } |
---|
.. | .. |
---|
1762 | 1897 | |
---|
1763 | 1898 | if (sample_type & PERF_SAMPLE_PHYS_ADDR) |
---|
1764 | 1899 | size += sizeof(data->phys_addr); |
---|
| 1900 | + |
---|
| 1901 | + if (sample_type & PERF_SAMPLE_CGROUP) |
---|
| 1902 | + size += sizeof(data->cgroup); |
---|
1765 | 1903 | |
---|
1766 | 1904 | event->header_size = size; |
---|
1767 | 1905 | } |
---|
.. | .. |
---|
1873 | 2011 | |
---|
1874 | 2012 | event->attach_state &= ~PERF_ATTACH_CONTEXT; |
---|
1875 | 2013 | |
---|
1876 | | - list_update_cgroup_event(event, ctx, false); |
---|
1877 | | - |
---|
1878 | 2014 | ctx->nr_events--; |
---|
1879 | 2015 | if (event->attr.inherit_stat) |
---|
1880 | 2016 | ctx->nr_stat--; |
---|
.. | .. |
---|
1891 | 2027 | * of error state is by explicit re-enabling |
---|
1892 | 2028 | * of the event |
---|
1893 | 2029 | */ |
---|
1894 | | - if (event->state > PERF_EVENT_STATE_OFF) |
---|
| 2030 | + if (event->state > PERF_EVENT_STATE_OFF) { |
---|
| 2031 | + perf_cgroup_event_disable(event, ctx); |
---|
1895 | 2032 | perf_event_set_state(event, PERF_EVENT_STATE_OFF); |
---|
| 2033 | + } |
---|
1896 | 2034 | |
---|
1897 | 2035 | ctx->generation++; |
---|
1898 | 2036 | } |
---|
1899 | 2037 | |
---|
| 2038 | +static int |
---|
| 2039 | +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) |
---|
| 2040 | +{ |
---|
| 2041 | + if (!has_aux(aux_event)) |
---|
| 2042 | + return 0; |
---|
| 2043 | + |
---|
| 2044 | + if (!event->pmu->aux_output_match) |
---|
| 2045 | + return 0; |
---|
| 2046 | + |
---|
| 2047 | + return event->pmu->aux_output_match(aux_event); |
---|
| 2048 | +} |
---|
| 2049 | + |
---|
| 2050 | +static void put_event(struct perf_event *event); |
---|
| 2051 | +static void event_sched_out(struct perf_event *event, |
---|
| 2052 | + struct perf_cpu_context *cpuctx, |
---|
| 2053 | + struct perf_event_context *ctx); |
---|
| 2054 | + |
---|
| 2055 | +static void perf_put_aux_event(struct perf_event *event) |
---|
| 2056 | +{ |
---|
| 2057 | + struct perf_event_context *ctx = event->ctx; |
---|
| 2058 | + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
---|
| 2059 | + struct perf_event *iter; |
---|
| 2060 | + |
---|
| 2061 | + /* |
---|
| 2062 | + * If event uses aux_event tear down the link |
---|
| 2063 | + */ |
---|
| 2064 | + if (event->aux_event) { |
---|
| 2065 | + iter = event->aux_event; |
---|
| 2066 | + event->aux_event = NULL; |
---|
| 2067 | + put_event(iter); |
---|
| 2068 | + return; |
---|
| 2069 | + } |
---|
| 2070 | + |
---|
| 2071 | + /* |
---|
| 2072 | + * If the event is an aux_event, tear down all links to |
---|
| 2073 | + * it from other events. |
---|
| 2074 | + */ |
---|
| 2075 | + for_each_sibling_event(iter, event->group_leader) { |
---|
| 2076 | + if (iter->aux_event != event) |
---|
| 2077 | + continue; |
---|
| 2078 | + |
---|
| 2079 | + iter->aux_event = NULL; |
---|
| 2080 | + put_event(event); |
---|
| 2081 | + |
---|
| 2082 | + /* |
---|
| 2083 | + * If it's ACTIVE, schedule it out and put it into ERROR |
---|
| 2084 | + * state so that we don't try to schedule it again. Note |
---|
| 2085 | + * that perf_event_enable() will clear the ERROR status. |
---|
| 2086 | + */ |
---|
| 2087 | + event_sched_out(iter, cpuctx, ctx); |
---|
| 2088 | + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); |
---|
| 2089 | + } |
---|
| 2090 | +} |
---|
| 2091 | + |
---|
| 2092 | +static bool perf_need_aux_event(struct perf_event *event) |
---|
| 2093 | +{ |
---|
| 2094 | + return !!event->attr.aux_output || !!event->attr.aux_sample_size; |
---|
| 2095 | +} |
---|
| 2096 | + |
---|
| 2097 | +static int perf_get_aux_event(struct perf_event *event, |
---|
| 2098 | + struct perf_event *group_leader) |
---|
| 2099 | +{ |
---|
| 2100 | + /* |
---|
| 2101 | + * Our group leader must be an aux event if we want to be |
---|
| 2102 | + * an aux_output. This way, the aux event will precede its |
---|
| 2103 | + * aux_output events in the group, and therefore will always |
---|
| 2104 | + * schedule first. |
---|
| 2105 | + */ |
---|
| 2106 | + if (!group_leader) |
---|
| 2107 | + return 0; |
---|
| 2108 | + |
---|
| 2109 | + /* |
---|
| 2110 | + * aux_output and aux_sample_size are mutually exclusive. |
---|
| 2111 | + */ |
---|
| 2112 | + if (event->attr.aux_output && event->attr.aux_sample_size) |
---|
| 2113 | + return 0; |
---|
| 2114 | + |
---|
| 2115 | + if (event->attr.aux_output && |
---|
| 2116 | + !perf_aux_output_match(event, group_leader)) |
---|
| 2117 | + return 0; |
---|
| 2118 | + |
---|
| 2119 | + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) |
---|
| 2120 | + return 0; |
---|
| 2121 | + |
---|
| 2122 | + if (!atomic_long_inc_not_zero(&group_leader->refcount)) |
---|
| 2123 | + return 0; |
---|
| 2124 | + |
---|
| 2125 | + /* |
---|
| 2126 | + * Link aux_outputs to their aux event; this is undone in |
---|
| 2127 | + * perf_group_detach() by perf_put_aux_event(). When the |
---|
| 2128 | + * group in torn down, the aux_output events loose their |
---|
| 2129 | + * link to the aux_event and can't schedule any more. |
---|
| 2130 | + */ |
---|
| 2131 | + event->aux_event = group_leader; |
---|
| 2132 | + |
---|
| 2133 | + return 1; |
---|
| 2134 | +} |
---|
| 2135 | + |
---|
| 2136 | +static inline struct list_head *get_event_list(struct perf_event *event) |
---|
| 2137 | +{ |
---|
| 2138 | + struct perf_event_context *ctx = event->ctx; |
---|
| 2139 | + return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; |
---|
| 2140 | +} |
---|
| 2141 | + |
---|
| 2142 | +/* |
---|
| 2143 | + * Events that have PERF_EV_CAP_SIBLING require being part of a group and |
---|
| 2144 | + * cannot exist on their own, schedule them out and move them into the ERROR |
---|
| 2145 | + * state. Also see _perf_event_enable(), it will not be able to recover |
---|
| 2146 | + * this ERROR state. |
---|
| 2147 | + */ |
---|
| 2148 | +static inline void perf_remove_sibling_event(struct perf_event *event) |
---|
| 2149 | +{ |
---|
| 2150 | + struct perf_event_context *ctx = event->ctx; |
---|
| 2151 | + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
---|
| 2152 | + |
---|
| 2153 | + event_sched_out(event, cpuctx, ctx); |
---|
| 2154 | + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); |
---|
| 2155 | +} |
---|
| 2156 | + |
---|
1900 | 2157 | static void perf_group_detach(struct perf_event *event) |
---|
1901 | 2158 | { |
---|
| 2159 | + struct perf_event *leader = event->group_leader; |
---|
1902 | 2160 | struct perf_event *sibling, *tmp; |
---|
1903 | 2161 | struct perf_event_context *ctx = event->ctx; |
---|
1904 | 2162 | |
---|
.. | .. |
---|
1912 | 2170 | |
---|
1913 | 2171 | event->attach_state &= ~PERF_ATTACH_GROUP; |
---|
1914 | 2172 | |
---|
| 2173 | + perf_put_aux_event(event); |
---|
| 2174 | + |
---|
1915 | 2175 | /* |
---|
1916 | 2176 | * If this is a sibling, remove it from its group. |
---|
1917 | 2177 | */ |
---|
1918 | | - if (event->group_leader != event) { |
---|
| 2178 | + if (leader != event) { |
---|
1919 | 2179 | list_del_init(&event->sibling_list); |
---|
1920 | 2180 | event->group_leader->nr_siblings--; |
---|
1921 | 2181 | goto out; |
---|
.. | .. |
---|
1928 | 2188 | */ |
---|
1929 | 2189 | list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { |
---|
1930 | 2190 | |
---|
| 2191 | + if (sibling->event_caps & PERF_EV_CAP_SIBLING) |
---|
| 2192 | + perf_remove_sibling_event(sibling); |
---|
| 2193 | + |
---|
1931 | 2194 | sibling->group_leader = sibling; |
---|
1932 | 2195 | list_del_init(&sibling->sibling_list); |
---|
1933 | 2196 | |
---|
.. | .. |
---|
1937 | 2200 | if (!RB_EMPTY_NODE(&event->group_node)) { |
---|
1938 | 2201 | add_event_to_groups(sibling, event->ctx); |
---|
1939 | 2202 | |
---|
1940 | | - if (sibling->state == PERF_EVENT_STATE_ACTIVE) { |
---|
1941 | | - struct list_head *list = sibling->attr.pinned ? |
---|
1942 | | - &ctx->pinned_active : &ctx->flexible_active; |
---|
1943 | | - |
---|
1944 | | - list_add_tail(&sibling->active_list, list); |
---|
1945 | | - } |
---|
| 2203 | + if (sibling->state == PERF_EVENT_STATE_ACTIVE) |
---|
| 2204 | + list_add_tail(&sibling->active_list, get_event_list(sibling)); |
---|
1946 | 2205 | } |
---|
1947 | 2206 | |
---|
1948 | 2207 | WARN_ON_ONCE(sibling->ctx != event->ctx); |
---|
1949 | 2208 | } |
---|
1950 | 2209 | |
---|
1951 | 2210 | out: |
---|
1952 | | - perf_event__header_size(event->group_leader); |
---|
1953 | | - |
---|
1954 | | - for_each_sibling_event(tmp, event->group_leader) |
---|
| 2211 | + for_each_sibling_event(tmp, leader) |
---|
1955 | 2212 | perf_event__header_size(tmp); |
---|
| 2213 | + |
---|
| 2214 | + perf_event__header_size(leader); |
---|
1956 | 2215 | } |
---|
1957 | 2216 | |
---|
1958 | 2217 | static bool is_orphaned_event(struct perf_event *event) |
---|
.. | .. |
---|
2021 | 2280 | |
---|
2022 | 2281 | if (READ_ONCE(event->pending_disable) >= 0) { |
---|
2023 | 2282 | WRITE_ONCE(event->pending_disable, -1); |
---|
| 2283 | + perf_cgroup_event_disable(event, ctx); |
---|
2024 | 2284 | state = PERF_EVENT_STATE_OFF; |
---|
2025 | 2285 | } |
---|
2026 | 2286 | perf_event_set_state(event, state); |
---|
.. | .. |
---|
2058 | 2318 | event_sched_out(event, cpuctx, ctx); |
---|
2059 | 2319 | |
---|
2060 | 2320 | perf_pmu_enable(ctx->pmu); |
---|
2061 | | - |
---|
2062 | | - if (group_event->attr.exclusive) |
---|
2063 | | - cpuctx->exclusive = 0; |
---|
2064 | 2321 | } |
---|
2065 | 2322 | |
---|
2066 | 2323 | #define DETACH_GROUP 0x01UL |
---|
.. | .. |
---|
2091 | 2348 | |
---|
2092 | 2349 | if (!ctx->nr_events && ctx->is_active) { |
---|
2093 | 2350 | ctx->is_active = 0; |
---|
| 2351 | + ctx->rotate_necessary = 0; |
---|
2094 | 2352 | if (ctx->task) { |
---|
2095 | 2353 | WARN_ON_ONCE(cpuctx->task_ctx != ctx); |
---|
2096 | 2354 | cpuctx->task_ctx = NULL; |
---|
.. | .. |
---|
2157 | 2415 | event_sched_out(event, cpuctx, ctx); |
---|
2158 | 2416 | |
---|
2159 | 2417 | perf_event_set_state(event, PERF_EVENT_STATE_OFF); |
---|
| 2418 | + perf_cgroup_event_disable(event, ctx); |
---|
2160 | 2419 | } |
---|
2161 | 2420 | |
---|
2162 | 2421 | /* |
---|
.. | .. |
---|
2164 | 2423 | * |
---|
2165 | 2424 | * If event->ctx is a cloned context, callers must make sure that |
---|
2166 | 2425 | * every task struct that event->ctx->task could possibly point to |
---|
2167 | | - * remains valid. This condition is satisifed when called through |
---|
| 2426 | + * remains valid. This condition is satisfied when called through |
---|
2168 | 2427 | * perf_event_for_each_child or perf_event_for_each because they |
---|
2169 | 2428 | * hold the top-level event's child_mutex, so any descendant that |
---|
2170 | 2429 | * goes to exit will block in perf_event_exit_event(). |
---|
.. | .. |
---|
2238 | 2497 | * But this is a bit hairy. |
---|
2239 | 2498 | * |
---|
2240 | 2499 | * So instead, we have an explicit cgroup call to remain |
---|
2241 | | - * within the time time source all along. We believe it |
---|
| 2500 | + * within the time source all along. We believe it |
---|
2242 | 2501 | * is cleaner and simpler to understand. |
---|
2243 | 2502 | */ |
---|
2244 | 2503 | if (is_cgroup_event(event)) |
---|
.. | .. |
---|
2258 | 2517 | struct perf_event_context *ctx) |
---|
2259 | 2518 | { |
---|
2260 | 2519 | int ret = 0; |
---|
| 2520 | + |
---|
| 2521 | + WARN_ON_ONCE(event->ctx != ctx); |
---|
2261 | 2522 | |
---|
2262 | 2523 | lockdep_assert_held(&ctx->lock); |
---|
2263 | 2524 | |
---|
.. | .. |
---|
2325 | 2586 | |
---|
2326 | 2587 | pmu->start_txn(pmu, PERF_PMU_TXN_ADD); |
---|
2327 | 2588 | |
---|
2328 | | - if (event_sched_in(group_event, cpuctx, ctx)) { |
---|
2329 | | - pmu->cancel_txn(pmu); |
---|
2330 | | - perf_mux_hrtimer_restart(cpuctx); |
---|
2331 | | - return -EAGAIN; |
---|
2332 | | - } |
---|
| 2589 | + if (event_sched_in(group_event, cpuctx, ctx)) |
---|
| 2590 | + goto error; |
---|
2333 | 2591 | |
---|
2334 | 2592 | /* |
---|
2335 | 2593 | * Schedule in siblings as one group (if any): |
---|
.. | .. |
---|
2358 | 2616 | } |
---|
2359 | 2617 | event_sched_out(group_event, cpuctx, ctx); |
---|
2360 | 2618 | |
---|
| 2619 | +error: |
---|
2361 | 2620 | pmu->cancel_txn(pmu); |
---|
2362 | | - |
---|
2363 | | - perf_mux_hrtimer_restart(cpuctx); |
---|
2364 | | - |
---|
2365 | 2621 | return -EAGAIN; |
---|
2366 | 2622 | } |
---|
2367 | 2623 | |
---|
.. | .. |
---|
2387 | 2643 | * If this group is exclusive and there are already |
---|
2388 | 2644 | * events on the CPU, it can't go on. |
---|
2389 | 2645 | */ |
---|
2390 | | - if (event->attr.exclusive && cpuctx->active_oncpu) |
---|
| 2646 | + if (event->attr.exclusive && !list_empty(get_event_list(event))) |
---|
2391 | 2647 | return 0; |
---|
2392 | 2648 | /* |
---|
2393 | 2649 | * Otherwise, try to add it if all previous groups were able |
---|
.. | .. |
---|
2488 | 2744 | perf_pmu_enable(cpuctx->ctx.pmu); |
---|
2489 | 2745 | } |
---|
2490 | 2746 | |
---|
| 2747 | +void perf_pmu_resched(struct pmu *pmu) |
---|
| 2748 | +{ |
---|
| 2749 | + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
---|
| 2750 | + struct perf_event_context *task_ctx = cpuctx->task_ctx; |
---|
| 2751 | + |
---|
| 2752 | + perf_ctx_lock(cpuctx, task_ctx); |
---|
| 2753 | + ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); |
---|
| 2754 | + perf_ctx_unlock(cpuctx, task_ctx); |
---|
| 2755 | +} |
---|
| 2756 | + |
---|
2491 | 2757 | /* |
---|
2492 | 2758 | * Cross CPU call to install and enable a performance event |
---|
2493 | 2759 | * |
---|
.. | .. |
---|
2528 | 2794 | } |
---|
2529 | 2795 | |
---|
2530 | 2796 | #ifdef CONFIG_CGROUP_PERF |
---|
2531 | | - if (is_cgroup_event(event)) { |
---|
| 2797 | + if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { |
---|
2532 | 2798 | /* |
---|
2533 | 2799 | * If the current cgroup doesn't match the event's |
---|
2534 | 2800 | * cgroup, we should not try to schedule it. |
---|
.. | .. |
---|
2580 | 2846 | * will be 'complete'. See perf_iterate_sb_cpu(). |
---|
2581 | 2847 | */ |
---|
2582 | 2848 | smp_store_release(&event->ctx, ctx); |
---|
| 2849 | + |
---|
| 2850 | + /* |
---|
| 2851 | + * perf_event_attr::disabled events will not run and can be initialized |
---|
| 2852 | + * without IPI. Except when this is the first event for the context, in |
---|
| 2853 | + * that case we need the magic of the IPI to set ctx->is_active. |
---|
| 2854 | + * |
---|
| 2855 | + * The IOC_ENABLE that is sure to follow the creation of a disabled |
---|
| 2856 | + * event will issue the IPI and reprogram the hardware. |
---|
| 2857 | + */ |
---|
| 2858 | + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { |
---|
| 2859 | + raw_spin_lock_irq(&ctx->lock); |
---|
| 2860 | + if (ctx->task == TASK_TOMBSTONE) { |
---|
| 2861 | + raw_spin_unlock_irq(&ctx->lock); |
---|
| 2862 | + return; |
---|
| 2863 | + } |
---|
| 2864 | + add_event_to_ctx(event, ctx); |
---|
| 2865 | + raw_spin_unlock_irq(&ctx->lock); |
---|
| 2866 | + return; |
---|
| 2867 | + } |
---|
2583 | 2868 | |
---|
2584 | 2869 | if (!task) { |
---|
2585 | 2870 | cpu_function_call(cpu, __perf_install_in_context, event); |
---|
.. | .. |
---|
2669 | 2954 | ctx_sched_out(ctx, cpuctx, EVENT_TIME); |
---|
2670 | 2955 | |
---|
2671 | 2956 | perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); |
---|
| 2957 | + perf_cgroup_event_enable(event, ctx); |
---|
2672 | 2958 | |
---|
2673 | 2959 | if (!ctx->is_active) |
---|
2674 | 2960 | return; |
---|
.. | .. |
---|
2710 | 2996 | raw_spin_lock_irq(&ctx->lock); |
---|
2711 | 2997 | if (event->state >= PERF_EVENT_STATE_INACTIVE || |
---|
2712 | 2998 | event->state < PERF_EVENT_STATE_ERROR) { |
---|
| 2999 | +out: |
---|
2713 | 3000 | raw_spin_unlock_irq(&ctx->lock); |
---|
2714 | 3001 | return; |
---|
2715 | 3002 | } |
---|
.. | .. |
---|
2721 | 3008 | * has gone back into error state, as distinct from the task having |
---|
2722 | 3009 | * been scheduled away before the cross-call arrived. |
---|
2723 | 3010 | */ |
---|
2724 | | - if (event->state == PERF_EVENT_STATE_ERROR) |
---|
| 3011 | + if (event->state == PERF_EVENT_STATE_ERROR) { |
---|
| 3012 | + /* |
---|
| 3013 | + * Detached SIBLING events cannot leave ERROR state. |
---|
| 3014 | + */ |
---|
| 3015 | + if (event->event_caps & PERF_EV_CAP_SIBLING && |
---|
| 3016 | + event->group_leader == event) |
---|
| 3017 | + goto out; |
---|
| 3018 | + |
---|
2725 | 3019 | event->state = PERF_EVENT_STATE_OFF; |
---|
| 3020 | + } |
---|
2726 | 3021 | raw_spin_unlock_irq(&ctx->lock); |
---|
2727 | 3022 | |
---|
2728 | 3023 | event_function_call(event, __perf_event_enable, NULL); |
---|
.. | .. |
---|
2826 | 3121 | * pre-existing mappings, called once when new filters arrive via SET_FILTER |
---|
2827 | 3122 | * ioctl; |
---|
2828 | 3123 | * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly |
---|
2829 | | - * registered mapping, called for every new mmap(), with mm::mmap_sem down |
---|
| 3124 | + * registered mapping, called for every new mmap(), with mm::mmap_lock down |
---|
2830 | 3125 | * for reading; |
---|
2831 | 3126 | * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process |
---|
2832 | 3127 | * of exec. |
---|
.. | .. |
---|
2966 | 3261 | if (is_active & EVENT_FLEXIBLE) { |
---|
2967 | 3262 | list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) |
---|
2968 | 3263 | group_sched_out(event, cpuctx, ctx); |
---|
| 3264 | + |
---|
| 3265 | + /* |
---|
| 3266 | + * Since we cleared EVENT_FLEXIBLE, also clear |
---|
| 3267 | + * rotate_necessary, is will be reset by |
---|
| 3268 | + * ctx_flexible_sched_in() when needed. |
---|
| 3269 | + */ |
---|
| 3270 | + ctx->rotate_necessary = 0; |
---|
2969 | 3271 | } |
---|
2970 | 3272 | perf_pmu_enable(ctx->pmu); |
---|
2971 | 3273 | } |
---|
.. | .. |
---|
3080 | 3382 | struct perf_event_context *parent, *next_parent; |
---|
3081 | 3383 | struct perf_cpu_context *cpuctx; |
---|
3082 | 3384 | int do_switch = 1; |
---|
| 3385 | + struct pmu *pmu; |
---|
3083 | 3386 | |
---|
3084 | 3387 | if (likely(!ctx)) |
---|
3085 | 3388 | return; |
---|
3086 | 3389 | |
---|
| 3390 | + pmu = ctx->pmu; |
---|
3087 | 3391 | cpuctx = __get_cpu_context(ctx); |
---|
3088 | 3392 | if (!cpuctx->task_ctx) |
---|
3089 | 3393 | return; |
---|
.. | .. |
---|
3113 | 3417 | raw_spin_lock(&ctx->lock); |
---|
3114 | 3418 | raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); |
---|
3115 | 3419 | if (context_equiv(ctx, next_ctx)) { |
---|
| 3420 | + |
---|
3116 | 3421 | WRITE_ONCE(ctx->task, next); |
---|
3117 | 3422 | WRITE_ONCE(next_ctx->task, task); |
---|
3118 | 3423 | |
---|
3119 | | - swap(ctx->task_ctx_data, next_ctx->task_ctx_data); |
---|
| 3424 | + perf_pmu_disable(pmu); |
---|
| 3425 | + |
---|
| 3426 | + if (cpuctx->sched_cb_usage && pmu->sched_task) |
---|
| 3427 | + pmu->sched_task(ctx, false); |
---|
| 3428 | + |
---|
| 3429 | + /* |
---|
| 3430 | + * PMU specific parts of task perf context can require |
---|
| 3431 | + * additional synchronization. As an example of such |
---|
| 3432 | + * synchronization see implementation details of Intel |
---|
| 3433 | + * LBR call stack data profiling; |
---|
| 3434 | + */ |
---|
| 3435 | + if (pmu->swap_task_ctx) |
---|
| 3436 | + pmu->swap_task_ctx(ctx, next_ctx); |
---|
| 3437 | + else |
---|
| 3438 | + swap(ctx->task_ctx_data, next_ctx->task_ctx_data); |
---|
| 3439 | + |
---|
| 3440 | + perf_pmu_enable(pmu); |
---|
3120 | 3441 | |
---|
3121 | 3442 | /* |
---|
3122 | 3443 | * RCU_INIT_POINTER here is safe because we've not |
---|
.. | .. |
---|
3140 | 3461 | |
---|
3141 | 3462 | if (do_switch) { |
---|
3142 | 3463 | raw_spin_lock(&ctx->lock); |
---|
| 3464 | + perf_pmu_disable(pmu); |
---|
| 3465 | + |
---|
| 3466 | + if (cpuctx->sched_cb_usage && pmu->sched_task) |
---|
| 3467 | + pmu->sched_task(ctx, false); |
---|
3143 | 3468 | task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); |
---|
| 3469 | + |
---|
| 3470 | + perf_pmu_enable(pmu); |
---|
3144 | 3471 | raw_spin_unlock(&ctx->lock); |
---|
3145 | 3472 | } |
---|
3146 | 3473 | } |
---|
.. | .. |
---|
3176 | 3503 | * PEBS requires this to provide PID/TID information. This requires we flush |
---|
3177 | 3504 | * all queued PEBS records before we context switch to a new task. |
---|
3178 | 3505 | */ |
---|
| 3506 | +static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) |
---|
| 3507 | +{ |
---|
| 3508 | + struct pmu *pmu; |
---|
| 3509 | + |
---|
| 3510 | + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ |
---|
| 3511 | + |
---|
| 3512 | + if (WARN_ON_ONCE(!pmu->sched_task)) |
---|
| 3513 | + return; |
---|
| 3514 | + |
---|
| 3515 | + perf_ctx_lock(cpuctx, cpuctx->task_ctx); |
---|
| 3516 | + perf_pmu_disable(pmu); |
---|
| 3517 | + |
---|
| 3518 | + pmu->sched_task(cpuctx->task_ctx, sched_in); |
---|
| 3519 | + |
---|
| 3520 | + perf_pmu_enable(pmu); |
---|
| 3521 | + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); |
---|
| 3522 | +} |
---|
| 3523 | + |
---|
3179 | 3524 | static void perf_pmu_sched_task(struct task_struct *prev, |
---|
3180 | 3525 | struct task_struct *next, |
---|
3181 | 3526 | bool sched_in) |
---|
3182 | 3527 | { |
---|
3183 | 3528 | struct perf_cpu_context *cpuctx; |
---|
3184 | | - struct pmu *pmu; |
---|
3185 | 3529 | |
---|
3186 | 3530 | if (prev == next) |
---|
3187 | 3531 | return; |
---|
3188 | 3532 | |
---|
3189 | 3533 | list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { |
---|
3190 | | - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ |
---|
3191 | | - |
---|
3192 | | - if (WARN_ON_ONCE(!pmu->sched_task)) |
---|
| 3534 | + /* will be handled in perf_event_context_sched_in/out */ |
---|
| 3535 | + if (cpuctx->task_ctx) |
---|
3193 | 3536 | continue; |
---|
3194 | 3537 | |
---|
3195 | | - perf_ctx_lock(cpuctx, cpuctx->task_ctx); |
---|
3196 | | - perf_pmu_disable(pmu); |
---|
3197 | | - |
---|
3198 | | - pmu->sched_task(cpuctx->task_ctx, sched_in); |
---|
3199 | | - |
---|
3200 | | - perf_pmu_enable(pmu); |
---|
3201 | | - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); |
---|
| 3538 | + __perf_pmu_sched_task(cpuctx, sched_in); |
---|
3202 | 3539 | } |
---|
3203 | 3540 | } |
---|
3204 | 3541 | |
---|
.. | .. |
---|
3251 | 3588 | ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); |
---|
3252 | 3589 | } |
---|
3253 | 3590 | |
---|
3254 | | -static int visit_groups_merge(struct perf_event_groups *groups, int cpu, |
---|
3255 | | - int (*func)(struct perf_event *, void *), void *data) |
---|
| 3591 | +static bool perf_less_group_idx(const void *l, const void *r) |
---|
3256 | 3592 | { |
---|
3257 | | - struct perf_event **evt, *evt1, *evt2; |
---|
| 3593 | + const struct perf_event *le = *(const struct perf_event **)l; |
---|
| 3594 | + const struct perf_event *re = *(const struct perf_event **)r; |
---|
| 3595 | + |
---|
| 3596 | + return le->group_index < re->group_index; |
---|
| 3597 | +} |
---|
| 3598 | + |
---|
| 3599 | +static void swap_ptr(void *l, void *r) |
---|
| 3600 | +{ |
---|
| 3601 | + void **lp = l, **rp = r; |
---|
| 3602 | + |
---|
| 3603 | + swap(*lp, *rp); |
---|
| 3604 | +} |
---|
| 3605 | + |
---|
| 3606 | +static const struct min_heap_callbacks perf_min_heap = { |
---|
| 3607 | + .elem_size = sizeof(struct perf_event *), |
---|
| 3608 | + .less = perf_less_group_idx, |
---|
| 3609 | + .swp = swap_ptr, |
---|
| 3610 | +}; |
---|
| 3611 | + |
---|
| 3612 | +static void __heap_add(struct min_heap *heap, struct perf_event *event) |
---|
| 3613 | +{ |
---|
| 3614 | + struct perf_event **itrs = heap->data; |
---|
| 3615 | + |
---|
| 3616 | + if (event) { |
---|
| 3617 | + itrs[heap->nr] = event; |
---|
| 3618 | + heap->nr++; |
---|
| 3619 | + } |
---|
| 3620 | +} |
---|
| 3621 | + |
---|
| 3622 | +static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, |
---|
| 3623 | + struct perf_event_groups *groups, int cpu, |
---|
| 3624 | + int (*func)(struct perf_event *, void *), |
---|
| 3625 | + void *data) |
---|
| 3626 | +{ |
---|
| 3627 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 3628 | + struct cgroup_subsys_state *css = NULL; |
---|
| 3629 | +#endif |
---|
| 3630 | + /* Space for per CPU and/or any CPU event iterators. */ |
---|
| 3631 | + struct perf_event *itrs[2]; |
---|
| 3632 | + struct min_heap event_heap; |
---|
| 3633 | + struct perf_event **evt; |
---|
3258 | 3634 | int ret; |
---|
3259 | 3635 | |
---|
3260 | | - evt1 = perf_event_groups_first(groups, -1); |
---|
3261 | | - evt2 = perf_event_groups_first(groups, cpu); |
---|
| 3636 | + if (cpuctx) { |
---|
| 3637 | + event_heap = (struct min_heap){ |
---|
| 3638 | + .data = cpuctx->heap, |
---|
| 3639 | + .nr = 0, |
---|
| 3640 | + .size = cpuctx->heap_size, |
---|
| 3641 | + }; |
---|
3262 | 3642 | |
---|
3263 | | - while (evt1 || evt2) { |
---|
3264 | | - if (evt1 && evt2) { |
---|
3265 | | - if (evt1->group_index < evt2->group_index) |
---|
3266 | | - evt = &evt1; |
---|
3267 | | - else |
---|
3268 | | - evt = &evt2; |
---|
3269 | | - } else if (evt1) { |
---|
3270 | | - evt = &evt1; |
---|
3271 | | - } else { |
---|
3272 | | - evt = &evt2; |
---|
3273 | | - } |
---|
| 3643 | + lockdep_assert_held(&cpuctx->ctx.lock); |
---|
3274 | 3644 | |
---|
| 3645 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 3646 | + if (cpuctx->cgrp) |
---|
| 3647 | + css = &cpuctx->cgrp->css; |
---|
| 3648 | +#endif |
---|
| 3649 | + } else { |
---|
| 3650 | + event_heap = (struct min_heap){ |
---|
| 3651 | + .data = itrs, |
---|
| 3652 | + .nr = 0, |
---|
| 3653 | + .size = ARRAY_SIZE(itrs), |
---|
| 3654 | + }; |
---|
| 3655 | + /* Events not within a CPU context may be on any CPU. */ |
---|
| 3656 | + __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); |
---|
| 3657 | + } |
---|
| 3658 | + evt = event_heap.data; |
---|
| 3659 | + |
---|
| 3660 | + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); |
---|
| 3661 | + |
---|
| 3662 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 3663 | + for (; css; css = css->parent) |
---|
| 3664 | + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); |
---|
| 3665 | +#endif |
---|
| 3666 | + |
---|
| 3667 | + min_heapify_all(&event_heap, &perf_min_heap); |
---|
| 3668 | + |
---|
| 3669 | + while (event_heap.nr) { |
---|
3275 | 3670 | ret = func(*evt, data); |
---|
3276 | 3671 | if (ret) |
---|
3277 | 3672 | return ret; |
---|
3278 | 3673 | |
---|
3279 | 3674 | *evt = perf_event_groups_next(*evt); |
---|
3280 | | - } |
---|
3281 | | - |
---|
3282 | | - return 0; |
---|
3283 | | -} |
---|
3284 | | - |
---|
3285 | | -struct sched_in_data { |
---|
3286 | | - struct perf_event_context *ctx; |
---|
3287 | | - struct perf_cpu_context *cpuctx; |
---|
3288 | | - int can_add_hw; |
---|
3289 | | -}; |
---|
3290 | | - |
---|
3291 | | -static int pinned_sched_in(struct perf_event *event, void *data) |
---|
3292 | | -{ |
---|
3293 | | - struct sched_in_data *sid = data; |
---|
3294 | | - |
---|
3295 | | - if (event->state <= PERF_EVENT_STATE_OFF) |
---|
3296 | | - return 0; |
---|
3297 | | - |
---|
3298 | | - if (!event_filter_match(event)) |
---|
3299 | | - return 0; |
---|
3300 | | - |
---|
3301 | | - if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { |
---|
3302 | | - if (!group_sched_in(event, sid->cpuctx, sid->ctx)) |
---|
3303 | | - list_add_tail(&event->active_list, &sid->ctx->pinned_active); |
---|
3304 | | - } |
---|
3305 | | - |
---|
3306 | | - /* |
---|
3307 | | - * If this pinned group hasn't been scheduled, |
---|
3308 | | - * put it in error state. |
---|
3309 | | - */ |
---|
3310 | | - if (event->state == PERF_EVENT_STATE_INACTIVE) |
---|
3311 | | - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); |
---|
3312 | | - |
---|
3313 | | - return 0; |
---|
3314 | | -} |
---|
3315 | | - |
---|
3316 | | -static int flexible_sched_in(struct perf_event *event, void *data) |
---|
3317 | | -{ |
---|
3318 | | - struct sched_in_data *sid = data; |
---|
3319 | | - |
---|
3320 | | - if (event->state <= PERF_EVENT_STATE_OFF) |
---|
3321 | | - return 0; |
---|
3322 | | - |
---|
3323 | | - if (!event_filter_match(event)) |
---|
3324 | | - return 0; |
---|
3325 | | - |
---|
3326 | | - if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { |
---|
3327 | | - if (!group_sched_in(event, sid->cpuctx, sid->ctx)) |
---|
3328 | | - list_add_tail(&event->active_list, &sid->ctx->flexible_active); |
---|
| 3675 | + if (*evt) |
---|
| 3676 | + min_heapify(&event_heap, 0, &perf_min_heap); |
---|
3329 | 3677 | else |
---|
3330 | | - sid->can_add_hw = 0; |
---|
| 3678 | + min_heap_pop(&event_heap, &perf_min_heap); |
---|
| 3679 | + } |
---|
| 3680 | + |
---|
| 3681 | + return 0; |
---|
| 3682 | +} |
---|
| 3683 | + |
---|
| 3684 | +static inline bool event_update_userpage(struct perf_event *event) |
---|
| 3685 | +{ |
---|
| 3686 | + if (likely(!atomic_read(&event->mmap_count))) |
---|
| 3687 | + return false; |
---|
| 3688 | + |
---|
| 3689 | + perf_event_update_time(event); |
---|
| 3690 | + perf_set_shadow_time(event, event->ctx); |
---|
| 3691 | + perf_event_update_userpage(event); |
---|
| 3692 | + |
---|
| 3693 | + return true; |
---|
| 3694 | +} |
---|
| 3695 | + |
---|
| 3696 | +static inline void group_update_userpage(struct perf_event *group_event) |
---|
| 3697 | +{ |
---|
| 3698 | + struct perf_event *event; |
---|
| 3699 | + |
---|
| 3700 | + if (!event_update_userpage(group_event)) |
---|
| 3701 | + return; |
---|
| 3702 | + |
---|
| 3703 | + for_each_sibling_event(event, group_event) |
---|
| 3704 | + event_update_userpage(event); |
---|
| 3705 | +} |
---|
| 3706 | + |
---|
| 3707 | +static int merge_sched_in(struct perf_event *event, void *data) |
---|
| 3708 | +{ |
---|
| 3709 | + struct perf_event_context *ctx = event->ctx; |
---|
| 3710 | + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
---|
| 3711 | + int *can_add_hw = data; |
---|
| 3712 | + |
---|
| 3713 | + if (event->state <= PERF_EVENT_STATE_OFF) |
---|
| 3714 | + return 0; |
---|
| 3715 | + |
---|
| 3716 | + if (!event_filter_match(event)) |
---|
| 3717 | + return 0; |
---|
| 3718 | + |
---|
| 3719 | + if (group_can_go_on(event, cpuctx, *can_add_hw)) { |
---|
| 3720 | + if (!group_sched_in(event, cpuctx, ctx)) |
---|
| 3721 | + list_add_tail(&event->active_list, get_event_list(event)); |
---|
| 3722 | + } |
---|
| 3723 | + |
---|
| 3724 | + if (event->state == PERF_EVENT_STATE_INACTIVE) { |
---|
| 3725 | + *can_add_hw = 0; |
---|
| 3726 | + if (event->attr.pinned) { |
---|
| 3727 | + perf_cgroup_event_disable(event, ctx); |
---|
| 3728 | + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); |
---|
| 3729 | + } else { |
---|
| 3730 | + ctx->rotate_necessary = 1; |
---|
| 3731 | + perf_mux_hrtimer_restart(cpuctx); |
---|
| 3732 | + group_update_userpage(event); |
---|
| 3733 | + } |
---|
3331 | 3734 | } |
---|
3332 | 3735 | |
---|
3333 | 3736 | return 0; |
---|
.. | .. |
---|
3337 | 3740 | ctx_pinned_sched_in(struct perf_event_context *ctx, |
---|
3338 | 3741 | struct perf_cpu_context *cpuctx) |
---|
3339 | 3742 | { |
---|
3340 | | - struct sched_in_data sid = { |
---|
3341 | | - .ctx = ctx, |
---|
3342 | | - .cpuctx = cpuctx, |
---|
3343 | | - .can_add_hw = 1, |
---|
3344 | | - }; |
---|
| 3743 | + int can_add_hw = 1; |
---|
3345 | 3744 | |
---|
3346 | | - visit_groups_merge(&ctx->pinned_groups, |
---|
| 3745 | + if (ctx != &cpuctx->ctx) |
---|
| 3746 | + cpuctx = NULL; |
---|
| 3747 | + |
---|
| 3748 | + visit_groups_merge(cpuctx, &ctx->pinned_groups, |
---|
3347 | 3749 | smp_processor_id(), |
---|
3348 | | - pinned_sched_in, &sid); |
---|
| 3750 | + merge_sched_in, &can_add_hw); |
---|
3349 | 3751 | } |
---|
3350 | 3752 | |
---|
3351 | 3753 | static void |
---|
3352 | 3754 | ctx_flexible_sched_in(struct perf_event_context *ctx, |
---|
3353 | 3755 | struct perf_cpu_context *cpuctx) |
---|
3354 | 3756 | { |
---|
3355 | | - struct sched_in_data sid = { |
---|
3356 | | - .ctx = ctx, |
---|
3357 | | - .cpuctx = cpuctx, |
---|
3358 | | - .can_add_hw = 1, |
---|
3359 | | - }; |
---|
| 3757 | + int can_add_hw = 1; |
---|
3360 | 3758 | |
---|
3361 | | - visit_groups_merge(&ctx->flexible_groups, |
---|
| 3759 | + if (ctx != &cpuctx->ctx) |
---|
| 3760 | + cpuctx = NULL; |
---|
| 3761 | + |
---|
| 3762 | + visit_groups_merge(cpuctx, &ctx->flexible_groups, |
---|
3362 | 3763 | smp_processor_id(), |
---|
3363 | | - flexible_sched_in, &sid); |
---|
| 3764 | + merge_sched_in, &can_add_hw); |
---|
3364 | 3765 | } |
---|
3365 | 3766 | |
---|
3366 | 3767 | static void |
---|
.. | .. |
---|
3419 | 3820 | struct task_struct *task) |
---|
3420 | 3821 | { |
---|
3421 | 3822 | struct perf_cpu_context *cpuctx; |
---|
| 3823 | + struct pmu *pmu = ctx->pmu; |
---|
3422 | 3824 | |
---|
3423 | 3825 | cpuctx = __get_cpu_context(ctx); |
---|
3424 | | - if (cpuctx->task_ctx == ctx) |
---|
| 3826 | + if (cpuctx->task_ctx == ctx) { |
---|
| 3827 | + if (cpuctx->sched_cb_usage) |
---|
| 3828 | + __perf_pmu_sched_task(cpuctx, true); |
---|
3425 | 3829 | return; |
---|
| 3830 | + } |
---|
3426 | 3831 | |
---|
3427 | 3832 | perf_ctx_lock(cpuctx, ctx); |
---|
3428 | 3833 | /* |
---|
.. | .. |
---|
3432 | 3837 | if (!ctx->nr_events) |
---|
3433 | 3838 | goto unlock; |
---|
3434 | 3839 | |
---|
3435 | | - perf_pmu_disable(ctx->pmu); |
---|
| 3840 | + perf_pmu_disable(pmu); |
---|
3436 | 3841 | /* |
---|
3437 | 3842 | * We want to keep the following priority order: |
---|
3438 | 3843 | * cpu pinned (that don't need to move), task pinned, |
---|
.. | .. |
---|
3444 | 3849 | if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) |
---|
3445 | 3850 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
---|
3446 | 3851 | perf_event_sched_in(cpuctx, ctx, task); |
---|
3447 | | - perf_pmu_enable(ctx->pmu); |
---|
| 3852 | + |
---|
| 3853 | + if (cpuctx->sched_cb_usage && pmu->sched_task) |
---|
| 3854 | + pmu->sched_task(cpuctx->task_ctx, true); |
---|
| 3855 | + |
---|
| 3856 | + perf_pmu_enable(pmu); |
---|
3448 | 3857 | |
---|
3449 | 3858 | unlock: |
---|
3450 | 3859 | perf_ctx_unlock(cpuctx, ctx); |
---|
.. | .. |
---|
3685 | 4094 | perf_event_groups_insert(&ctx->flexible_groups, event); |
---|
3686 | 4095 | } |
---|
3687 | 4096 | |
---|
| 4097 | +/* pick an event from the flexible_groups to rotate */ |
---|
3688 | 4098 | static inline struct perf_event * |
---|
3689 | | -ctx_first_active(struct perf_event_context *ctx) |
---|
| 4099 | +ctx_event_to_rotate(struct perf_event_context *ctx) |
---|
3690 | 4100 | { |
---|
3691 | | - return list_first_entry_or_null(&ctx->flexible_active, |
---|
3692 | | - struct perf_event, active_list); |
---|
| 4101 | + struct perf_event *event; |
---|
| 4102 | + |
---|
| 4103 | + /* pick the first active flexible event */ |
---|
| 4104 | + event = list_first_entry_or_null(&ctx->flexible_active, |
---|
| 4105 | + struct perf_event, active_list); |
---|
| 4106 | + |
---|
| 4107 | + /* if no active flexible event, pick the first event */ |
---|
| 4108 | + if (!event) { |
---|
| 4109 | + event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), |
---|
| 4110 | + typeof(*event), group_node); |
---|
| 4111 | + } |
---|
| 4112 | + |
---|
| 4113 | + /* |
---|
| 4114 | + * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() |
---|
| 4115 | + * finds there are unschedulable events, it will set it again. |
---|
| 4116 | + */ |
---|
| 4117 | + ctx->rotate_necessary = 0; |
---|
| 4118 | + |
---|
| 4119 | + return event; |
---|
3693 | 4120 | } |
---|
3694 | 4121 | |
---|
3695 | 4122 | static bool perf_rotate_context(struct perf_cpu_context *cpuctx) |
---|
3696 | 4123 | { |
---|
3697 | 4124 | struct perf_event *cpu_event = NULL, *task_event = NULL; |
---|
3698 | | - bool cpu_rotate = false, task_rotate = false; |
---|
3699 | | - struct perf_event_context *ctx = NULL; |
---|
| 4125 | + struct perf_event_context *task_ctx = NULL; |
---|
| 4126 | + int cpu_rotate, task_rotate; |
---|
3700 | 4127 | |
---|
3701 | 4128 | /* |
---|
3702 | 4129 | * Since we run this from IRQ context, nobody can install new |
---|
3703 | 4130 | * events, thus the event count values are stable. |
---|
3704 | 4131 | */ |
---|
3705 | 4132 | |
---|
3706 | | - if (cpuctx->ctx.nr_events) { |
---|
3707 | | - if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
---|
3708 | | - cpu_rotate = true; |
---|
3709 | | - } |
---|
3710 | | - |
---|
3711 | | - ctx = cpuctx->task_ctx; |
---|
3712 | | - if (ctx && ctx->nr_events) { |
---|
3713 | | - if (ctx->nr_events != ctx->nr_active) |
---|
3714 | | - task_rotate = true; |
---|
3715 | | - } |
---|
| 4133 | + cpu_rotate = cpuctx->ctx.rotate_necessary; |
---|
| 4134 | + task_ctx = cpuctx->task_ctx; |
---|
| 4135 | + task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; |
---|
3716 | 4136 | |
---|
3717 | 4137 | if (!(cpu_rotate || task_rotate)) |
---|
3718 | 4138 | return false; |
---|
.. | .. |
---|
3721 | 4141 | perf_pmu_disable(cpuctx->ctx.pmu); |
---|
3722 | 4142 | |
---|
3723 | 4143 | if (task_rotate) |
---|
3724 | | - task_event = ctx_first_active(ctx); |
---|
| 4144 | + task_event = ctx_event_to_rotate(task_ctx); |
---|
3725 | 4145 | if (cpu_rotate) |
---|
3726 | | - cpu_event = ctx_first_active(&cpuctx->ctx); |
---|
| 4146 | + cpu_event = ctx_event_to_rotate(&cpuctx->ctx); |
---|
3727 | 4147 | |
---|
3728 | 4148 | /* |
---|
3729 | 4149 | * As per the order given at ctx_resched() first 'pop' task flexible |
---|
3730 | 4150 | * and then, if needed CPU flexible. |
---|
3731 | 4151 | */ |
---|
3732 | | - if (task_event || (ctx && cpu_event)) |
---|
3733 | | - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); |
---|
| 4152 | + if (task_event || (task_ctx && cpu_event)) |
---|
| 4153 | + ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); |
---|
3734 | 4154 | if (cpu_event) |
---|
3735 | 4155 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
---|
3736 | 4156 | |
---|
3737 | 4157 | if (task_event) |
---|
3738 | | - rotate_ctx(ctx, task_event); |
---|
| 4158 | + rotate_ctx(task_ctx, task_event); |
---|
3739 | 4159 | if (cpu_event) |
---|
3740 | 4160 | rotate_ctx(&cpuctx->ctx, cpu_event); |
---|
3741 | 4161 | |
---|
3742 | | - perf_event_sched_in(cpuctx, ctx, current); |
---|
| 4162 | + perf_event_sched_in(cpuctx, task_ctx, current); |
---|
3743 | 4163 | |
---|
3744 | 4164 | perf_pmu_enable(cpuctx->ctx.pmu); |
---|
3745 | 4165 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); |
---|
.. | .. |
---|
3983 | 4403 | |
---|
3984 | 4404 | return ret; |
---|
3985 | 4405 | } |
---|
| 4406 | +EXPORT_SYMBOL_GPL(perf_event_read_local); |
---|
3986 | 4407 | |
---|
3987 | 4408 | static int perf_event_read(struct perf_event *event, bool group) |
---|
3988 | 4409 | { |
---|
.. | .. |
---|
4074 | 4495 | INIT_LIST_HEAD(&ctx->event_list); |
---|
4075 | 4496 | INIT_LIST_HEAD(&ctx->pinned_active); |
---|
4076 | 4497 | INIT_LIST_HEAD(&ctx->flexible_active); |
---|
4077 | | - atomic_set(&ctx->refcount, 1); |
---|
| 4498 | + refcount_set(&ctx->refcount, 1); |
---|
4078 | 4499 | } |
---|
4079 | 4500 | |
---|
4080 | 4501 | static struct perf_event_context * |
---|
.. | .. |
---|
4087 | 4508 | return NULL; |
---|
4088 | 4509 | |
---|
4089 | 4510 | __perf_event_init_context(ctx); |
---|
4090 | | - if (task) { |
---|
4091 | | - ctx->task = task; |
---|
4092 | | - get_task_struct(task); |
---|
4093 | | - } |
---|
| 4511 | + if (task) |
---|
| 4512 | + ctx->task = get_task_struct(task); |
---|
4094 | 4513 | ctx->pmu = pmu; |
---|
4095 | 4514 | |
---|
4096 | 4515 | return ctx; |
---|
.. | .. |
---|
4152 | 4571 | goto errout; |
---|
4153 | 4572 | |
---|
4154 | 4573 | if (event->attach_state & PERF_ATTACH_TASK_DATA) { |
---|
4155 | | - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); |
---|
| 4574 | + task_ctx_data = alloc_task_ctx_data(pmu); |
---|
4156 | 4575 | if (!task_ctx_data) { |
---|
4157 | 4576 | err = -ENOMEM; |
---|
4158 | 4577 | goto errout; |
---|
.. | .. |
---|
4210 | 4629 | } |
---|
4211 | 4630 | } |
---|
4212 | 4631 | |
---|
4213 | | - kfree(task_ctx_data); |
---|
| 4632 | + free_task_ctx_data(pmu, task_ctx_data); |
---|
4214 | 4633 | return ctx; |
---|
4215 | 4634 | |
---|
4216 | 4635 | errout: |
---|
4217 | | - kfree(task_ctx_data); |
---|
| 4636 | + free_task_ctx_data(pmu, task_ctx_data); |
---|
4218 | 4637 | return ERR_PTR(err); |
---|
4219 | 4638 | } |
---|
4220 | 4639 | |
---|
.. | .. |
---|
4233 | 4652 | } |
---|
4234 | 4653 | |
---|
4235 | 4654 | static void ring_buffer_attach(struct perf_event *event, |
---|
4236 | | - struct ring_buffer *rb); |
---|
| 4655 | + struct perf_buffer *rb); |
---|
4237 | 4656 | |
---|
4238 | 4657 | static void detach_sb_event(struct perf_event *event) |
---|
4239 | 4658 | { |
---|
.. | .. |
---|
4256 | 4675 | |
---|
4257 | 4676 | if (attr->mmap || attr->mmap_data || attr->mmap2 || |
---|
4258 | 4677 | attr->comm || attr->comm_exec || |
---|
4259 | | - attr->task || |
---|
4260 | | - attr->context_switch) |
---|
| 4678 | + attr->task || attr->ksymbol || |
---|
| 4679 | + attr->context_switch || attr->text_poke || |
---|
| 4680 | + attr->bpf_event) |
---|
4261 | 4681 | return true; |
---|
4262 | 4682 | return false; |
---|
4263 | 4683 | } |
---|
.. | .. |
---|
4306 | 4726 | if (event->parent) |
---|
4307 | 4727 | return; |
---|
4308 | 4728 | |
---|
4309 | | - if (event->attach_state & PERF_ATTACH_TASK) |
---|
| 4729 | + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) |
---|
4310 | 4730 | dec = true; |
---|
4311 | 4731 | if (event->attr.mmap || event->attr.mmap_data) |
---|
4312 | 4732 | atomic_dec(&nr_mmap_events); |
---|
.. | .. |
---|
4314 | 4734 | atomic_dec(&nr_comm_events); |
---|
4315 | 4735 | if (event->attr.namespaces) |
---|
4316 | 4736 | atomic_dec(&nr_namespaces_events); |
---|
| 4737 | + if (event->attr.cgroup) |
---|
| 4738 | + atomic_dec(&nr_cgroup_events); |
---|
4317 | 4739 | if (event->attr.task) |
---|
4318 | 4740 | atomic_dec(&nr_task_events); |
---|
4319 | 4741 | if (event->attr.freq) |
---|
.. | .. |
---|
4326 | 4748 | dec = true; |
---|
4327 | 4749 | if (has_branch_stack(event)) |
---|
4328 | 4750 | dec = true; |
---|
| 4751 | + if (event->attr.ksymbol) |
---|
| 4752 | + atomic_dec(&nr_ksymbol_events); |
---|
| 4753 | + if (event->attr.bpf_event) |
---|
| 4754 | + atomic_dec(&nr_bpf_events); |
---|
| 4755 | + if (event->attr.text_poke) |
---|
| 4756 | + atomic_dec(&nr_text_poke_events); |
---|
4329 | 4757 | |
---|
4330 | 4758 | if (dec) { |
---|
4331 | 4759 | if (!atomic_add_unless(&perf_sched_count, -1, 1)) |
---|
.. | .. |
---|
4909 | 5337 | static __poll_t perf_poll(struct file *file, poll_table *wait) |
---|
4910 | 5338 | { |
---|
4911 | 5339 | struct perf_event *event = file->private_data; |
---|
4912 | | - struct ring_buffer *rb; |
---|
| 5340 | + struct perf_buffer *rb; |
---|
4913 | 5341 | __poll_t events = EPOLLHUP; |
---|
4914 | 5342 | |
---|
4915 | 5343 | poll_wait(file, &event->waitq, wait); |
---|
.. | .. |
---|
4935 | 5363 | local64_set(&event->count, 0); |
---|
4936 | 5364 | perf_event_update_userpage(event); |
---|
4937 | 5365 | } |
---|
| 5366 | + |
---|
| 5367 | +/* Assume it's not an event with inherit set. */ |
---|
| 5368 | +u64 perf_event_pause(struct perf_event *event, bool reset) |
---|
| 5369 | +{ |
---|
| 5370 | + struct perf_event_context *ctx; |
---|
| 5371 | + u64 count; |
---|
| 5372 | + |
---|
| 5373 | + ctx = perf_event_ctx_lock(event); |
---|
| 5374 | + WARN_ON_ONCE(event->attr.inherit); |
---|
| 5375 | + _perf_event_disable(event); |
---|
| 5376 | + count = local64_read(&event->count); |
---|
| 5377 | + if (reset) |
---|
| 5378 | + local64_set(&event->count, 0); |
---|
| 5379 | + perf_event_ctx_unlock(event, ctx); |
---|
| 5380 | + |
---|
| 5381 | + return count; |
---|
| 5382 | +} |
---|
| 5383 | +EXPORT_SYMBOL_GPL(perf_event_pause); |
---|
4938 | 5384 | |
---|
4939 | 5385 | /* |
---|
4940 | 5386 | * Holding the top-level event's child_mutex means that any |
---|
.. | .. |
---|
5013 | 5459 | return event->pmu->check_period(event, value); |
---|
5014 | 5460 | } |
---|
5015 | 5461 | |
---|
5016 | | -static int perf_event_period(struct perf_event *event, u64 __user *arg) |
---|
| 5462 | +static int _perf_event_period(struct perf_event *event, u64 value) |
---|
5017 | 5463 | { |
---|
5018 | | - u64 value; |
---|
5019 | | - |
---|
5020 | 5464 | if (!is_sampling_event(event)) |
---|
5021 | 5465 | return -EINVAL; |
---|
5022 | | - |
---|
5023 | | - if (copy_from_user(&value, arg, sizeof(value))) |
---|
5024 | | - return -EFAULT; |
---|
5025 | 5466 | |
---|
5026 | 5467 | if (!value) |
---|
5027 | 5468 | return -EINVAL; |
---|
.. | .. |
---|
5039 | 5480 | |
---|
5040 | 5481 | return 0; |
---|
5041 | 5482 | } |
---|
| 5483 | + |
---|
| 5484 | +int perf_event_period(struct perf_event *event, u64 value) |
---|
| 5485 | +{ |
---|
| 5486 | + struct perf_event_context *ctx; |
---|
| 5487 | + int ret; |
---|
| 5488 | + |
---|
| 5489 | + ctx = perf_event_ctx_lock(event); |
---|
| 5490 | + ret = _perf_event_period(event, value); |
---|
| 5491 | + perf_event_ctx_unlock(event, ctx); |
---|
| 5492 | + |
---|
| 5493 | + return ret; |
---|
| 5494 | +} |
---|
| 5495 | +EXPORT_SYMBOL_GPL(perf_event_period); |
---|
5042 | 5496 | |
---|
5043 | 5497 | static const struct file_operations perf_fops; |
---|
5044 | 5498 | |
---|
.. | .. |
---|
5083 | 5537 | return _perf_event_refresh(event, arg); |
---|
5084 | 5538 | |
---|
5085 | 5539 | case PERF_EVENT_IOC_PERIOD: |
---|
5086 | | - return perf_event_period(event, (u64 __user *)arg); |
---|
| 5540 | + { |
---|
| 5541 | + u64 value; |
---|
5087 | 5542 | |
---|
| 5543 | + if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) |
---|
| 5544 | + return -EFAULT; |
---|
| 5545 | + |
---|
| 5546 | + return _perf_event_period(event, value); |
---|
| 5547 | + } |
---|
5088 | 5548 | case PERF_EVENT_IOC_ID: |
---|
5089 | 5549 | { |
---|
5090 | 5550 | u64 id = primary_event_id(event); |
---|
.. | .. |
---|
5119 | 5579 | return perf_event_set_bpf_prog(event, arg); |
---|
5120 | 5580 | |
---|
5121 | 5581 | case PERF_EVENT_IOC_PAUSE_OUTPUT: { |
---|
5122 | | - struct ring_buffer *rb; |
---|
| 5582 | + struct perf_buffer *rb; |
---|
5123 | 5583 | |
---|
5124 | 5584 | rcu_read_lock(); |
---|
5125 | 5585 | rb = rcu_dereference(event->rb); |
---|
.. | .. |
---|
5255 | 5715 | static void perf_event_init_userpage(struct perf_event *event) |
---|
5256 | 5716 | { |
---|
5257 | 5717 | struct perf_event_mmap_page *userpg; |
---|
5258 | | - struct ring_buffer *rb; |
---|
| 5718 | + struct perf_buffer *rb; |
---|
5259 | 5719 | |
---|
5260 | 5720 | rcu_read_lock(); |
---|
5261 | 5721 | rb = rcu_dereference(event->rb); |
---|
.. | .. |
---|
5287 | 5747 | void perf_event_update_userpage(struct perf_event *event) |
---|
5288 | 5748 | { |
---|
5289 | 5749 | struct perf_event_mmap_page *userpg; |
---|
5290 | | - struct ring_buffer *rb; |
---|
| 5750 | + struct perf_buffer *rb; |
---|
5291 | 5751 | u64 enabled, running, now; |
---|
5292 | 5752 | |
---|
5293 | 5753 | rcu_read_lock(); |
---|
.. | .. |
---|
5338 | 5798 | static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) |
---|
5339 | 5799 | { |
---|
5340 | 5800 | struct perf_event *event = vmf->vma->vm_file->private_data; |
---|
5341 | | - struct ring_buffer *rb; |
---|
| 5801 | + struct perf_buffer *rb; |
---|
5342 | 5802 | vm_fault_t ret = VM_FAULT_SIGBUS; |
---|
5343 | 5803 | |
---|
5344 | 5804 | if (vmf->flags & FAULT_FLAG_MKWRITE) { |
---|
.. | .. |
---|
5371 | 5831 | } |
---|
5372 | 5832 | |
---|
5373 | 5833 | static void ring_buffer_attach(struct perf_event *event, |
---|
5374 | | - struct ring_buffer *rb) |
---|
| 5834 | + struct perf_buffer *rb) |
---|
5375 | 5835 | { |
---|
5376 | | - struct ring_buffer *old_rb = NULL; |
---|
| 5836 | + struct perf_buffer *old_rb = NULL; |
---|
5377 | 5837 | unsigned long flags; |
---|
| 5838 | + |
---|
| 5839 | + WARN_ON_ONCE(event->parent); |
---|
5378 | 5840 | |
---|
5379 | 5841 | if (event->rb) { |
---|
5380 | 5842 | /* |
---|
.. | .. |
---|
5431 | 5893 | |
---|
5432 | 5894 | static void ring_buffer_wakeup(struct perf_event *event) |
---|
5433 | 5895 | { |
---|
5434 | | - struct ring_buffer *rb; |
---|
| 5896 | + struct perf_buffer *rb; |
---|
| 5897 | + |
---|
| 5898 | + if (event->parent) |
---|
| 5899 | + event = event->parent; |
---|
5435 | 5900 | |
---|
5436 | 5901 | rcu_read_lock(); |
---|
5437 | 5902 | rb = rcu_dereference(event->rb); |
---|
.. | .. |
---|
5442 | 5907 | rcu_read_unlock(); |
---|
5443 | 5908 | } |
---|
5444 | 5909 | |
---|
5445 | | -struct ring_buffer *ring_buffer_get(struct perf_event *event) |
---|
| 5910 | +struct perf_buffer *ring_buffer_get(struct perf_event *event) |
---|
5446 | 5911 | { |
---|
5447 | | - struct ring_buffer *rb; |
---|
| 5912 | + struct perf_buffer *rb; |
---|
| 5913 | + |
---|
| 5914 | + if (event->parent) |
---|
| 5915 | + event = event->parent; |
---|
5448 | 5916 | |
---|
5449 | 5917 | rcu_read_lock(); |
---|
5450 | 5918 | rb = rcu_dereference(event->rb); |
---|
5451 | 5919 | if (rb) { |
---|
5452 | | - if (!atomic_inc_not_zero(&rb->refcount)) |
---|
| 5920 | + if (!refcount_inc_not_zero(&rb->refcount)) |
---|
5453 | 5921 | rb = NULL; |
---|
5454 | 5922 | } |
---|
5455 | 5923 | rcu_read_unlock(); |
---|
.. | .. |
---|
5457 | 5925 | return rb; |
---|
5458 | 5926 | } |
---|
5459 | 5927 | |
---|
5460 | | -void ring_buffer_put(struct ring_buffer *rb) |
---|
| 5928 | +void ring_buffer_put(struct perf_buffer *rb) |
---|
5461 | 5929 | { |
---|
5462 | | - if (!atomic_dec_and_test(&rb->refcount)) |
---|
| 5930 | + if (!refcount_dec_and_test(&rb->refcount)) |
---|
5463 | 5931 | return; |
---|
5464 | 5932 | |
---|
5465 | 5933 | WARN_ON_ONCE(!list_empty(&rb->event_list)); |
---|
.. | .. |
---|
5494 | 5962 | static void perf_mmap_close(struct vm_area_struct *vma) |
---|
5495 | 5963 | { |
---|
5496 | 5964 | struct perf_event *event = vma->vm_file->private_data; |
---|
5497 | | - struct ring_buffer *rb = ring_buffer_get(event); |
---|
| 5965 | + struct perf_buffer *rb = ring_buffer_get(event); |
---|
5498 | 5966 | struct user_struct *mmap_user = rb->mmap_user; |
---|
5499 | 5967 | int mmap_locked = rb->mmap_locked; |
---|
5500 | 5968 | unsigned long size = perf_data_size(rb); |
---|
.. | .. |
---|
5519 | 5987 | perf_pmu_output_stop(event); |
---|
5520 | 5988 | |
---|
5521 | 5989 | /* now it's safe to free the pages */ |
---|
5522 | | - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); |
---|
5523 | | - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; |
---|
| 5990 | + atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); |
---|
| 5991 | + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); |
---|
5524 | 5992 | |
---|
5525 | 5993 | /* this has to be the last one */ |
---|
5526 | 5994 | rb_free_aux(rb); |
---|
5527 | | - WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); |
---|
| 5995 | + WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); |
---|
5528 | 5996 | |
---|
5529 | 5997 | mutex_unlock(&event->mmap_mutex); |
---|
5530 | 5998 | } |
---|
.. | .. |
---|
5593 | 6061 | * undo the VM accounting. |
---|
5594 | 6062 | */ |
---|
5595 | 6063 | |
---|
5596 | | - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); |
---|
5597 | | - vma->vm_mm->pinned_vm -= mmap_locked; |
---|
| 6064 | + atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, |
---|
| 6065 | + &mmap_user->locked_vm); |
---|
| 6066 | + atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); |
---|
5598 | 6067 | free_uid(mmap_user); |
---|
5599 | 6068 | |
---|
5600 | 6069 | out_put: |
---|
.. | .. |
---|
5603 | 6072 | |
---|
5604 | 6073 | static const struct vm_operations_struct perf_mmap_vmops = { |
---|
5605 | 6074 | .open = perf_mmap_open, |
---|
5606 | | - .close = perf_mmap_close, /* non mergable */ |
---|
| 6075 | + .close = perf_mmap_close, /* non mergeable */ |
---|
5607 | 6076 | .fault = perf_mmap_fault, |
---|
5608 | 6077 | .page_mkwrite = perf_mmap_fault, |
---|
5609 | 6078 | }; |
---|
.. | .. |
---|
5613 | 6082 | struct perf_event *event = file->private_data; |
---|
5614 | 6083 | unsigned long user_locked, user_lock_limit; |
---|
5615 | 6084 | struct user_struct *user = current_user(); |
---|
| 6085 | + struct perf_buffer *rb = NULL; |
---|
5616 | 6086 | unsigned long locked, lock_limit; |
---|
5617 | | - struct ring_buffer *rb = NULL; |
---|
5618 | 6087 | unsigned long vma_size; |
---|
5619 | 6088 | unsigned long nr_pages; |
---|
5620 | 6089 | long user_extra = 0, extra = 0; |
---|
.. | .. |
---|
5711 | 6180 | again: |
---|
5712 | 6181 | mutex_lock(&event->mmap_mutex); |
---|
5713 | 6182 | if (event->rb) { |
---|
5714 | | - if (event->rb->nr_pages != nr_pages) { |
---|
| 6183 | + if (data_page_nr(event->rb) != nr_pages) { |
---|
5715 | 6184 | ret = -EINVAL; |
---|
5716 | 6185 | goto unlock; |
---|
5717 | 6186 | } |
---|
5718 | 6187 | |
---|
5719 | 6188 | if (!atomic_inc_not_zero(&event->rb->mmap_count)) { |
---|
5720 | 6189 | /* |
---|
5721 | | - * Raced against perf_mmap_close() through |
---|
5722 | | - * perf_event_set_output(). Try again, hope for better |
---|
5723 | | - * luck. |
---|
| 6190 | + * Raced against perf_mmap_close(); remove the |
---|
| 6191 | + * event and try again. |
---|
5724 | 6192 | */ |
---|
| 6193 | + ring_buffer_attach(event, NULL); |
---|
5725 | 6194 | mutex_unlock(&event->mmap_mutex); |
---|
5726 | 6195 | goto again; |
---|
5727 | 6196 | } |
---|
.. | .. |
---|
5749 | 6218 | user_locked = user_lock_limit; |
---|
5750 | 6219 | user_locked += user_extra; |
---|
5751 | 6220 | |
---|
5752 | | - if (user_locked > user_lock_limit) |
---|
| 6221 | + if (user_locked > user_lock_limit) { |
---|
| 6222 | + /* |
---|
| 6223 | + * charge locked_vm until it hits user_lock_limit; |
---|
| 6224 | + * charge the rest from pinned_vm |
---|
| 6225 | + */ |
---|
5753 | 6226 | extra = user_locked - user_lock_limit; |
---|
| 6227 | + user_extra -= extra; |
---|
| 6228 | + } |
---|
5754 | 6229 | |
---|
5755 | 6230 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
---|
5756 | 6231 | lock_limit >>= PAGE_SHIFT; |
---|
5757 | | - locked = vma->vm_mm->pinned_vm + extra; |
---|
| 6232 | + locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; |
---|
5758 | 6233 | |
---|
5759 | 6234 | if ((locked > lock_limit) && perf_is_paranoid() && |
---|
5760 | 6235 | !capable(CAP_IPC_LOCK)) { |
---|
.. | .. |
---|
5783 | 6258 | |
---|
5784 | 6259 | ring_buffer_attach(event, rb); |
---|
5785 | 6260 | |
---|
| 6261 | + perf_event_update_time(event); |
---|
| 6262 | + perf_set_shadow_time(event, event->ctx); |
---|
5786 | 6263 | perf_event_init_userpage(event); |
---|
5787 | 6264 | perf_event_update_userpage(event); |
---|
5788 | 6265 | } else { |
---|
.. | .. |
---|
5795 | 6272 | unlock: |
---|
5796 | 6273 | if (!ret) { |
---|
5797 | 6274 | atomic_long_add(user_extra, &user->locked_vm); |
---|
5798 | | - vma->vm_mm->pinned_vm += extra; |
---|
| 6275 | + atomic64_add(extra, &vma->vm_mm->pinned_vm); |
---|
5799 | 6276 | |
---|
5800 | 6277 | atomic_inc(&event->mmap_count); |
---|
5801 | 6278 | } else if (rb) { |
---|
.. | .. |
---|
5932 | 6409 | * Later on, we might change it to a list if there is |
---|
5933 | 6410 | * another virtualization implementation supporting the callbacks. |
---|
5934 | 6411 | */ |
---|
5935 | | -struct perf_guest_info_callbacks *perf_guest_cbs; |
---|
| 6412 | +struct perf_guest_info_callbacks __rcu *perf_guest_cbs; |
---|
5936 | 6413 | |
---|
5937 | 6414 | int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) |
---|
5938 | 6415 | { |
---|
5939 | | - perf_guest_cbs = cbs; |
---|
| 6416 | + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) |
---|
| 6417 | + return -EBUSY; |
---|
| 6418 | + |
---|
| 6419 | + rcu_assign_pointer(perf_guest_cbs, cbs); |
---|
5940 | 6420 | return 0; |
---|
5941 | 6421 | } |
---|
5942 | 6422 | EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); |
---|
5943 | 6423 | |
---|
5944 | 6424 | int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) |
---|
5945 | 6425 | { |
---|
5946 | | - perf_guest_cbs = NULL; |
---|
| 6426 | + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) |
---|
| 6427 | + return -EINVAL; |
---|
| 6428 | + |
---|
| 6429 | + rcu_assign_pointer(perf_guest_cbs, NULL); |
---|
| 6430 | + synchronize_rcu(); |
---|
5947 | 6431 | return 0; |
---|
5948 | 6432 | } |
---|
5949 | 6433 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); |
---|
.. | .. |
---|
5965 | 6449 | } |
---|
5966 | 6450 | |
---|
5967 | 6451 | static void perf_sample_regs_user(struct perf_regs *regs_user, |
---|
5968 | | - struct pt_regs *regs, |
---|
5969 | | - struct pt_regs *regs_user_copy) |
---|
| 6452 | + struct pt_regs *regs) |
---|
5970 | 6453 | { |
---|
5971 | 6454 | if (user_mode(regs)) { |
---|
5972 | 6455 | regs_user->abi = perf_reg_abi(current); |
---|
5973 | 6456 | regs_user->regs = regs; |
---|
5974 | 6457 | } else if (!(current->flags & PF_KTHREAD)) { |
---|
5975 | | - perf_get_regs_user(regs_user, regs, regs_user_copy); |
---|
| 6458 | + perf_get_regs_user(regs_user, regs); |
---|
5976 | 6459 | } else { |
---|
5977 | 6460 | regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; |
---|
5978 | 6461 | regs_user->regs = NULL; |
---|
.. | .. |
---|
5991 | 6474 | * Get remaining task size from user stack pointer. |
---|
5992 | 6475 | * |
---|
5993 | 6476 | * It'd be better to take stack vma map and limit this more |
---|
5994 | | - * precisly, but there's no way to get it safely under interrupt, |
---|
| 6477 | + * precisely, but there's no way to get it safely under interrupt, |
---|
5995 | 6478 | * so using TASK_SIZE as limit. |
---|
5996 | 6479 | */ |
---|
5997 | 6480 | static u64 perf_ustack_task_size(struct pt_regs *regs) |
---|
.. | .. |
---|
6073 | 6556 | |
---|
6074 | 6557 | /* Data. */ |
---|
6075 | 6558 | sp = perf_user_stack_pointer(regs); |
---|
6076 | | - fs = get_fs(); |
---|
6077 | | - set_fs(USER_DS); |
---|
| 6559 | + fs = force_uaccess_begin(); |
---|
6078 | 6560 | rem = __output_copy_user(handle, (void *) sp, dump_size); |
---|
6079 | | - set_fs(fs); |
---|
| 6561 | + force_uaccess_end(fs); |
---|
6080 | 6562 | dyn_size = dump_size - rem; |
---|
6081 | 6563 | |
---|
6082 | 6564 | perf_output_skip(handle, rem); |
---|
.. | .. |
---|
6084 | 6566 | /* Dynamic size. */ |
---|
6085 | 6567 | perf_output_put(handle, dyn_size); |
---|
6086 | 6568 | } |
---|
| 6569 | +} |
---|
| 6570 | + |
---|
| 6571 | +static unsigned long perf_prepare_sample_aux(struct perf_event *event, |
---|
| 6572 | + struct perf_sample_data *data, |
---|
| 6573 | + size_t size) |
---|
| 6574 | +{ |
---|
| 6575 | + struct perf_event *sampler = event->aux_event; |
---|
| 6576 | + struct perf_buffer *rb; |
---|
| 6577 | + |
---|
| 6578 | + data->aux_size = 0; |
---|
| 6579 | + |
---|
| 6580 | + if (!sampler) |
---|
| 6581 | + goto out; |
---|
| 6582 | + |
---|
| 6583 | + if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) |
---|
| 6584 | + goto out; |
---|
| 6585 | + |
---|
| 6586 | + if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) |
---|
| 6587 | + goto out; |
---|
| 6588 | + |
---|
| 6589 | + rb = ring_buffer_get(sampler); |
---|
| 6590 | + if (!rb) |
---|
| 6591 | + goto out; |
---|
| 6592 | + |
---|
| 6593 | + /* |
---|
| 6594 | + * If this is an NMI hit inside sampling code, don't take |
---|
| 6595 | + * the sample. See also perf_aux_sample_output(). |
---|
| 6596 | + */ |
---|
| 6597 | + if (READ_ONCE(rb->aux_in_sampling)) { |
---|
| 6598 | + data->aux_size = 0; |
---|
| 6599 | + } else { |
---|
| 6600 | + size = min_t(size_t, size, perf_aux_size(rb)); |
---|
| 6601 | + data->aux_size = ALIGN(size, sizeof(u64)); |
---|
| 6602 | + } |
---|
| 6603 | + ring_buffer_put(rb); |
---|
| 6604 | + |
---|
| 6605 | +out: |
---|
| 6606 | + return data->aux_size; |
---|
| 6607 | +} |
---|
| 6608 | + |
---|
| 6609 | +long perf_pmu_snapshot_aux(struct perf_buffer *rb, |
---|
| 6610 | + struct perf_event *event, |
---|
| 6611 | + struct perf_output_handle *handle, |
---|
| 6612 | + unsigned long size) |
---|
| 6613 | +{ |
---|
| 6614 | + unsigned long flags; |
---|
| 6615 | + long ret; |
---|
| 6616 | + |
---|
| 6617 | + /* |
---|
| 6618 | + * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler |
---|
| 6619 | + * paths. If we start calling them in NMI context, they may race with |
---|
| 6620 | + * the IRQ ones, that is, for example, re-starting an event that's just |
---|
| 6621 | + * been stopped, which is why we're using a separate callback that |
---|
| 6622 | + * doesn't change the event state. |
---|
| 6623 | + * |
---|
| 6624 | + * IRQs need to be disabled to prevent IPIs from racing with us. |
---|
| 6625 | + */ |
---|
| 6626 | + local_irq_save(flags); |
---|
| 6627 | + /* |
---|
| 6628 | + * Guard against NMI hits inside the critical section; |
---|
| 6629 | + * see also perf_prepare_sample_aux(). |
---|
| 6630 | + */ |
---|
| 6631 | + WRITE_ONCE(rb->aux_in_sampling, 1); |
---|
| 6632 | + barrier(); |
---|
| 6633 | + |
---|
| 6634 | + ret = event->pmu->snapshot_aux(event, handle, size); |
---|
| 6635 | + |
---|
| 6636 | + barrier(); |
---|
| 6637 | + WRITE_ONCE(rb->aux_in_sampling, 0); |
---|
| 6638 | + local_irq_restore(flags); |
---|
| 6639 | + |
---|
| 6640 | + return ret; |
---|
| 6641 | +} |
---|
| 6642 | + |
---|
| 6643 | +static void perf_aux_sample_output(struct perf_event *event, |
---|
| 6644 | + struct perf_output_handle *handle, |
---|
| 6645 | + struct perf_sample_data *data) |
---|
| 6646 | +{ |
---|
| 6647 | + struct perf_event *sampler = event->aux_event; |
---|
| 6648 | + struct perf_buffer *rb; |
---|
| 6649 | + unsigned long pad; |
---|
| 6650 | + long size; |
---|
| 6651 | + |
---|
| 6652 | + if (WARN_ON_ONCE(!sampler || !data->aux_size)) |
---|
| 6653 | + return; |
---|
| 6654 | + |
---|
| 6655 | + rb = ring_buffer_get(sampler); |
---|
| 6656 | + if (!rb) |
---|
| 6657 | + return; |
---|
| 6658 | + |
---|
| 6659 | + size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); |
---|
| 6660 | + |
---|
| 6661 | + /* |
---|
| 6662 | + * An error here means that perf_output_copy() failed (returned a |
---|
| 6663 | + * non-zero surplus that it didn't copy), which in its current |
---|
| 6664 | + * enlightened implementation is not possible. If that changes, we'd |
---|
| 6665 | + * like to know. |
---|
| 6666 | + */ |
---|
| 6667 | + if (WARN_ON_ONCE(size < 0)) |
---|
| 6668 | + goto out_put; |
---|
| 6669 | + |
---|
| 6670 | + /* |
---|
| 6671 | + * The pad comes from ALIGN()ing data->aux_size up to u64 in |
---|
| 6672 | + * perf_prepare_sample_aux(), so should not be more than that. |
---|
| 6673 | + */ |
---|
| 6674 | + pad = data->aux_size - size; |
---|
| 6675 | + if (WARN_ON_ONCE(pad >= sizeof(u64))) |
---|
| 6676 | + pad = 8; |
---|
| 6677 | + |
---|
| 6678 | + if (pad) { |
---|
| 6679 | + u64 zero = 0; |
---|
| 6680 | + perf_output_copy(handle, &zero, pad); |
---|
| 6681 | + } |
---|
| 6682 | + |
---|
| 6683 | +out_put: |
---|
| 6684 | + ring_buffer_put(rb); |
---|
6087 | 6685 | } |
---|
6088 | 6686 | |
---|
6089 | 6687 | static void __perf_event_header__init_id(struct perf_event_header *header, |
---|
.. | .. |
---|
6255 | 6853 | perf_output_read_one(handle, event, enabled, running); |
---|
6256 | 6854 | } |
---|
6257 | 6855 | |
---|
| 6856 | +static inline bool perf_sample_save_hw_index(struct perf_event *event) |
---|
| 6857 | +{ |
---|
| 6858 | + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; |
---|
| 6859 | +} |
---|
| 6860 | + |
---|
6258 | 6861 | void perf_output_sample(struct perf_output_handle *handle, |
---|
6259 | 6862 | struct perf_event_header *header, |
---|
6260 | 6863 | struct perf_sample_data *data, |
---|
.. | .. |
---|
6343 | 6946 | * sizeof(struct perf_branch_entry); |
---|
6344 | 6947 | |
---|
6345 | 6948 | perf_output_put(handle, data->br_stack->nr); |
---|
| 6949 | + if (perf_sample_save_hw_index(event)) |
---|
| 6950 | + perf_output_put(handle, data->br_stack->hw_idx); |
---|
6346 | 6951 | perf_output_copy(handle, data->br_stack->entries, size); |
---|
6347 | 6952 | } else { |
---|
6348 | 6953 | /* |
---|
.. | .. |
---|
6405 | 7010 | if (sample_type & PERF_SAMPLE_PHYS_ADDR) |
---|
6406 | 7011 | perf_output_put(handle, data->phys_addr); |
---|
6407 | 7012 | |
---|
| 7013 | + if (sample_type & PERF_SAMPLE_CGROUP) |
---|
| 7014 | + perf_output_put(handle, data->cgroup); |
---|
| 7015 | + |
---|
| 7016 | + if (sample_type & PERF_SAMPLE_AUX) { |
---|
| 7017 | + perf_output_put(handle, data->aux_size); |
---|
| 7018 | + |
---|
| 7019 | + if (data->aux_size) |
---|
| 7020 | + perf_aux_sample_output(event, handle, data); |
---|
| 7021 | + } |
---|
| 7022 | + |
---|
6408 | 7023 | if (!event->attr.watermark) { |
---|
6409 | 7024 | int wakeup_events = event->attr.wakeup_events; |
---|
6410 | 7025 | |
---|
6411 | 7026 | if (wakeup_events) { |
---|
6412 | | - struct ring_buffer *rb = handle->rb; |
---|
| 7027 | + struct perf_buffer *rb = handle->rb; |
---|
6413 | 7028 | int events = local_inc_return(&rb->events); |
---|
6414 | 7029 | |
---|
6415 | 7030 | if (events >= wakeup_events) { |
---|
.. | .. |
---|
6437 | 7052 | * Walking the pages tables for user address. |
---|
6438 | 7053 | * Interrupts are disabled, so it prevents any tear down |
---|
6439 | 7054 | * of the page tables. |
---|
6440 | | - * Try IRQ-safe __get_user_pages_fast first. |
---|
| 7055 | + * Try IRQ-safe get_user_page_fast_only first. |
---|
6441 | 7056 | * If failed, leave phys_addr as 0. |
---|
6442 | 7057 | */ |
---|
6443 | 7058 | if (current->mm != NULL) { |
---|
6444 | 7059 | struct page *p; |
---|
6445 | 7060 | |
---|
6446 | 7061 | pagefault_disable(); |
---|
6447 | | - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) { |
---|
| 7062 | + if (get_user_page_fast_only(virt, 0, &p)) { |
---|
6448 | 7063 | phys_addr = page_to_phys(p) + virt % PAGE_SIZE; |
---|
6449 | 7064 | put_page(p); |
---|
6450 | 7065 | } |
---|
.. | .. |
---|
6532 | 7147 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { |
---|
6533 | 7148 | int size = sizeof(u64); /* nr */ |
---|
6534 | 7149 | if (data->br_stack) { |
---|
| 7150 | + if (perf_sample_save_hw_index(event)) |
---|
| 7151 | + size += sizeof(u64); |
---|
| 7152 | + |
---|
6535 | 7153 | size += data->br_stack->nr |
---|
6536 | 7154 | * sizeof(struct perf_branch_entry); |
---|
6537 | 7155 | } |
---|
.. | .. |
---|
6539 | 7157 | } |
---|
6540 | 7158 | |
---|
6541 | 7159 | if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) |
---|
6542 | | - perf_sample_regs_user(&data->regs_user, regs, |
---|
6543 | | - &data->regs_user_copy); |
---|
| 7160 | + perf_sample_regs_user(&data->regs_user, regs); |
---|
6544 | 7161 | |
---|
6545 | 7162 | if (sample_type & PERF_SAMPLE_REGS_USER) { |
---|
6546 | 7163 | /* regs dump ABI info */ |
---|
.. | .. |
---|
6556 | 7173 | |
---|
6557 | 7174 | if (sample_type & PERF_SAMPLE_STACK_USER) { |
---|
6558 | 7175 | /* |
---|
6559 | | - * Either we need PERF_SAMPLE_STACK_USER bit to be allways |
---|
| 7176 | + * Either we need PERF_SAMPLE_STACK_USER bit to be always |
---|
6560 | 7177 | * processed as the last one or have additional check added |
---|
6561 | 7178 | * in case new sample type is added, because we could eat |
---|
6562 | 7179 | * up the rest of the sample size. |
---|
.. | .. |
---|
6596 | 7213 | |
---|
6597 | 7214 | if (sample_type & PERF_SAMPLE_PHYS_ADDR) |
---|
6598 | 7215 | data->phys_addr = perf_virt_to_phys(data->addr); |
---|
| 7216 | + |
---|
| 7217 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 7218 | + if (sample_type & PERF_SAMPLE_CGROUP) { |
---|
| 7219 | + struct cgroup *cgrp; |
---|
| 7220 | + |
---|
| 7221 | + /* protected by RCU */ |
---|
| 7222 | + cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; |
---|
| 7223 | + data->cgroup = cgroup_id(cgrp); |
---|
| 7224 | + } |
---|
| 7225 | +#endif |
---|
| 7226 | + |
---|
| 7227 | + if (sample_type & PERF_SAMPLE_AUX) { |
---|
| 7228 | + u64 size; |
---|
| 7229 | + |
---|
| 7230 | + header->size += sizeof(u64); /* size */ |
---|
| 7231 | + |
---|
| 7232 | + /* |
---|
| 7233 | + * Given the 16bit nature of header::size, an AUX sample can |
---|
| 7234 | + * easily overflow it, what with all the preceding sample bits. |
---|
| 7235 | + * Make sure this doesn't happen by using up to U16_MAX bytes |
---|
| 7236 | + * per sample in total (rounded down to 8 byte boundary). |
---|
| 7237 | + */ |
---|
| 7238 | + size = min_t(size_t, U16_MAX - header->size, |
---|
| 7239 | + event->attr.aux_sample_size); |
---|
| 7240 | + size = rounddown(size, 8); |
---|
| 7241 | + size = perf_prepare_sample_aux(event, data, size); |
---|
| 7242 | + |
---|
| 7243 | + WARN_ON_ONCE(size + header->size > U16_MAX); |
---|
| 7244 | + header->size += size; |
---|
| 7245 | + } |
---|
| 7246 | + /* |
---|
| 7247 | + * If you're adding more sample types here, you likely need to do |
---|
| 7248 | + * something about the overflowing header::size, like repurpose the |
---|
| 7249 | + * lowest 3 bits of size, which should be always zero at the moment. |
---|
| 7250 | + * This raises a more important question, do we really need 512k sized |
---|
| 7251 | + * samples and why, so good argumentation is in order for whatever you |
---|
| 7252 | + * do here next. |
---|
| 7253 | + */ |
---|
| 7254 | + WARN_ON_ONCE(header->size & 7); |
---|
6599 | 7255 | } |
---|
6600 | 7256 | |
---|
6601 | | -static __always_inline void |
---|
| 7257 | +static __always_inline int |
---|
6602 | 7258 | __perf_event_output(struct perf_event *event, |
---|
6603 | 7259 | struct perf_sample_data *data, |
---|
6604 | 7260 | struct pt_regs *regs, |
---|
6605 | 7261 | int (*output_begin)(struct perf_output_handle *, |
---|
| 7262 | + struct perf_sample_data *, |
---|
6606 | 7263 | struct perf_event *, |
---|
6607 | 7264 | unsigned int)) |
---|
6608 | 7265 | { |
---|
6609 | 7266 | struct perf_output_handle handle; |
---|
6610 | 7267 | struct perf_event_header header; |
---|
| 7268 | + int err; |
---|
6611 | 7269 | |
---|
6612 | 7270 | /* protect the callchain buffers */ |
---|
6613 | 7271 | rcu_read_lock(); |
---|
6614 | 7272 | |
---|
6615 | 7273 | perf_prepare_sample(&header, data, event, regs); |
---|
6616 | 7274 | |
---|
6617 | | - if (output_begin(&handle, event, header.size)) |
---|
| 7275 | + err = output_begin(&handle, data, event, header.size); |
---|
| 7276 | + if (err) |
---|
6618 | 7277 | goto exit; |
---|
6619 | 7278 | |
---|
6620 | 7279 | perf_output_sample(&handle, &header, data, event); |
---|
.. | .. |
---|
6623 | 7282 | |
---|
6624 | 7283 | exit: |
---|
6625 | 7284 | rcu_read_unlock(); |
---|
| 7285 | + return err; |
---|
6626 | 7286 | } |
---|
6627 | 7287 | |
---|
6628 | 7288 | void |
---|
.. | .. |
---|
6641 | 7301 | __perf_event_output(event, data, regs, perf_output_begin_backward); |
---|
6642 | 7302 | } |
---|
6643 | 7303 | |
---|
6644 | | -void |
---|
| 7304 | +int |
---|
6645 | 7305 | perf_event_output(struct perf_event *event, |
---|
6646 | 7306 | struct perf_sample_data *data, |
---|
6647 | 7307 | struct pt_regs *regs) |
---|
6648 | 7308 | { |
---|
6649 | | - __perf_event_output(event, data, regs, perf_output_begin); |
---|
| 7309 | + return __perf_event_output(event, data, regs, perf_output_begin); |
---|
6650 | 7310 | } |
---|
6651 | 7311 | |
---|
6652 | 7312 | /* |
---|
.. | .. |
---|
6678 | 7338 | int ret; |
---|
6679 | 7339 | |
---|
6680 | 7340 | perf_event_header__init_id(&read_event.header, &sample, event); |
---|
6681 | | - ret = perf_output_begin(&handle, event, read_event.header.size); |
---|
| 7341 | + ret = perf_output_begin(&handle, &sample, event, read_event.header.size); |
---|
6682 | 7342 | if (ret) |
---|
6683 | 7343 | return; |
---|
6684 | 7344 | |
---|
.. | .. |
---|
6823 | 7483 | } |
---|
6824 | 7484 | |
---|
6825 | 7485 | struct remote_output { |
---|
6826 | | - struct ring_buffer *rb; |
---|
| 7486 | + struct perf_buffer *rb; |
---|
6827 | 7487 | int err; |
---|
6828 | 7488 | }; |
---|
6829 | 7489 | |
---|
.. | .. |
---|
6831 | 7491 | { |
---|
6832 | 7492 | struct perf_event *parent = event->parent; |
---|
6833 | 7493 | struct remote_output *ro = data; |
---|
6834 | | - struct ring_buffer *rb = ro->rb; |
---|
| 7494 | + struct perf_buffer *rb = ro->rb; |
---|
6835 | 7495 | struct stop_event_data sd = { |
---|
6836 | 7496 | .event = event, |
---|
6837 | 7497 | }; |
---|
.. | .. |
---|
6947 | 7607 | |
---|
6948 | 7608 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
---|
6949 | 7609 | |
---|
6950 | | - ret = perf_output_begin(&handle, event, |
---|
| 7610 | + ret = perf_output_begin(&handle, &sample, event, |
---|
6951 | 7611 | task_event->event_id.header.size); |
---|
6952 | 7612 | if (ret) |
---|
6953 | 7613 | goto out; |
---|
.. | .. |
---|
7050 | 7710 | return; |
---|
7051 | 7711 | |
---|
7052 | 7712 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); |
---|
7053 | | - ret = perf_output_begin(&handle, event, |
---|
| 7713 | + ret = perf_output_begin(&handle, &sample, event, |
---|
7054 | 7714 | comm_event->event_id.header.size); |
---|
7055 | 7715 | |
---|
7056 | 7716 | if (ret) |
---|
.. | .. |
---|
7150 | 7810 | |
---|
7151 | 7811 | perf_event_header__init_id(&namespaces_event->event_id.header, |
---|
7152 | 7812 | &sample, event); |
---|
7153 | | - ret = perf_output_begin(&handle, event, |
---|
| 7813 | + ret = perf_output_begin(&handle, &sample, event, |
---|
7154 | 7814 | namespaces_event->event_id.header.size); |
---|
7155 | 7815 | if (ret) |
---|
7156 | 7816 | goto out; |
---|
.. | .. |
---|
7175 | 7835 | { |
---|
7176 | 7836 | struct path ns_path; |
---|
7177 | 7837 | struct inode *ns_inode; |
---|
7178 | | - void *error; |
---|
| 7838 | + int error; |
---|
7179 | 7839 | |
---|
7180 | 7840 | error = ns_get_path(&ns_path, task, ns_ops); |
---|
7181 | 7841 | if (!error) { |
---|
.. | .. |
---|
7245 | 7905 | } |
---|
7246 | 7906 | |
---|
7247 | 7907 | /* |
---|
| 7908 | + * cgroup tracking |
---|
| 7909 | + */ |
---|
| 7910 | +#ifdef CONFIG_CGROUP_PERF |
---|
| 7911 | + |
---|
| 7912 | +struct perf_cgroup_event { |
---|
| 7913 | + char *path; |
---|
| 7914 | + int path_size; |
---|
| 7915 | + struct { |
---|
| 7916 | + struct perf_event_header header; |
---|
| 7917 | + u64 id; |
---|
| 7918 | + char path[]; |
---|
| 7919 | + } event_id; |
---|
| 7920 | +}; |
---|
| 7921 | + |
---|
| 7922 | +static int perf_event_cgroup_match(struct perf_event *event) |
---|
| 7923 | +{ |
---|
| 7924 | + return event->attr.cgroup; |
---|
| 7925 | +} |
---|
| 7926 | + |
---|
| 7927 | +static void perf_event_cgroup_output(struct perf_event *event, void *data) |
---|
| 7928 | +{ |
---|
| 7929 | + struct perf_cgroup_event *cgroup_event = data; |
---|
| 7930 | + struct perf_output_handle handle; |
---|
| 7931 | + struct perf_sample_data sample; |
---|
| 7932 | + u16 header_size = cgroup_event->event_id.header.size; |
---|
| 7933 | + int ret; |
---|
| 7934 | + |
---|
| 7935 | + if (!perf_event_cgroup_match(event)) |
---|
| 7936 | + return; |
---|
| 7937 | + |
---|
| 7938 | + perf_event_header__init_id(&cgroup_event->event_id.header, |
---|
| 7939 | + &sample, event); |
---|
| 7940 | + ret = perf_output_begin(&handle, &sample, event, |
---|
| 7941 | + cgroup_event->event_id.header.size); |
---|
| 7942 | + if (ret) |
---|
| 7943 | + goto out; |
---|
| 7944 | + |
---|
| 7945 | + perf_output_put(&handle, cgroup_event->event_id); |
---|
| 7946 | + __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); |
---|
| 7947 | + |
---|
| 7948 | + perf_event__output_id_sample(event, &handle, &sample); |
---|
| 7949 | + |
---|
| 7950 | + perf_output_end(&handle); |
---|
| 7951 | +out: |
---|
| 7952 | + cgroup_event->event_id.header.size = header_size; |
---|
| 7953 | +} |
---|
| 7954 | + |
---|
| 7955 | +static void perf_event_cgroup(struct cgroup *cgrp) |
---|
| 7956 | +{ |
---|
| 7957 | + struct perf_cgroup_event cgroup_event; |
---|
| 7958 | + char path_enomem[16] = "//enomem"; |
---|
| 7959 | + char *pathname; |
---|
| 7960 | + size_t size; |
---|
| 7961 | + |
---|
| 7962 | + if (!atomic_read(&nr_cgroup_events)) |
---|
| 7963 | + return; |
---|
| 7964 | + |
---|
| 7965 | + cgroup_event = (struct perf_cgroup_event){ |
---|
| 7966 | + .event_id = { |
---|
| 7967 | + .header = { |
---|
| 7968 | + .type = PERF_RECORD_CGROUP, |
---|
| 7969 | + .misc = 0, |
---|
| 7970 | + .size = sizeof(cgroup_event.event_id), |
---|
| 7971 | + }, |
---|
| 7972 | + .id = cgroup_id(cgrp), |
---|
| 7973 | + }, |
---|
| 7974 | + }; |
---|
| 7975 | + |
---|
| 7976 | + pathname = kmalloc(PATH_MAX, GFP_KERNEL); |
---|
| 7977 | + if (pathname == NULL) { |
---|
| 7978 | + cgroup_event.path = path_enomem; |
---|
| 7979 | + } else { |
---|
| 7980 | + /* just to be sure to have enough space for alignment */ |
---|
| 7981 | + cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); |
---|
| 7982 | + cgroup_event.path = pathname; |
---|
| 7983 | + } |
---|
| 7984 | + |
---|
| 7985 | + /* |
---|
| 7986 | + * Since our buffer works in 8 byte units we need to align our string |
---|
| 7987 | + * size to a multiple of 8. However, we must guarantee the tail end is |
---|
| 7988 | + * zero'd out to avoid leaking random bits to userspace. |
---|
| 7989 | + */ |
---|
| 7990 | + size = strlen(cgroup_event.path) + 1; |
---|
| 7991 | + while (!IS_ALIGNED(size, sizeof(u64))) |
---|
| 7992 | + cgroup_event.path[size++] = '\0'; |
---|
| 7993 | + |
---|
| 7994 | + cgroup_event.event_id.header.size += size; |
---|
| 7995 | + cgroup_event.path_size = size; |
---|
| 7996 | + |
---|
| 7997 | + perf_iterate_sb(perf_event_cgroup_output, |
---|
| 7998 | + &cgroup_event, |
---|
| 7999 | + NULL); |
---|
| 8000 | + |
---|
| 8001 | + kfree(pathname); |
---|
| 8002 | +} |
---|
| 8003 | + |
---|
| 8004 | +#endif |
---|
| 8005 | + |
---|
| 8006 | +/* |
---|
7248 | 8007 | * mmap tracking |
---|
7249 | 8008 | */ |
---|
7250 | 8009 | |
---|
.. | .. |
---|
7304 | 8063 | } |
---|
7305 | 8064 | |
---|
7306 | 8065 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); |
---|
7307 | | - ret = perf_output_begin(&handle, event, |
---|
| 8066 | + ret = perf_output_begin(&handle, &sample, event, |
---|
7308 | 8067 | mmap_event->event_id.header.size); |
---|
7309 | 8068 | if (ret) |
---|
7310 | 8069 | goto out; |
---|
.. | .. |
---|
7364 | 8123 | flags |= MAP_EXECUTABLE; |
---|
7365 | 8124 | if (vma->vm_flags & VM_LOCKED) |
---|
7366 | 8125 | flags |= MAP_LOCKED; |
---|
7367 | | - if (vma->vm_flags & VM_HUGETLB) |
---|
| 8126 | + if (is_vm_hugetlb_page(vma)) |
---|
7368 | 8127 | flags |= MAP_HUGETLB; |
---|
7369 | 8128 | |
---|
7370 | 8129 | if (file) { |
---|
.. | .. |
---|
7614 | 8373 | int ret; |
---|
7615 | 8374 | |
---|
7616 | 8375 | perf_event_header__init_id(&rec.header, &sample, event); |
---|
7617 | | - ret = perf_output_begin(&handle, event, rec.header.size); |
---|
| 8376 | + ret = perf_output_begin(&handle, &sample, event, rec.header.size); |
---|
7618 | 8377 | |
---|
7619 | 8378 | if (ret) |
---|
7620 | 8379 | return; |
---|
.. | .. |
---|
7648 | 8407 | |
---|
7649 | 8408 | perf_event_header__init_id(&lost_samples_event.header, &sample, event); |
---|
7650 | 8409 | |
---|
7651 | | - ret = perf_output_begin(&handle, event, |
---|
| 8410 | + ret = perf_output_begin(&handle, &sample, event, |
---|
7652 | 8411 | lost_samples_event.header.size); |
---|
7653 | 8412 | if (ret) |
---|
7654 | 8413 | return; |
---|
.. | .. |
---|
7703 | 8462 | |
---|
7704 | 8463 | perf_event_header__init_id(&se->event_id.header, &sample, event); |
---|
7705 | 8464 | |
---|
7706 | | - ret = perf_output_begin(&handle, event, se->event_id.header.size); |
---|
| 8465 | + ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); |
---|
7707 | 8466 | if (ret) |
---|
7708 | 8467 | return; |
---|
7709 | 8468 | |
---|
.. | .. |
---|
7778 | 8537 | |
---|
7779 | 8538 | perf_event_header__init_id(&throttle_event.header, &sample, event); |
---|
7780 | 8539 | |
---|
7781 | | - ret = perf_output_begin(&handle, event, |
---|
| 8540 | + ret = perf_output_begin(&handle, &sample, event, |
---|
7782 | 8541 | throttle_event.header.size); |
---|
7783 | 8542 | if (ret) |
---|
7784 | 8543 | return; |
---|
.. | .. |
---|
7786 | 8545 | perf_output_put(&handle, throttle_event); |
---|
7787 | 8546 | perf_event__output_id_sample(event, &handle, &sample); |
---|
7788 | 8547 | perf_output_end(&handle); |
---|
| 8548 | +} |
---|
| 8549 | + |
---|
| 8550 | +/* |
---|
| 8551 | + * ksymbol register/unregister tracking |
---|
| 8552 | + */ |
---|
| 8553 | + |
---|
| 8554 | +struct perf_ksymbol_event { |
---|
| 8555 | + const char *name; |
---|
| 8556 | + int name_len; |
---|
| 8557 | + struct { |
---|
| 8558 | + struct perf_event_header header; |
---|
| 8559 | + u64 addr; |
---|
| 8560 | + u32 len; |
---|
| 8561 | + u16 ksym_type; |
---|
| 8562 | + u16 flags; |
---|
| 8563 | + } event_id; |
---|
| 8564 | +}; |
---|
| 8565 | + |
---|
| 8566 | +static int perf_event_ksymbol_match(struct perf_event *event) |
---|
| 8567 | +{ |
---|
| 8568 | + return event->attr.ksymbol; |
---|
| 8569 | +} |
---|
| 8570 | + |
---|
| 8571 | +static void perf_event_ksymbol_output(struct perf_event *event, void *data) |
---|
| 8572 | +{ |
---|
| 8573 | + struct perf_ksymbol_event *ksymbol_event = data; |
---|
| 8574 | + struct perf_output_handle handle; |
---|
| 8575 | + struct perf_sample_data sample; |
---|
| 8576 | + int ret; |
---|
| 8577 | + |
---|
| 8578 | + if (!perf_event_ksymbol_match(event)) |
---|
| 8579 | + return; |
---|
| 8580 | + |
---|
| 8581 | + perf_event_header__init_id(&ksymbol_event->event_id.header, |
---|
| 8582 | + &sample, event); |
---|
| 8583 | + ret = perf_output_begin(&handle, &sample, event, |
---|
| 8584 | + ksymbol_event->event_id.header.size); |
---|
| 8585 | + if (ret) |
---|
| 8586 | + return; |
---|
| 8587 | + |
---|
| 8588 | + perf_output_put(&handle, ksymbol_event->event_id); |
---|
| 8589 | + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); |
---|
| 8590 | + perf_event__output_id_sample(event, &handle, &sample); |
---|
| 8591 | + |
---|
| 8592 | + perf_output_end(&handle); |
---|
| 8593 | +} |
---|
| 8594 | + |
---|
| 8595 | +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, |
---|
| 8596 | + const char *sym) |
---|
| 8597 | +{ |
---|
| 8598 | + struct perf_ksymbol_event ksymbol_event; |
---|
| 8599 | + char name[KSYM_NAME_LEN]; |
---|
| 8600 | + u16 flags = 0; |
---|
| 8601 | + int name_len; |
---|
| 8602 | + |
---|
| 8603 | + if (!atomic_read(&nr_ksymbol_events)) |
---|
| 8604 | + return; |
---|
| 8605 | + |
---|
| 8606 | + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || |
---|
| 8607 | + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) |
---|
| 8608 | + goto err; |
---|
| 8609 | + |
---|
| 8610 | + strlcpy(name, sym, KSYM_NAME_LEN); |
---|
| 8611 | + name_len = strlen(name) + 1; |
---|
| 8612 | + while (!IS_ALIGNED(name_len, sizeof(u64))) |
---|
| 8613 | + name[name_len++] = '\0'; |
---|
| 8614 | + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); |
---|
| 8615 | + |
---|
| 8616 | + if (unregister) |
---|
| 8617 | + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; |
---|
| 8618 | + |
---|
| 8619 | + ksymbol_event = (struct perf_ksymbol_event){ |
---|
| 8620 | + .name = name, |
---|
| 8621 | + .name_len = name_len, |
---|
| 8622 | + .event_id = { |
---|
| 8623 | + .header = { |
---|
| 8624 | + .type = PERF_RECORD_KSYMBOL, |
---|
| 8625 | + .size = sizeof(ksymbol_event.event_id) + |
---|
| 8626 | + name_len, |
---|
| 8627 | + }, |
---|
| 8628 | + .addr = addr, |
---|
| 8629 | + .len = len, |
---|
| 8630 | + .ksym_type = ksym_type, |
---|
| 8631 | + .flags = flags, |
---|
| 8632 | + }, |
---|
| 8633 | + }; |
---|
| 8634 | + |
---|
| 8635 | + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); |
---|
| 8636 | + return; |
---|
| 8637 | +err: |
---|
| 8638 | + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); |
---|
| 8639 | +} |
---|
| 8640 | + |
---|
| 8641 | +/* |
---|
| 8642 | + * bpf program load/unload tracking |
---|
| 8643 | + */ |
---|
| 8644 | + |
---|
| 8645 | +struct perf_bpf_event { |
---|
| 8646 | + struct bpf_prog *prog; |
---|
| 8647 | + struct { |
---|
| 8648 | + struct perf_event_header header; |
---|
| 8649 | + u16 type; |
---|
| 8650 | + u16 flags; |
---|
| 8651 | + u32 id; |
---|
| 8652 | + u8 tag[BPF_TAG_SIZE]; |
---|
| 8653 | + } event_id; |
---|
| 8654 | +}; |
---|
| 8655 | + |
---|
| 8656 | +static int perf_event_bpf_match(struct perf_event *event) |
---|
| 8657 | +{ |
---|
| 8658 | + return event->attr.bpf_event; |
---|
| 8659 | +} |
---|
| 8660 | + |
---|
| 8661 | +static void perf_event_bpf_output(struct perf_event *event, void *data) |
---|
| 8662 | +{ |
---|
| 8663 | + struct perf_bpf_event *bpf_event = data; |
---|
| 8664 | + struct perf_output_handle handle; |
---|
| 8665 | + struct perf_sample_data sample; |
---|
| 8666 | + int ret; |
---|
| 8667 | + |
---|
| 8668 | + if (!perf_event_bpf_match(event)) |
---|
| 8669 | + return; |
---|
| 8670 | + |
---|
| 8671 | + perf_event_header__init_id(&bpf_event->event_id.header, |
---|
| 8672 | + &sample, event); |
---|
| 8673 | + ret = perf_output_begin(&handle, &sample, event, |
---|
| 8674 | + bpf_event->event_id.header.size); |
---|
| 8675 | + if (ret) |
---|
| 8676 | + return; |
---|
| 8677 | + |
---|
| 8678 | + perf_output_put(&handle, bpf_event->event_id); |
---|
| 8679 | + perf_event__output_id_sample(event, &handle, &sample); |
---|
| 8680 | + |
---|
| 8681 | + perf_output_end(&handle); |
---|
| 8682 | +} |
---|
| 8683 | + |
---|
| 8684 | +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, |
---|
| 8685 | + enum perf_bpf_event_type type) |
---|
| 8686 | +{ |
---|
| 8687 | + bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; |
---|
| 8688 | + int i; |
---|
| 8689 | + |
---|
| 8690 | + if (prog->aux->func_cnt == 0) { |
---|
| 8691 | + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, |
---|
| 8692 | + (u64)(unsigned long)prog->bpf_func, |
---|
| 8693 | + prog->jited_len, unregister, |
---|
| 8694 | + prog->aux->ksym.name); |
---|
| 8695 | + } else { |
---|
| 8696 | + for (i = 0; i < prog->aux->func_cnt; i++) { |
---|
| 8697 | + struct bpf_prog *subprog = prog->aux->func[i]; |
---|
| 8698 | + |
---|
| 8699 | + perf_event_ksymbol( |
---|
| 8700 | + PERF_RECORD_KSYMBOL_TYPE_BPF, |
---|
| 8701 | + (u64)(unsigned long)subprog->bpf_func, |
---|
| 8702 | + subprog->jited_len, unregister, |
---|
| 8703 | + subprog->aux->ksym.name); |
---|
| 8704 | + } |
---|
| 8705 | + } |
---|
| 8706 | +} |
---|
| 8707 | + |
---|
| 8708 | +void perf_event_bpf_event(struct bpf_prog *prog, |
---|
| 8709 | + enum perf_bpf_event_type type, |
---|
| 8710 | + u16 flags) |
---|
| 8711 | +{ |
---|
| 8712 | + struct perf_bpf_event bpf_event; |
---|
| 8713 | + |
---|
| 8714 | + if (type <= PERF_BPF_EVENT_UNKNOWN || |
---|
| 8715 | + type >= PERF_BPF_EVENT_MAX) |
---|
| 8716 | + return; |
---|
| 8717 | + |
---|
| 8718 | + switch (type) { |
---|
| 8719 | + case PERF_BPF_EVENT_PROG_LOAD: |
---|
| 8720 | + case PERF_BPF_EVENT_PROG_UNLOAD: |
---|
| 8721 | + if (atomic_read(&nr_ksymbol_events)) |
---|
| 8722 | + perf_event_bpf_emit_ksymbols(prog, type); |
---|
| 8723 | + break; |
---|
| 8724 | + default: |
---|
| 8725 | + break; |
---|
| 8726 | + } |
---|
| 8727 | + |
---|
| 8728 | + if (!atomic_read(&nr_bpf_events)) |
---|
| 8729 | + return; |
---|
| 8730 | + |
---|
| 8731 | + bpf_event = (struct perf_bpf_event){ |
---|
| 8732 | + .prog = prog, |
---|
| 8733 | + .event_id = { |
---|
| 8734 | + .header = { |
---|
| 8735 | + .type = PERF_RECORD_BPF_EVENT, |
---|
| 8736 | + .size = sizeof(bpf_event.event_id), |
---|
| 8737 | + }, |
---|
| 8738 | + .type = type, |
---|
| 8739 | + .flags = flags, |
---|
| 8740 | + .id = prog->aux->id, |
---|
| 8741 | + }, |
---|
| 8742 | + }; |
---|
| 8743 | + |
---|
| 8744 | + BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); |
---|
| 8745 | + |
---|
| 8746 | + memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); |
---|
| 8747 | + perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); |
---|
| 8748 | +} |
---|
| 8749 | + |
---|
| 8750 | +struct perf_text_poke_event { |
---|
| 8751 | + const void *old_bytes; |
---|
| 8752 | + const void *new_bytes; |
---|
| 8753 | + size_t pad; |
---|
| 8754 | + u16 old_len; |
---|
| 8755 | + u16 new_len; |
---|
| 8756 | + |
---|
| 8757 | + struct { |
---|
| 8758 | + struct perf_event_header header; |
---|
| 8759 | + |
---|
| 8760 | + u64 addr; |
---|
| 8761 | + } event_id; |
---|
| 8762 | +}; |
---|
| 8763 | + |
---|
| 8764 | +static int perf_event_text_poke_match(struct perf_event *event) |
---|
| 8765 | +{ |
---|
| 8766 | + return event->attr.text_poke; |
---|
| 8767 | +} |
---|
| 8768 | + |
---|
| 8769 | +static void perf_event_text_poke_output(struct perf_event *event, void *data) |
---|
| 8770 | +{ |
---|
| 8771 | + struct perf_text_poke_event *text_poke_event = data; |
---|
| 8772 | + struct perf_output_handle handle; |
---|
| 8773 | + struct perf_sample_data sample; |
---|
| 8774 | + u64 padding = 0; |
---|
| 8775 | + int ret; |
---|
| 8776 | + |
---|
| 8777 | + if (!perf_event_text_poke_match(event)) |
---|
| 8778 | + return; |
---|
| 8779 | + |
---|
| 8780 | + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); |
---|
| 8781 | + |
---|
| 8782 | + ret = perf_output_begin(&handle, &sample, event, |
---|
| 8783 | + text_poke_event->event_id.header.size); |
---|
| 8784 | + if (ret) |
---|
| 8785 | + return; |
---|
| 8786 | + |
---|
| 8787 | + perf_output_put(&handle, text_poke_event->event_id); |
---|
| 8788 | + perf_output_put(&handle, text_poke_event->old_len); |
---|
| 8789 | + perf_output_put(&handle, text_poke_event->new_len); |
---|
| 8790 | + |
---|
| 8791 | + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); |
---|
| 8792 | + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); |
---|
| 8793 | + |
---|
| 8794 | + if (text_poke_event->pad) |
---|
| 8795 | + __output_copy(&handle, &padding, text_poke_event->pad); |
---|
| 8796 | + |
---|
| 8797 | + perf_event__output_id_sample(event, &handle, &sample); |
---|
| 8798 | + |
---|
| 8799 | + perf_output_end(&handle); |
---|
| 8800 | +} |
---|
| 8801 | + |
---|
| 8802 | +void perf_event_text_poke(const void *addr, const void *old_bytes, |
---|
| 8803 | + size_t old_len, const void *new_bytes, size_t new_len) |
---|
| 8804 | +{ |
---|
| 8805 | + struct perf_text_poke_event text_poke_event; |
---|
| 8806 | + size_t tot, pad; |
---|
| 8807 | + |
---|
| 8808 | + if (!atomic_read(&nr_text_poke_events)) |
---|
| 8809 | + return; |
---|
| 8810 | + |
---|
| 8811 | + tot = sizeof(text_poke_event.old_len) + old_len; |
---|
| 8812 | + tot += sizeof(text_poke_event.new_len) + new_len; |
---|
| 8813 | + pad = ALIGN(tot, sizeof(u64)) - tot; |
---|
| 8814 | + |
---|
| 8815 | + text_poke_event = (struct perf_text_poke_event){ |
---|
| 8816 | + .old_bytes = old_bytes, |
---|
| 8817 | + .new_bytes = new_bytes, |
---|
| 8818 | + .pad = pad, |
---|
| 8819 | + .old_len = old_len, |
---|
| 8820 | + .new_len = new_len, |
---|
| 8821 | + .event_id = { |
---|
| 8822 | + .header = { |
---|
| 8823 | + .type = PERF_RECORD_TEXT_POKE, |
---|
| 8824 | + .misc = PERF_RECORD_MISC_KERNEL, |
---|
| 8825 | + .size = sizeof(text_poke_event.event_id) + tot + pad, |
---|
| 8826 | + }, |
---|
| 8827 | + .addr = (unsigned long)addr, |
---|
| 8828 | + }, |
---|
| 8829 | + }; |
---|
| 8830 | + |
---|
| 8831 | + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); |
---|
7789 | 8832 | } |
---|
7790 | 8833 | |
---|
7791 | 8834 | void perf_event_itrace_started(struct perf_event *event) |
---|
.. | .. |
---|
7818 | 8861 | rec.tid = perf_event_tid(event, current); |
---|
7819 | 8862 | |
---|
7820 | 8863 | perf_event_header__init_id(&rec.header, &sample, event); |
---|
7821 | | - ret = perf_output_begin(&handle, event, rec.header.size); |
---|
| 8864 | + ret = perf_output_begin(&handle, &sample, event, rec.header.size); |
---|
7822 | 8865 | |
---|
7823 | 8866 | if (ret) |
---|
7824 | 8867 | return; |
---|
.. | .. |
---|
7842 | 8885 | hwc->interrupts = 1; |
---|
7843 | 8886 | } else { |
---|
7844 | 8887 | hwc->interrupts++; |
---|
7845 | | - if (unlikely(throttle |
---|
7846 | | - && hwc->interrupts >= max_samples_per_tick)) { |
---|
| 8888 | + if (unlikely(throttle && |
---|
| 8889 | + hwc->interrupts > max_samples_per_tick)) { |
---|
7847 | 8890 | __this_cpu_inc(perf_throttled_count); |
---|
7848 | 8891 | tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); |
---|
7849 | 8892 | hwc->interrupts = MAX_INTERRUPTS; |
---|
.. | .. |
---|
8386 | 9429 | if (event->hw.state & PERF_HES_STOPPED) |
---|
8387 | 9430 | return 0; |
---|
8388 | 9431 | /* |
---|
8389 | | - * All tracepoints are from kernel-space. |
---|
| 9432 | + * If exclude_kernel, only trace user-space tracepoints (uprobes) |
---|
8390 | 9433 | */ |
---|
8391 | | - if (event->attr.exclude_kernel) |
---|
| 9434 | + if (event->attr.exclude_kernel && !user_mode(regs)) |
---|
8392 | 9435 | return 0; |
---|
8393 | 9436 | |
---|
8394 | 9437 | if (!perf_tp_filter_match(event, data)) |
---|
.. | .. |
---|
8514 | 9557 | * |
---|
8515 | 9558 | * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe |
---|
8516 | 9559 | * if not set, create kprobe/uprobe |
---|
| 9560 | + * |
---|
| 9561 | + * The following values specify a reference counter (or semaphore in the |
---|
| 9562 | + * terminology of tools like dtrace, systemtap, etc.) Userspace Statically |
---|
| 9563 | + * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. |
---|
| 9564 | + * |
---|
| 9565 | + * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset |
---|
| 9566 | + * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left |
---|
8517 | 9567 | */ |
---|
8518 | 9568 | enum perf_probe_config { |
---|
8519 | 9569 | PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ |
---|
| 9570 | + PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, |
---|
| 9571 | + PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, |
---|
8520 | 9572 | }; |
---|
8521 | 9573 | |
---|
8522 | 9574 | PMU_FORMAT_ATTR(retprobe, "config:0"); |
---|
| 9575 | +#endif |
---|
8523 | 9576 | |
---|
8524 | | -static struct attribute *probe_attrs[] = { |
---|
| 9577 | +#ifdef CONFIG_KPROBE_EVENTS |
---|
| 9578 | +static struct attribute *kprobe_attrs[] = { |
---|
8525 | 9579 | &format_attr_retprobe.attr, |
---|
8526 | 9580 | NULL, |
---|
8527 | 9581 | }; |
---|
8528 | 9582 | |
---|
8529 | | -static struct attribute_group probe_format_group = { |
---|
| 9583 | +static struct attribute_group kprobe_format_group = { |
---|
8530 | 9584 | .name = "format", |
---|
8531 | | - .attrs = probe_attrs, |
---|
| 9585 | + .attrs = kprobe_attrs, |
---|
8532 | 9586 | }; |
---|
8533 | 9587 | |
---|
8534 | | -static const struct attribute_group *probe_attr_groups[] = { |
---|
8535 | | - &probe_format_group, |
---|
| 9588 | +static const struct attribute_group *kprobe_attr_groups[] = { |
---|
| 9589 | + &kprobe_format_group, |
---|
8536 | 9590 | NULL, |
---|
8537 | 9591 | }; |
---|
8538 | | -#endif |
---|
8539 | 9592 | |
---|
8540 | | -#ifdef CONFIG_KPROBE_EVENTS |
---|
8541 | 9593 | static int perf_kprobe_event_init(struct perf_event *event); |
---|
8542 | 9594 | static struct pmu perf_kprobe = { |
---|
8543 | 9595 | .task_ctx_nr = perf_sw_context, |
---|
.. | .. |
---|
8547 | 9599 | .start = perf_swevent_start, |
---|
8548 | 9600 | .stop = perf_swevent_stop, |
---|
8549 | 9601 | .read = perf_swevent_read, |
---|
8550 | | - .attr_groups = probe_attr_groups, |
---|
| 9602 | + .attr_groups = kprobe_attr_groups, |
---|
8551 | 9603 | }; |
---|
8552 | 9604 | |
---|
8553 | 9605 | static int perf_kprobe_event_init(struct perf_event *event) |
---|
.. | .. |
---|
8558 | 9610 | if (event->attr.type != perf_kprobe.type) |
---|
8559 | 9611 | return -ENOENT; |
---|
8560 | 9612 | |
---|
8561 | | - if (!capable(CAP_SYS_ADMIN)) |
---|
| 9613 | + if (!perfmon_capable()) |
---|
8562 | 9614 | return -EACCES; |
---|
8563 | 9615 | |
---|
8564 | 9616 | /* |
---|
.. | .. |
---|
8579 | 9631 | #endif /* CONFIG_KPROBE_EVENTS */ |
---|
8580 | 9632 | |
---|
8581 | 9633 | #ifdef CONFIG_UPROBE_EVENTS |
---|
| 9634 | +PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); |
---|
| 9635 | + |
---|
| 9636 | +static struct attribute *uprobe_attrs[] = { |
---|
| 9637 | + &format_attr_retprobe.attr, |
---|
| 9638 | + &format_attr_ref_ctr_offset.attr, |
---|
| 9639 | + NULL, |
---|
| 9640 | +}; |
---|
| 9641 | + |
---|
| 9642 | +static struct attribute_group uprobe_format_group = { |
---|
| 9643 | + .name = "format", |
---|
| 9644 | + .attrs = uprobe_attrs, |
---|
| 9645 | +}; |
---|
| 9646 | + |
---|
| 9647 | +static const struct attribute_group *uprobe_attr_groups[] = { |
---|
| 9648 | + &uprobe_format_group, |
---|
| 9649 | + NULL, |
---|
| 9650 | +}; |
---|
| 9651 | + |
---|
8582 | 9652 | static int perf_uprobe_event_init(struct perf_event *event); |
---|
8583 | 9653 | static struct pmu perf_uprobe = { |
---|
8584 | 9654 | .task_ctx_nr = perf_sw_context, |
---|
.. | .. |
---|
8588 | 9658 | .start = perf_swevent_start, |
---|
8589 | 9659 | .stop = perf_swevent_stop, |
---|
8590 | 9660 | .read = perf_swevent_read, |
---|
8591 | | - .attr_groups = probe_attr_groups, |
---|
| 9661 | + .attr_groups = uprobe_attr_groups, |
---|
8592 | 9662 | }; |
---|
8593 | 9663 | |
---|
8594 | 9664 | static int perf_uprobe_event_init(struct perf_event *event) |
---|
8595 | 9665 | { |
---|
8596 | 9666 | int err; |
---|
| 9667 | + unsigned long ref_ctr_offset; |
---|
8597 | 9668 | bool is_retprobe; |
---|
8598 | 9669 | |
---|
8599 | 9670 | if (event->attr.type != perf_uprobe.type) |
---|
8600 | 9671 | return -ENOENT; |
---|
8601 | 9672 | |
---|
8602 | | - if (!capable(CAP_SYS_ADMIN)) |
---|
| 9673 | + if (!perfmon_capable()) |
---|
8603 | 9674 | return -EACCES; |
---|
8604 | 9675 | |
---|
8605 | 9676 | /* |
---|
.. | .. |
---|
8609 | 9680 | return -EOPNOTSUPP; |
---|
8610 | 9681 | |
---|
8611 | 9682 | is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; |
---|
8612 | | - err = perf_uprobe_init(event, is_retprobe); |
---|
| 9683 | + ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; |
---|
| 9684 | + err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); |
---|
8613 | 9685 | if (err) |
---|
8614 | 9686 | return err; |
---|
8615 | 9687 | |
---|
.. | .. |
---|
8647 | 9719 | int ret = 0; |
---|
8648 | 9720 | |
---|
8649 | 9721 | ctx.regs = perf_arch_bpf_user_pt_regs(regs); |
---|
8650 | | - preempt_disable(); |
---|
8651 | 9722 | if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) |
---|
8652 | 9723 | goto out; |
---|
8653 | 9724 | rcu_read_lock(); |
---|
.. | .. |
---|
8655 | 9726 | rcu_read_unlock(); |
---|
8656 | 9727 | out: |
---|
8657 | 9728 | __this_cpu_dec(bpf_prog_active); |
---|
8658 | | - preempt_enable(); |
---|
8659 | 9729 | if (!ret) |
---|
8660 | 9730 | return; |
---|
8661 | 9731 | |
---|
.. | .. |
---|
8676 | 9746 | prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); |
---|
8677 | 9747 | if (IS_ERR(prog)) |
---|
8678 | 9748 | return PTR_ERR(prog); |
---|
| 9749 | + |
---|
| 9750 | + if (event->attr.precise_ip && |
---|
| 9751 | + prog->call_get_stack && |
---|
| 9752 | + (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || |
---|
| 9753 | + event->attr.exclude_callchain_kernel || |
---|
| 9754 | + event->attr.exclude_callchain_user)) { |
---|
| 9755 | + /* |
---|
| 9756 | + * On perf_event with precise_ip, calling bpf_get_stack() |
---|
| 9757 | + * may trigger unwinder warnings and occasional crashes. |
---|
| 9758 | + * bpf_get_[stack|stackid] works around this issue by using |
---|
| 9759 | + * callchain attached to perf_sample_data. If the |
---|
| 9760 | + * perf_event does not full (kernel and user) callchain |
---|
| 9761 | + * attached to perf_sample_data, do not allow attaching BPF |
---|
| 9762 | + * program that calls bpf_get_[stack|stackid]. |
---|
| 9763 | + */ |
---|
| 9764 | + bpf_prog_put(prog); |
---|
| 9765 | + return -EPROTO; |
---|
| 9766 | + } |
---|
8679 | 9767 | |
---|
8680 | 9768 | event->prog = prog; |
---|
8681 | 9769 | event->orig_overflow_handler = READ_ONCE(event->overflow_handler); |
---|
.. | .. |
---|
8875 | 9963 | /* |
---|
8876 | 9964 | * Scan through mm's vmas and see if one of them matches the |
---|
8877 | 9965 | * @filter; if so, adjust filter's address range. |
---|
8878 | | - * Called with mm::mmap_sem down for reading. |
---|
| 9966 | + * Called with mm::mmap_lock down for reading. |
---|
8879 | 9967 | */ |
---|
8880 | 9968 | static void perf_addr_filter_apply(struct perf_addr_filter *filter, |
---|
8881 | 9969 | struct mm_struct *mm, |
---|
.. | .. |
---|
8917 | 10005 | if (!mm) |
---|
8918 | 10006 | goto restart; |
---|
8919 | 10007 | |
---|
8920 | | - down_read(&mm->mmap_sem); |
---|
| 10008 | + mmap_read_lock(mm); |
---|
8921 | 10009 | } |
---|
8922 | 10010 | |
---|
8923 | 10011 | raw_spin_lock_irqsave(&ifh->lock, flags); |
---|
.. | .. |
---|
8943 | 10031 | raw_spin_unlock_irqrestore(&ifh->lock, flags); |
---|
8944 | 10032 | |
---|
8945 | 10033 | if (ifh->nr_file_filters) { |
---|
8946 | | - up_read(&mm->mmap_sem); |
---|
| 10034 | + mmap_read_unlock(mm); |
---|
8947 | 10035 | |
---|
8948 | 10036 | mmput(mm); |
---|
8949 | 10037 | } |
---|
.. | .. |
---|
9050 | 10138 | case IF_SRC_KERNELADDR: |
---|
9051 | 10139 | case IF_SRC_KERNEL: |
---|
9052 | 10140 | kernel = 1; |
---|
| 10141 | + fallthrough; |
---|
9053 | 10142 | |
---|
9054 | 10143 | case IF_SRC_FILEADDR: |
---|
9055 | 10144 | case IF_SRC_FILE: |
---|
.. | .. |
---|
9136 | 10225 | } |
---|
9137 | 10226 | |
---|
9138 | 10227 | /* ready to consume more filters */ |
---|
| 10228 | + kfree(filename); |
---|
| 10229 | + filename = NULL; |
---|
9139 | 10230 | state = IF_STATE_ACTION; |
---|
9140 | 10231 | filter = NULL; |
---|
| 10232 | + kernel = 0; |
---|
9141 | 10233 | } |
---|
9142 | 10234 | } |
---|
9143 | 10235 | |
---|
.. | .. |
---|
9285 | 10377 | period = max_t(u64, 10000, hwc->sample_period); |
---|
9286 | 10378 | } |
---|
9287 | 10379 | hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), |
---|
9288 | | - HRTIMER_MODE_REL_PINNED); |
---|
| 10380 | + HRTIMER_MODE_REL_PINNED_HARD); |
---|
9289 | 10381 | } |
---|
9290 | 10382 | |
---|
9291 | 10383 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
---|
.. | .. |
---|
9640 | 10732 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
---|
9641 | 10733 | cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); |
---|
9642 | 10734 | |
---|
9643 | | - cpu_function_call(cpu, |
---|
9644 | | - (remote_function_f)perf_mux_hrtimer_restart, cpuctx); |
---|
| 10735 | + cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx); |
---|
9645 | 10736 | } |
---|
9646 | 10737 | cpus_read_unlock(); |
---|
9647 | 10738 | mutex_unlock(&mux_interval_mutex); |
---|
.. | .. |
---|
9678 | 10769 | |
---|
9679 | 10770 | pmu->dev->groups = pmu->attr_groups; |
---|
9680 | 10771 | device_initialize(pmu->dev); |
---|
9681 | | - ret = dev_set_name(pmu->dev, "%s", pmu->name); |
---|
9682 | | - if (ret) |
---|
9683 | | - goto free_dev; |
---|
9684 | 10772 | |
---|
9685 | 10773 | dev_set_drvdata(pmu->dev, pmu); |
---|
9686 | 10774 | pmu->dev->bus = &pmu_bus; |
---|
9687 | 10775 | pmu->dev->release = pmu_dev_release; |
---|
| 10776 | + |
---|
| 10777 | + ret = dev_set_name(pmu->dev, "%s", pmu->name); |
---|
| 10778 | + if (ret) |
---|
| 10779 | + goto free_dev; |
---|
| 10780 | + |
---|
9688 | 10781 | ret = device_add(pmu->dev); |
---|
9689 | 10782 | if (ret) |
---|
9690 | 10783 | goto free_dev; |
---|
.. | .. |
---|
9692 | 10785 | /* For PMUs with address filters, throw in an extra attribute: */ |
---|
9693 | 10786 | if (pmu->nr_addr_filters) |
---|
9694 | 10787 | ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); |
---|
| 10788 | + |
---|
| 10789 | + if (ret) |
---|
| 10790 | + goto del_dev; |
---|
| 10791 | + |
---|
| 10792 | + if (pmu->attr_update) |
---|
| 10793 | + ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); |
---|
9695 | 10794 | |
---|
9696 | 10795 | if (ret) |
---|
9697 | 10796 | goto del_dev; |
---|
.. | .. |
---|
9712 | 10811 | |
---|
9713 | 10812 | int perf_pmu_register(struct pmu *pmu, const char *name, int type) |
---|
9714 | 10813 | { |
---|
9715 | | - int cpu, ret; |
---|
| 10814 | + int cpu, ret, max = PERF_TYPE_MAX; |
---|
9716 | 10815 | |
---|
9717 | 10816 | mutex_lock(&pmus_lock); |
---|
9718 | 10817 | ret = -ENOMEM; |
---|
.. | .. |
---|
9725 | 10824 | goto skip_type; |
---|
9726 | 10825 | pmu->name = name; |
---|
9727 | 10826 | |
---|
9728 | | - if (type < 0) { |
---|
9729 | | - type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); |
---|
9730 | | - if (type < 0) { |
---|
9731 | | - ret = type; |
---|
| 10827 | + if (type != PERF_TYPE_SOFTWARE) { |
---|
| 10828 | + if (type >= 0) |
---|
| 10829 | + max = type; |
---|
| 10830 | + |
---|
| 10831 | + ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); |
---|
| 10832 | + if (ret < 0) |
---|
9732 | 10833 | goto free_pdc; |
---|
9733 | | - } |
---|
| 10834 | + |
---|
| 10835 | + WARN_ON(type >= 0 && ret != type); |
---|
| 10836 | + |
---|
| 10837 | + type = ret; |
---|
9734 | 10838 | } |
---|
9735 | 10839 | pmu->type = type; |
---|
9736 | 10840 | |
---|
.. | .. |
---|
9776 | 10880 | cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); |
---|
9777 | 10881 | |
---|
9778 | 10882 | __perf_mux_hrtimer_init(cpuctx, cpu); |
---|
| 10883 | + |
---|
| 10884 | + cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); |
---|
| 10885 | + cpuctx->heap = cpuctx->heap_default; |
---|
9779 | 10886 | } |
---|
9780 | 10887 | |
---|
9781 | 10888 | got_cpu_context: |
---|
.. | .. |
---|
9807 | 10914 | if (!pmu->event_idx) |
---|
9808 | 10915 | pmu->event_idx = perf_event_idx_default; |
---|
9809 | 10916 | |
---|
9810 | | - list_add_rcu(&pmu->entry, &pmus); |
---|
| 10917 | + /* |
---|
| 10918 | + * Ensure the TYPE_SOFTWARE PMUs are at the head of the list, |
---|
| 10919 | + * since these cannot be in the IDR. This way the linear search |
---|
| 10920 | + * is fast, provided a valid software event is provided. |
---|
| 10921 | + */ |
---|
| 10922 | + if (type == PERF_TYPE_SOFTWARE || !name) |
---|
| 10923 | + list_add_rcu(&pmu->entry, &pmus); |
---|
| 10924 | + else |
---|
| 10925 | + list_add_tail_rcu(&pmu->entry, &pmus); |
---|
| 10926 | + |
---|
9811 | 10927 | atomic_set(&pmu->exclusive_cnt, 0); |
---|
9812 | 10928 | ret = 0; |
---|
9813 | 10929 | unlock: |
---|
.. | .. |
---|
9820 | 10936 | put_device(pmu->dev); |
---|
9821 | 10937 | |
---|
9822 | 10938 | free_idr: |
---|
9823 | | - if (pmu->type >= PERF_TYPE_MAX) |
---|
| 10939 | + if (pmu->type != PERF_TYPE_SOFTWARE) |
---|
9824 | 10940 | idr_remove(&pmu_idr, pmu->type); |
---|
9825 | 10941 | |
---|
9826 | 10942 | free_pdc: |
---|
.. | .. |
---|
9842 | 10958 | synchronize_rcu(); |
---|
9843 | 10959 | |
---|
9844 | 10960 | free_percpu(pmu->pmu_disable_count); |
---|
9845 | | - if (pmu->type >= PERF_TYPE_MAX) |
---|
| 10961 | + if (pmu->type != PERF_TYPE_SOFTWARE) |
---|
9846 | 10962 | idr_remove(&pmu_idr, pmu->type); |
---|
9847 | 10963 | if (pmu_bus_running) { |
---|
9848 | 10964 | if (pmu->nr_addr_filters) |
---|
.. | .. |
---|
9854 | 10970 | mutex_unlock(&pmus_lock); |
---|
9855 | 10971 | } |
---|
9856 | 10972 | EXPORT_SYMBOL_GPL(perf_pmu_unregister); |
---|
| 10973 | + |
---|
| 10974 | +static inline bool has_extended_regs(struct perf_event *event) |
---|
| 10975 | +{ |
---|
| 10976 | + return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || |
---|
| 10977 | + (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); |
---|
| 10978 | +} |
---|
9857 | 10979 | |
---|
9858 | 10980 | static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) |
---|
9859 | 10981 | { |
---|
.. | .. |
---|
9885 | 11007 | if (ctx) |
---|
9886 | 11008 | perf_event_ctx_unlock(event->group_leader, ctx); |
---|
9887 | 11009 | |
---|
| 11010 | + if (!ret) { |
---|
| 11011 | + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && |
---|
| 11012 | + has_extended_regs(event)) |
---|
| 11013 | + ret = -EOPNOTSUPP; |
---|
| 11014 | + |
---|
| 11015 | + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && |
---|
| 11016 | + event_has_any_exclude_flag(event)) |
---|
| 11017 | + ret = -EINVAL; |
---|
| 11018 | + |
---|
| 11019 | + if (ret && event->destroy) |
---|
| 11020 | + event->destroy(event); |
---|
| 11021 | + } |
---|
| 11022 | + |
---|
9888 | 11023 | if (ret) |
---|
9889 | 11024 | module_put(pmu->module); |
---|
9890 | 11025 | |
---|
.. | .. |
---|
9893 | 11028 | |
---|
9894 | 11029 | static struct pmu *perf_init_event(struct perf_event *event) |
---|
9895 | 11030 | { |
---|
| 11031 | + int idx, type, ret; |
---|
9896 | 11032 | struct pmu *pmu; |
---|
9897 | | - int idx; |
---|
9898 | | - int ret; |
---|
9899 | 11033 | |
---|
9900 | 11034 | idx = srcu_read_lock(&pmus_srcu); |
---|
9901 | 11035 | |
---|
.. | .. |
---|
9907 | 11041 | goto unlock; |
---|
9908 | 11042 | } |
---|
9909 | 11043 | |
---|
| 11044 | + /* |
---|
| 11045 | + * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE |
---|
| 11046 | + * are often aliases for PERF_TYPE_RAW. |
---|
| 11047 | + */ |
---|
| 11048 | + type = event->attr.type; |
---|
| 11049 | + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) |
---|
| 11050 | + type = PERF_TYPE_RAW; |
---|
| 11051 | + |
---|
| 11052 | +again: |
---|
9910 | 11053 | rcu_read_lock(); |
---|
9911 | | - pmu = idr_find(&pmu_idr, event->attr.type); |
---|
| 11054 | + pmu = idr_find(&pmu_idr, type); |
---|
9912 | 11055 | rcu_read_unlock(); |
---|
9913 | 11056 | if (pmu) { |
---|
9914 | 11057 | ret = perf_try_init_event(pmu, event); |
---|
| 11058 | + if (ret == -ENOENT && event->attr.type != type) { |
---|
| 11059 | + type = event->attr.type; |
---|
| 11060 | + goto again; |
---|
| 11061 | + } |
---|
| 11062 | + |
---|
9915 | 11063 | if (ret) |
---|
9916 | 11064 | pmu = ERR_PTR(ret); |
---|
| 11065 | + |
---|
9917 | 11066 | goto unlock; |
---|
9918 | 11067 | } |
---|
9919 | 11068 | |
---|
9920 | | - list_for_each_entry_rcu(pmu, &pmus, entry) { |
---|
| 11069 | + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { |
---|
9921 | 11070 | ret = perf_try_init_event(pmu, event); |
---|
9922 | 11071 | if (!ret) |
---|
9923 | 11072 | goto unlock; |
---|
.. | .. |
---|
9993 | 11142 | if (event->parent) |
---|
9994 | 11143 | return; |
---|
9995 | 11144 | |
---|
9996 | | - if (event->attach_state & PERF_ATTACH_TASK) |
---|
| 11145 | + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) |
---|
9997 | 11146 | inc = true; |
---|
9998 | 11147 | if (event->attr.mmap || event->attr.mmap_data) |
---|
9999 | 11148 | atomic_inc(&nr_mmap_events); |
---|
.. | .. |
---|
10001 | 11150 | atomic_inc(&nr_comm_events); |
---|
10002 | 11151 | if (event->attr.namespaces) |
---|
10003 | 11152 | atomic_inc(&nr_namespaces_events); |
---|
| 11153 | + if (event->attr.cgroup) |
---|
| 11154 | + atomic_inc(&nr_cgroup_events); |
---|
10004 | 11155 | if (event->attr.task) |
---|
10005 | 11156 | atomic_inc(&nr_task_events); |
---|
10006 | 11157 | if (event->attr.freq) |
---|
.. | .. |
---|
10013 | 11164 | inc = true; |
---|
10014 | 11165 | if (is_cgroup_event(event)) |
---|
10015 | 11166 | inc = true; |
---|
| 11167 | + if (event->attr.ksymbol) |
---|
| 11168 | + atomic_inc(&nr_ksymbol_events); |
---|
| 11169 | + if (event->attr.bpf_event) |
---|
| 11170 | + atomic_inc(&nr_bpf_events); |
---|
| 11171 | + if (event->attr.text_poke) |
---|
| 11172 | + atomic_inc(&nr_text_poke_events); |
---|
10016 | 11173 | |
---|
10017 | 11174 | if (inc) { |
---|
10018 | 11175 | /* |
---|
.. | .. |
---|
10031 | 11188 | * call the perf scheduling hooks before proceeding to |
---|
10032 | 11189 | * install events that need them. |
---|
10033 | 11190 | */ |
---|
10034 | | - synchronize_sched(); |
---|
| 11191 | + synchronize_rcu(); |
---|
10035 | 11192 | } |
---|
10036 | 11193 | /* |
---|
10037 | 11194 | * Now that we have waited for the sync_sched(), allow further |
---|
.. | .. |
---|
10120 | 11277 | * and we cannot use the ctx information because we need the |
---|
10121 | 11278 | * pmu before we get a ctx. |
---|
10122 | 11279 | */ |
---|
10123 | | - get_task_struct(task); |
---|
10124 | | - event->hw.target = task; |
---|
| 11280 | + event->hw.target = get_task_struct(task); |
---|
10125 | 11281 | } |
---|
10126 | 11282 | |
---|
10127 | 11283 | event->clock = &local_clock; |
---|
.. | .. |
---|
10133 | 11289 | context = parent_event->overflow_handler_context; |
---|
10134 | 11290 | #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) |
---|
10135 | 11291 | if (overflow_handler == bpf_overflow_handler) { |
---|
10136 | | - struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); |
---|
| 11292 | + struct bpf_prog *prog = parent_event->prog; |
---|
10137 | 11293 | |
---|
10138 | | - if (IS_ERR(prog)) { |
---|
10139 | | - err = PTR_ERR(prog); |
---|
10140 | | - goto err_ns; |
---|
10141 | | - } |
---|
| 11294 | + bpf_prog_inc(prog); |
---|
10142 | 11295 | event->prog = prog; |
---|
10143 | 11296 | event->orig_overflow_handler = |
---|
10144 | 11297 | parent_event->orig_overflow_handler; |
---|
.. | .. |
---|
10179 | 11332 | if (!has_branch_stack(event)) |
---|
10180 | 11333 | event->attr.branch_sample_type = 0; |
---|
10181 | 11334 | |
---|
10182 | | - if (cgroup_fd != -1) { |
---|
10183 | | - err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); |
---|
10184 | | - if (err) |
---|
10185 | | - goto err_ns; |
---|
10186 | | - } |
---|
10187 | | - |
---|
10188 | 11335 | pmu = perf_init_event(event); |
---|
10189 | 11336 | if (IS_ERR(pmu)) { |
---|
10190 | 11337 | err = PTR_ERR(pmu); |
---|
10191 | 11338 | goto err_ns; |
---|
| 11339 | + } |
---|
| 11340 | + |
---|
| 11341 | + /* |
---|
| 11342 | + * Disallow uncore-cgroup events, they don't make sense as the cgroup will |
---|
| 11343 | + * be different on other CPUs in the uncore mask. |
---|
| 11344 | + */ |
---|
| 11345 | + if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { |
---|
| 11346 | + err = -EINVAL; |
---|
| 11347 | + goto err_pmu; |
---|
| 11348 | + } |
---|
| 11349 | + |
---|
| 11350 | + if (event->attr.aux_output && |
---|
| 11351 | + !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { |
---|
| 11352 | + err = -EOPNOTSUPP; |
---|
| 11353 | + goto err_pmu; |
---|
| 11354 | + } |
---|
| 11355 | + |
---|
| 11356 | + if (cgroup_fd != -1) { |
---|
| 11357 | + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); |
---|
| 11358 | + if (err) |
---|
| 11359 | + goto err_pmu; |
---|
10192 | 11360 | } |
---|
10193 | 11361 | |
---|
10194 | 11362 | err = exclusive_event_init(event); |
---|
.. | .. |
---|
10251 | 11419 | exclusive_event_destroy(event); |
---|
10252 | 11420 | |
---|
10253 | 11421 | err_pmu: |
---|
| 11422 | + if (is_cgroup_event(event)) |
---|
| 11423 | + perf_detach_cgroup(event); |
---|
10254 | 11424 | if (event->destroy) |
---|
10255 | 11425 | event->destroy(event); |
---|
10256 | 11426 | module_put(pmu->module); |
---|
10257 | 11427 | err_ns: |
---|
10258 | | - if (is_cgroup_event(event)) |
---|
10259 | | - perf_detach_cgroup(event); |
---|
10260 | 11428 | if (event->ns) |
---|
10261 | 11429 | put_pid_ns(event->ns); |
---|
10262 | 11430 | if (event->hw.target) |
---|
.. | .. |
---|
10272 | 11440 | u32 size; |
---|
10273 | 11441 | int ret; |
---|
10274 | 11442 | |
---|
10275 | | - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) |
---|
10276 | | - return -EFAULT; |
---|
10277 | | - |
---|
10278 | | - /* |
---|
10279 | | - * zero the full structure, so that a short copy will be nice. |
---|
10280 | | - */ |
---|
| 11443 | + /* Zero the full structure, so that a short copy will be nice. */ |
---|
10281 | 11444 | memset(attr, 0, sizeof(*attr)); |
---|
10282 | 11445 | |
---|
10283 | 11446 | ret = get_user(size, &uattr->size); |
---|
10284 | 11447 | if (ret) |
---|
10285 | 11448 | return ret; |
---|
10286 | 11449 | |
---|
10287 | | - if (size > PAGE_SIZE) /* silly large */ |
---|
10288 | | - goto err_size; |
---|
10289 | | - |
---|
10290 | | - if (!size) /* abi compat */ |
---|
| 11450 | + /* ABI compatibility quirk: */ |
---|
| 11451 | + if (!size) |
---|
10291 | 11452 | size = PERF_ATTR_SIZE_VER0; |
---|
10292 | | - |
---|
10293 | | - if (size < PERF_ATTR_SIZE_VER0) |
---|
| 11453 | + if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) |
---|
10294 | 11454 | goto err_size; |
---|
10295 | 11455 | |
---|
10296 | | - /* |
---|
10297 | | - * If we're handed a bigger struct than we know of, |
---|
10298 | | - * ensure all the unknown bits are 0 - i.e. new |
---|
10299 | | - * user-space does not rely on any kernel feature |
---|
10300 | | - * extensions we dont know about yet. |
---|
10301 | | - */ |
---|
10302 | | - if (size > sizeof(*attr)) { |
---|
10303 | | - unsigned char __user *addr; |
---|
10304 | | - unsigned char __user *end; |
---|
10305 | | - unsigned char val; |
---|
10306 | | - |
---|
10307 | | - addr = (void __user *)uattr + sizeof(*attr); |
---|
10308 | | - end = (void __user *)uattr + size; |
---|
10309 | | - |
---|
10310 | | - for (; addr < end; addr++) { |
---|
10311 | | - ret = get_user(val, addr); |
---|
10312 | | - if (ret) |
---|
10313 | | - return ret; |
---|
10314 | | - if (val) |
---|
10315 | | - goto err_size; |
---|
10316 | | - } |
---|
10317 | | - size = sizeof(*attr); |
---|
| 11456 | + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); |
---|
| 11457 | + if (ret) { |
---|
| 11458 | + if (ret == -E2BIG) |
---|
| 11459 | + goto err_size; |
---|
| 11460 | + return ret; |
---|
10318 | 11461 | } |
---|
10319 | | - |
---|
10320 | | - ret = copy_from_user(attr, uattr, size); |
---|
10321 | | - if (ret) |
---|
10322 | | - return -EFAULT; |
---|
10323 | 11462 | |
---|
10324 | 11463 | attr->size = size; |
---|
10325 | 11464 | |
---|
10326 | | - if (attr->__reserved_1) |
---|
| 11465 | + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) |
---|
10327 | 11466 | return -EINVAL; |
---|
10328 | 11467 | |
---|
10329 | 11468 | if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) |
---|
.. | .. |
---|
10394 | 11533 | |
---|
10395 | 11534 | if (attr->sample_type & PERF_SAMPLE_REGS_INTR) |
---|
10396 | 11535 | ret = perf_reg_validate(attr->sample_regs_intr); |
---|
| 11536 | + |
---|
| 11537 | +#ifndef CONFIG_CGROUP_PERF |
---|
| 11538 | + if (attr->sample_type & PERF_SAMPLE_CGROUP) |
---|
| 11539 | + return -EINVAL; |
---|
| 11540 | +#endif |
---|
| 11541 | + |
---|
10397 | 11542 | out: |
---|
10398 | 11543 | return ret; |
---|
10399 | 11544 | |
---|
.. | .. |
---|
10403 | 11548 | goto out; |
---|
10404 | 11549 | } |
---|
10405 | 11550 | |
---|
| 11551 | +static void mutex_lock_double(struct mutex *a, struct mutex *b) |
---|
| 11552 | +{ |
---|
| 11553 | + if (b < a) |
---|
| 11554 | + swap(a, b); |
---|
| 11555 | + |
---|
| 11556 | + mutex_lock(a); |
---|
| 11557 | + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); |
---|
| 11558 | +} |
---|
| 11559 | + |
---|
10406 | 11560 | static int |
---|
10407 | 11561 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
---|
10408 | 11562 | { |
---|
10409 | | - struct ring_buffer *rb = NULL; |
---|
| 11563 | + struct perf_buffer *rb = NULL; |
---|
10410 | 11564 | int ret = -EINVAL; |
---|
10411 | 11565 | |
---|
10412 | | - if (!output_event) |
---|
| 11566 | + if (!output_event) { |
---|
| 11567 | + mutex_lock(&event->mmap_mutex); |
---|
10413 | 11568 | goto set; |
---|
| 11569 | + } |
---|
10414 | 11570 | |
---|
10415 | 11571 | /* don't allow circular references */ |
---|
10416 | 11572 | if (event == output_event) |
---|
.. | .. |
---|
10425 | 11581 | /* |
---|
10426 | 11582 | * If its not a per-cpu rb, it must be the same task. |
---|
10427 | 11583 | */ |
---|
10428 | | - if (output_event->cpu == -1 && output_event->ctx != event->ctx) |
---|
| 11584 | + if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) |
---|
10429 | 11585 | goto out; |
---|
10430 | 11586 | |
---|
10431 | 11587 | /* |
---|
.. | .. |
---|
10448 | 11604 | event->pmu != output_event->pmu) |
---|
10449 | 11605 | goto out; |
---|
10450 | 11606 | |
---|
| 11607 | + /* |
---|
| 11608 | + * Hold both mmap_mutex to serialize against perf_mmap_close(). Since |
---|
| 11609 | + * output_event is already on rb->event_list, and the list iteration |
---|
| 11610 | + * restarts after every removal, it is guaranteed this new event is |
---|
| 11611 | + * observed *OR* if output_event is already removed, it's guaranteed we |
---|
| 11612 | + * observe !rb->mmap_count. |
---|
| 11613 | + */ |
---|
| 11614 | + mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); |
---|
10451 | 11615 | set: |
---|
10452 | | - mutex_lock(&event->mmap_mutex); |
---|
10453 | 11616 | /* Can't redirect output if we've got an active mmap() */ |
---|
10454 | 11617 | if (atomic_read(&event->mmap_count)) |
---|
10455 | 11618 | goto unlock; |
---|
.. | .. |
---|
10459 | 11622 | rb = ring_buffer_get(output_event); |
---|
10460 | 11623 | if (!rb) |
---|
10461 | 11624 | goto unlock; |
---|
| 11625 | + |
---|
| 11626 | + /* did we race against perf_mmap_close() */ |
---|
| 11627 | + if (!atomic_read(&rb->mmap_count)) { |
---|
| 11628 | + ring_buffer_put(rb); |
---|
| 11629 | + goto unlock; |
---|
| 11630 | + } |
---|
10462 | 11631 | } |
---|
10463 | 11632 | |
---|
10464 | 11633 | ring_buffer_attach(event, rb); |
---|
.. | .. |
---|
10466 | 11635 | ret = 0; |
---|
10467 | 11636 | unlock: |
---|
10468 | 11637 | mutex_unlock(&event->mmap_mutex); |
---|
| 11638 | + if (output_event) |
---|
| 11639 | + mutex_unlock(&output_event->mmap_mutex); |
---|
10469 | 11640 | |
---|
10470 | 11641 | out: |
---|
10471 | 11642 | return ret; |
---|
10472 | | -} |
---|
10473 | | - |
---|
10474 | | -static void mutex_lock_double(struct mutex *a, struct mutex *b) |
---|
10475 | | -{ |
---|
10476 | | - if (b < a) |
---|
10477 | | - swap(a, b); |
---|
10478 | | - |
---|
10479 | | - mutex_lock(a); |
---|
10480 | | - mutex_lock_nested(b, SINGLE_DEPTH_NESTING); |
---|
10481 | 11643 | } |
---|
10482 | 11644 | |
---|
10483 | 11645 | static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) |
---|
.. | .. |
---|
10500 | 11662 | break; |
---|
10501 | 11663 | |
---|
10502 | 11664 | case CLOCK_BOOTTIME: |
---|
10503 | | - event->clock = &ktime_get_boot_ns; |
---|
| 11665 | + event->clock = &ktime_get_boottime_ns; |
---|
10504 | 11666 | break; |
---|
10505 | 11667 | |
---|
10506 | 11668 | case CLOCK_TAI: |
---|
10507 | | - event->clock = &ktime_get_tai_ns; |
---|
| 11669 | + event->clock = &ktime_get_clocktai_ns; |
---|
10508 | 11670 | break; |
---|
10509 | 11671 | |
---|
10510 | 11672 | default: |
---|
.. | .. |
---|
10530 | 11692 | again: |
---|
10531 | 11693 | rcu_read_lock(); |
---|
10532 | 11694 | gctx = READ_ONCE(group_leader->ctx); |
---|
10533 | | - if (!atomic_inc_not_zero(&gctx->refcount)) { |
---|
| 11695 | + if (!refcount_inc_not_zero(&gctx->refcount)) { |
---|
10534 | 11696 | rcu_read_unlock(); |
---|
10535 | 11697 | goto again; |
---|
10536 | 11698 | } |
---|
.. | .. |
---|
10563 | 11725 | struct perf_event *group_leader = NULL, *output_event = NULL; |
---|
10564 | 11726 | struct perf_event *event, *sibling; |
---|
10565 | 11727 | struct perf_event_attr attr; |
---|
10566 | | - struct perf_event_context *ctx, *uninitialized_var(gctx); |
---|
| 11728 | + struct perf_event_context *ctx, *gctx; |
---|
10567 | 11729 | struct file *event_file = NULL; |
---|
10568 | 11730 | struct fd group = {NULL, 0}; |
---|
10569 | 11731 | struct task_struct *task = NULL; |
---|
.. | .. |
---|
10578 | 11740 | if (flags & ~PERF_FLAG_ALL) |
---|
10579 | 11741 | return -EINVAL; |
---|
10580 | 11742 | |
---|
10581 | | - if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) |
---|
10582 | | - return -EACCES; |
---|
10583 | | - |
---|
10584 | | - /* Do we allow access to perf_event_open(2) ? */ |
---|
10585 | | - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); |
---|
| 11743 | + err = perf_copy_attr(attr_uptr, &attr); |
---|
10586 | 11744 | if (err) |
---|
10587 | 11745 | return err; |
---|
10588 | 11746 | |
---|
10589 | | - err = perf_copy_attr(attr_uptr, &attr); |
---|
| 11747 | + /* Do we allow access to perf_event_open(2) ? */ |
---|
| 11748 | + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); |
---|
10590 | 11749 | if (err) |
---|
10591 | 11750 | return err; |
---|
10592 | 11751 | |
---|
.. | .. |
---|
10597 | 11756 | } |
---|
10598 | 11757 | |
---|
10599 | 11758 | if (attr.namespaces) { |
---|
10600 | | - if (!capable(CAP_SYS_ADMIN)) |
---|
| 11759 | + if (!perfmon_capable()) |
---|
10601 | 11760 | return -EACCES; |
---|
10602 | 11761 | } |
---|
10603 | 11762 | |
---|
.. | .. |
---|
10612 | 11771 | /* Only privileged users can get physical addresses */ |
---|
10613 | 11772 | if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { |
---|
10614 | 11773 | err = perf_allow_kernel(&attr); |
---|
| 11774 | + if (err) |
---|
| 11775 | + return err; |
---|
| 11776 | + } |
---|
| 11777 | + |
---|
| 11778 | + /* REGS_INTR can leak data, lockdown must prevent this */ |
---|
| 11779 | + if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { |
---|
| 11780 | + err = security_locked_down(LOCKDOWN_PERF); |
---|
10615 | 11781 | if (err) |
---|
10616 | 11782 | return err; |
---|
10617 | 11783 | } |
---|
.. | .. |
---|
10657 | 11823 | goto err_task; |
---|
10658 | 11824 | } |
---|
10659 | 11825 | |
---|
10660 | | - if (task) { |
---|
10661 | | - err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); |
---|
10662 | | - if (err) |
---|
10663 | | - goto err_task; |
---|
10664 | | - |
---|
10665 | | - /* |
---|
10666 | | - * Reuse ptrace permission checks for now. |
---|
10667 | | - * |
---|
10668 | | - * We must hold cred_guard_mutex across this and any potential |
---|
10669 | | - * perf_install_in_context() call for this new event to |
---|
10670 | | - * serialize against exec() altering our credentials (and the |
---|
10671 | | - * perf_event_exit_task() that could imply). |
---|
10672 | | - */ |
---|
10673 | | - err = -EACCES; |
---|
10674 | | - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) |
---|
10675 | | - goto err_cred; |
---|
10676 | | - } |
---|
10677 | | - |
---|
10678 | 11826 | if (flags & PERF_FLAG_PID_CGROUP) |
---|
10679 | 11827 | cgroup_fd = pid; |
---|
10680 | 11828 | |
---|
.. | .. |
---|
10682 | 11830 | NULL, NULL, cgroup_fd); |
---|
10683 | 11831 | if (IS_ERR(event)) { |
---|
10684 | 11832 | err = PTR_ERR(event); |
---|
10685 | | - goto err_cred; |
---|
| 11833 | + goto err_task; |
---|
10686 | 11834 | } |
---|
10687 | 11835 | |
---|
10688 | 11836 | if (is_sampling_event(event)) { |
---|
.. | .. |
---|
10776 | 11924 | * Do not allow to attach to a group in a different task |
---|
10777 | 11925 | * or CPU context. If we're moving SW events, we'll fix |
---|
10778 | 11926 | * this up later, so allow that. |
---|
| 11927 | + * |
---|
| 11928 | + * Racy, not holding group_leader->ctx->mutex, see comment with |
---|
| 11929 | + * perf_event_ctx_lock(). |
---|
10779 | 11930 | */ |
---|
10780 | 11931 | if (!move_group && group_leader->ctx != ctx) |
---|
10781 | 11932 | goto err_context; |
---|
.. | .. |
---|
10799 | 11950 | err = PTR_ERR(event_file); |
---|
10800 | 11951 | event_file = NULL; |
---|
10801 | 11952 | goto err_context; |
---|
| 11953 | + } |
---|
| 11954 | + |
---|
| 11955 | + if (task) { |
---|
| 11956 | + err = down_read_interruptible(&task->signal->exec_update_lock); |
---|
| 11957 | + if (err) |
---|
| 11958 | + goto err_file; |
---|
| 11959 | + |
---|
| 11960 | + /* |
---|
| 11961 | + * Preserve ptrace permission check for backwards compatibility. |
---|
| 11962 | + * |
---|
| 11963 | + * We must hold exec_update_lock across this and any potential |
---|
| 11964 | + * perf_install_in_context() call for this new event to |
---|
| 11965 | + * serialize against exec() altering our credentials (and the |
---|
| 11966 | + * perf_event_exit_task() that could imply). |
---|
| 11967 | + */ |
---|
| 11968 | + err = -EACCES; |
---|
| 11969 | + if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) |
---|
| 11970 | + goto err_cred; |
---|
10802 | 11971 | } |
---|
10803 | 11972 | |
---|
10804 | 11973 | if (move_group) { |
---|
.. | .. |
---|
10825 | 11994 | } else { |
---|
10826 | 11995 | perf_event_ctx_unlock(group_leader, gctx); |
---|
10827 | 11996 | move_group = 0; |
---|
| 11997 | + goto not_move_group; |
---|
10828 | 11998 | } |
---|
10829 | 11999 | } |
---|
10830 | 12000 | |
---|
.. | .. |
---|
10841 | 12011 | } |
---|
10842 | 12012 | } else { |
---|
10843 | 12013 | mutex_lock(&ctx->mutex); |
---|
| 12014 | + |
---|
| 12015 | + /* |
---|
| 12016 | + * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, |
---|
| 12017 | + * see the group_leader && !move_group test earlier. |
---|
| 12018 | + */ |
---|
| 12019 | + if (group_leader && group_leader->ctx != ctx) { |
---|
| 12020 | + err = -EINVAL; |
---|
| 12021 | + goto err_locked; |
---|
| 12022 | + } |
---|
10844 | 12023 | } |
---|
| 12024 | +not_move_group: |
---|
10845 | 12025 | |
---|
10846 | 12026 | if (ctx->task == TASK_TOMBSTONE) { |
---|
10847 | 12027 | err = -ESRCH; |
---|
.. | .. |
---|
10869 | 12049 | } |
---|
10870 | 12050 | } |
---|
10871 | 12051 | |
---|
| 12052 | + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { |
---|
| 12053 | + err = -EINVAL; |
---|
| 12054 | + goto err_locked; |
---|
| 12055 | + } |
---|
10872 | 12056 | |
---|
10873 | 12057 | /* |
---|
10874 | 12058 | * Must be under the same ctx::mutex as perf_install_in_context(), |
---|
.. | .. |
---|
10950 | 12134 | mutex_unlock(&ctx->mutex); |
---|
10951 | 12135 | |
---|
10952 | 12136 | if (task) { |
---|
10953 | | - mutex_unlock(&task->signal->cred_guard_mutex); |
---|
| 12137 | + up_read(&task->signal->exec_update_lock); |
---|
10954 | 12138 | put_task_struct(task); |
---|
10955 | 12139 | } |
---|
10956 | 12140 | |
---|
.. | .. |
---|
10972 | 12156 | if (move_group) |
---|
10973 | 12157 | perf_event_ctx_unlock(group_leader, gctx); |
---|
10974 | 12158 | mutex_unlock(&ctx->mutex); |
---|
10975 | | -/* err_file: */ |
---|
| 12159 | +err_cred: |
---|
| 12160 | + if (task) |
---|
| 12161 | + up_read(&task->signal->exec_update_lock); |
---|
| 12162 | +err_file: |
---|
10976 | 12163 | fput(event_file); |
---|
10977 | 12164 | err_context: |
---|
10978 | 12165 | perf_unpin_context(ctx); |
---|
.. | .. |
---|
10984 | 12171 | */ |
---|
10985 | 12172 | if (!event_file) |
---|
10986 | 12173 | free_event(event); |
---|
10987 | | -err_cred: |
---|
10988 | | - if (task) |
---|
10989 | | - mutex_unlock(&task->signal->cred_guard_mutex); |
---|
10990 | 12174 | err_task: |
---|
10991 | 12175 | if (task) |
---|
10992 | 12176 | put_task_struct(task); |
---|
.. | .. |
---|
11015 | 12199 | int err; |
---|
11016 | 12200 | |
---|
11017 | 12201 | /* |
---|
11018 | | - * Get the target context (task or percpu): |
---|
| 12202 | + * Grouping is not supported for kernel events, neither is 'AUX', |
---|
| 12203 | + * make sure the caller's intentions are adjusted. |
---|
11019 | 12204 | */ |
---|
| 12205 | + if (attr->aux_output) |
---|
| 12206 | + return ERR_PTR(-EINVAL); |
---|
11020 | 12207 | |
---|
11021 | 12208 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, |
---|
11022 | 12209 | overflow_handler, context, -1); |
---|
.. | .. |
---|
11028 | 12215 | /* Mark owner so we could distinguish it from user events. */ |
---|
11029 | 12216 | event->owner = TASK_TOMBSTONE; |
---|
11030 | 12217 | |
---|
| 12218 | + /* |
---|
| 12219 | + * Get the target context (task or percpu): |
---|
| 12220 | + */ |
---|
11031 | 12221 | ctx = find_get_context(event->pmu, task, event); |
---|
11032 | 12222 | if (IS_ERR(ctx)) { |
---|
11033 | 12223 | err = PTR_ERR(ctx); |
---|
.. | .. |
---|
11285 | 12475 | /* |
---|
11286 | 12476 | * When a child task exits, feed back event values to parent events. |
---|
11287 | 12477 | * |
---|
11288 | | - * Can be called with cred_guard_mutex held when called from |
---|
11289 | | - * install_exec_creds(). |
---|
| 12478 | + * Can be called with exec_update_lock held when called from |
---|
| 12479 | + * setup_new_exec(). |
---|
11290 | 12480 | */ |
---|
11291 | 12481 | void perf_event_exit_task(struct task_struct *child) |
---|
11292 | 12482 | { |
---|
.. | .. |
---|
11390 | 12580 | * |
---|
11391 | 12581 | * Wait for all events to drop their context reference. |
---|
11392 | 12582 | */ |
---|
11393 | | - wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1); |
---|
| 12583 | + wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); |
---|
11394 | 12584 | put_ctx(ctx); /* must be last */ |
---|
11395 | 12585 | } |
---|
11396 | 12586 | } |
---|
.. | .. |
---|
11405 | 12595 | |
---|
11406 | 12596 | struct file *perf_event_get(unsigned int fd) |
---|
11407 | 12597 | { |
---|
11408 | | - struct file *file; |
---|
11409 | | - |
---|
11410 | | - file = fget_raw(fd); |
---|
| 12598 | + struct file *file = fget(fd); |
---|
11411 | 12599 | if (!file) |
---|
11412 | 12600 | return ERR_PTR(-EBADF); |
---|
11413 | 12601 | |
---|
.. | .. |
---|
11477 | 12665 | !child_ctx->task_ctx_data) { |
---|
11478 | 12666 | struct pmu *pmu = child_event->pmu; |
---|
11479 | 12667 | |
---|
11480 | | - child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, |
---|
11481 | | - GFP_KERNEL); |
---|
| 12668 | + child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); |
---|
11482 | 12669 | if (!child_ctx->task_ctx_data) { |
---|
11483 | 12670 | free_event(child_event); |
---|
11484 | 12671 | return ERR_PTR(-ENOMEM); |
---|
.. | .. |
---|
11583 | 12770 | child, leader, child_ctx); |
---|
11584 | 12771 | if (IS_ERR(child_ctr)) |
---|
11585 | 12772 | return PTR_ERR(child_ctr); |
---|
| 12773 | + |
---|
| 12774 | + if (sub->aux_event == parent_event && child_ctr && |
---|
| 12775 | + !perf_get_aux_event(child_ctr, leader)) |
---|
| 12776 | + return -EINVAL; |
---|
11586 | 12777 | } |
---|
11587 | 12778 | return 0; |
---|
11588 | 12779 | } |
---|
.. | .. |
---|
11778 | 12969 | } |
---|
11779 | 12970 | } |
---|
11780 | 12971 | |
---|
11781 | | -void perf_swevent_init_cpu(unsigned int cpu) |
---|
| 12972 | +static void perf_swevent_init_cpu(unsigned int cpu) |
---|
11782 | 12973 | { |
---|
11783 | 12974 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
---|
11784 | 12975 | |
---|
.. | .. |
---|
11975 | 13166 | kfree(jc); |
---|
11976 | 13167 | } |
---|
11977 | 13168 | |
---|
| 13169 | +static int perf_cgroup_css_online(struct cgroup_subsys_state *css) |
---|
| 13170 | +{ |
---|
| 13171 | + perf_event_cgroup(css->cgroup); |
---|
| 13172 | + return 0; |
---|
| 13173 | +} |
---|
| 13174 | + |
---|
11978 | 13175 | static int __perf_cgroup_move(void *info) |
---|
11979 | 13176 | { |
---|
11980 | 13177 | struct task_struct *task = info; |
---|
.. | .. |
---|
11996 | 13193 | struct cgroup_subsys perf_event_cgrp_subsys = { |
---|
11997 | 13194 | .css_alloc = perf_cgroup_css_alloc, |
---|
11998 | 13195 | .css_free = perf_cgroup_css_free, |
---|
| 13196 | + .css_online = perf_cgroup_css_online, |
---|
11999 | 13197 | .attach = perf_cgroup_attach, |
---|
12000 | 13198 | /* |
---|
12001 | 13199 | * Implicitly enable on dfl hierarchy so that perf events can |
---|