From 9999e48639b3cecb08ffb37358bcba3b48161b29 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Fri, 10 May 2024 08:50:17 +0000
Subject: [PATCH] add ax88772_rst
---
kernel/kernel/events/core.c | 2080 ++++++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 1,639 insertions(+), 441 deletions(-)
diff --git a/kernel/kernel/events/core.c b/kernel/kernel/events/core.c
index a2899b6..80ab33a 100644
--- a/kernel/kernel/events/core.c
+++ b/kernel/kernel/events/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events core code:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/fs.h>
@@ -29,6 +28,7 @@
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
@@ -50,6 +50,7 @@
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
+#include <linux/min_heap.h>
#include "internal.h"
@@ -265,7 +266,7 @@
if (!event->parent) {
/*
* If this is a !child event, we must hold ctx::mutex to
- * stabilize the the event->ctx relation. See
+ * stabilize the event->ctx relation. See
* perf_event_ctx_lock().
*/
lockdep_assert_held(&ctx->mutex);
@@ -391,6 +392,10 @@
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_ksymbol_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;
+static atomic_t nr_cgroup_events __read_mostly;
+static atomic_t nr_text_poke_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -403,13 +408,8 @@
* 0 - disallow raw tracepoint access for unpriv
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
- * 3 - disallow all unpriv perf event use
*/
-#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
-int sysctl_perf_event_paranoid __read_mostly = 3;
-#else
int sysctl_perf_event_paranoid __read_mostly = 2;
-#endif
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -444,8 +444,7 @@
static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -469,8 +468,7 @@
int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -761,7 +759,7 @@
/*
* Do not update time when cgroup is not active
*/
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
__update_cgrp_time(event->cgrp);
}
@@ -901,6 +899,47 @@
rcu_read_unlock();
}
+static int perf_cgroup_ensure_storage(struct perf_event *event,
+ struct cgroup_subsys_state *css)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event **storage;
+ int cpu, heap_size, ret = 0;
+
+ /*
+ * Allow storage to have sufficent space for an iterator for each
+ * possibly nested cgroup plus an iterator for events with no cgroup.
+ */
+ for (heap_size = 1; css; css = css->parent)
+ heap_size++;
+
+ for_each_possible_cpu(cpu) {
+ cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ if (heap_size <= cpuctx->heap_size)
+ continue;
+
+ storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!storage) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ raw_spin_lock_irq(&cpuctx->ctx.lock);
+ if (cpuctx->heap_size < heap_size) {
+ swap(cpuctx->heap, storage);
+ if (storage == cpuctx->heap_default)
+ storage = NULL;
+ cpuctx->heap_size = heap_size;
+ }
+ raw_spin_unlock_irq(&cpuctx->ctx.lock);
+
+ kfree(storage);
+ }
+
+ return ret;
+}
+
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
@@ -919,6 +958,10 @@
ret = PTR_ERR(css);
goto out;
}
+
+ ret = perf_cgroup_ensure_storage(event, css);
+ if (ret)
+ goto out;
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
@@ -945,25 +988,19 @@
event->shadow_ctx_time = now - t->timestamp;
}
-/*
- * Update cpuctx->cgrp so that it is set when first cgroup event is added and
- * cleared when last cgroup event is removed.
- */
static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
- struct list_head *cpuctx_entry;
if (!is_cgroup_event(event))
return;
/*
* Because cgroup events are always per-cpu events,
- * this will always be called from the right CPU.
+ * @ctx == &cpuctx->ctx.
*/
- cpuctx = __get_cpu_context(ctx);
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
/*
* Since setting cpuctx->cgrp is conditional on the current @cgrp
@@ -971,27 +1008,41 @@
* because if the first would mismatch, the second would not try again
* and we would leave cpuctx->cgrp unset.
*/
- if (add && !cpuctx->cgrp) {
+ if (ctx->is_active && !cpuctx->cgrp) {
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
cpuctx->cgrp = cgrp;
}
- if (add && ctx->nr_cgroups++)
- return;
- else if (!add && --ctx->nr_cgroups)
+ if (ctx->nr_cgroups++)
return;
- /* no cgroup running */
- if (!add)
+ list_add(&cpuctx->cgrp_cpuctx_entry,
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
+}
+
+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
+ return;
+
+ /*
+ * Because cgroup events are always per-cpu events,
+ * @ctx == &cpuctx->ctx.
+ */
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (--ctx->nr_cgroups)
+ return;
+
+ if (ctx->is_active && cpuctx->cgrp)
cpuctx->cgrp = NULL;
- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
- if (add)
- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
- else
- list_del(cpuctx_entry);
+ list_del(&cpuctx->cgrp_cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1041,7 +1092,7 @@
{
}
-void
+static inline void
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}
@@ -1057,11 +1108,14 @@
}
static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+}
#endif
/*
@@ -1113,7 +1167,7 @@
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
raw_spin_lock_init(&cpuctx->hrtimer_lock);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
@@ -1131,11 +1185,16 @@
if (!cpuctx->hrtimer_active) {
cpuctx->hrtimer_active = 1;
hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
return 0;
+}
+
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
}
void perf_pmu_disable(struct pmu *pmu)
@@ -1182,7 +1241,21 @@
static void get_ctx(struct perf_event_context *ctx)
{
- WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+ refcount_inc(&ctx->refcount);
+}
+
+static void *alloc_task_ctx_data(struct pmu *pmu)
+{
+ if (pmu->task_ctx_cache)
+ return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
+
+ return NULL;
+}
+
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+{
+ if (pmu->task_ctx_cache && task_ctx_data)
+ kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
}
static void free_ctx(struct rcu_head *head)
@@ -1190,13 +1263,13 @@
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- kfree(ctx->task_ctx_data);
+ free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
kfree(ctx);
}
static void put_ctx(struct perf_event_context *ctx)
{
- if (atomic_dec_and_test(&ctx->refcount)) {
+ if (refcount_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task && ctx->task != TASK_TOMBSTONE)
@@ -1232,7 +1305,7 @@
* life-time rules separate them. That is an exiting task cannot fork, and a
* spawning task cannot (yet) exit.
*
- * But remember that that these are parent<->child context relations, and
+ * But remember that these are parent<->child context relations, and
* migration does not affect children, therefore these two orderings should not
* interact.
*
@@ -1258,13 +1331,13 @@
* function.
*
* Lock order:
- * cred_guard_mutex
+ * exec_update_lock
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
* perf_event_context::lock
* perf_event::mmap_mutex
- * mmap_sem
+ * mmap_lock
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
@@ -1279,7 +1352,7 @@
again:
rcu_read_lock();
ctx = READ_ONCE(event->ctx);
- if (!atomic_inc_not_zero(&ctx->refcount)) {
+ if (!refcount_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
}
@@ -1371,7 +1444,7 @@
/*
* Get the perf_event_context for a task and lock it.
*
- * This has to cope with with the fact that until it is locked,
+ * This has to cope with the fact that until it is locked,
* the context could get moved to another task.
*/
static struct perf_event_context *
@@ -1412,7 +1485,7 @@
}
if (ctx->task == TASK_TOMBSTONE ||
- !atomic_inc_not_zero(&ctx->refcount)) {
+ !refcount_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock(&ctx->lock);
ctx = NULL;
} else {
@@ -1540,6 +1613,30 @@
if (left->cpu > right->cpu)
return false;
+#ifdef CONFIG_CGROUP_PERF
+ if (left->cgrp != right->cgrp) {
+ if (!left->cgrp || !left->cgrp->css.cgroup) {
+ /*
+ * Left has no cgroup but right does, no cgroups come
+ * first.
+ */
+ return true;
+ }
+ if (!right->cgrp || !right->cgrp->css.cgroup) {
+ /*
+ * Right has no cgroup but left does, no cgroups come
+ * first.
+ */
+ return false;
+ }
+ /* Two dissimilar cgroups, order by id. */
+ if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
+ return true;
+
+ return false;
+ }
+#endif
+
if (left->group_index < right->group_index)
return true;
if (left->group_index > right->group_index)
@@ -1619,25 +1716,48 @@
}
/*
- * Get the leftmost event in the @cpu subtree.
+ * Get the leftmost event in the cpu/cgroup subtree.
*/
static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
+ struct cgroup *cgrp)
{
struct perf_event *node_event = NULL, *match = NULL;
struct rb_node *node = groups->tree.rb_node;
+#ifdef CONFIG_CGROUP_PERF
+ u64 node_cgrp_id, cgrp_id = 0;
+
+ if (cgrp)
+ cgrp_id = cgrp->kn->id;
+#endif
while (node) {
node_event = container_of(node, struct perf_event, group_node);
if (cpu < node_event->cpu) {
node = node->rb_left;
- } else if (cpu > node_event->cpu) {
- node = node->rb_right;
- } else {
- match = node_event;
- node = node->rb_left;
+ continue;
}
+ if (cpu > node_event->cpu) {
+ node = node->rb_right;
+ continue;
+ }
+#ifdef CONFIG_CGROUP_PERF
+ node_cgrp_id = 0;
+ if (node_event->cgrp && node_event->cgrp->css.cgroup)
+ node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+
+ if (cgrp_id < node_cgrp_id) {
+ node = node->rb_left;
+ continue;
+ }
+ if (cgrp_id > node_cgrp_id) {
+ node = node->rb_right;
+ continue;
+ }
+#endif
+ match = node_event;
+ node = node->rb_left;
}
return match;
@@ -1650,12 +1770,26 @@
perf_event_groups_next(struct perf_event *event)
{
struct perf_event *next;
+#ifdef CONFIG_CGROUP_PERF
+ u64 curr_cgrp_id = 0;
+ u64 next_cgrp_id = 0;
+#endif
next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
- if (next && next->cpu == event->cpu)
- return next;
+ if (next == NULL || next->cpu != event->cpu)
+ return NULL;
- return NULL;
+#ifdef CONFIG_CGROUP_PERF
+ if (event->cgrp && event->cgrp->css.cgroup)
+ curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+
+ if (next->cgrp && next->cgrp->css.cgroup)
+ next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+
+ if (curr_cgrp_id != next_cgrp_id)
+ return NULL;
+#endif
+ return next;
}
/*
@@ -1691,12 +1825,13 @@
add_event_to_groups(event, ctx);
}
- list_update_cgroup_event(event, ctx, true);
-
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_enable(event, ctx);
ctx->generation++;
}
@@ -1762,6 +1897,9 @@
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
size += sizeof(data->phys_addr);
+
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ size += sizeof(data->cgroup);
event->header_size = size;
}
@@ -1873,8 +2011,6 @@
event->attach_state &= ~PERF_ATTACH_CONTEXT;
- list_update_cgroup_event(event, ctx, false);
-
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1891,14 +2027,136 @@
* of error state is by explicit re-enabling
* of the event
*/
- if (event->state > PERF_EVENT_STATE_OFF)
+ if (event->state > PERF_EVENT_STATE_OFF) {
+ perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ }
ctx->generation++;
}
+static int
+perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
+{
+ if (!has_aux(aux_event))
+ return 0;
+
+ if (!event->pmu->aux_output_match)
+ return 0;
+
+ return event->pmu->aux_output_match(aux_event);
+}
+
+static void put_event(struct perf_event *event);
+static void event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx);
+
+static void perf_put_aux_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event *iter;
+
+ /*
+ * If event uses aux_event tear down the link
+ */
+ if (event->aux_event) {
+ iter = event->aux_event;
+ event->aux_event = NULL;
+ put_event(iter);
+ return;
+ }
+
+ /*
+ * If the event is an aux_event, tear down all links to
+ * it from other events.
+ */
+ for_each_sibling_event(iter, event->group_leader) {
+ if (iter->aux_event != event)
+ continue;
+
+ iter->aux_event = NULL;
+ put_event(event);
+
+ /*
+ * If it's ACTIVE, schedule it out and put it into ERROR
+ * state so that we don't try to schedule it again. Note
+ * that perf_event_enable() will clear the ERROR status.
+ */
+ event_sched_out(iter, cpuctx, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ }
+}
+
+static bool perf_need_aux_event(struct perf_event *event)
+{
+ return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+}
+
+static int perf_get_aux_event(struct perf_event *event,
+ struct perf_event *group_leader)
+{
+ /*
+ * Our group leader must be an aux event if we want to be
+ * an aux_output. This way, the aux event will precede its
+ * aux_output events in the group, and therefore will always
+ * schedule first.
+ */
+ if (!group_leader)
+ return 0;
+
+ /*
+ * aux_output and aux_sample_size are mutually exclusive.
+ */
+ if (event->attr.aux_output && event->attr.aux_sample_size)
+ return 0;
+
+ if (event->attr.aux_output &&
+ !perf_aux_output_match(event, group_leader))
+ return 0;
+
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
+ return 0;
+
+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
+ return 0;
+
+ /*
+ * Link aux_outputs to their aux event; this is undone in
+ * perf_group_detach() by perf_put_aux_event(). When the
+ * group in torn down, the aux_output events loose their
+ * link to the aux_event and can't schedule any more.
+ */
+ event->aux_event = group_leader;
+
+ return 1;
+}
+
+static inline struct list_head *get_event_list(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+}
+
+/*
+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
+ * cannot exist on their own, schedule them out and move them into the ERROR
+ * state. Also see _perf_event_enable(), it will not be able to recover
+ * this ERROR state.
+ */
+static inline void perf_remove_sibling_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ event_sched_out(event, cpuctx, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+}
+
static void perf_group_detach(struct perf_event *event)
{
+ struct perf_event *leader = event->group_leader;
struct perf_event *sibling, *tmp;
struct perf_event_context *ctx = event->ctx;
@@ -1912,10 +2170,12 @@
event->attach_state &= ~PERF_ATTACH_GROUP;
+ perf_put_aux_event(event);
+
/*
* If this is a sibling, remove it from its group.
*/
- if (event->group_leader != event) {
+ if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
goto out;
@@ -1928,6 +2188,9 @@
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
+ perf_remove_sibling_event(sibling);
+
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -1937,22 +2200,18 @@
if (!RB_EMPTY_NODE(&event->group_node)) {
add_event_to_groups(sibling, event->ctx);
- if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
- struct list_head *list = sibling->attr.pinned ?
- &ctx->pinned_active : &ctx->flexible_active;
-
- list_add_tail(&sibling->active_list, list);
- }
+ if (sibling->state == PERF_EVENT_STATE_ACTIVE)
+ list_add_tail(&sibling->active_list, get_event_list(sibling));
}
WARN_ON_ONCE(sibling->ctx != event->ctx);
}
out:
- perf_event__header_size(event->group_leader);
-
- for_each_sibling_event(tmp, event->group_leader)
+ for_each_sibling_event(tmp, leader)
perf_event__header_size(tmp);
+
+ perf_event__header_size(leader);
}
static bool is_orphaned_event(struct perf_event *event)
@@ -2021,6 +2280,7 @@
if (READ_ONCE(event->pending_disable) >= 0) {
WRITE_ONCE(event->pending_disable, -1);
+ perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2058,9 +2318,6 @@
event_sched_out(event, cpuctx, ctx);
perf_pmu_enable(ctx->pmu);
-
- if (group_event->attr.exclusive)
- cpuctx->exclusive = 0;
}
#define DETACH_GROUP 0x01UL
@@ -2091,6 +2348,7 @@
if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
+ ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2157,6 +2415,7 @@
event_sched_out(event, cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ perf_cgroup_event_disable(event, ctx);
}
/*
@@ -2164,7 +2423,7 @@
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
- * remains valid. This condition is satisifed when called through
+ * remains valid. This condition is satisfied when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
@@ -2238,7 +2497,7 @@
* But this is a bit hairy.
*
* So instead, we have an explicit cgroup call to remain
- * within the time time source all along. We believe it
+ * within the time source all along. We believe it
* is cleaner and simpler to understand.
*/
if (is_cgroup_event(event))
@@ -2258,6 +2517,8 @@
struct perf_event_context *ctx)
{
int ret = 0;
+
+ WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -2325,11 +2586,8 @@
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx)) {
- pmu->cancel_txn(pmu);
- perf_mux_hrtimer_restart(cpuctx);
- return -EAGAIN;
- }
+ if (event_sched_in(group_event, cpuctx, ctx))
+ goto error;
/*
* Schedule in siblings as one group (if any):
@@ -2358,10 +2616,8 @@
}
event_sched_out(group_event, cpuctx, ctx);
+error:
pmu->cancel_txn(pmu);
-
- perf_mux_hrtimer_restart(cpuctx);
-
return -EAGAIN;
}
@@ -2387,7 +2643,7 @@
* If this group is exclusive and there are already
* events on the CPU, it can't go on.
*/
- if (event->attr.exclusive && cpuctx->active_oncpu)
+ if (event->attr.exclusive && !list_empty(get_event_list(event)))
return 0;
/*
* Otherwise, try to add it if all previous groups were able
@@ -2488,6 +2744,16 @@
perf_pmu_enable(cpuctx->ctx.pmu);
}
+void perf_pmu_resched(struct pmu *pmu)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ perf_ctx_unlock(cpuctx, task_ctx);
+}
+
/*
* Cross CPU call to install and enable a performance event
*
@@ -2528,7 +2794,7 @@
}
#ifdef CONFIG_CGROUP_PERF
- if (is_cgroup_event(event)) {
+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
/*
* If the current cgroup doesn't match the event's
* cgroup, we should not try to schedule it.
@@ -2580,6 +2846,25 @@
* will be 'complete'. See perf_iterate_sb_cpu().
*/
smp_store_release(&event->ctx, ctx);
+
+ /*
+ * perf_event_attr::disabled events will not run and can be initialized
+ * without IPI. Except when this is the first event for the context, in
+ * that case we need the magic of the IPI to set ctx->is_active.
+ *
+ * The IOC_ENABLE that is sure to follow the creation of a disabled
+ * event will issue the IPI and reprogram the hardware.
+ */
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ raw_spin_lock_irq(&ctx->lock);
+ if (ctx->task == TASK_TOMBSTONE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event);
@@ -2669,6 +2954,7 @@
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+ perf_cgroup_event_enable(event, ctx);
if (!ctx->is_active)
return;
@@ -2710,6 +2996,7 @@
raw_spin_lock_irq(&ctx->lock);
if (event->state >= PERF_EVENT_STATE_INACTIVE ||
event->state < PERF_EVENT_STATE_ERROR) {
+out:
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -2721,8 +3008,16 @@
* has gone back into error state, as distinct from the task having
* been scheduled away before the cross-call arrived.
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if (event->state == PERF_EVENT_STATE_ERROR) {
+ /*
+ * Detached SIBLING events cannot leave ERROR state.
+ */
+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
+ event->group_leader == event)
+ goto out;
+
event->state = PERF_EVENT_STATE_OFF;
+ }
raw_spin_unlock_irq(&ctx->lock);
event_function_call(event, __perf_event_enable, NULL);
@@ -2826,7 +3121,7 @@
* pre-existing mappings, called once when new filters arrive via SET_FILTER
* ioctl;
* (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
- * registered mapping, called for every new mmap(), with mm::mmap_sem down
+ * registered mapping, called for every new mmap(), with mm::mmap_lock down
* for reading;
* (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
* of exec.
@@ -2966,6 +3261,13 @@
if (is_active & EVENT_FLEXIBLE) {
list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
group_sched_out(event, cpuctx, ctx);
+
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ ctx->rotate_necessary = 0;
}
perf_pmu_enable(ctx->pmu);
}
@@ -3080,10 +3382,12 @@
struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
+ struct pmu *pmu;
if (likely(!ctx))
return;
+ pmu = ctx->pmu;
cpuctx = __get_cpu_context(ctx);
if (!cpuctx->task_ctx)
return;
@@ -3113,10 +3417,27 @@
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
+
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ perf_pmu_disable(pmu);
+
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
+ pmu->sched_task(ctx, false);
+
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (pmu->swap_task_ctx)
+ pmu->swap_task_ctx(ctx, next_ctx);
+ else
+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+
+ perf_pmu_enable(pmu);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3140,7 +3461,13 @@
if (do_switch) {
raw_spin_lock(&ctx->lock);
+ perf_pmu_disable(pmu);
+
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
+ pmu->sched_task(ctx, false);
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+
+ perf_pmu_enable(pmu);
raw_spin_unlock(&ctx->lock);
}
}
@@ -3176,29 +3503,39 @@
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+{
+ struct pmu *pmu;
+
+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+
+ if (WARN_ON_ONCE(!pmu->sched_task))
+ return;
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(pmu);
+
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+ perf_pmu_enable(pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
{
struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
if (prev == next)
return;
list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
-
- if (WARN_ON_ONCE(!pmu->sched_task))
+ /* will be handled in perf_event_context_sched_in/out */
+ if (cpuctx->task_ctx)
continue;
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(pmu);
-
- pmu->sched_task(cpuctx->task_ctx, sched_in);
-
- perf_pmu_enable(pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ __perf_pmu_sched_task(cpuctx, sched_in);
}
}
@@ -3251,83 +3588,149 @@
ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}
-static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
- int (*func)(struct perf_event *, void *), void *data)
+static bool perf_less_group_idx(const void *l, const void *r)
{
- struct perf_event **evt, *evt1, *evt2;
+ const struct perf_event *le = *(const struct perf_event **)l;
+ const struct perf_event *re = *(const struct perf_event **)r;
+
+ return le->group_index < re->group_index;
+}
+
+static void swap_ptr(void *l, void *r)
+{
+ void **lp = l, **rp = r;
+
+ swap(*lp, *rp);
+}
+
+static const struct min_heap_callbacks perf_min_heap = {
+ .elem_size = sizeof(struct perf_event *),
+ .less = perf_less_group_idx,
+ .swp = swap_ptr,
+};
+
+static void __heap_add(struct min_heap *heap, struct perf_event *event)
+{
+ struct perf_event **itrs = heap->data;
+
+ if (event) {
+ itrs[heap->nr] = event;
+ heap->nr++;
+ }
+}
+
+static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+ struct perf_event_groups *groups, int cpu,
+ int (*func)(struct perf_event *, void *),
+ void *data)
+{
+#ifdef CONFIG_CGROUP_PERF
+ struct cgroup_subsys_state *css = NULL;
+#endif
+ /* Space for per CPU and/or any CPU event iterators. */
+ struct perf_event *itrs[2];
+ struct min_heap event_heap;
+ struct perf_event **evt;
int ret;
- evt1 = perf_event_groups_first(groups, -1);
- evt2 = perf_event_groups_first(groups, cpu);
+ if (cpuctx) {
+ event_heap = (struct min_heap){
+ .data = cpuctx->heap,
+ .nr = 0,
+ .size = cpuctx->heap_size,
+ };
- while (evt1 || evt2) {
- if (evt1 && evt2) {
- if (evt1->group_index < evt2->group_index)
- evt = &evt1;
- else
- evt = &evt2;
- } else if (evt1) {
- evt = &evt1;
- } else {
- evt = &evt2;
- }
+ lockdep_assert_held(&cpuctx->ctx.lock);
+#ifdef CONFIG_CGROUP_PERF
+ if (cpuctx->cgrp)
+ css = &cpuctx->cgrp->css;
+#endif
+ } else {
+ event_heap = (struct min_heap){
+ .data = itrs,
+ .nr = 0,
+ .size = ARRAY_SIZE(itrs),
+ };
+ /* Events not within a CPU context may be on any CPU. */
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+ }
+ evt = event_heap.data;
+
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+
+#ifdef CONFIG_CGROUP_PERF
+ for (; css; css = css->parent)
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+#endif
+
+ min_heapify_all(&event_heap, &perf_min_heap);
+
+ while (event_heap.nr) {
ret = func(*evt, data);
if (ret)
return ret;
*evt = perf_event_groups_next(*evt);
- }
-
- return 0;
-}
-
-struct sched_in_data {
- struct perf_event_context *ctx;
- struct perf_cpu_context *cpuctx;
- int can_add_hw;
-};
-
-static int pinned_sched_in(struct perf_event *event, void *data)
-{
- struct sched_in_data *sid = data;
-
- if (event->state <= PERF_EVENT_STATE_OFF)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
- list_add_tail(&event->active_list, &sid->ctx->pinned_active);
- }
-
- /*
- * If this pinned group hasn't been scheduled,
- * put it in error state.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
-
- return 0;
-}
-
-static int flexible_sched_in(struct perf_event *event, void *data)
-{
- struct sched_in_data *sid = data;
-
- if (event->state <= PERF_EVENT_STATE_OFF)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+ if (*evt)
+ min_heapify(&event_heap, 0, &perf_min_heap);
else
- sid->can_add_hw = 0;
+ min_heap_pop(&event_heap, &perf_min_heap);
+ }
+
+ return 0;
+}
+
+static inline bool event_update_userpage(struct perf_event *event)
+{
+ if (likely(!atomic_read(&event->mmap_count)))
+ return false;
+
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
+ perf_event_update_userpage(event);
+
+ return true;
+}
+
+static inline void group_update_userpage(struct perf_event *group_event)
+{
+ struct perf_event *event;
+
+ if (!event_update_userpage(group_event))
+ return;
+
+ for_each_sibling_event(event, group_event)
+ event_update_userpage(event);
+}
+
+static int merge_sched_in(struct perf_event *event, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ int *can_add_hw = data;
+
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ if (!event_filter_match(event))
+ return 0;
+
+ if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+ if (!group_sched_in(event, cpuctx, ctx))
+ list_add_tail(&event->active_list, get_event_list(event));
+ }
+
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ *can_add_hw = 0;
+ if (event->attr.pinned) {
+ perf_cgroup_event_disable(event, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ } else {
+ ctx->rotate_necessary = 1;
+ perf_mux_hrtimer_restart(cpuctx);
+ group_update_userpage(event);
+ }
}
return 0;
@@ -3337,30 +3740,28 @@
ctx_pinned_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
- struct sched_in_data sid = {
- .ctx = ctx,
- .cpuctx = cpuctx,
- .can_add_hw = 1,
- };
+ int can_add_hw = 1;
- visit_groups_merge(&ctx->pinned_groups,
+ if (ctx != &cpuctx->ctx)
+ cpuctx = NULL;
+
+ visit_groups_merge(cpuctx, &ctx->pinned_groups,
smp_processor_id(),
- pinned_sched_in, &sid);
+ merge_sched_in, &can_add_hw);
}
static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
- struct sched_in_data sid = {
- .ctx = ctx,
- .cpuctx = cpuctx,
- .can_add_hw = 1,
- };
+ int can_add_hw = 1;
- visit_groups_merge(&ctx->flexible_groups,
+ if (ctx != &cpuctx->ctx)
+ cpuctx = NULL;
+
+ visit_groups_merge(cpuctx, &ctx->flexible_groups,
smp_processor_id(),
- flexible_sched_in, &sid);
+ merge_sched_in, &can_add_hw);
}
static void
@@ -3419,10 +3820,14 @@
struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
+ struct pmu *pmu = ctx->pmu;
cpuctx = __get_cpu_context(ctx);
- if (cpuctx->task_ctx == ctx)
+ if (cpuctx->task_ctx == ctx) {
+ if (cpuctx->sched_cb_usage)
+ __perf_pmu_sched_task(cpuctx, true);
return;
+ }
perf_ctx_lock(cpuctx, ctx);
/*
@@ -3432,7 +3837,7 @@
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(pmu);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3444,7 +3849,11 @@
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
- perf_pmu_enable(ctx->pmu);
+
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
+ pmu->sched_task(cpuctx->task_ctx, true);
+
+ perf_pmu_enable(pmu);
unlock:
perf_ctx_unlock(cpuctx, ctx);
@@ -3685,34 +4094,45 @@
perf_event_groups_insert(&ctx->flexible_groups, event);
}
+/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_first_active(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_context *ctx)
{
- return list_first_entry_or_null(&ctx->flexible_active,
- struct perf_event, active_list);
+ struct perf_event *event;
+
+ /* pick the first active flexible event */
+ event = list_first_entry_or_null(&ctx->flexible_active,
+ struct perf_event, active_list);
+
+ /* if no active flexible event, pick the first event */
+ if (!event) {
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
+ typeof(*event), group_node);
+ }
+
+ /*
+ * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
+ * finds there are unschedulable events, it will set it again.
+ */
+ ctx->rotate_necessary = 0;
+
+ return event;
}
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event *cpu_event = NULL, *task_event = NULL;
- bool cpu_rotate = false, task_rotate = false;
- struct perf_event_context *ctx = NULL;
+ struct perf_event_context *task_ctx = NULL;
+ int cpu_rotate, task_rotate;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- if (cpuctx->ctx.nr_events) {
- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- cpu_rotate = true;
- }
-
- ctx = cpuctx->task_ctx;
- if (ctx && ctx->nr_events) {
- if (ctx->nr_events != ctx->nr_active)
- task_rotate = true;
- }
+ cpu_rotate = cpuctx->ctx.rotate_necessary;
+ task_ctx = cpuctx->task_ctx;
+ task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
@@ -3721,25 +4141,25 @@
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(ctx);
+ task_event = ctx_event_to_rotate(task_ctx);
if (cpu_rotate)
- cpu_event = ctx_first_active(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (ctx && cpu_event))
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_ctx && cpu_event))
+ ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
if (cpu_event)
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (task_event)
- rotate_ctx(ctx, task_event);
+ rotate_ctx(task_ctx, task_event);
if (cpu_event)
rotate_ctx(&cpuctx->ctx, cpu_event);
- perf_event_sched_in(cpuctx, ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3983,6 +4403,7 @@
return ret;
}
+EXPORT_SYMBOL_GPL(perf_event_read_local);
static int perf_event_read(struct perf_event *event, bool group)
{
@@ -4074,7 +4495,7 @@
INIT_LIST_HEAD(&ctx->event_list);
INIT_LIST_HEAD(&ctx->pinned_active);
INIT_LIST_HEAD(&ctx->flexible_active);
- atomic_set(&ctx->refcount, 1);
+ refcount_set(&ctx->refcount, 1);
}
static struct perf_event_context *
@@ -4087,10 +4508,8 @@
return NULL;
__perf_event_init_context(ctx);
- if (task) {
- ctx->task = task;
- get_task_struct(task);
- }
+ if (task)
+ ctx->task = get_task_struct(task);
ctx->pmu = pmu;
return ctx;
@@ -4152,7 +4571,7 @@
goto errout;
if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+ task_ctx_data = alloc_task_ctx_data(pmu);
if (!task_ctx_data) {
err = -ENOMEM;
goto errout;
@@ -4210,11 +4629,11 @@
}
}
- kfree(task_ctx_data);
+ free_task_ctx_data(pmu, task_ctx_data);
return ctx;
errout:
- kfree(task_ctx_data);
+ free_task_ctx_data(pmu, task_ctx_data);
return ERR_PTR(err);
}
@@ -4233,7 +4652,7 @@
}
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
+ struct perf_buffer *rb);
static void detach_sb_event(struct perf_event *event)
{
@@ -4256,8 +4675,9 @@
if (attr->mmap || attr->mmap_data || attr->mmap2 ||
attr->comm || attr->comm_exec ||
- attr->task ||
- attr->context_switch)
+ attr->task || attr->ksymbol ||
+ attr->context_switch || attr->text_poke ||
+ attr->bpf_event)
return true;
return false;
}
@@ -4306,7 +4726,7 @@
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
@@ -4314,6 +4734,8 @@
atomic_dec(&nr_comm_events);
if (event->attr.namespaces)
atomic_dec(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_dec(&nr_cgroup_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -4326,6 +4748,12 @@
dec = true;
if (has_branch_stack(event))
dec = true;
+ if (event->attr.ksymbol)
+ atomic_dec(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_dec(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_dec(&nr_text_poke_events);
if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -4909,7 +5337,7 @@
static __poll_t perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
poll_wait(file, &event->waitq, wait);
@@ -4935,6 +5363,24 @@
local64_set(&event->count, 0);
perf_event_update_userpage(event);
}
+
+/* Assume it's not an event with inherit set. */
+u64 perf_event_pause(struct perf_event *event, bool reset)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(event->attr.inherit);
+ _perf_event_disable(event);
+ count = local64_read(&event->count);
+ if (reset)
+ local64_set(&event->count, 0);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(perf_event_pause);
/*
* Holding the top-level event's child_mutex means that any
@@ -5013,15 +5459,10 @@
return event->pmu->check_period(event, value);
}
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
+static int _perf_event_period(struct perf_event *event, u64 value)
{
- u64 value;
-
if (!is_sampling_event(event))
return -EINVAL;
-
- if (copy_from_user(&value, arg, sizeof(value)))
- return -EFAULT;
if (!value)
return -EINVAL;
@@ -5039,6 +5480,19 @@
return 0;
}
+
+int perf_event_period(struct perf_event *event, u64 value)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_event_period(event, value);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(perf_event_period);
static const struct file_operations perf_fops;
@@ -5083,8 +5537,14 @@
return _perf_event_refresh(event, arg);
case PERF_EVENT_IOC_PERIOD:
- return perf_event_period(event, (u64 __user *)arg);
+ {
+ u64 value;
+ if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
+ return -EFAULT;
+
+ return _perf_event_period(event, value);
+ }
case PERF_EVENT_IOC_ID:
{
u64 id = primary_event_id(event);
@@ -5119,7 +5579,7 @@
return perf_event_set_bpf_prog(event, arg);
case PERF_EVENT_IOC_PAUSE_OUTPUT: {
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5255,7 +5715,7 @@
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5287,7 +5747,7 @@
void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
u64 enabled, running, now;
rcu_read_lock();
@@ -5338,7 +5798,7 @@
static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
struct perf_event *event = vmf->vma->vm_file->private_data;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
vm_fault_t ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -5371,10 +5831,12 @@
}
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb)
+ struct perf_buffer *rb)
{
- struct ring_buffer *old_rb = NULL;
+ struct perf_buffer *old_rb = NULL;
unsigned long flags;
+
+ WARN_ON_ONCE(event->parent);
if (event->rb) {
/*
@@ -5431,7 +5893,10 @@
static void ring_buffer_wakeup(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
+
+ if (event->parent)
+ event = event->parent;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -5442,14 +5907,17 @@
rcu_read_unlock();
}
-struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
+
+ if (event->parent)
+ event = event->parent;
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
- if (!atomic_inc_not_zero(&rb->refcount))
+ if (!refcount_inc_not_zero(&rb->refcount))
rb = NULL;
}
rcu_read_unlock();
@@ -5457,9 +5925,9 @@
return rb;
}
-void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct perf_buffer *rb)
{
- if (!atomic_dec_and_test(&rb->refcount))
+ if (!refcount_dec_and_test(&rb->refcount))
return;
WARN_ON_ONCE(!list_empty(&rb->event_list));
@@ -5494,7 +5962,7 @@
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- struct ring_buffer *rb = ring_buffer_get(event);
+ struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
@@ -5519,12 +5987,12 @@
perf_pmu_output_stop(event);
/* now it's safe to free the pages */
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
+ atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
rb_free_aux(rb);
- WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+ WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
mutex_unlock(&event->mmap_mutex);
}
@@ -5593,8 +6061,9 @@
* undo the VM accounting.
*/
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= mmap_locked;
+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
+ &mmap_user->locked_vm);
+ atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
free_uid(mmap_user);
out_put:
@@ -5603,7 +6072,7 @@
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
- .close = perf_mmap_close, /* non mergable */
+ .close = perf_mmap_close, /* non mergeable */
.fault = perf_mmap_fault,
.page_mkwrite = perf_mmap_fault,
};
@@ -5613,8 +6082,8 @@
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
+ struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
- struct ring_buffer *rb = NULL;
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
@@ -5711,17 +6180,17 @@
again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages != nr_pages) {
+ if (data_page_nr(event->rb) != nr_pages) {
ret = -EINVAL;
goto unlock;
}
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Raced against perf_mmap_close(); remove the
+ * event and try again.
*/
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
goto again;
}
@@ -5749,12 +6218,18 @@
user_locked = user_lock_limit;
user_locked += user_extra;
- if (user_locked > user_lock_limit)
+ if (user_locked > user_lock_limit) {
+ /*
+ * charge locked_vm until it hits user_lock_limit;
+ * charge the rest from pinned_vm
+ */
extra = user_locked - user_lock_limit;
+ user_extra -= extra;
+ }
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
- locked = vma->vm_mm->pinned_vm + extra;
+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
if ((locked > lock_limit) && perf_is_paranoid() &&
!capable(CAP_IPC_LOCK)) {
@@ -5783,6 +6258,8 @@
ring_buffer_attach(event, rb);
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
@@ -5795,7 +6272,7 @@
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->pinned_vm += extra;
+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
atomic_inc(&event->mmap_count);
} else if (rb) {
@@ -5932,18 +6409,25 @@
* Later on, we might change it to a list if there is
* another virtualization implementation supporting the callbacks.
*/
-struct perf_guest_info_callbacks *perf_guest_cbs;
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = cbs;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
+ return -EBUSY;
+
+ rcu_assign_pointer(perf_guest_cbs, cbs);
return 0;
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = NULL;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
+ return -EINVAL;
+
+ rcu_assign_pointer(perf_guest_cbs, NULL);
+ synchronize_rcu();
return 0;
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
@@ -5965,14 +6449,13 @@
}
static void perf_sample_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
} else if (!(current->flags & PF_KTHREAD)) {
- perf_get_regs_user(regs_user, regs, regs_user_copy);
+ perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
regs_user->regs = NULL;
@@ -5991,7 +6474,7 @@
* Get remaining task size from user stack pointer.
*
* It'd be better to take stack vma map and limit this more
- * precisly, but there's no way to get it safely under interrupt,
+ * precisely, but there's no way to get it safely under interrupt,
* so using TASK_SIZE as limit.
*/
static u64 perf_ustack_task_size(struct pt_regs *regs)
@@ -6073,10 +6556,9 @@
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- set_fs(fs);
+ force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -6084,6 +6566,122 @@
/* Dynamic size. */
perf_output_put(handle, dyn_size);
}
+}
+
+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
+ struct perf_sample_data *data,
+ size_t size)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct perf_buffer *rb;
+
+ data->aux_size = 0;
+
+ if (!sampler)
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+ goto out;
+
+ rb = ring_buffer_get(sampler);
+ if (!rb)
+ goto out;
+
+ /*
+ * If this is an NMI hit inside sampling code, don't take
+ * the sample. See also perf_aux_sample_output().
+ */
+ if (READ_ONCE(rb->aux_in_sampling)) {
+ data->aux_size = 0;
+ } else {
+ size = min_t(size_t, size, perf_aux_size(rb));
+ data->aux_size = ALIGN(size, sizeof(u64));
+ }
+ ring_buffer_put(rb);
+
+out:
+ return data->aux_size;
+}
+
+long perf_pmu_snapshot_aux(struct perf_buffer *rb,
+ struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
+{
+ unsigned long flags;
+ long ret;
+
+ /*
+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+ * paths. If we start calling them in NMI context, they may race with
+ * the IRQ ones, that is, for example, re-starting an event that's just
+ * been stopped, which is why we're using a separate callback that
+ * doesn't change the event state.
+ *
+ * IRQs need to be disabled to prevent IPIs from racing with us.
+ */
+ local_irq_save(flags);
+ /*
+ * Guard against NMI hits inside the critical section;
+ * see also perf_prepare_sample_aux().
+ */
+ WRITE_ONCE(rb->aux_in_sampling, 1);
+ barrier();
+
+ ret = event->pmu->snapshot_aux(event, handle, size);
+
+ barrier();
+ WRITE_ONCE(rb->aux_in_sampling, 0);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct perf_buffer *rb;
+ unsigned long pad;
+ long size;
+
+ if (WARN_ON_ONCE(!sampler || !data->aux_size))
+ return;
+
+ rb = ring_buffer_get(sampler);
+ if (!rb)
+ return;
+
+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
+
+ /*
+ * An error here means that perf_output_copy() failed (returned a
+ * non-zero surplus that it didn't copy), which in its current
+ * enlightened implementation is not possible. If that changes, we'd
+ * like to know.
+ */
+ if (WARN_ON_ONCE(size < 0))
+ goto out_put;
+
+ /*
+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
+ * perf_prepare_sample_aux(), so should not be more than that.
+ */
+ pad = data->aux_size - size;
+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
+ pad = 8;
+
+ if (pad) {
+ u64 zero = 0;
+ perf_output_copy(handle, &zero, pad);
+ }
+
+out_put:
+ ring_buffer_put(rb);
}
static void __perf_event_header__init_id(struct perf_event_header *header,
@@ -6255,6 +6853,11 @@
perf_output_read_one(handle, event, enabled, running);
}
+static inline bool perf_sample_save_hw_index(struct perf_event *event)
+{
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
+}
+
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
@@ -6343,6 +6946,8 @@
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
+ if (perf_sample_save_hw_index(event))
+ perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
} else {
/*
@@ -6405,11 +7010,21 @@
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ perf_output_put(handle, data->cgroup);
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ perf_output_put(handle, data->aux_size);
+
+ if (data->aux_size)
+ perf_aux_sample_output(event, handle, data);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
if (wakeup_events) {
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
int events = local_inc_return(&rb->events);
if (events >= wakeup_events) {
@@ -6437,14 +7052,14 @@
* Walking the pages tables for user address.
* Interrupts are disabled, so it prevents any tear down
* of the page tables.
- * Try IRQ-safe __get_user_pages_fast first.
+ * Try IRQ-safe get_user_page_fast_only first.
* If failed, leave phys_addr as 0.
*/
if (current->mm != NULL) {
struct page *p;
pagefault_disable();
- if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {
+ if (get_user_page_fast_only(virt, 0, &p)) {
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
put_page(p);
}
@@ -6532,6 +7147,9 @@
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
int size = sizeof(u64); /* nr */
if (data->br_stack) {
+ if (perf_sample_save_hw_index(event))
+ size += sizeof(u64);
+
size += data->br_stack->nr
* sizeof(struct perf_branch_entry);
}
@@ -6539,8 +7157,7 @@
}
if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
- perf_sample_regs_user(&data->regs_user, regs,
- &data->regs_user_copy);
+ perf_sample_regs_user(&data->regs_user, regs);
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
@@ -6556,7 +7173,7 @@
if (sample_type & PERF_SAMPLE_STACK_USER) {
/*
- * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
* in case new sample type is added, because we could eat
* up the rest of the sample size.
@@ -6596,25 +7213,67 @@
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+
+#ifdef CONFIG_CGROUP_PERF
+ if (sample_type & PERF_SAMPLE_CGROUP) {
+ struct cgroup *cgrp;
+
+ /* protected by RCU */
+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+ data->cgroup = cgroup_id(cgrp);
+ }
+#endif
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ u64 size;
+
+ header->size += sizeof(u64); /* size */
+
+ /*
+ * Given the 16bit nature of header::size, an AUX sample can
+ * easily overflow it, what with all the preceding sample bits.
+ * Make sure this doesn't happen by using up to U16_MAX bytes
+ * per sample in total (rounded down to 8 byte boundary).
+ */
+ size = min_t(size_t, U16_MAX - header->size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ size = perf_prepare_sample_aux(event, data, size);
+
+ WARN_ON_ONCE(size + header->size > U16_MAX);
+ header->size += size;
+ }
+ /*
+ * If you're adding more sample types here, you likely need to do
+ * something about the overflowing header::size, like repurpose the
+ * lowest 3 bits of size, which should be always zero at the moment.
+ * This raises a more important question, do we really need 512k sized
+ * samples and why, so good argumentation is in order for whatever you
+ * do here next.
+ */
+ WARN_ON_ONCE(header->size & 7);
}
-static __always_inline void
+static __always_inline int
__perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs,
int (*output_begin)(struct perf_output_handle *,
+ struct perf_sample_data *,
struct perf_event *,
unsigned int))
{
struct perf_output_handle handle;
struct perf_event_header header;
+ int err;
/* protect the callchain buffers */
rcu_read_lock();
perf_prepare_sample(&header, data, event, regs);
- if (output_begin(&handle, event, header.size))
+ err = output_begin(&handle, data, event, header.size);
+ if (err)
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -6623,6 +7282,7 @@
exit:
rcu_read_unlock();
+ return err;
}
void
@@ -6641,12 +7301,12 @@
__perf_event_output(event, data, regs, perf_output_begin_backward);
}
-void
+int
perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- __perf_event_output(event, data, regs, perf_output_begin);
+ return __perf_event_output(event, data, regs, perf_output_begin);
}
/*
@@ -6678,7 +7338,7 @@
int ret;
perf_event_header__init_id(&read_event.header, &sample, event);
- ret = perf_output_begin(&handle, event, read_event.header.size);
+ ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
if (ret)
return;
@@ -6823,7 +7483,7 @@
}
struct remote_output {
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
int err;
};
@@ -6831,7 +7491,7 @@
{
struct perf_event *parent = event->parent;
struct remote_output *ro = data;
- struct ring_buffer *rb = ro->rb;
+ struct perf_buffer *rb = ro->rb;
struct stop_event_data sd = {
.event = event,
};
@@ -6947,7 +7607,7 @@
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
task_event->event_id.header.size);
if (ret)
goto out;
@@ -7050,7 +7710,7 @@
return;
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
comm_event->event_id.header.size);
if (ret)
@@ -7150,7 +7810,7 @@
perf_event_header__init_id(&namespaces_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
namespaces_event->event_id.header.size);
if (ret)
goto out;
@@ -7175,7 +7835,7 @@
{
struct path ns_path;
struct inode *ns_inode;
- void *error;
+ int error;
error = ns_get_path(&ns_path, task, ns_ops);
if (!error) {
@@ -7245,6 +7905,105 @@
}
/*
+ * cgroup tracking
+ */
+#ifdef CONFIG_CGROUP_PERF
+
+struct perf_cgroup_event {
+ char *path;
+ int path_size;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ char path[];
+ } event_id;
+};
+
+static int perf_event_cgroup_match(struct perf_event *event)
+{
+ return event->attr.cgroup;
+}
+
+static void perf_event_cgroup_output(struct perf_event *event, void *data)
+{
+ struct perf_cgroup_event *cgroup_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u16 header_size = cgroup_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_cgroup_match(event))
+ return;
+
+ perf_event_header__init_id(&cgroup_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, &sample, event,
+ cgroup_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ perf_output_put(&handle, cgroup_event->event_id);
+ __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ cgroup_event->event_id.header.size = header_size;
+}
+
+static void perf_event_cgroup(struct cgroup *cgrp)
+{
+ struct perf_cgroup_event cgroup_event;
+ char path_enomem[16] = "//enomem";
+ char *pathname;
+ size_t size;
+
+ if (!atomic_read(&nr_cgroup_events))
+ return;
+
+ cgroup_event = (struct perf_cgroup_event){
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_CGROUP,
+ .misc = 0,
+ .size = sizeof(cgroup_event.event_id),
+ },
+ .id = cgroup_id(cgrp),
+ },
+ };
+
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (pathname == NULL) {
+ cgroup_event.path = path_enomem;
+ } else {
+ /* just to be sure to have enough space for alignment */
+ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
+ cgroup_event.path = pathname;
+ }
+
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(cgroup_event.path) + 1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ cgroup_event.path[size++] = '\0';
+
+ cgroup_event.event_id.header.size += size;
+ cgroup_event.path_size = size;
+
+ perf_iterate_sb(perf_event_cgroup_output,
+ &cgroup_event,
+ NULL);
+
+ kfree(pathname);
+}
+
+#endif
+
+/*
* mmap tracking
*/
@@ -7304,7 +8063,7 @@
}
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
mmap_event->event_id.header.size);
if (ret)
goto out;
@@ -7364,7 +8123,7 @@
flags |= MAP_EXECUTABLE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
+ if (is_vm_hugetlb_page(vma))
flags |= MAP_HUGETLB;
if (file) {
@@ -7614,7 +8373,7 @@
int ret;
perf_event_header__init_id(&rec.header, &sample, event);
- ret = perf_output_begin(&handle, event, rec.header.size);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
if (ret)
return;
@@ -7648,7 +8407,7 @@
perf_event_header__init_id(&lost_samples_event.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
lost_samples_event.header.size);
if (ret)
return;
@@ -7703,7 +8462,7 @@
perf_event_header__init_id(&se->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event, se->event_id.header.size);
+ ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
if (ret)
return;
@@ -7778,7 +8537,7 @@
perf_event_header__init_id(&throttle_event.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
throttle_event.header.size);
if (ret)
return;
@@ -7786,6 +8545,290 @@
perf_output_put(&handle, throttle_event);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
+}
+
+/*
+ * ksymbol register/unregister tracking
+ */
+
+struct perf_ksymbol_event {
+ const char *name;
+ int name_len;
+ struct {
+ struct perf_event_header header;
+ u64 addr;
+ u32 len;
+ u16 ksym_type;
+ u16 flags;
+ } event_id;
+};
+
+static int perf_event_ksymbol_match(struct perf_event *event)
+{
+ return event->attr.ksymbol;
+}
+
+static void perf_event_ksymbol_output(struct perf_event *event, void *data)
+{
+ struct perf_ksymbol_event *ksymbol_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_ksymbol_match(event))
+ return;
+
+ perf_event_header__init_id(&ksymbol_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, &sample, event,
+ ksymbol_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, ksymbol_event->event_id);
+ __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
+ const char *sym)
+{
+ struct perf_ksymbol_event ksymbol_event;
+ char name[KSYM_NAME_LEN];
+ u16 flags = 0;
+ int name_len;
+
+ if (!atomic_read(&nr_ksymbol_events))
+ return;
+
+ if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
+ ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
+ goto err;
+
+ strlcpy(name, sym, KSYM_NAME_LEN);
+ name_len = strlen(name) + 1;
+ while (!IS_ALIGNED(name_len, sizeof(u64)))
+ name[name_len++] = '\0';
+ BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
+
+ if (unregister)
+ flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
+
+ ksymbol_event = (struct perf_ksymbol_event){
+ .name = name,
+ .name_len = name_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_KSYMBOL,
+ .size = sizeof(ksymbol_event.event_id) +
+ name_len,
+ },
+ .addr = addr,
+ .len = len,
+ .ksym_type = ksym_type,
+ .flags = flags,
+ },
+ };
+
+ perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
+ return;
+err:
+ WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
+}
+
+/*
+ * bpf program load/unload tracking
+ */
+
+struct perf_bpf_event {
+ struct bpf_prog *prog;
+ struct {
+ struct perf_event_header header;
+ u16 type;
+ u16 flags;
+ u32 id;
+ u8 tag[BPF_TAG_SIZE];
+ } event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+ return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event, void *data)
+{
+ struct perf_bpf_event *bpf_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_bpf_match(event))
+ return;
+
+ perf_event_header__init_id(&bpf_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, &sample, event,
+ bpf_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, bpf_event->event_id);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
+ enum perf_bpf_event_type type)
+{
+ bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
+ int i;
+
+ if (prog->aux->func_cnt == 0) {
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)prog->bpf_func,
+ prog->jited_len, unregister,
+ prog->aux->ksym.name);
+ } else {
+ for (i = 0; i < prog->aux->func_cnt; i++) {
+ struct bpf_prog *subprog = prog->aux->func[i];
+
+ perf_event_ksymbol(
+ PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)subprog->bpf_func,
+ subprog->jited_len, unregister,
+ subprog->aux->ksym.name);
+ }
+ }
+}
+
+void perf_event_bpf_event(struct bpf_prog *prog,
+ enum perf_bpf_event_type type,
+ u16 flags)
+{
+ struct perf_bpf_event bpf_event;
+
+ if (type <= PERF_BPF_EVENT_UNKNOWN ||
+ type >= PERF_BPF_EVENT_MAX)
+ return;
+
+ switch (type) {
+ case PERF_BPF_EVENT_PROG_LOAD:
+ case PERF_BPF_EVENT_PROG_UNLOAD:
+ if (atomic_read(&nr_ksymbol_events))
+ perf_event_bpf_emit_ksymbols(prog, type);
+ break;
+ default:
+ break;
+ }
+
+ if (!atomic_read(&nr_bpf_events))
+ return;
+
+ bpf_event = (struct perf_bpf_event){
+ .prog = prog,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_BPF_EVENT,
+ .size = sizeof(bpf_event.event_id),
+ },
+ .type = type,
+ .flags = flags,
+ .id = prog->aux->id,
+ },
+ };
+
+ BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
+
+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
+ perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
+}
+
+struct perf_text_poke_event {
+ const void *old_bytes;
+ const void *new_bytes;
+ size_t pad;
+ u16 old_len;
+ u16 new_len;
+
+ struct {
+ struct perf_event_header header;
+
+ u64 addr;
+ } event_id;
+};
+
+static int perf_event_text_poke_match(struct perf_event *event)
+{
+ return event->attr.text_poke;
+}
+
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
+{
+ struct perf_text_poke_event *text_poke_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u64 padding = 0;
+ int ret;
+
+ if (!perf_event_text_poke_match(event))
+ return;
+
+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, &sample, event,
+ text_poke_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, text_poke_event->event_id);
+ perf_output_put(&handle, text_poke_event->old_len);
+ perf_output_put(&handle, text_poke_event->new_len);
+
+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
+
+ if (text_poke_event->pad)
+ __output_copy(&handle, &padding, text_poke_event->pad);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_text_poke(const void *addr, const void *old_bytes,
+ size_t old_len, const void *new_bytes, size_t new_len)
+{
+ struct perf_text_poke_event text_poke_event;
+ size_t tot, pad;
+
+ if (!atomic_read(&nr_text_poke_events))
+ return;
+
+ tot = sizeof(text_poke_event.old_len) + old_len;
+ tot += sizeof(text_poke_event.new_len) + new_len;
+ pad = ALIGN(tot, sizeof(u64)) - tot;
+
+ text_poke_event = (struct perf_text_poke_event){
+ .old_bytes = old_bytes,
+ .new_bytes = new_bytes,
+ .pad = pad,
+ .old_len = old_len,
+ .new_len = new_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_TEXT_POKE,
+ .misc = PERF_RECORD_MISC_KERNEL,
+ .size = sizeof(text_poke_event.event_id) + tot + pad,
+ },
+ .addr = (unsigned long)addr,
+ },
+ };
+
+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
}
void perf_event_itrace_started(struct perf_event *event)
@@ -7818,7 +8861,7 @@
rec.tid = perf_event_tid(event, current);
perf_event_header__init_id(&rec.header, &sample, event);
- ret = perf_output_begin(&handle, event, rec.header.size);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
if (ret)
return;
@@ -7842,8 +8885,8 @@
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle
- && hwc->interrupts >= max_samples_per_tick)) {
+ if (unlikely(throttle &&
+ hwc->interrupts > max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
hwc->interrupts = MAX_INTERRUPTS;
@@ -8386,9 +9429,9 @@
if (event->hw.state & PERF_HES_STOPPED)
return 0;
/*
- * All tracepoints are from kernel-space.
+ * If exclude_kernel, only trace user-space tracepoints (uprobes)
*/
- if (event->attr.exclude_kernel)
+ if (event->attr.exclude_kernel && !user_mode(regs))
return 0;
if (!perf_tp_filter_match(event, data))
@@ -8514,30 +9557,39 @@
*
* PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
* if not set, create kprobe/uprobe
+ *
+ * The following values specify a reference counter (or semaphore in the
+ * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
+ * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
+ *
+ * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset
+ * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
*/
enum perf_probe_config {
PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
+ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
+ PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
};
PMU_FORMAT_ATTR(retprobe, "config:0");
+#endif
-static struct attribute *probe_attrs[] = {
+#ifdef CONFIG_KPROBE_EVENTS
+static struct attribute *kprobe_attrs[] = {
&format_attr_retprobe.attr,
NULL,
};
-static struct attribute_group probe_format_group = {
+static struct attribute_group kprobe_format_group = {
.name = "format",
- .attrs = probe_attrs,
+ .attrs = kprobe_attrs,
};
-static const struct attribute_group *probe_attr_groups[] = {
- &probe_format_group,
+static const struct attribute_group *kprobe_attr_groups[] = {
+ &kprobe_format_group,
NULL,
};
-#endif
-#ifdef CONFIG_KPROBE_EVENTS
static int perf_kprobe_event_init(struct perf_event *event);
static struct pmu perf_kprobe = {
.task_ctx_nr = perf_sw_context,
@@ -8547,7 +9599,7 @@
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
- .attr_groups = probe_attr_groups,
+ .attr_groups = kprobe_attr_groups,
};
static int perf_kprobe_event_init(struct perf_event *event)
@@ -8558,7 +9610,7 @@
if (event->attr.type != perf_kprobe.type)
return -ENOENT;
- if (!capable(CAP_SYS_ADMIN))
+ if (!perfmon_capable())
return -EACCES;
/*
@@ -8579,6 +9631,24 @@
#endif /* CONFIG_KPROBE_EVENTS */
#ifdef CONFIG_UPROBE_EVENTS
+PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
+
+static struct attribute *uprobe_attrs[] = {
+ &format_attr_retprobe.attr,
+ &format_attr_ref_ctr_offset.attr,
+ NULL,
+};
+
+static struct attribute_group uprobe_format_group = {
+ .name = "format",
+ .attrs = uprobe_attrs,
+};
+
+static const struct attribute_group *uprobe_attr_groups[] = {
+ &uprobe_format_group,
+ NULL,
+};
+
static int perf_uprobe_event_init(struct perf_event *event);
static struct pmu perf_uprobe = {
.task_ctx_nr = perf_sw_context,
@@ -8588,18 +9658,19 @@
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
- .attr_groups = probe_attr_groups,
+ .attr_groups = uprobe_attr_groups,
};
static int perf_uprobe_event_init(struct perf_event *event)
{
int err;
+ unsigned long ref_ctr_offset;
bool is_retprobe;
if (event->attr.type != perf_uprobe.type)
return -ENOENT;
- if (!capable(CAP_SYS_ADMIN))
+ if (!perfmon_capable())
return -EACCES;
/*
@@ -8609,7 +9680,8 @@
return -EOPNOTSUPP;
is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
- err = perf_uprobe_init(event, is_retprobe);
+ ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
+ err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
if (err)
return err;
@@ -8647,7 +9719,6 @@
int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
- preempt_disable();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
@@ -8655,7 +9726,6 @@
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
- preempt_enable();
if (!ret)
return;
@@ -8676,6 +9746,24 @@
prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
if (IS_ERR(prog))
return PTR_ERR(prog);
+
+ if (event->attr.precise_ip &&
+ prog->call_get_stack &&
+ (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
+ event->attr.exclude_callchain_kernel ||
+ event->attr.exclude_callchain_user)) {
+ /*
+ * On perf_event with precise_ip, calling bpf_get_stack()
+ * may trigger unwinder warnings and occasional crashes.
+ * bpf_get_[stack|stackid] works around this issue by using
+ * callchain attached to perf_sample_data. If the
+ * perf_event does not full (kernel and user) callchain
+ * attached to perf_sample_data, do not allow attaching BPF
+ * program that calls bpf_get_[stack|stackid].
+ */
+ bpf_prog_put(prog);
+ return -EPROTO;
+ }
event->prog = prog;
event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
@@ -8875,7 +9963,7 @@
/*
* Scan through mm's vmas and see if one of them matches the
* @filter; if so, adjust filter's address range.
- * Called with mm::mmap_sem down for reading.
+ * Called with mm::mmap_lock down for reading.
*/
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct mm_struct *mm,
@@ -8917,7 +10005,7 @@
if (!mm)
goto restart;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
}
raw_spin_lock_irqsave(&ifh->lock, flags);
@@ -8943,7 +10031,7 @@
raw_spin_unlock_irqrestore(&ifh->lock, flags);
if (ifh->nr_file_filters) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
}
@@ -9050,6 +10138,7 @@
case IF_SRC_KERNELADDR:
case IF_SRC_KERNEL:
kernel = 1;
+ fallthrough;
case IF_SRC_FILEADDR:
case IF_SRC_FILE:
@@ -9136,8 +10225,11 @@
}
/* ready to consume more filters */
+ kfree(filename);
+ filename = NULL;
state = IF_STATE_ACTION;
filter = NULL;
+ kernel = 0;
}
}
@@ -9285,7 +10377,7 @@
period = max_t(u64, 10000, hwc->sample_period);
}
hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -9307,7 +10399,7 @@
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hwc->hrtimer.function = perf_swevent_hrtimer;
/*
@@ -9640,8 +10732,7 @@
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -9678,13 +10769,15 @@
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
pmu->dev->release = pmu_dev_release;
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
ret = device_add(pmu->dev);
if (ret)
goto free_dev;
@@ -9692,6 +10785,12 @@
/* For PMUs with address filters, throw in an extra attribute: */
if (pmu->nr_addr_filters)
ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
+
+ if (ret)
+ goto del_dev;
+
+ if (pmu->attr_update)
+ ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
if (ret)
goto del_dev;
@@ -9712,7 +10811,7 @@
int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
- int cpu, ret;
+ int cpu, ret, max = PERF_TYPE_MAX;
mutex_lock(&pmus_lock);
ret = -ENOMEM;
@@ -9725,12 +10824,17 @@
goto skip_type;
pmu->name = name;
- if (type < 0) {
- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
- if (type < 0) {
- ret = type;
+ if (type != PERF_TYPE_SOFTWARE) {
+ if (type >= 0)
+ max = type;
+
+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
+ if (ret < 0)
goto free_pdc;
- }
+
+ WARN_ON(type >= 0 && ret != type);
+
+ type = ret;
}
pmu->type = type;
@@ -9776,6 +10880,9 @@
cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
__perf_mux_hrtimer_init(cpuctx, cpu);
+
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
got_cpu_context:
@@ -9807,7 +10914,16 @@
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
- list_add_rcu(&pmu->entry, &pmus);
+ /*
+ * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
+ * since these cannot be in the IDR. This way the linear search
+ * is fast, provided a valid software event is provided.
+ */
+ if (type == PERF_TYPE_SOFTWARE || !name)
+ list_add_rcu(&pmu->entry, &pmus);
+ else
+ list_add_tail_rcu(&pmu->entry, &pmus);
+
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
@@ -9820,7 +10936,7 @@
put_device(pmu->dev);
free_idr:
- if (pmu->type >= PERF_TYPE_MAX)
+ if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
free_pdc:
@@ -9842,7 +10958,7 @@
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
- if (pmu->type >= PERF_TYPE_MAX)
+ if (pmu->type != PERF_TYPE_SOFTWARE)
idr_remove(&pmu_idr, pmu->type);
if (pmu_bus_running) {
if (pmu->nr_addr_filters)
@@ -9854,6 +10970,12 @@
mutex_unlock(&pmus_lock);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+
+static inline bool has_extended_regs(struct perf_event *event)
+{
+ return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
+ (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
+}
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
@@ -9885,6 +11007,19 @@
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
+ if (!ret) {
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event))
+ ret = -EOPNOTSUPP;
+
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event))
+ ret = -EINVAL;
+
+ if (ret && event->destroy)
+ event->destroy(event);
+ }
+
if (ret)
module_put(pmu->module);
@@ -9893,9 +11028,8 @@
static struct pmu *perf_init_event(struct perf_event *event)
{
+ int idx, type, ret;
struct pmu *pmu;
- int idx;
- int ret;
idx = srcu_read_lock(&pmus_srcu);
@@ -9907,17 +11041,32 @@
goto unlock;
}
+ /*
+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * are often aliases for PERF_TYPE_RAW.
+ */
+ type = event->attr.type;
+ if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
+ type = PERF_TYPE_RAW;
+
+again:
rcu_read_lock();
- pmu = idr_find(&pmu_idr, event->attr.type);
+ pmu = idr_find(&pmu_idr, type);
rcu_read_unlock();
if (pmu) {
ret = perf_try_init_event(pmu, event);
+ if (ret == -ENOENT && event->attr.type != type) {
+ type = event->attr.type;
+ goto again;
+ }
+
if (ret)
pmu = ERR_PTR(ret);
+
goto unlock;
}
- list_for_each_entry_rcu(pmu, &pmus, entry) {
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;
@@ -9993,7 +11142,7 @@
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
@@ -10001,6 +11150,8 @@
atomic_inc(&nr_comm_events);
if (event->attr.namespaces)
atomic_inc(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_inc(&nr_cgroup_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
@@ -10013,6 +11164,12 @@
inc = true;
if (is_cgroup_event(event))
inc = true;
+ if (event->attr.ksymbol)
+ atomic_inc(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_inc(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_inc(&nr_text_poke_events);
if (inc) {
/*
@@ -10031,7 +11188,7 @@
* call the perf scheduling hooks before proceeding to
* install events that need them.
*/
- synchronize_sched();
+ synchronize_rcu();
}
/*
* Now that we have waited for the sync_sched(), allow further
@@ -10120,8 +11277,7 @@
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
- get_task_struct(task);
- event->hw.target = task;
+ event->hw.target = get_task_struct(task);
}
event->clock = &local_clock;
@@ -10133,12 +11289,9 @@
context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
if (overflow_handler == bpf_overflow_handler) {
- struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
+ struct bpf_prog *prog = parent_event->prog;
- if (IS_ERR(prog)) {
- err = PTR_ERR(prog);
- goto err_ns;
- }
+ bpf_prog_inc(prog);
event->prog = prog;
event->orig_overflow_handler =
parent_event->orig_overflow_handler;
@@ -10179,16 +11332,31 @@
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
- if (cgroup_fd != -1) {
- err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
- if (err)
- goto err_ns;
- }
-
pmu = perf_init_event(event);
if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
goto err_ns;
+ }
+
+ /*
+ * Disallow uncore-cgroup events, they don't make sense as the cgroup will
+ * be different on other CPUs in the uncore mask.
+ */
+ if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+ err = -EINVAL;
+ goto err_pmu;
+ }
+
+ if (event->attr.aux_output &&
+ !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+ err = -EOPNOTSUPP;
+ goto err_pmu;
+ }
+
+ if (cgroup_fd != -1) {
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+ goto err_pmu;
}
err = exclusive_event_init(event);
@@ -10251,12 +11419,12 @@
exclusive_event_destroy(event);
err_pmu:
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
if (event->destroy)
event->destroy(event);
module_put(pmu->module);
err_ns:
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
if (event->ns)
put_pid_ns(event->ns);
if (event->hw.target)
@@ -10272,58 +11440,29 @@
u32 size;
int ret;
- if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
- return -EFAULT;
-
- /*
- * zero the full structure, so that a short copy will be nice.
- */
+ /* Zero the full structure, so that a short copy will be nice. */
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
- if (size > PAGE_SIZE) /* silly large */
- goto err_size;
-
- if (!size) /* abi compat */
+ /* ABI compatibility quirk: */
+ if (!size)
size = PERF_ATTR_SIZE_VER0;
-
- if (size < PERF_ATTR_SIZE_VER0)
+ if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
- /*
- * If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(*attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(*attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- ret = get_user(val, addr);
- if (ret)
- return ret;
- if (val)
- goto err_size;
- }
- size = sizeof(*attr);
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
}
-
- ret = copy_from_user(attr, uattr, size);
- if (ret)
- return -EFAULT;
attr->size = size;
- if (attr->__reserved_1)
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -10394,6 +11533,12 @@
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
+
+#ifndef CONFIG_CGROUP_PERF
+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
+ return -EINVAL;
+#endif
+
out:
return ret;
@@ -10403,14 +11548,25 @@
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL;
+ struct perf_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -10425,7 +11581,7 @@
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -10448,8 +11604,15 @@
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
goto unlock;
@@ -10459,6 +11622,12 @@
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -10466,18 +11635,11 @@
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
-}
-
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
@@ -10500,11 +11662,11 @@
break;
case CLOCK_BOOTTIME:
- event->clock = &ktime_get_boot_ns;
+ event->clock = &ktime_get_boottime_ns;
break;
case CLOCK_TAI:
- event->clock = &ktime_get_tai_ns;
+ event->clock = &ktime_get_clocktai_ns;
break;
default:
@@ -10530,7 +11692,7 @@
again:
rcu_read_lock();
gctx = READ_ONCE(group_leader->ctx);
- if (!atomic_inc_not_zero(&gctx->refcount)) {
+ if (!refcount_inc_not_zero(&gctx->refcount)) {
rcu_read_unlock();
goto again;
}
@@ -10563,7 +11725,7 @@
struct perf_event *group_leader = NULL, *output_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *uninitialized_var(gctx);
+ struct perf_event_context *ctx, *gctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -10578,15 +11740,12 @@
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
- if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- /* Do we allow access to perf_event_open(2) ? */
- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
- err = perf_copy_attr(attr_uptr, &attr);
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
if (err)
return err;
@@ -10597,7 +11756,7 @@
}
if (attr.namespaces) {
- if (!capable(CAP_SYS_ADMIN))
+ if (!perfmon_capable())
return -EACCES;
}
@@ -10612,6 +11771,13 @@
/* Only privileged users can get physical addresses */
if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
+ }
+
+ /* REGS_INTR can leak data, lockdown must prevent this */
+ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
+ err = security_locked_down(LOCKDOWN_PERF);
if (err)
return err;
}
@@ -10657,24 +11823,6 @@
goto err_task;
}
- if (task) {
- err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
- if (err)
- goto err_task;
-
- /*
- * Reuse ptrace permission checks for now.
- *
- * We must hold cred_guard_mutex across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto err_cred;
- }
-
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
@@ -10682,7 +11830,7 @@
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cred;
+ goto err_task;
}
if (is_sampling_event(event)) {
@@ -10776,6 +11924,9 @@
* Do not allow to attach to a group in a different task
* or CPU context. If we're moving SW events, we'll fix
* this up later, so allow that.
+ *
+ * Racy, not holding group_leader->ctx->mutex, see comment with
+ * perf_event_ctx_lock().
*/
if (!move_group && group_leader->ctx != ctx)
goto err_context;
@@ -10799,6 +11950,24 @@
err = PTR_ERR(event_file);
event_file = NULL;
goto err_context;
+ }
+
+ if (task) {
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_file;
+
+ /*
+ * Preserve ptrace permission check for backwards compatibility.
+ *
+ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ goto err_cred;
}
if (move_group) {
@@ -10825,6 +11994,7 @@
} else {
perf_event_ctx_unlock(group_leader, gctx);
move_group = 0;
+ goto not_move_group;
}
}
@@ -10841,7 +12011,17 @@
}
} else {
mutex_lock(&ctx->mutex);
+
+ /*
+ * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
+ * see the group_leader && !move_group test earlier.
+ */
+ if (group_leader && group_leader->ctx != ctx) {
+ err = -EINVAL;
+ goto err_locked;
+ }
}
+not_move_group:
if (ctx->task == TASK_TOMBSTONE) {
err = -ESRCH;
@@ -10869,6 +12049,10 @@
}
}
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
+ err = -EINVAL;
+ goto err_locked;
+ }
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
@@ -10950,7 +12134,7 @@
mutex_unlock(&ctx->mutex);
if (task) {
- mutex_unlock(&task->signal->cred_guard_mutex);
+ up_read(&task->signal->exec_update_lock);
put_task_struct(task);
}
@@ -10972,7 +12156,10 @@
if (move_group)
perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
-/* err_file: */
+err_cred:
+ if (task)
+ up_read(&task->signal->exec_update_lock);
+err_file:
fput(event_file);
err_context:
perf_unpin_context(ctx);
@@ -10984,9 +12171,6 @@
*/
if (!event_file)
free_event(event);
-err_cred:
- if (task)
- mutex_unlock(&task->signal->cred_guard_mutex);
err_task:
if (task)
put_task_struct(task);
@@ -11015,8 +12199,11 @@
int err;
/*
- * Get the target context (task or percpu):
+ * Grouping is not supported for kernel events, neither is 'AUX',
+ * make sure the caller's intentions are adjusted.
*/
+ if (attr->aux_output)
+ return ERR_PTR(-EINVAL);
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
@@ -11028,6 +12215,9 @@
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ /*
+ * Get the target context (task or percpu):
+ */
ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
@@ -11285,8 +12475,8 @@
/*
* When a child task exits, feed back event values to parent events.
*
- * Can be called with cred_guard_mutex held when called from
- * install_exec_creds().
+ * Can be called with exec_update_lock held when called from
+ * setup_new_exec().
*/
void perf_event_exit_task(struct task_struct *child)
{
@@ -11390,7 +12580,7 @@
*
* Wait for all events to drop their context reference.
*/
- wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1);
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
put_ctx(ctx); /* must be last */
}
}
@@ -11405,9 +12595,7 @@
struct file *perf_event_get(unsigned int fd)
{
- struct file *file;
-
- file = fget_raw(fd);
+ struct file *file = fget(fd);
if (!file)
return ERR_PTR(-EBADF);
@@ -11477,8 +12665,7 @@
!child_ctx->task_ctx_data) {
struct pmu *pmu = child_event->pmu;
- child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
- GFP_KERNEL);
+ child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
if (!child_ctx->task_ctx_data) {
free_event(child_event);
return ERR_PTR(-ENOMEM);
@@ -11583,6 +12770,10 @@
child, leader, child_ctx);
if (IS_ERR(child_ctr))
return PTR_ERR(child_ctr);
+
+ if (sub->aux_event == parent_event && child_ctr &&
+ !perf_get_aux_event(child_ctr, leader))
+ return -EINVAL;
}
return 0;
}
@@ -11778,7 +12969,7 @@
}
}
-void perf_swevent_init_cpu(unsigned int cpu)
+static void perf_swevent_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -11975,6 +13166,12 @@
kfree(jc);
}
+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ perf_event_cgroup(css->cgroup);
+ return 0;
+}
+
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
@@ -11996,6 +13193,7 @@
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
+ .css_online = perf_cgroup_css_online,
.attach = perf_cgroup_attach,
/*
* Implicitly enable on dfl hierarchy so that perf events can
--
Gitblit v1.6.2