From 10ebd8556b7990499c896a550e3d416b444211e6 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Fri, 10 May 2024 02:23:07 +0000
Subject: [PATCH] add led
---
kernel/include/linux/perf_event.h | 295 ++++++++++++++++++++++++++++++++++++++++++++--------------
1 files changed, 222 insertions(+), 73 deletions(-)
diff --git a/kernel/include/linux/perf_event.h b/kernel/include/linux/perf_event.h
index 89d6a8c..346d24a 100644
--- a/kernel/include/linux/perf_event.h
+++ b/kernel/include/linux/perf_event.h
@@ -30,6 +30,7 @@
int (*is_in_guest)(void);
int (*is_user_mode)(void);
unsigned long (*get_guest_ip)(void);
+ void (*handle_intel_pt_intr)(void);
};
#ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -53,14 +54,14 @@
#include <linux/atomic.h>
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
-#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/refcount.h>
#include <linux/security.h>
#include <asm/local.h>
struct perf_callchain_entry {
__u64 nr;
- __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
+ __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */
};
struct perf_callchain_entry_ctx {
@@ -92,15 +93,27 @@
/*
* branch stack layout:
* nr: number of taken branches stored in entries[]
+ * hw_idx: The low level index of raw branch records
+ * for the most recent branch.
+ * -1ULL means invalid/unknown.
*
* Note that nr can vary from sample to sample
* branches (to, from) are stored from most recent
* to least recent, i.e., entries[0] contains the most
* recent branch.
+ * The entries[] is an abstraction of raw branch records,
+ * which may not be stored in age order in HW, e.g. Intel LBR.
+ * The hw_idx is to expose the low level index of raw
+ * branch record for the most recent branch aka entries[0].
+ * The hw_idx index is between -1 (unknown) and max depth,
+ * which can be retrieved in /sys/devices/cpu/caps/branches.
+ * For the architectures whose raw branch records are
+ * already stored in age order, the hw_idx should be 0.
*/
struct perf_branch_stack {
__u64 nr;
- struct perf_branch_entry entries[0];
+ __u64 hw_idx;
+ struct perf_branch_entry entries[];
};
struct task_struct;
@@ -199,17 +212,26 @@
*/
u64 sample_period;
- /*
- * The period we started this sample with.
- */
- u64 last_period;
+ union {
+ struct { /* Sampling */
+ /*
+ * The period we started this sample with.
+ */
+ u64 last_period;
- /*
- * However much is left of the current period; note that this is
- * a full 64bit value and allows for generation of periods longer
- * than hardware might allow.
- */
- local64_t period_left;
+ /*
+ * However much is left of the current period;
+ * note that this is a full 64bit value and
+ * allows for generation of periods longer
+ * than hardware might allow.
+ */
+ local64_t period_left;
+ };
+ struct { /* Topdown events counting for context switch */
+ u64 saved_metric;
+ u64 saved_slots;
+ };
+ };
/*
* State for throttling the event, see __perf_event_overflow() and
@@ -241,10 +263,14 @@
#define PERF_PMU_CAP_NO_INTERRUPT 0x01
#define PERF_PMU_CAP_NO_NMI 0x02
#define PERF_PMU_CAP_AUX_NO_SG 0x04
-#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08
+#define PERF_PMU_CAP_EXTENDED_REGS 0x08
#define PERF_PMU_CAP_EXCLUSIVE 0x10
#define PERF_PMU_CAP_ITRACE 0x20
#define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x40
+#define PERF_PMU_CAP_NO_EXCLUDE 0x80
+#define PERF_PMU_CAP_AUX_OUTPUT 0x100
+
+struct perf_output_handle;
/**
* struct pmu - generic performance monitoring unit
@@ -255,6 +281,7 @@
struct module *module;
struct device *dev;
const struct attribute_group **attr_groups;
+ const struct attribute_group **attr_update;
const char *name;
int type;
@@ -263,13 +290,11 @@
*/
int capabilities;
- int * __percpu pmu_disable_count;
- struct perf_cpu_context * __percpu pmu_cpu_context;
+ int __percpu *pmu_disable_count;
+ struct perf_cpu_context __percpu *pmu_cpu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
int task_ctx_nr;
int hrtimer_interval_ms;
- u32 events_across_hotplug:1,
- reserved:31;
/* number of address filters this PMU can do */
unsigned int nr_addr_filters;
@@ -291,7 +316,7 @@
* -EBUSY -- @event is for this PMU but PMU temporarily unavailable
* -EINVAL -- @event is for this PMU but @event is not valid
* -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported
- * -EACCESS -- @event is for this PMU, @event is valid, but no privilidges
+ * -EACCES -- @event is for this PMU, @event is valid, but no privileges
*
* 0 -- @event is for this PMU and valid
*
@@ -350,7 +375,7 @@
* ->stop() with PERF_EF_UPDATE will read the counter and update
* period/count values like ->read() would.
*
- * ->start() with PERF_EF_RELOAD will reprogram the the counter
+ * ->start() with PERF_EF_RELOAD will reprogram the counter
* value, must be preceded by a ->stop() with PERF_EF_UPDATE.
*/
void (*start) (struct perf_event *event, int flags);
@@ -403,11 +428,21 @@
*/
void (*sched_task) (struct perf_event_context *ctx,
bool sched_in);
- /*
- * PMU specific data size
- */
- size_t task_ctx_size;
+ /*
+ * Kmem cache of PMU specific data
+ */
+ struct kmem_cache *task_ctx_cache;
+
+ /*
+ * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
+ * can be synchronized using this function. See Intel LBR callstack support
+ * implementation and Perf core context switch handling callbacks for usage
+ * examples.
+ */
+ void (*swap_task_ctx) (struct perf_event_context *prev,
+ struct perf_event_context *next);
+ /* optional */
/*
* Set up pmu-private data structures for an AUX area
@@ -420,6 +455,19 @@
* Free pmu-private AUX data structures
*/
void (*free_aux) (void *aux); /* optional */
+
+ /*
+ * Take a snapshot of the AUX buffer without touching the event
+ * state, so that preempting ->start()/->stop() callbacks does
+ * not interfere with their logic. Called in PMI context.
+ *
+ * Returns the size of AUX data copied to the output handle.
+ *
+ * Optional.
+ */
+ long (*snapshot_aux) (struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size);
/*
* Validate address range filters: make sure the HW supports the
@@ -447,6 +495,16 @@
/* optional */
/*
+ * Check if event can be used for aux_output purposes for
+ * events of this PMU.
+ *
+ * Runs from perf_event_open(). Should return 0 for "no match"
+ * or non-zero for "match".
+ */
+ int (*aux_output_match) (struct perf_event *event);
+ /* optional */
+
+ /*
* Filter events for PMU-specific reasons.
*/
int (*filter_match) (struct perf_event *event); /* optional */
@@ -466,7 +524,7 @@
/**
* struct perf_addr_filter - address range filter definition
* @entry: event's filter list linkage
- * @inode: object file's inode for file-based filters
+ * @path: object file's path for file-based filters
* @offset: filter range offset
* @size: filter range size (size==0 means single address trigger)
* @action: filter/start/stop
@@ -506,7 +564,6 @@
* enum perf_event_state - the states of an event:
*/
enum perf_event_state {
- PERF_EVENT_STATE_DORMANT = -5,
PERF_EVENT_STATE_DEAD = -4,
PERF_EVENT_STATE_EXIT = -3,
PERF_EVENT_STATE_ERROR = -2,
@@ -528,9 +585,13 @@
* PERF_EV_CAP_SOFTWARE: Is a software event.
* PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
* from any CPU in the package where it is active.
+ * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
+ * cannot be a group leader. If an event with this flag is detached from the
+ * group it is scheduled out and moved into an unrecoverable ERROR state.
*/
#define PERF_EV_CAP_SOFTWARE BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1)
+#define PERF_EV_CAP_SIBLING BIT(2)
#define SWEVENT_HLIST_BITS 8
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
@@ -545,9 +606,10 @@
#define PERF_ATTACH_TASK 0x04
#define PERF_ATTACH_TASK_DATA 0x08
#define PERF_ATTACH_ITRACE 0x10
+#define PERF_ATTACH_SCHED_CB 0x20
struct perf_cgroup;
-struct ring_buffer;
+struct perf_buffer;
struct pmu_event_list {
raw_spinlock_t lock;
@@ -619,7 +681,9 @@
/*
* timestamp shadows the actual context timing but it can
* be safely used in NMI interrupt context. It reflects the
- * context time as it was when the event was last scheduled in.
+ * context time as it was when the event was last scheduled in,
+ * or when ctx_sched_in failed to schedule the event because we
+ * run out of PMC.
*
* ctx_time already accounts for ctx->timestamp. Therefore to
* compute ctx_time for a sample, simply add perf_clock().
@@ -651,7 +715,6 @@
int oncpu;
int cpu;
- cpumask_t readable_on_cpus;
struct list_head owner_entry;
struct task_struct *owner;
@@ -660,7 +723,7 @@
struct mutex mmap_mutex;
atomic_t mmap_count;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
struct list_head rb_entry;
unsigned long rcu_batches;
int rcu_pending;
@@ -682,6 +745,9 @@
/* vma address array for file-based filders */
struct perf_addr_filter_range *addr_filter_ranges;
unsigned long addr_filters_gen;
+
+ /* for aux_output events */
+ struct perf_event *aux_event;
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;
@@ -713,12 +779,6 @@
void *security;
#endif
struct list_head sb_list;
- /* Is this event shared with other events */
- bool shared;
-
- /* TODO: need to cherry-pick 3d3eb5fb85d97. This is just padding for now
- * to reduce the ABI diff */
- struct list_head dormant_event_entry;
#endif /* CONFIG_PERF_EVENTS */
};
@@ -761,7 +821,12 @@
int nr_stat;
int nr_freq;
int rotate_disable;
- atomic_t refcount;
+ /*
+ * Set when nr_events != nr_active, except tolerant to events not
+ * necessary to be active due to scheduling constraints, such as cgroups.
+ */
+ int rotate_necessary;
+ refcount_t refcount;
struct task_struct *task;
/*
@@ -814,11 +879,18 @@
int sched_cb_usage;
int online;
+ /*
+ * Per-CPU storage for iterators used in visit_groups_merge. The default
+ * storage is of size 2 to hold the CPU and any CPU event iterators.
+ */
+ int heap_size;
+ struct perf_event **heap;
+ struct perf_event *heap_default[2];
};
struct perf_output_handle {
struct perf_event *event;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned long wakeup;
unsigned long size;
u64 aux_flags;
@@ -901,6 +973,9 @@
extern void perf_sched_cb_inc(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
+
+extern void perf_pmu_resched(struct pmu *pmu);
+
extern int perf_event_refresh(struct perf_event *event, int refresh);
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);
@@ -949,18 +1024,14 @@
u32 reserved;
} cpu_entry;
struct perf_callchain_entry *callchain;
+ u64 aux_size;
- /*
- * regs_user may point to task_pt_regs or to regs_user_copy, depending
- * on arch details.
- */
struct perf_regs regs_user;
- struct pt_regs regs_user_copy;
-
struct perf_regs regs_intr;
u64 stack_user_size;
u64 phys_addr;
+ u64 cgroup;
} ____cacheline_aligned;
/* default value for data source */
@@ -1002,19 +1073,35 @@
extern void perf_event_output_backward(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs);
-extern void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs);
+extern int perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs);
static inline bool
-is_default_overflow_handler(struct perf_event *event)
+__is_default_overflow_handler(perf_overflow_handler_t overflow_handler)
{
- if (likely(event->overflow_handler == perf_event_output_forward))
+ if (likely(overflow_handler == perf_event_output_forward))
return true;
- if (unlikely(event->overflow_handler == perf_event_output_backward))
+ if (unlikely(overflow_handler == perf_event_output_backward))
return true;
return false;
}
+
+#define is_default_overflow_handler(event) \
+ __is_default_overflow_handler((event)->overflow_handler)
+
+#ifdef CONFIG_BPF_SYSCALL
+static inline bool uses_default_overflow_handler(struct perf_event *event)
+{
+ if (likely(is_default_overflow_handler(event)))
+ return true;
+
+ return __is_default_overflow_handler(event->orig_overflow_handler);
+}
+#else
+#define uses_default_overflow_handler(event) \
+ is_default_overflow_handler(event)
+#endif
extern void
perf_event_header__init_id(struct perf_event_header *header,
@@ -1027,6 +1114,15 @@
extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);
+
+static inline bool event_has_any_exclude_flag(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ return attr->exclude_idle || attr->exclude_user ||
+ attr->exclude_kernel || attr->exclude_hv ||
+ attr->exclude_guest || attr->exclude_host;
+}
static inline bool is_sampling_event(struct perf_event *event)
{
@@ -1064,12 +1160,18 @@
#endif
/*
- * Take a snapshot of the regs. Skip ip and frame pointer to
- * the nth caller. We only need a few of the regs:
+ * When generating a perf sample in-line, instead of from an interrupt /
+ * exception, we lack a pt_regs. This is typically used from software events
+ * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints.
+ *
+ * We typically don't need a full set, but (for x86) do require:
* - ip for PERF_SAMPLE_IP
* - cs for user_mode() tests
- * - bp for callchains
- * - eflags, for future purposes, just in case
+ * - sp for PERF_SAMPLE_CALLCHAIN
+ * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs())
+ *
+ * NOTE: assumes @regs is otherwise already 0 filled; this is important for
+ * things like PERF_SAMPLE_REGS_INTR.
*/
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
@@ -1142,7 +1244,25 @@
}
extern void perf_event_mmap(struct vm_area_struct *vma);
-extern struct perf_guest_info_callbacks *perf_guest_cbs;
+
+extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
+ bool unregister, const char *sym);
+extern void perf_event_bpf_event(struct bpf_prog *prog,
+ enum perf_bpf_event_type type,
+ u16 flags);
+
+extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
+static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void)
+{
+ /*
+ * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading
+ * the callbacks between a !NULL check and dereferences, to ensure
+ * pending stores/changes to the callback pointers are visible before a
+ * non-NULL perf_guest_cbs is visible to readers, and to prevent a
+ * module from unloading callbacks while readers are active.
+ */
+ return rcu_dereference(perf_guest_cbs);
+}
extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
@@ -1150,6 +1270,9 @@
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
+extern void perf_event_text_poke(const void *addr,
+ const void *old_bytes, size_t old_len,
+ const void *new_bytes, size_t new_len);
/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
@@ -1162,6 +1285,8 @@
extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
+extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
+extern void put_callchain_entry(int rctx);
extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;
@@ -1198,15 +1323,12 @@
extern void perf_sample_event_took(u64 sample_len_ns);
-extern int perf_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-
+int perf_proc_update_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
int perf_event_max_stack_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos);
+ void *buffer, size_t *lenp, loff_t *ppos);
/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN 0
@@ -1216,11 +1338,6 @@
#define PERF_SECURITY_KERNEL 2
#define PERF_SECURITY_TRACEPOINT 3
-static inline bool perf_paranoid_any(void)
-{
- return sysctl_perf_event_paranoid > 2;
-}
-
static inline int perf_is_paranoid(void)
{
return sysctl_perf_event_paranoid > -1;
@@ -1228,7 +1345,7 @@
static inline int perf_allow_kernel(struct perf_event_attr *attr)
{
- if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN))
+ if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
return -EACCES;
return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
@@ -1236,7 +1353,7 @@
static inline int perf_allow_cpu(struct perf_event_attr *attr)
{
- if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN))
+ if (sysctl_perf_event_paranoid > 0 && !perfmon_capable())
return -EACCES;
return security_perf_event_open(attr, PERF_SECURITY_CPU);
@@ -1244,7 +1361,7 @@
static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
{
- if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN))
+ if (sysctl_perf_event_paranoid > -1 && !perfmon_capable())
return -EPERM;
return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
@@ -1308,11 +1425,14 @@
extern void perf_event_addr_filters_sync(struct perf_event *event);
extern int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
struct perf_event *event, unsigned int size);
extern int perf_output_begin_forward(struct perf_output_handle *handle,
- struct perf_event *event,
- unsigned int size);
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ unsigned int size);
extern int perf_output_begin_backward(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
struct perf_event *event,
unsigned int size);
@@ -1321,6 +1441,9 @@
const void *buf, unsigned int len);
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
unsigned int len);
+extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
+ struct perf_output_handle *handle,
+ unsigned long from, unsigned long to);
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event);
@@ -1330,6 +1453,8 @@
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
+extern int perf_event_period(struct perf_event *event, u64 value);
+extern u64 perf_event_pause(struct perf_event *event, bool reset);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1389,10 +1514,22 @@
(struct perf_guest_info_callbacks *callbacks) { return 0; }
static inline void perf_event_mmap(struct vm_area_struct *vma) { }
+
+typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
+static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
+ bool unregister, const char *sym) { }
+static inline void perf_event_bpf_event(struct bpf_prog *prog,
+ enum perf_bpf_event_type type,
+ u16 flags) { }
static inline void perf_event_exec(void) { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec) { }
static inline void perf_event_namespaces(struct task_struct *tsk) { }
static inline void perf_event_fork(struct task_struct *tsk) { }
+static inline void perf_event_text_poke(const void *addr,
+ const void *old_bytes,
+ size_t old_len,
+ const void *new_bytes,
+ size_t new_len) { }
static inline void perf_event_init(void) { }
static inline int perf_swevent_get_recursion_context(void) { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx) { }
@@ -1402,6 +1539,14 @@
static inline int __perf_event_disable(void *info) { return -1; }
static inline void perf_event_task_tick(void) { }
static inline int perf_event_release_kernel(struct perf_event *event) { return 0; }
+static inline int perf_event_period(struct perf_event *event, u64 value)
+{
+ return -EINVAL;
+}
+static inline u64 perf_event_pause(struct perf_event *event, bool reset)
+{
+ return 0;
+}
#endif
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
@@ -1467,4 +1612,8 @@
#define perf_event_exit_cpu NULL
#endif
+extern void __weak arch_perf_update_userpage(struct perf_event *event,
+ struct perf_event_mmap_page *userpg,
+ u64 now);
+
#endif /* _LINUX_PERF_EVENT_H */
--
Gitblit v1.6.2