From 10ebd8556b7990499c896a550e3d416b444211e6 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 10 May 2024 02:23:07 +0000 Subject: [PATCH] add led --- kernel/include/linux/perf_event.h | 295 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 files changed, 222 insertions(+), 73 deletions(-) diff --git a/kernel/include/linux/perf_event.h b/kernel/include/linux/perf_event.h index 89d6a8c..346d24a 100644 --- a/kernel/include/linux/perf_event.h +++ b/kernel/include/linux/perf_event.h @@ -30,6 +30,7 @@ int (*is_in_guest)(void); int (*is_user_mode)(void); unsigned long (*get_guest_ip)(void); + void (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -53,14 +54,14 @@ #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> -#include <linux/workqueue.h> #include <linux/cgroup.h> +#include <linux/refcount.h> #include <linux/security.h> #include <asm/local.h> struct perf_callchain_entry { __u64 nr; - __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */ + __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { @@ -92,15 +93,27 @@ /* * branch stack layout: * nr: number of taken branches stored in entries[] + * hw_idx: The low level index of raw branch records + * for the most recent branch. + * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. + * The entries[] is an abstraction of raw branch records, + * which may not be stored in age order in HW, e.g. Intel LBR. + * The hw_idx is to expose the low level index of raw + * branch record for the most recent branch aka entries[0]. + * The hw_idx index is between -1 (unknown) and max depth, + * which can be retrieved in /sys/devices/cpu/caps/branches. + * For the architectures whose raw branch records are + * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; - struct perf_branch_entry entries[0]; + __u64 hw_idx; + struct perf_branch_entry entries[]; }; struct task_struct; @@ -199,17 +212,26 @@ */ u64 sample_period; - /* - * The period we started this sample with. - */ - u64 last_period; + union { + struct { /* Sampling */ + /* + * The period we started this sample with. + */ + u64 last_period; - /* - * However much is left of the current period; note that this is - * a full 64bit value and allows for generation of periods longer - * than hardware might allow. - */ - local64_t period_left; + /* + * However much is left of the current period; + * note that this is a full 64bit value and + * allows for generation of periods longer + * than hardware might allow. + */ + local64_t period_left; + }; + struct { /* Topdown events counting for context switch */ + u64 saved_metric; + u64 saved_slots; + }; + }; /* * State for throttling the event, see __perf_event_overflow() and @@ -241,10 +263,14 @@ #define PERF_PMU_CAP_NO_INTERRUPT 0x01 #define PERF_PMU_CAP_NO_NMI 0x02 #define PERF_PMU_CAP_AUX_NO_SG 0x04 -#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08 +#define PERF_PMU_CAP_EXTENDED_REGS 0x08 #define PERF_PMU_CAP_EXCLUSIVE 0x10 #define PERF_PMU_CAP_ITRACE 0x20 #define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x40 +#define PERF_PMU_CAP_NO_EXCLUDE 0x80 +#define PERF_PMU_CAP_AUX_OUTPUT 0x100 + +struct perf_output_handle; /** * struct pmu - generic performance monitoring unit @@ -255,6 +281,7 @@ struct module *module; struct device *dev; const struct attribute_group **attr_groups; + const struct attribute_group **attr_update; const char *name; int type; @@ -263,13 +290,11 @@ */ int capabilities; - int * __percpu pmu_disable_count; - struct perf_cpu_context * __percpu pmu_cpu_context; + int __percpu *pmu_disable_count; + struct perf_cpu_context __percpu *pmu_cpu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; - u32 events_across_hotplug:1, - reserved:31; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; @@ -291,7 +316,7 @@ * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported - * -EACCESS -- @event is for this PMU, @event is valid, but no privilidges + * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * @@ -350,7 +375,7 @@ * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * - * ->start() with PERF_EF_RELOAD will reprogram the the counter + * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. */ void (*start) (struct perf_event *event, int flags); @@ -403,11 +428,21 @@ */ void (*sched_task) (struct perf_event_context *ctx, bool sched_in); - /* - * PMU specific data size - */ - size_t task_ctx_size; + /* + * Kmem cache of PMU specific data + */ + struct kmem_cache *task_ctx_cache; + + /* + * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) + * can be synchronized using this function. See Intel LBR callstack support + * implementation and Perf core context switch handling callbacks for usage + * examples. + */ + void (*swap_task_ctx) (struct perf_event_context *prev, + struct perf_event_context *next); + /* optional */ /* * Set up pmu-private data structures for an AUX area @@ -420,6 +455,19 @@ * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ + + /* + * Take a snapshot of the AUX buffer without touching the event + * state, so that preempting ->start()/->stop() callbacks does + * not interfere with their logic. Called in PMI context. + * + * Returns the size of AUX data copied to the output handle. + * + * Optional. + */ + long (*snapshot_aux) (struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size); /* * Validate address range filters: make sure the HW supports the @@ -447,6 +495,16 @@ /* optional */ /* + * Check if event can be used for aux_output purposes for + * events of this PMU. + * + * Runs from perf_event_open(). Should return 0 for "no match" + * or non-zero for "match". + */ + int (*aux_output_match) (struct perf_event *event); + /* optional */ + + /* * Filter events for PMU-specific reasons. */ int (*filter_match) (struct perf_event *event); /* optional */ @@ -466,7 +524,7 @@ /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage - * @inode: object file's inode for file-based filters + * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop @@ -506,7 +564,6 @@ * enum perf_event_state - the states of an event: */ enum perf_event_state { - PERF_EVENT_STATE_DORMANT = -5, PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, @@ -528,9 +585,13 @@ * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. + * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and + * cannot be a group leader. If an event with this flag is detached from the + * group it is scheduled out and moved into an unrecoverable ERROR state. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) +#define PERF_EV_CAP_SIBLING BIT(2) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) @@ -545,9 +606,10 @@ #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 +#define PERF_ATTACH_SCHED_CB 0x20 struct perf_cgroup; -struct ring_buffer; +struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; @@ -619,7 +681,9 @@ /* * timestamp shadows the actual context timing but it can * be safely used in NMI interrupt context. It reflects the - * context time as it was when the event was last scheduled in. + * context time as it was when the event was last scheduled in, + * or when ctx_sched_in failed to schedule the event because we + * run out of PMC. * * ctx_time already accounts for ctx->timestamp. Therefore to * compute ctx_time for a sample, simply add perf_clock(). @@ -651,7 +715,6 @@ int oncpu; int cpu; - cpumask_t readable_on_cpus; struct list_head owner_entry; struct task_struct *owner; @@ -660,7 +723,7 @@ struct mutex mmap_mutex; atomic_t mmap_count; - struct ring_buffer *rb; + struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; @@ -682,6 +745,9 @@ /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; + + /* for aux_output events */ + struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; @@ -713,12 +779,6 @@ void *security; #endif struct list_head sb_list; - /* Is this event shared with other events */ - bool shared; - - /* TODO: need to cherry-pick 3d3eb5fb85d97. This is just padding for now - * to reduce the ABI diff */ - struct list_head dormant_event_entry; #endif /* CONFIG_PERF_EVENTS */ }; @@ -761,7 +821,12 @@ int nr_stat; int nr_freq; int rotate_disable; - atomic_t refcount; + /* + * Set when nr_events != nr_active, except tolerant to events not + * necessary to be active due to scheduling constraints, such as cgroups. + */ + int rotate_necessary; + refcount_t refcount; struct task_struct *task; /* @@ -814,11 +879,18 @@ int sched_cb_usage; int online; + /* + * Per-CPU storage for iterators used in visit_groups_merge. The default + * storage is of size 2 to hold the CPU and any CPU event iterators. + */ + int heap_size; + struct perf_event **heap; + struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; - struct ring_buffer *rb; + struct perf_buffer *rb; unsigned long wakeup; unsigned long size; u64 aux_flags; @@ -901,6 +973,9 @@ extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); + +extern void perf_pmu_resched(struct pmu *pmu); + extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); @@ -949,18 +1024,14 @@ u32 reserved; } cpu_entry; struct perf_callchain_entry *callchain; + u64 aux_size; - /* - * regs_user may point to task_pt_regs or to regs_user_copy, depending - * on arch details. - */ struct perf_regs regs_user; - struct pt_regs regs_user_copy; - struct perf_regs regs_intr; u64 stack_user_size; u64 phys_addr; + u64 cgroup; } ____cacheline_aligned; /* default value for data source */ @@ -1002,19 +1073,35 @@ extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); -extern void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs); +extern int perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs); static inline bool -is_default_overflow_handler(struct perf_event *event) +__is_default_overflow_handler(perf_overflow_handler_t overflow_handler) { - if (likely(event->overflow_handler == perf_event_output_forward)) + if (likely(overflow_handler == perf_event_output_forward)) return true; - if (unlikely(event->overflow_handler == perf_event_output_backward)) + if (unlikely(overflow_handler == perf_event_output_backward)) return true; return false; } + +#define is_default_overflow_handler(event) \ + __is_default_overflow_handler((event)->overflow_handler) + +#ifdef CONFIG_BPF_SYSCALL +static inline bool uses_default_overflow_handler(struct perf_event *event) +{ + if (likely(is_default_overflow_handler(event))) + return true; + + return __is_default_overflow_handler(event->orig_overflow_handler); +} +#else +#define uses_default_overflow_handler(event) \ + is_default_overflow_handler(event) +#endif extern void perf_event_header__init_id(struct perf_event_header *header, @@ -1027,6 +1114,15 @@ extern void perf_log_lost_samples(struct perf_event *event, u64 lost); + +static inline bool event_has_any_exclude_flag(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + + return attr->exclude_idle || attr->exclude_user || + attr->exclude_kernel || attr->exclude_hv || + attr->exclude_guest || attr->exclude_host; +} static inline bool is_sampling_event(struct perf_event *event) { @@ -1064,12 +1160,18 @@ #endif /* - * Take a snapshot of the regs. Skip ip and frame pointer to - * the nth caller. We only need a few of the regs: + * When generating a perf sample in-line, instead of from an interrupt / + * exception, we lack a pt_regs. This is typically used from software events + * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. + * + * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests - * - bp for callchains - * - eflags, for future purposes, just in case + * - sp for PERF_SAMPLE_CALLCHAIN + * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) + * + * NOTE: assumes @regs is otherwise already 0 filled; this is important for + * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { @@ -1142,7 +1244,25 @@ } extern void perf_event_mmap(struct vm_area_struct *vma); -extern struct perf_guest_info_callbacks *perf_guest_cbs; + +extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, const char *sym); +extern void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags); + +extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; +static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) +{ + /* + * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading + * the callbacks between a !NULL check and dereferences, to ensure + * pending stores/changes to the callback pointers are visible before a + * non-NULL perf_guest_cbs is visible to readers, and to prevent a + * module from unloading callbacks while readers are active. + */ + return rcu_dereference(perf_guest_cbs); +} extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -1150,6 +1270,9 @@ extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); +extern void perf_event_text_poke(const void *addr, + const void *old_bytes, size_t old_len, + const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); @@ -1162,6 +1285,8 @@ extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); +extern struct perf_callchain_entry *get_callchain_entry(int *rctx); +extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; @@ -1198,15 +1323,12 @@ extern void perf_sample_event_took(u64 sample_len_ns); -extern int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - +int perf_proc_update_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 @@ -1216,11 +1338,6 @@ #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 -static inline bool perf_paranoid_any(void) -{ - return sysctl_perf_event_paranoid > 2; -} - static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; @@ -1228,7 +1345,7 @@ static inline int perf_allow_kernel(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_KERNEL); @@ -1236,7 +1353,7 @@ static inline int perf_allow_cpu(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); @@ -1244,7 +1361,7 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); @@ -1308,11 +1425,14 @@ extern void perf_event_addr_filters_sync(struct perf_event *event); extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, - unsigned int size); + struct perf_sample_data *data, + struct perf_event *event, + unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); @@ -1321,6 +1441,9 @@ const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); +extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, + struct perf_output_handle *handle, + unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); @@ -1330,6 +1453,8 @@ extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); +extern int perf_event_period(struct perf_event *event, u64 value); +extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, @@ -1389,10 +1514,22 @@ (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } + +typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); +static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, const char *sym) { } +static inline void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } +static inline void perf_event_text_poke(const void *addr, + const void *old_bytes, + size_t old_len, + const void *new_bytes, + size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } @@ -1402,6 +1539,14 @@ static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } +static inline int perf_event_period(struct perf_event *event, u64 value) +{ + return -EINVAL; +} +static inline u64 perf_event_pause(struct perf_event *event, bool reset) +{ + return 0; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) @@ -1467,4 +1612,8 @@ #define perf_event_exit_cpu NULL #endif +extern void __weak arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, + u64 now); + #endif /* _LINUX_PERF_EVENT_H */ -- Gitblit v1.6.2