| .. | .. | 
|---|
 | 1 | +// SPDX-License-Identifier: GPL-2.0  | 
|---|
| 1 | 2 |  /* | 
|---|
| 2 | 3 |   * Performance events core code: | 
|---|
| 3 | 4 |   * | 
|---|
| .. | .. | 
|---|
| 5 | 6 |   *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | 
|---|
| 6 | 7 |   *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra | 
|---|
| 7 | 8 |   *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 
|---|
| 8 |  | - *  | 
|---|
| 9 |  | - * For licensing details see kernel-base/COPYING  | 
|---|
| 10 | 9 |   */ | 
|---|
| 11 | 10 |   | 
|---|
| 12 | 11 |  #include <linux/fs.h> | 
|---|
| .. | .. | 
|---|
| 29 | 28 |  #include <linux/export.h> | 
|---|
| 30 | 29 |  #include <linux/vmalloc.h> | 
|---|
| 31 | 30 |  #include <linux/hardirq.h> | 
|---|
 | 31 | +#include <linux/hugetlb.h>  | 
|---|
| 32 | 32 |  #include <linux/rculist.h> | 
|---|
| 33 | 33 |  #include <linux/uaccess.h> | 
|---|
| 34 | 34 |  #include <linux/syscalls.h> | 
|---|
| .. | .. | 
|---|
| 50 | 50 |  #include <linux/sched/mm.h> | 
|---|
| 51 | 51 |  #include <linux/proc_ns.h> | 
|---|
| 52 | 52 |  #include <linux/mount.h> | 
|---|
 | 53 | +#include <linux/min_heap.h>  | 
|---|
| 53 | 54 |   | 
|---|
| 54 | 55 |  #include "internal.h" | 
|---|
| 55 | 56 |   | 
|---|
| .. | .. | 
|---|
| 265 | 266 |  	if (!event->parent) { | 
|---|
| 266 | 267 |  		/* | 
|---|
| 267 | 268 |  		 * If this is a !child event, we must hold ctx::mutex to | 
|---|
| 268 |  | -		 * stabilize the the event->ctx relation. See  | 
|---|
 | 269 | +		 * stabilize the event->ctx relation. See  | 
|---|
| 269 | 270 |  		 * perf_event_ctx_lock(). | 
|---|
| 270 | 271 |  		 */ | 
|---|
| 271 | 272 |  		lockdep_assert_held(&ctx->mutex); | 
|---|
| .. | .. | 
|---|
| 391 | 392 |  static atomic_t nr_task_events __read_mostly; | 
|---|
| 392 | 393 |  static atomic_t nr_freq_events __read_mostly; | 
|---|
| 393 | 394 |  static atomic_t nr_switch_events __read_mostly; | 
|---|
 | 395 | +static atomic_t nr_ksymbol_events __read_mostly;  | 
|---|
 | 396 | +static atomic_t nr_bpf_events __read_mostly;  | 
|---|
 | 397 | +static atomic_t nr_cgroup_events __read_mostly;  | 
|---|
 | 398 | +static atomic_t nr_text_poke_events __read_mostly;  | 
|---|
| 394 | 399 |   | 
|---|
| 395 | 400 |  static LIST_HEAD(pmus); | 
|---|
| 396 | 401 |  static DEFINE_MUTEX(pmus_lock); | 
|---|
| .. | .. | 
|---|
| 403 | 408 |   *   0 - disallow raw tracepoint access for unpriv | 
|---|
| 404 | 409 |   *   1 - disallow cpu events for unpriv | 
|---|
| 405 | 410 |   *   2 - disallow kernel profiling for unpriv | 
|---|
| 406 |  | - *   3 - disallow all unpriv perf event use  | 
|---|
| 407 | 411 |   */ | 
|---|
| 408 |  | -#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT  | 
|---|
| 409 |  | -int sysctl_perf_event_paranoid __read_mostly = 3;  | 
|---|
| 410 |  | -#else  | 
|---|
| 411 | 412 |  int sysctl_perf_event_paranoid __read_mostly = 2; | 
|---|
| 412 |  | -#endif  | 
|---|
| 413 | 413 |   | 
|---|
| 414 | 414 |  /* Minimum for 512 kiB + 1 user control page */ | 
|---|
| 415 | 415 |  int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ | 
|---|
| .. | .. | 
|---|
| 444 | 444 |  static bool perf_rotate_context(struct perf_cpu_context *cpuctx); | 
|---|
| 445 | 445 |   | 
|---|
| 446 | 446 |  int perf_proc_update_handler(struct ctl_table *table, int write, | 
|---|
| 447 |  | -		void __user *buffer, size_t *lenp,  | 
|---|
| 448 |  | -		loff_t *ppos)  | 
|---|
 | 447 | +		void *buffer, size_t *lenp, loff_t *ppos)  | 
|---|
| 449 | 448 |  { | 
|---|
| 450 | 449 |  	int ret; | 
|---|
| 451 | 450 |  	int perf_cpu = sysctl_perf_cpu_time_max_percent; | 
|---|
| .. | .. | 
|---|
| 469 | 468 |  int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; | 
|---|
| 470 | 469 |   | 
|---|
| 471 | 470 |  int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, | 
|---|
| 472 |  | -				void __user *buffer, size_t *lenp,  | 
|---|
| 473 |  | -				loff_t *ppos)  | 
|---|
 | 471 | +		void *buffer, size_t *lenp, loff_t *ppos)  | 
|---|
| 474 | 472 |  { | 
|---|
| 475 | 473 |  	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 
|---|
| 476 | 474 |   | 
|---|
| .. | .. | 
|---|
| 761 | 759 |  	/* | 
|---|
| 762 | 760 |  	 * Do not update time when cgroup is not active | 
|---|
| 763 | 761 |  	 */ | 
|---|
| 764 |  | -       if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))  | 
|---|
 | 762 | +	if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))  | 
|---|
| 765 | 763 |  		__update_cgrp_time(event->cgrp); | 
|---|
| 766 | 764 |  } | 
|---|
| 767 | 765 |   | 
|---|
| .. | .. | 
|---|
| 901 | 899 |  	rcu_read_unlock(); | 
|---|
| 902 | 900 |  } | 
|---|
| 903 | 901 |   | 
|---|
 | 902 | +static int perf_cgroup_ensure_storage(struct perf_event *event,  | 
|---|
 | 903 | +				struct cgroup_subsys_state *css)  | 
|---|
 | 904 | +{  | 
|---|
 | 905 | +	struct perf_cpu_context *cpuctx;  | 
|---|
 | 906 | +	struct perf_event **storage;  | 
|---|
 | 907 | +	int cpu, heap_size, ret = 0;  | 
|---|
 | 908 | +  | 
|---|
 | 909 | +	/*  | 
|---|
 | 910 | +	 * Allow storage to have sufficent space for an iterator for each  | 
|---|
 | 911 | +	 * possibly nested cgroup plus an iterator for events with no cgroup.  | 
|---|
 | 912 | +	 */  | 
|---|
 | 913 | +	for (heap_size = 1; css; css = css->parent)  | 
|---|
 | 914 | +		heap_size++;  | 
|---|
 | 915 | +  | 
|---|
 | 916 | +	for_each_possible_cpu(cpu) {  | 
|---|
 | 917 | +		cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);  | 
|---|
 | 918 | +		if (heap_size <= cpuctx->heap_size)  | 
|---|
 | 919 | +			continue;  | 
|---|
 | 920 | +  | 
|---|
 | 921 | +		storage = kmalloc_node(heap_size * sizeof(struct perf_event *),  | 
|---|
 | 922 | +				       GFP_KERNEL, cpu_to_node(cpu));  | 
|---|
 | 923 | +		if (!storage) {  | 
|---|
 | 924 | +			ret = -ENOMEM;  | 
|---|
 | 925 | +			break;  | 
|---|
 | 926 | +		}  | 
|---|
 | 927 | +  | 
|---|
 | 928 | +		raw_spin_lock_irq(&cpuctx->ctx.lock);  | 
|---|
 | 929 | +		if (cpuctx->heap_size < heap_size) {  | 
|---|
 | 930 | +			swap(cpuctx->heap, storage);  | 
|---|
 | 931 | +			if (storage == cpuctx->heap_default)  | 
|---|
 | 932 | +				storage = NULL;  | 
|---|
 | 933 | +			cpuctx->heap_size = heap_size;  | 
|---|
 | 934 | +		}  | 
|---|
 | 935 | +		raw_spin_unlock_irq(&cpuctx->ctx.lock);  | 
|---|
 | 936 | +  | 
|---|
 | 937 | +		kfree(storage);  | 
|---|
 | 938 | +	}  | 
|---|
 | 939 | +  | 
|---|
 | 940 | +	return ret;  | 
|---|
 | 941 | +}  | 
|---|
 | 942 | +  | 
|---|
| 904 | 943 |  static inline int perf_cgroup_connect(int fd, struct perf_event *event, | 
|---|
| 905 | 944 |  				      struct perf_event_attr *attr, | 
|---|
| 906 | 945 |  				      struct perf_event *group_leader) | 
|---|
| .. | .. | 
|---|
| 919 | 958 |  		ret = PTR_ERR(css); | 
|---|
| 920 | 959 |  		goto out; | 
|---|
| 921 | 960 |  	} | 
|---|
 | 961 | +  | 
|---|
 | 962 | +	ret = perf_cgroup_ensure_storage(event, css);  | 
|---|
 | 963 | +	if (ret)  | 
|---|
 | 964 | +		goto out;  | 
|---|
| 922 | 965 |   | 
|---|
| 923 | 966 |  	cgrp = container_of(css, struct perf_cgroup, css); | 
|---|
| 924 | 967 |  	event->cgrp = cgrp; | 
|---|
| .. | .. | 
|---|
| 945 | 988 |  	event->shadow_ctx_time = now - t->timestamp; | 
|---|
| 946 | 989 |  } | 
|---|
| 947 | 990 |   | 
|---|
| 948 |  | -/*  | 
|---|
| 949 |  | - * Update cpuctx->cgrp so that it is set when first cgroup event is added and  | 
|---|
| 950 |  | - * cleared when last cgroup event is removed.  | 
|---|
| 951 |  | - */  | 
|---|
| 952 | 991 |  static inline void | 
|---|
| 953 |  | -list_update_cgroup_event(struct perf_event *event,  | 
|---|
| 954 |  | -			 struct perf_event_context *ctx, bool add)  | 
|---|
 | 992 | +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)  | 
|---|
| 955 | 993 |  { | 
|---|
| 956 | 994 |  	struct perf_cpu_context *cpuctx; | 
|---|
| 957 |  | -	struct list_head *cpuctx_entry;  | 
|---|
| 958 | 995 |   | 
|---|
| 959 | 996 |  	if (!is_cgroup_event(event)) | 
|---|
| 960 | 997 |  		return; | 
|---|
| 961 | 998 |   | 
|---|
| 962 | 999 |  	/* | 
|---|
| 963 | 1000 |  	 * Because cgroup events are always per-cpu events, | 
|---|
| 964 |  | -	 * this will always be called from the right CPU.  | 
|---|
 | 1001 | +	 * @ctx == &cpuctx->ctx.  | 
|---|
| 965 | 1002 |  	 */ | 
|---|
| 966 |  | -	cpuctx = __get_cpu_context(ctx);  | 
|---|
 | 1003 | +	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);  | 
|---|
| 967 | 1004 |   | 
|---|
| 968 | 1005 |  	/* | 
|---|
| 969 | 1006 |  	 * Since setting cpuctx->cgrp is conditional on the current @cgrp | 
|---|
| .. | .. | 
|---|
| 971 | 1008 |  	 * because if the first would mismatch, the second would not try again | 
|---|
| 972 | 1009 |  	 * and we would leave cpuctx->cgrp unset. | 
|---|
| 973 | 1010 |  	 */ | 
|---|
| 974 |  | -	if (add && !cpuctx->cgrp) {  | 
|---|
 | 1011 | +	if (ctx->is_active && !cpuctx->cgrp) {  | 
|---|
| 975 | 1012 |  		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); | 
|---|
| 976 | 1013 |   | 
|---|
| 977 | 1014 |  		if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) | 
|---|
| 978 | 1015 |  			cpuctx->cgrp = cgrp; | 
|---|
| 979 | 1016 |  	} | 
|---|
| 980 | 1017 |   | 
|---|
| 981 |  | -	if (add && ctx->nr_cgroups++)  | 
|---|
| 982 |  | -		return;  | 
|---|
| 983 |  | -	else if (!add && --ctx->nr_cgroups)  | 
|---|
 | 1018 | +	if (ctx->nr_cgroups++)  | 
|---|
| 984 | 1019 |  		return; | 
|---|
| 985 | 1020 |   | 
|---|
| 986 |  | -	/* no cgroup running */  | 
|---|
| 987 |  | -	if (!add)  | 
|---|
 | 1021 | +	list_add(&cpuctx->cgrp_cpuctx_entry,  | 
|---|
 | 1022 | +			per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));  | 
|---|
 | 1023 | +}  | 
|---|
 | 1024 | +  | 
|---|
 | 1025 | +static inline void  | 
|---|
 | 1026 | +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)  | 
|---|
 | 1027 | +{  | 
|---|
 | 1028 | +	struct perf_cpu_context *cpuctx;  | 
|---|
 | 1029 | +  | 
|---|
 | 1030 | +	if (!is_cgroup_event(event))  | 
|---|
 | 1031 | +		return;  | 
|---|
 | 1032 | +  | 
|---|
 | 1033 | +	/*  | 
|---|
 | 1034 | +	 * Because cgroup events are always per-cpu events,  | 
|---|
 | 1035 | +	 * @ctx == &cpuctx->ctx.  | 
|---|
 | 1036 | +	 */  | 
|---|
 | 1037 | +	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);  | 
|---|
 | 1038 | +  | 
|---|
 | 1039 | +	if (--ctx->nr_cgroups)  | 
|---|
 | 1040 | +		return;  | 
|---|
 | 1041 | +  | 
|---|
 | 1042 | +	if (ctx->is_active && cpuctx->cgrp)  | 
|---|
| 988 | 1043 |  		cpuctx->cgrp = NULL; | 
|---|
| 989 | 1044 |   | 
|---|
| 990 |  | -	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;  | 
|---|
| 991 |  | -	if (add)  | 
|---|
| 992 |  | -		list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));  | 
|---|
| 993 |  | -	else  | 
|---|
| 994 |  | -		list_del(cpuctx_entry);  | 
|---|
 | 1045 | +	list_del(&cpuctx->cgrp_cpuctx_entry);  | 
|---|
| 995 | 1046 |  } | 
|---|
| 996 | 1047 |   | 
|---|
| 997 | 1048 |  #else /* !CONFIG_CGROUP_PERF */ | 
|---|
| .. | .. | 
|---|
| 1041 | 1092 |  { | 
|---|
| 1042 | 1093 |  } | 
|---|
| 1043 | 1094 |   | 
|---|
| 1044 |  | -void  | 
|---|
 | 1095 | +static inline void  | 
|---|
| 1045 | 1096 |  perf_cgroup_switch(struct task_struct *task, struct task_struct *next) | 
|---|
| 1046 | 1097 |  { | 
|---|
| 1047 | 1098 |  } | 
|---|
| .. | .. | 
|---|
| 1057 | 1108 |  } | 
|---|
| 1058 | 1109 |   | 
|---|
| 1059 | 1110 |  static inline void | 
|---|
| 1060 |  | -list_update_cgroup_event(struct perf_event *event,  | 
|---|
| 1061 |  | -			 struct perf_event_context *ctx, bool add)  | 
|---|
 | 1111 | +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)  | 
|---|
| 1062 | 1112 |  { | 
|---|
| 1063 | 1113 |  } | 
|---|
| 1064 | 1114 |   | 
|---|
 | 1115 | +static inline void  | 
|---|
 | 1116 | +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)  | 
|---|
 | 1117 | +{  | 
|---|
 | 1118 | +}  | 
|---|
| 1065 | 1119 |  #endif | 
|---|
| 1066 | 1120 |   | 
|---|
| 1067 | 1121 |  /* | 
|---|
| .. | .. | 
|---|
| 1113 | 1167 |  	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); | 
|---|
| 1114 | 1168 |   | 
|---|
| 1115 | 1169 |  	raw_spin_lock_init(&cpuctx->hrtimer_lock); | 
|---|
| 1116 |  | -	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);  | 
|---|
 | 1170 | +	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);  | 
|---|
| 1117 | 1171 |  	timer->function = perf_mux_hrtimer_handler; | 
|---|
| 1118 | 1172 |  } | 
|---|
| 1119 | 1173 |   | 
|---|
| .. | .. | 
|---|
| 1131 | 1185 |  	if (!cpuctx->hrtimer_active) { | 
|---|
| 1132 | 1186 |  		cpuctx->hrtimer_active = 1; | 
|---|
| 1133 | 1187 |  		hrtimer_forward_now(timer, cpuctx->hrtimer_interval); | 
|---|
| 1134 |  | -		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);  | 
|---|
 | 1188 | +		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);  | 
|---|
| 1135 | 1189 |  	} | 
|---|
| 1136 | 1190 |  	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); | 
|---|
| 1137 | 1191 |   | 
|---|
| .. | .. | 
|---|
| 1182 | 1236 |   | 
|---|
| 1183 | 1237 |  static void get_ctx(struct perf_event_context *ctx) | 
|---|
| 1184 | 1238 |  { | 
|---|
| 1185 |  | -	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));  | 
|---|
 | 1239 | +	refcount_inc(&ctx->refcount);  | 
|---|
 | 1240 | +}  | 
|---|
 | 1241 | +  | 
|---|
 | 1242 | +static void *alloc_task_ctx_data(struct pmu *pmu)  | 
|---|
 | 1243 | +{  | 
|---|
 | 1244 | +	if (pmu->task_ctx_cache)  | 
|---|
 | 1245 | +		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);  | 
|---|
 | 1246 | +  | 
|---|
 | 1247 | +	return NULL;  | 
|---|
 | 1248 | +}  | 
|---|
 | 1249 | +  | 
|---|
 | 1250 | +static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)  | 
|---|
 | 1251 | +{  | 
|---|
 | 1252 | +	if (pmu->task_ctx_cache && task_ctx_data)  | 
|---|
 | 1253 | +		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);  | 
|---|
| 1186 | 1254 |  } | 
|---|
| 1187 | 1255 |   | 
|---|
| 1188 | 1256 |  static void free_ctx(struct rcu_head *head) | 
|---|
| .. | .. | 
|---|
| 1190 | 1258 |  	struct perf_event_context *ctx; | 
|---|
| 1191 | 1259 |   | 
|---|
| 1192 | 1260 |  	ctx = container_of(head, struct perf_event_context, rcu_head); | 
|---|
| 1193 |  | -	kfree(ctx->task_ctx_data);  | 
|---|
 | 1261 | +	free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);  | 
|---|
| 1194 | 1262 |  	kfree(ctx); | 
|---|
| 1195 | 1263 |  } | 
|---|
| 1196 | 1264 |   | 
|---|
| 1197 | 1265 |  static void put_ctx(struct perf_event_context *ctx) | 
|---|
| 1198 | 1266 |  { | 
|---|
| 1199 |  | -	if (atomic_dec_and_test(&ctx->refcount)) {  | 
|---|
 | 1267 | +	if (refcount_dec_and_test(&ctx->refcount)) {  | 
|---|
| 1200 | 1268 |  		if (ctx->parent_ctx) | 
|---|
| 1201 | 1269 |  			put_ctx(ctx->parent_ctx); | 
|---|
| 1202 | 1270 |  		if (ctx->task && ctx->task != TASK_TOMBSTONE) | 
|---|
| .. | .. | 
|---|
| 1232 | 1300 |   * life-time rules separate them. That is an exiting task cannot fork, and a | 
|---|
| 1233 | 1301 |   * spawning task cannot (yet) exit. | 
|---|
| 1234 | 1302 |   * | 
|---|
| 1235 |  | - * But remember that that these are parent<->child context relations, and  | 
|---|
 | 1303 | + * But remember that these are parent<->child context relations, and  | 
|---|
| 1236 | 1304 |   * migration does not affect children, therefore these two orderings should not | 
|---|
| 1237 | 1305 |   * interact. | 
|---|
| 1238 | 1306 |   * | 
|---|
| .. | .. | 
|---|
| 1258 | 1326 |   * function. | 
|---|
| 1259 | 1327 |   * | 
|---|
| 1260 | 1328 |   * Lock order: | 
|---|
| 1261 |  | - *    cred_guard_mutex  | 
|---|
 | 1329 | + *    exec_update_lock  | 
|---|
| 1262 | 1330 |   *	task_struct::perf_event_mutex | 
|---|
| 1263 | 1331 |   *	  perf_event_context::mutex | 
|---|
| 1264 | 1332 |   *	    perf_event::child_mutex; | 
|---|
| 1265 | 1333 |   *	      perf_event_context::lock | 
|---|
| 1266 | 1334 |   *	    perf_event::mmap_mutex | 
|---|
| 1267 |  | - *	    mmap_sem  | 
|---|
 | 1335 | + *	    mmap_lock  | 
|---|
| 1268 | 1336 |   *	      perf_addr_filters_head::lock | 
|---|
| 1269 | 1337 |   * | 
|---|
| 1270 | 1338 |   *    cpu_hotplug_lock | 
|---|
| .. | .. | 
|---|
| 1279 | 1347 |  again: | 
|---|
| 1280 | 1348 |  	rcu_read_lock(); | 
|---|
| 1281 | 1349 |  	ctx = READ_ONCE(event->ctx); | 
|---|
| 1282 |  | -	if (!atomic_inc_not_zero(&ctx->refcount)) {  | 
|---|
 | 1350 | +	if (!refcount_inc_not_zero(&ctx->refcount)) {  | 
|---|
| 1283 | 1351 |  		rcu_read_unlock(); | 
|---|
| 1284 | 1352 |  		goto again; | 
|---|
| 1285 | 1353 |  	} | 
|---|
| .. | .. | 
|---|
| 1371 | 1439 |  /* | 
|---|
| 1372 | 1440 |   * Get the perf_event_context for a task and lock it. | 
|---|
| 1373 | 1441 |   * | 
|---|
| 1374 |  | - * This has to cope with with the fact that until it is locked,  | 
|---|
 | 1442 | + * This has to cope with the fact that until it is locked,  | 
|---|
| 1375 | 1443 |   * the context could get moved to another task. | 
|---|
| 1376 | 1444 |   */ | 
|---|
| 1377 | 1445 |  static struct perf_event_context * | 
|---|
| .. | .. | 
|---|
| 1412 | 1480 |  		} | 
|---|
| 1413 | 1481 |   | 
|---|
| 1414 | 1482 |  		if (ctx->task == TASK_TOMBSTONE || | 
|---|
| 1415 |  | -		    !atomic_inc_not_zero(&ctx->refcount)) {  | 
|---|
 | 1483 | +		    !refcount_inc_not_zero(&ctx->refcount)) {  | 
|---|
| 1416 | 1484 |  			raw_spin_unlock(&ctx->lock); | 
|---|
| 1417 | 1485 |  			ctx = NULL; | 
|---|
| 1418 | 1486 |  		} else { | 
|---|
| .. | .. | 
|---|
| 1540 | 1608 |  	if (left->cpu > right->cpu) | 
|---|
| 1541 | 1609 |  		return false; | 
|---|
| 1542 | 1610 |   | 
|---|
 | 1611 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 1612 | +	if (left->cgrp != right->cgrp) {  | 
|---|
 | 1613 | +		if (!left->cgrp || !left->cgrp->css.cgroup) {  | 
|---|
 | 1614 | +			/*  | 
|---|
 | 1615 | +			 * Left has no cgroup but right does, no cgroups come  | 
|---|
 | 1616 | +			 * first.  | 
|---|
 | 1617 | +			 */  | 
|---|
 | 1618 | +			return true;  | 
|---|
 | 1619 | +		}  | 
|---|
 | 1620 | +		if (!right->cgrp || !right->cgrp->css.cgroup) {  | 
|---|
 | 1621 | +			/*  | 
|---|
 | 1622 | +			 * Right has no cgroup but left does, no cgroups come  | 
|---|
 | 1623 | +			 * first.  | 
|---|
 | 1624 | +			 */  | 
|---|
 | 1625 | +			return false;  | 
|---|
 | 1626 | +		}  | 
|---|
 | 1627 | +		/* Two dissimilar cgroups, order by id. */  | 
|---|
 | 1628 | +		if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)  | 
|---|
 | 1629 | +			return true;  | 
|---|
 | 1630 | +  | 
|---|
 | 1631 | +		return false;  | 
|---|
 | 1632 | +	}  | 
|---|
 | 1633 | +#endif  | 
|---|
 | 1634 | +  | 
|---|
| 1543 | 1635 |  	if (left->group_index < right->group_index) | 
|---|
| 1544 | 1636 |  		return true; | 
|---|
| 1545 | 1637 |  	if (left->group_index > right->group_index) | 
|---|
| .. | .. | 
|---|
| 1619 | 1711 |  } | 
|---|
| 1620 | 1712 |   | 
|---|
| 1621 | 1713 |  /* | 
|---|
| 1622 |  | - * Get the leftmost event in the @cpu subtree.  | 
|---|
 | 1714 | + * Get the leftmost event in the cpu/cgroup subtree.  | 
|---|
| 1623 | 1715 |   */ | 
|---|
| 1624 | 1716 |  static struct perf_event * | 
|---|
| 1625 |  | -perf_event_groups_first(struct perf_event_groups *groups, int cpu)  | 
|---|
 | 1717 | +perf_event_groups_first(struct perf_event_groups *groups, int cpu,  | 
|---|
 | 1718 | +			struct cgroup *cgrp)  | 
|---|
| 1626 | 1719 |  { | 
|---|
| 1627 | 1720 |  	struct perf_event *node_event = NULL, *match = NULL; | 
|---|
| 1628 | 1721 |  	struct rb_node *node = groups->tree.rb_node; | 
|---|
 | 1722 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 1723 | +	u64 node_cgrp_id, cgrp_id = 0;  | 
|---|
 | 1724 | +  | 
|---|
 | 1725 | +	if (cgrp)  | 
|---|
 | 1726 | +		cgrp_id = cgrp->kn->id;  | 
|---|
 | 1727 | +#endif  | 
|---|
| 1629 | 1728 |   | 
|---|
| 1630 | 1729 |  	while (node) { | 
|---|
| 1631 | 1730 |  		node_event = container_of(node, struct perf_event, group_node); | 
|---|
| 1632 | 1731 |   | 
|---|
| 1633 | 1732 |  		if (cpu < node_event->cpu) { | 
|---|
| 1634 | 1733 |  			node = node->rb_left; | 
|---|
| 1635 |  | -		} else if (cpu > node_event->cpu) {  | 
|---|
| 1636 |  | -			node = node->rb_right;  | 
|---|
| 1637 |  | -		} else {  | 
|---|
| 1638 |  | -			match = node_event;  | 
|---|
| 1639 |  | -			node = node->rb_left;  | 
|---|
 | 1734 | +			continue;  | 
|---|
| 1640 | 1735 |  		} | 
|---|
 | 1736 | +		if (cpu > node_event->cpu) {  | 
|---|
 | 1737 | +			node = node->rb_right;  | 
|---|
 | 1738 | +			continue;  | 
|---|
 | 1739 | +		}  | 
|---|
 | 1740 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 1741 | +		node_cgrp_id = 0;  | 
|---|
 | 1742 | +		if (node_event->cgrp && node_event->cgrp->css.cgroup)  | 
|---|
 | 1743 | +			node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;  | 
|---|
 | 1744 | +  | 
|---|
 | 1745 | +		if (cgrp_id < node_cgrp_id) {  | 
|---|
 | 1746 | +			node = node->rb_left;  | 
|---|
 | 1747 | +			continue;  | 
|---|
 | 1748 | +		}  | 
|---|
 | 1749 | +		if (cgrp_id > node_cgrp_id) {  | 
|---|
 | 1750 | +			node = node->rb_right;  | 
|---|
 | 1751 | +			continue;  | 
|---|
 | 1752 | +		}  | 
|---|
 | 1753 | +#endif  | 
|---|
 | 1754 | +		match = node_event;  | 
|---|
 | 1755 | +		node = node->rb_left;  | 
|---|
| 1641 | 1756 |  	} | 
|---|
| 1642 | 1757 |   | 
|---|
| 1643 | 1758 |  	return match; | 
|---|
| .. | .. | 
|---|
| 1650 | 1765 |  perf_event_groups_next(struct perf_event *event) | 
|---|
| 1651 | 1766 |  { | 
|---|
| 1652 | 1767 |  	struct perf_event *next; | 
|---|
 | 1768 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 1769 | +	u64 curr_cgrp_id = 0;  | 
|---|
 | 1770 | +	u64 next_cgrp_id = 0;  | 
|---|
 | 1771 | +#endif  | 
|---|
| 1653 | 1772 |   | 
|---|
| 1654 | 1773 |  	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); | 
|---|
| 1655 |  | -	if (next && next->cpu == event->cpu)  | 
|---|
| 1656 |  | -		return next;  | 
|---|
 | 1774 | +	if (next == NULL || next->cpu != event->cpu)  | 
|---|
 | 1775 | +		return NULL;  | 
|---|
| 1657 | 1776 |   | 
|---|
| 1658 |  | -	return NULL;  | 
|---|
 | 1777 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 1778 | +	if (event->cgrp && event->cgrp->css.cgroup)  | 
|---|
 | 1779 | +		curr_cgrp_id = event->cgrp->css.cgroup->kn->id;  | 
|---|
 | 1780 | +  | 
|---|
 | 1781 | +	if (next->cgrp && next->cgrp->css.cgroup)  | 
|---|
 | 1782 | +		next_cgrp_id = next->cgrp->css.cgroup->kn->id;  | 
|---|
 | 1783 | +  | 
|---|
 | 1784 | +	if (curr_cgrp_id != next_cgrp_id)  | 
|---|
 | 1785 | +		return NULL;  | 
|---|
 | 1786 | +#endif  | 
|---|
 | 1787 | +	return next;  | 
|---|
| 1659 | 1788 |  } | 
|---|
| 1660 | 1789 |   | 
|---|
| 1661 | 1790 |  /* | 
|---|
| .. | .. | 
|---|
| 1691 | 1820 |  		add_event_to_groups(event, ctx); | 
|---|
| 1692 | 1821 |  	} | 
|---|
| 1693 | 1822 |   | 
|---|
| 1694 |  | -	list_update_cgroup_event(event, ctx, true);  | 
|---|
| 1695 |  | -  | 
|---|
| 1696 | 1823 |  	list_add_rcu(&event->event_entry, &ctx->event_list); | 
|---|
| 1697 | 1824 |  	ctx->nr_events++; | 
|---|
| 1698 | 1825 |  	if (event->attr.inherit_stat) | 
|---|
| 1699 | 1826 |  		ctx->nr_stat++; | 
|---|
 | 1827 | +  | 
|---|
 | 1828 | +	if (event->state > PERF_EVENT_STATE_OFF)  | 
|---|
 | 1829 | +		perf_cgroup_event_enable(event, ctx);  | 
|---|
| 1700 | 1830 |   | 
|---|
| 1701 | 1831 |  	ctx->generation++; | 
|---|
| 1702 | 1832 |  } | 
|---|
| .. | .. | 
|---|
| 1762 | 1892 |   | 
|---|
| 1763 | 1893 |  	if (sample_type & PERF_SAMPLE_PHYS_ADDR) | 
|---|
| 1764 | 1894 |  		size += sizeof(data->phys_addr); | 
|---|
 | 1895 | +  | 
|---|
 | 1896 | +	if (sample_type & PERF_SAMPLE_CGROUP)  | 
|---|
 | 1897 | +		size += sizeof(data->cgroup);  | 
|---|
| 1765 | 1898 |   | 
|---|
| 1766 | 1899 |  	event->header_size = size; | 
|---|
| 1767 | 1900 |  } | 
|---|
| .. | .. | 
|---|
| 1873 | 2006 |   | 
|---|
| 1874 | 2007 |  	event->attach_state &= ~PERF_ATTACH_CONTEXT; | 
|---|
| 1875 | 2008 |   | 
|---|
| 1876 |  | -	list_update_cgroup_event(event, ctx, false);  | 
|---|
| 1877 |  | -  | 
|---|
| 1878 | 2009 |  	ctx->nr_events--; | 
|---|
| 1879 | 2010 |  	if (event->attr.inherit_stat) | 
|---|
| 1880 | 2011 |  		ctx->nr_stat--; | 
|---|
| .. | .. | 
|---|
| 1891 | 2022 |  	 * of error state is by explicit re-enabling | 
|---|
| 1892 | 2023 |  	 * of the event | 
|---|
| 1893 | 2024 |  	 */ | 
|---|
| 1894 |  | -	if (event->state > PERF_EVENT_STATE_OFF)  | 
|---|
 | 2025 | +	if (event->state > PERF_EVENT_STATE_OFF) {  | 
|---|
 | 2026 | +		perf_cgroup_event_disable(event, ctx);  | 
|---|
| 1895 | 2027 |  		perf_event_set_state(event, PERF_EVENT_STATE_OFF); | 
|---|
 | 2028 | +	}  | 
|---|
| 1896 | 2029 |   | 
|---|
| 1897 | 2030 |  	ctx->generation++; | 
|---|
| 1898 | 2031 |  } | 
|---|
| 1899 | 2032 |   | 
|---|
 | 2033 | +static int  | 
|---|
 | 2034 | +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)  | 
|---|
 | 2035 | +{  | 
|---|
 | 2036 | +	if (!has_aux(aux_event))  | 
|---|
 | 2037 | +		return 0;  | 
|---|
 | 2038 | +  | 
|---|
 | 2039 | +	if (!event->pmu->aux_output_match)  | 
|---|
 | 2040 | +		return 0;  | 
|---|
 | 2041 | +  | 
|---|
 | 2042 | +	return event->pmu->aux_output_match(aux_event);  | 
|---|
 | 2043 | +}  | 
|---|
 | 2044 | +  | 
|---|
 | 2045 | +static void put_event(struct perf_event *event);  | 
|---|
 | 2046 | +static void event_sched_out(struct perf_event *event,  | 
|---|
 | 2047 | +			    struct perf_cpu_context *cpuctx,  | 
|---|
 | 2048 | +			    struct perf_event_context *ctx);  | 
|---|
 | 2049 | +  | 
|---|
 | 2050 | +static void perf_put_aux_event(struct perf_event *event)  | 
|---|
 | 2051 | +{  | 
|---|
 | 2052 | +	struct perf_event_context *ctx = event->ctx;  | 
|---|
 | 2053 | +	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);  | 
|---|
 | 2054 | +	struct perf_event *iter;  | 
|---|
 | 2055 | +  | 
|---|
 | 2056 | +	/*  | 
|---|
 | 2057 | +	 * If event uses aux_event tear down the link  | 
|---|
 | 2058 | +	 */  | 
|---|
 | 2059 | +	if (event->aux_event) {  | 
|---|
 | 2060 | +		iter = event->aux_event;  | 
|---|
 | 2061 | +		event->aux_event = NULL;  | 
|---|
 | 2062 | +		put_event(iter);  | 
|---|
 | 2063 | +		return;  | 
|---|
 | 2064 | +	}  | 
|---|
 | 2065 | +  | 
|---|
 | 2066 | +	/*  | 
|---|
 | 2067 | +	 * If the event is an aux_event, tear down all links to  | 
|---|
 | 2068 | +	 * it from other events.  | 
|---|
 | 2069 | +	 */  | 
|---|
 | 2070 | +	for_each_sibling_event(iter, event->group_leader) {  | 
|---|
 | 2071 | +		if (iter->aux_event != event)  | 
|---|
 | 2072 | +			continue;  | 
|---|
 | 2073 | +  | 
|---|
 | 2074 | +		iter->aux_event = NULL;  | 
|---|
 | 2075 | +		put_event(event);  | 
|---|
 | 2076 | +  | 
|---|
 | 2077 | +		/*  | 
|---|
 | 2078 | +		 * If it's ACTIVE, schedule it out and put it into ERROR  | 
|---|
 | 2079 | +		 * state so that we don't try to schedule it again. Note  | 
|---|
 | 2080 | +		 * that perf_event_enable() will clear the ERROR status.  | 
|---|
 | 2081 | +		 */  | 
|---|
 | 2082 | +		event_sched_out(iter, cpuctx, ctx);  | 
|---|
 | 2083 | +		perf_event_set_state(event, PERF_EVENT_STATE_ERROR);  | 
|---|
 | 2084 | +	}  | 
|---|
 | 2085 | +}  | 
|---|
 | 2086 | +  | 
|---|
 | 2087 | +static bool perf_need_aux_event(struct perf_event *event)  | 
|---|
 | 2088 | +{  | 
|---|
 | 2089 | +	return !!event->attr.aux_output || !!event->attr.aux_sample_size;  | 
|---|
 | 2090 | +}  | 
|---|
 | 2091 | +  | 
|---|
 | 2092 | +static int perf_get_aux_event(struct perf_event *event,  | 
|---|
 | 2093 | +			      struct perf_event *group_leader)  | 
|---|
 | 2094 | +{  | 
|---|
 | 2095 | +	/*  | 
|---|
 | 2096 | +	 * Our group leader must be an aux event if we want to be  | 
|---|
 | 2097 | +	 * an aux_output. This way, the aux event will precede its  | 
|---|
 | 2098 | +	 * aux_output events in the group, and therefore will always  | 
|---|
 | 2099 | +	 * schedule first.  | 
|---|
 | 2100 | +	 */  | 
|---|
 | 2101 | +	if (!group_leader)  | 
|---|
 | 2102 | +		return 0;  | 
|---|
 | 2103 | +  | 
|---|
 | 2104 | +	/*  | 
|---|
 | 2105 | +	 * aux_output and aux_sample_size are mutually exclusive.  | 
|---|
 | 2106 | +	 */  | 
|---|
 | 2107 | +	if (event->attr.aux_output && event->attr.aux_sample_size)  | 
|---|
 | 2108 | +		return 0;  | 
|---|
 | 2109 | +  | 
|---|
 | 2110 | +	if (event->attr.aux_output &&  | 
|---|
 | 2111 | +	    !perf_aux_output_match(event, group_leader))  | 
|---|
 | 2112 | +		return 0;  | 
|---|
 | 2113 | +  | 
|---|
 | 2114 | +	if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)  | 
|---|
 | 2115 | +		return 0;  | 
|---|
 | 2116 | +  | 
|---|
 | 2117 | +	if (!atomic_long_inc_not_zero(&group_leader->refcount))  | 
|---|
 | 2118 | +		return 0;  | 
|---|
 | 2119 | +  | 
|---|
 | 2120 | +	/*  | 
|---|
 | 2121 | +	 * Link aux_outputs to their aux event; this is undone in  | 
|---|
 | 2122 | +	 * perf_group_detach() by perf_put_aux_event(). When the  | 
|---|
 | 2123 | +	 * group in torn down, the aux_output events loose their  | 
|---|
 | 2124 | +	 * link to the aux_event and can't schedule any more.  | 
|---|
 | 2125 | +	 */  | 
|---|
 | 2126 | +	event->aux_event = group_leader;  | 
|---|
 | 2127 | +  | 
|---|
 | 2128 | +	return 1;  | 
|---|
 | 2129 | +}  | 
|---|
 | 2130 | +  | 
|---|
 | 2131 | +static inline struct list_head *get_event_list(struct perf_event *event)  | 
|---|
 | 2132 | +{  | 
|---|
 | 2133 | +	struct perf_event_context *ctx = event->ctx;  | 
|---|
 | 2134 | +	return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;  | 
|---|
 | 2135 | +}  | 
|---|
 | 2136 | +  | 
|---|
 | 2137 | +/*  | 
|---|
 | 2138 | + * Events that have PERF_EV_CAP_SIBLING require being part of a group and  | 
|---|
 | 2139 | + * cannot exist on their own, schedule them out and move them into the ERROR  | 
|---|
 | 2140 | + * state. Also see _perf_event_enable(), it will not be able to recover  | 
|---|
 | 2141 | + * this ERROR state.  | 
|---|
 | 2142 | + */  | 
|---|
 | 2143 | +static inline void perf_remove_sibling_event(struct perf_event *event)  | 
|---|
 | 2144 | +{  | 
|---|
 | 2145 | +	struct perf_event_context *ctx = event->ctx;  | 
|---|
 | 2146 | +	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);  | 
|---|
 | 2147 | +  | 
|---|
 | 2148 | +	event_sched_out(event, cpuctx, ctx);  | 
|---|
 | 2149 | +	perf_event_set_state(event, PERF_EVENT_STATE_ERROR);  | 
|---|
 | 2150 | +}  | 
|---|
 | 2151 | +  | 
|---|
| 1900 | 2152 |  static void perf_group_detach(struct perf_event *event) | 
|---|
| 1901 | 2153 |  { | 
|---|
 | 2154 | +	struct perf_event *leader = event->group_leader;  | 
|---|
| 1902 | 2155 |  	struct perf_event *sibling, *tmp; | 
|---|
| 1903 | 2156 |  	struct perf_event_context *ctx = event->ctx; | 
|---|
| 1904 | 2157 |   | 
|---|
| .. | .. | 
|---|
| 1912 | 2165 |   | 
|---|
| 1913 | 2166 |  	event->attach_state &= ~PERF_ATTACH_GROUP; | 
|---|
| 1914 | 2167 |   | 
|---|
 | 2168 | +	perf_put_aux_event(event);  | 
|---|
 | 2169 | +  | 
|---|
| 1915 | 2170 |  	/* | 
|---|
| 1916 | 2171 |  	 * If this is a sibling, remove it from its group. | 
|---|
| 1917 | 2172 |  	 */ | 
|---|
| 1918 |  | -	if (event->group_leader != event) {  | 
|---|
 | 2173 | +	if (leader != event) {  | 
|---|
| 1919 | 2174 |  		list_del_init(&event->sibling_list); | 
|---|
| 1920 | 2175 |  		event->group_leader->nr_siblings--; | 
|---|
| 1921 | 2176 |  		goto out; | 
|---|
| .. | .. | 
|---|
| 1928 | 2183 |  	 */ | 
|---|
| 1929 | 2184 |  	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { | 
|---|
| 1930 | 2185 |   | 
|---|
 | 2186 | +		if (sibling->event_caps & PERF_EV_CAP_SIBLING)  | 
|---|
 | 2187 | +			perf_remove_sibling_event(sibling);  | 
|---|
 | 2188 | +  | 
|---|
| 1931 | 2189 |  		sibling->group_leader = sibling; | 
|---|
| 1932 | 2190 |  		list_del_init(&sibling->sibling_list); | 
|---|
| 1933 | 2191 |   | 
|---|
| .. | .. | 
|---|
| 1937 | 2195 |  		if (!RB_EMPTY_NODE(&event->group_node)) { | 
|---|
| 1938 | 2196 |  			add_event_to_groups(sibling, event->ctx); | 
|---|
| 1939 | 2197 |   | 
|---|
| 1940 |  | -			if (sibling->state == PERF_EVENT_STATE_ACTIVE) {  | 
|---|
| 1941 |  | -				struct list_head *list = sibling->attr.pinned ?  | 
|---|
| 1942 |  | -					&ctx->pinned_active : &ctx->flexible_active;  | 
|---|
| 1943 |  | -  | 
|---|
| 1944 |  | -				list_add_tail(&sibling->active_list, list);  | 
|---|
| 1945 |  | -			}  | 
|---|
 | 2198 | +			if (sibling->state == PERF_EVENT_STATE_ACTIVE)  | 
|---|
 | 2199 | +				list_add_tail(&sibling->active_list, get_event_list(sibling));  | 
|---|
| 1946 | 2200 |  		} | 
|---|
| 1947 | 2201 |   | 
|---|
| 1948 | 2202 |  		WARN_ON_ONCE(sibling->ctx != event->ctx); | 
|---|
| 1949 | 2203 |  	} | 
|---|
| 1950 | 2204 |   | 
|---|
| 1951 | 2205 |  out: | 
|---|
| 1952 |  | -	perf_event__header_size(event->group_leader);  | 
|---|
| 1953 |  | -  | 
|---|
| 1954 |  | -	for_each_sibling_event(tmp, event->group_leader)  | 
|---|
 | 2206 | +	for_each_sibling_event(tmp, leader)  | 
|---|
| 1955 | 2207 |  		perf_event__header_size(tmp); | 
|---|
 | 2208 | +  | 
|---|
 | 2209 | +	perf_event__header_size(leader);  | 
|---|
| 1956 | 2210 |  } | 
|---|
| 1957 | 2211 |   | 
|---|
| 1958 | 2212 |  static bool is_orphaned_event(struct perf_event *event) | 
|---|
| .. | .. | 
|---|
| 2021 | 2275 |   | 
|---|
| 2022 | 2276 |  	if (READ_ONCE(event->pending_disable) >= 0) { | 
|---|
| 2023 | 2277 |  		WRITE_ONCE(event->pending_disable, -1); | 
|---|
 | 2278 | +		perf_cgroup_event_disable(event, ctx);  | 
|---|
| 2024 | 2279 |  		state = PERF_EVENT_STATE_OFF; | 
|---|
| 2025 | 2280 |  	} | 
|---|
| 2026 | 2281 |  	perf_event_set_state(event, state); | 
|---|
| .. | .. | 
|---|
| 2058 | 2313 |  		event_sched_out(event, cpuctx, ctx); | 
|---|
| 2059 | 2314 |   | 
|---|
| 2060 | 2315 |  	perf_pmu_enable(ctx->pmu); | 
|---|
| 2061 |  | -  | 
|---|
| 2062 |  | -	if (group_event->attr.exclusive)  | 
|---|
| 2063 |  | -		cpuctx->exclusive = 0;  | 
|---|
| 2064 | 2316 |  } | 
|---|
| 2065 | 2317 |   | 
|---|
| 2066 | 2318 |  #define DETACH_GROUP	0x01UL | 
|---|
| .. | .. | 
|---|
| 2091 | 2343 |   | 
|---|
| 2092 | 2344 |  	if (!ctx->nr_events && ctx->is_active) { | 
|---|
| 2093 | 2345 |  		ctx->is_active = 0; | 
|---|
 | 2346 | +		ctx->rotate_necessary = 0;  | 
|---|
| 2094 | 2347 |  		if (ctx->task) { | 
|---|
| 2095 | 2348 |  			WARN_ON_ONCE(cpuctx->task_ctx != ctx); | 
|---|
| 2096 | 2349 |  			cpuctx->task_ctx = NULL; | 
|---|
| .. | .. | 
|---|
| 2157 | 2410 |  		event_sched_out(event, cpuctx, ctx); | 
|---|
| 2158 | 2411 |   | 
|---|
| 2159 | 2412 |  	perf_event_set_state(event, PERF_EVENT_STATE_OFF); | 
|---|
 | 2413 | +	perf_cgroup_event_disable(event, ctx);  | 
|---|
| 2160 | 2414 |  } | 
|---|
| 2161 | 2415 |   | 
|---|
| 2162 | 2416 |  /* | 
|---|
| .. | .. | 
|---|
| 2164 | 2418 |   * | 
|---|
| 2165 | 2419 |   * If event->ctx is a cloned context, callers must make sure that | 
|---|
| 2166 | 2420 |   * every task struct that event->ctx->task could possibly point to | 
|---|
| 2167 |  | - * remains valid.  This condition is satisifed when called through  | 
|---|
 | 2421 | + * remains valid.  This condition is satisfied when called through  | 
|---|
| 2168 | 2422 |   * perf_event_for_each_child or perf_event_for_each because they | 
|---|
| 2169 | 2423 |   * hold the top-level event's child_mutex, so any descendant that | 
|---|
| 2170 | 2424 |   * goes to exit will block in perf_event_exit_event(). | 
|---|
| .. | .. | 
|---|
| 2238 | 2492 |  	 * But this is a bit hairy. | 
|---|
| 2239 | 2493 |  	 * | 
|---|
| 2240 | 2494 |  	 * So instead, we have an explicit cgroup call to remain | 
|---|
| 2241 |  | -	 * within the time time source all along. We believe it  | 
|---|
 | 2495 | +	 * within the time source all along. We believe it  | 
|---|
| 2242 | 2496 |  	 * is cleaner and simpler to understand. | 
|---|
| 2243 | 2497 |  	 */ | 
|---|
| 2244 | 2498 |  	if (is_cgroup_event(event)) | 
|---|
| .. | .. | 
|---|
| 2258 | 2512 |  		 struct perf_event_context *ctx) | 
|---|
| 2259 | 2513 |  { | 
|---|
| 2260 | 2514 |  	int ret = 0; | 
|---|
 | 2515 | +  | 
|---|
 | 2516 | +	WARN_ON_ONCE(event->ctx != ctx);  | 
|---|
| 2261 | 2517 |   | 
|---|
| 2262 | 2518 |  	lockdep_assert_held(&ctx->lock); | 
|---|
| 2263 | 2519 |   | 
|---|
| .. | .. | 
|---|
| 2325 | 2581 |   | 
|---|
| 2326 | 2582 |  	pmu->start_txn(pmu, PERF_PMU_TXN_ADD); | 
|---|
| 2327 | 2583 |   | 
|---|
| 2328 |  | -	if (event_sched_in(group_event, cpuctx, ctx)) {  | 
|---|
| 2329 |  | -		pmu->cancel_txn(pmu);  | 
|---|
| 2330 |  | -		perf_mux_hrtimer_restart(cpuctx);  | 
|---|
| 2331 |  | -		return -EAGAIN;  | 
|---|
| 2332 |  | -	}  | 
|---|
 | 2584 | +	if (event_sched_in(group_event, cpuctx, ctx))  | 
|---|
 | 2585 | +		goto error;  | 
|---|
| 2333 | 2586 |   | 
|---|
| 2334 | 2587 |  	/* | 
|---|
| 2335 | 2588 |  	 * Schedule in siblings as one group (if any): | 
|---|
| .. | .. | 
|---|
| 2358 | 2611 |  	} | 
|---|
| 2359 | 2612 |  	event_sched_out(group_event, cpuctx, ctx); | 
|---|
| 2360 | 2613 |   | 
|---|
 | 2614 | +error:  | 
|---|
| 2361 | 2615 |  	pmu->cancel_txn(pmu); | 
|---|
| 2362 |  | -  | 
|---|
| 2363 |  | -	perf_mux_hrtimer_restart(cpuctx);  | 
|---|
| 2364 |  | -  | 
|---|
| 2365 | 2616 |  	return -EAGAIN; | 
|---|
| 2366 | 2617 |  } | 
|---|
| 2367 | 2618 |   | 
|---|
| .. | .. | 
|---|
| 2387 | 2638 |  	 * If this group is exclusive and there are already | 
|---|
| 2388 | 2639 |  	 * events on the CPU, it can't go on. | 
|---|
| 2389 | 2640 |  	 */ | 
|---|
| 2390 |  | -	if (event->attr.exclusive && cpuctx->active_oncpu)  | 
|---|
 | 2641 | +	if (event->attr.exclusive && !list_empty(get_event_list(event)))  | 
|---|
| 2391 | 2642 |  		return 0; | 
|---|
| 2392 | 2643 |  	/* | 
|---|
| 2393 | 2644 |  	 * Otherwise, try to add it if all previous groups were able | 
|---|
| .. | .. | 
|---|
| 2488 | 2739 |  	perf_pmu_enable(cpuctx->ctx.pmu); | 
|---|
| 2489 | 2740 |  } | 
|---|
| 2490 | 2741 |   | 
|---|
 | 2742 | +void perf_pmu_resched(struct pmu *pmu)  | 
|---|
 | 2743 | +{  | 
|---|
 | 2744 | +	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);  | 
|---|
 | 2745 | +	struct perf_event_context *task_ctx = cpuctx->task_ctx;  | 
|---|
 | 2746 | +  | 
|---|
 | 2747 | +	perf_ctx_lock(cpuctx, task_ctx);  | 
|---|
 | 2748 | +	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);  | 
|---|
 | 2749 | +	perf_ctx_unlock(cpuctx, task_ctx);  | 
|---|
 | 2750 | +}  | 
|---|
 | 2751 | +  | 
|---|
| 2491 | 2752 |  /* | 
|---|
| 2492 | 2753 |   * Cross CPU call to install and enable a performance event | 
|---|
| 2493 | 2754 |   * | 
|---|
| .. | .. | 
|---|
| 2528 | 2789 |  	} | 
|---|
| 2529 | 2790 |   | 
|---|
| 2530 | 2791 |  #ifdef CONFIG_CGROUP_PERF | 
|---|
| 2531 |  | -	if (is_cgroup_event(event)) {  | 
|---|
 | 2792 | +	if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {  | 
|---|
| 2532 | 2793 |  		/* | 
|---|
| 2533 | 2794 |  		 * If the current cgroup doesn't match the event's | 
|---|
| 2534 | 2795 |  		 * cgroup, we should not try to schedule it. | 
|---|
| .. | .. | 
|---|
| 2580 | 2841 |  	 * will be 'complete'. See perf_iterate_sb_cpu(). | 
|---|
| 2581 | 2842 |  	 */ | 
|---|
| 2582 | 2843 |  	smp_store_release(&event->ctx, ctx); | 
|---|
 | 2844 | +  | 
|---|
 | 2845 | +	/*  | 
|---|
 | 2846 | +	 * perf_event_attr::disabled events will not run and can be initialized  | 
|---|
 | 2847 | +	 * without IPI. Except when this is the first event for the context, in  | 
|---|
 | 2848 | +	 * that case we need the magic of the IPI to set ctx->is_active.  | 
|---|
 | 2849 | +	 *  | 
|---|
 | 2850 | +	 * The IOC_ENABLE that is sure to follow the creation of a disabled  | 
|---|
 | 2851 | +	 * event will issue the IPI and reprogram the hardware.  | 
|---|
 | 2852 | +	 */  | 
|---|
 | 2853 | +	if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {  | 
|---|
 | 2854 | +		raw_spin_lock_irq(&ctx->lock);  | 
|---|
 | 2855 | +		if (ctx->task == TASK_TOMBSTONE) {  | 
|---|
 | 2856 | +			raw_spin_unlock_irq(&ctx->lock);  | 
|---|
 | 2857 | +			return;  | 
|---|
 | 2858 | +		}  | 
|---|
 | 2859 | +		add_event_to_ctx(event, ctx);  | 
|---|
 | 2860 | +		raw_spin_unlock_irq(&ctx->lock);  | 
|---|
 | 2861 | +		return;  | 
|---|
 | 2862 | +	}  | 
|---|
| 2583 | 2863 |   | 
|---|
| 2584 | 2864 |  	if (!task) { | 
|---|
| 2585 | 2865 |  		cpu_function_call(cpu, __perf_install_in_context, event); | 
|---|
| .. | .. | 
|---|
| 2669 | 2949 |  		ctx_sched_out(ctx, cpuctx, EVENT_TIME); | 
|---|
| 2670 | 2950 |   | 
|---|
| 2671 | 2951 |  	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); | 
|---|
 | 2952 | +	perf_cgroup_event_enable(event, ctx);  | 
|---|
| 2672 | 2953 |   | 
|---|
| 2673 | 2954 |  	if (!ctx->is_active) | 
|---|
| 2674 | 2955 |  		return; | 
|---|
| .. | .. | 
|---|
| 2710 | 2991 |  	raw_spin_lock_irq(&ctx->lock); | 
|---|
| 2711 | 2992 |  	if (event->state >= PERF_EVENT_STATE_INACTIVE || | 
|---|
| 2712 | 2993 |  	    event->state <  PERF_EVENT_STATE_ERROR) { | 
|---|
 | 2994 | +out:  | 
|---|
| 2713 | 2995 |  		raw_spin_unlock_irq(&ctx->lock); | 
|---|
| 2714 | 2996 |  		return; | 
|---|
| 2715 | 2997 |  	} | 
|---|
| .. | .. | 
|---|
| 2721 | 3003 |  	 * has gone back into error state, as distinct from the task having | 
|---|
| 2722 | 3004 |  	 * been scheduled away before the cross-call arrived. | 
|---|
| 2723 | 3005 |  	 */ | 
|---|
| 2724 |  | -	if (event->state == PERF_EVENT_STATE_ERROR)  | 
|---|
 | 3006 | +	if (event->state == PERF_EVENT_STATE_ERROR) {  | 
|---|
 | 3007 | +		/*  | 
|---|
 | 3008 | +		 * Detached SIBLING events cannot leave ERROR state.  | 
|---|
 | 3009 | +		 */  | 
|---|
 | 3010 | +		if (event->event_caps & PERF_EV_CAP_SIBLING &&  | 
|---|
 | 3011 | +		    event->group_leader == event)  | 
|---|
 | 3012 | +			goto out;  | 
|---|
 | 3013 | +  | 
|---|
| 2725 | 3014 |  		event->state = PERF_EVENT_STATE_OFF; | 
|---|
 | 3015 | +	}  | 
|---|
| 2726 | 3016 |  	raw_spin_unlock_irq(&ctx->lock); | 
|---|
| 2727 | 3017 |   | 
|---|
| 2728 | 3018 |  	event_function_call(event, __perf_event_enable, NULL); | 
|---|
| .. | .. | 
|---|
| 2826 | 3116 |   *     pre-existing mappings, called once when new filters arrive via SET_FILTER | 
|---|
| 2827 | 3117 |   *     ioctl; | 
|---|
| 2828 | 3118 |   * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly | 
|---|
| 2829 |  | - *     registered mapping, called for every new mmap(), with mm::mmap_sem down  | 
|---|
 | 3119 | + *     registered mapping, called for every new mmap(), with mm::mmap_lock down  | 
|---|
| 2830 | 3120 |   *     for reading; | 
|---|
| 2831 | 3121 |   * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process | 
|---|
| 2832 | 3122 |   *     of exec. | 
|---|
| .. | .. | 
|---|
| 2966 | 3256 |  	if (is_active & EVENT_FLEXIBLE) { | 
|---|
| 2967 | 3257 |  		list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) | 
|---|
| 2968 | 3258 |  			group_sched_out(event, cpuctx, ctx); | 
|---|
 | 3259 | +  | 
|---|
 | 3260 | +		/*  | 
|---|
 | 3261 | +		 * Since we cleared EVENT_FLEXIBLE, also clear  | 
|---|
 | 3262 | +		 * rotate_necessary, is will be reset by  | 
|---|
 | 3263 | +		 * ctx_flexible_sched_in() when needed.  | 
|---|
 | 3264 | +		 */  | 
|---|
 | 3265 | +		ctx->rotate_necessary = 0;  | 
|---|
| 2969 | 3266 |  	} | 
|---|
| 2970 | 3267 |  	perf_pmu_enable(ctx->pmu); | 
|---|
| 2971 | 3268 |  } | 
|---|
| .. | .. | 
|---|
| 3080 | 3377 |  	struct perf_event_context *parent, *next_parent; | 
|---|
| 3081 | 3378 |  	struct perf_cpu_context *cpuctx; | 
|---|
| 3082 | 3379 |  	int do_switch = 1; | 
|---|
 | 3380 | +	struct pmu *pmu;  | 
|---|
| 3083 | 3381 |   | 
|---|
| 3084 | 3382 |  	if (likely(!ctx)) | 
|---|
| 3085 | 3383 |  		return; | 
|---|
| 3086 | 3384 |   | 
|---|
 | 3385 | +	pmu = ctx->pmu;  | 
|---|
| 3087 | 3386 |  	cpuctx = __get_cpu_context(ctx); | 
|---|
| 3088 | 3387 |  	if (!cpuctx->task_ctx) | 
|---|
| 3089 | 3388 |  		return; | 
|---|
| .. | .. | 
|---|
| 3113 | 3412 |  		raw_spin_lock(&ctx->lock); | 
|---|
| 3114 | 3413 |  		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); | 
|---|
| 3115 | 3414 |  		if (context_equiv(ctx, next_ctx)) { | 
|---|
 | 3415 | +  | 
|---|
| 3116 | 3416 |  			WRITE_ONCE(ctx->task, next); | 
|---|
| 3117 | 3417 |  			WRITE_ONCE(next_ctx->task, task); | 
|---|
| 3118 | 3418 |   | 
|---|
| 3119 |  | -			swap(ctx->task_ctx_data, next_ctx->task_ctx_data);  | 
|---|
 | 3419 | +			perf_pmu_disable(pmu);  | 
|---|
 | 3420 | +  | 
|---|
 | 3421 | +			if (cpuctx->sched_cb_usage && pmu->sched_task)  | 
|---|
 | 3422 | +				pmu->sched_task(ctx, false);  | 
|---|
 | 3423 | +  | 
|---|
 | 3424 | +			/*  | 
|---|
 | 3425 | +			 * PMU specific parts of task perf context can require  | 
|---|
 | 3426 | +			 * additional synchronization. As an example of such  | 
|---|
 | 3427 | +			 * synchronization see implementation details of Intel  | 
|---|
 | 3428 | +			 * LBR call stack data profiling;  | 
|---|
 | 3429 | +			 */  | 
|---|
 | 3430 | +			if (pmu->swap_task_ctx)  | 
|---|
 | 3431 | +				pmu->swap_task_ctx(ctx, next_ctx);  | 
|---|
 | 3432 | +			else  | 
|---|
 | 3433 | +				swap(ctx->task_ctx_data, next_ctx->task_ctx_data);  | 
|---|
 | 3434 | +  | 
|---|
 | 3435 | +			perf_pmu_enable(pmu);  | 
|---|
| 3120 | 3436 |   | 
|---|
| 3121 | 3437 |  			/* | 
|---|
| 3122 | 3438 |  			 * RCU_INIT_POINTER here is safe because we've not | 
|---|
| .. | .. | 
|---|
| 3140 | 3456 |   | 
|---|
| 3141 | 3457 |  	if (do_switch) { | 
|---|
| 3142 | 3458 |  		raw_spin_lock(&ctx->lock); | 
|---|
 | 3459 | +		perf_pmu_disable(pmu);  | 
|---|
 | 3460 | +  | 
|---|
 | 3461 | +		if (cpuctx->sched_cb_usage && pmu->sched_task)  | 
|---|
 | 3462 | +			pmu->sched_task(ctx, false);  | 
|---|
| 3143 | 3463 |  		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); | 
|---|
 | 3464 | +  | 
|---|
 | 3465 | +		perf_pmu_enable(pmu);  | 
|---|
| 3144 | 3466 |  		raw_spin_unlock(&ctx->lock); | 
|---|
| 3145 | 3467 |  	} | 
|---|
| 3146 | 3468 |  } | 
|---|
| .. | .. | 
|---|
| 3176 | 3498 |   * PEBS requires this to provide PID/TID information. This requires we flush | 
|---|
| 3177 | 3499 |   * all queued PEBS records before we context switch to a new task. | 
|---|
| 3178 | 3500 |   */ | 
|---|
 | 3501 | +static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)  | 
|---|
 | 3502 | +{  | 
|---|
 | 3503 | +	struct pmu *pmu;  | 
|---|
 | 3504 | +  | 
|---|
 | 3505 | +	pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */  | 
|---|
 | 3506 | +  | 
|---|
 | 3507 | +	if (WARN_ON_ONCE(!pmu->sched_task))  | 
|---|
 | 3508 | +		return;  | 
|---|
 | 3509 | +  | 
|---|
 | 3510 | +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);  | 
|---|
 | 3511 | +	perf_pmu_disable(pmu);  | 
|---|
 | 3512 | +  | 
|---|
 | 3513 | +	pmu->sched_task(cpuctx->task_ctx, sched_in);  | 
|---|
 | 3514 | +  | 
|---|
 | 3515 | +	perf_pmu_enable(pmu);  | 
|---|
 | 3516 | +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);  | 
|---|
 | 3517 | +}  | 
|---|
 | 3518 | +  | 
|---|
| 3179 | 3519 |  static void perf_pmu_sched_task(struct task_struct *prev, | 
|---|
| 3180 | 3520 |  				struct task_struct *next, | 
|---|
| 3181 | 3521 |  				bool sched_in) | 
|---|
| 3182 | 3522 |  { | 
|---|
| 3183 | 3523 |  	struct perf_cpu_context *cpuctx; | 
|---|
| 3184 |  | -	struct pmu *pmu;  | 
|---|
| 3185 | 3524 |   | 
|---|
| 3186 | 3525 |  	if (prev == next) | 
|---|
| 3187 | 3526 |  		return; | 
|---|
| 3188 | 3527 |   | 
|---|
| 3189 | 3528 |  	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { | 
|---|
| 3190 |  | -		pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */  | 
|---|
| 3191 |  | -  | 
|---|
| 3192 |  | -		if (WARN_ON_ONCE(!pmu->sched_task))  | 
|---|
 | 3529 | +		/* will be handled in perf_event_context_sched_in/out */  | 
|---|
 | 3530 | +		if (cpuctx->task_ctx)  | 
|---|
| 3193 | 3531 |  			continue; | 
|---|
| 3194 | 3532 |   | 
|---|
| 3195 |  | -		perf_ctx_lock(cpuctx, cpuctx->task_ctx);  | 
|---|
| 3196 |  | -		perf_pmu_disable(pmu);  | 
|---|
| 3197 |  | -  | 
|---|
| 3198 |  | -		pmu->sched_task(cpuctx->task_ctx, sched_in);  | 
|---|
| 3199 |  | -  | 
|---|
| 3200 |  | -		perf_pmu_enable(pmu);  | 
|---|
| 3201 |  | -		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);  | 
|---|
 | 3533 | +		__perf_pmu_sched_task(cpuctx, sched_in);  | 
|---|
| 3202 | 3534 |  	} | 
|---|
| 3203 | 3535 |  } | 
|---|
| 3204 | 3536 |   | 
|---|
| .. | .. | 
|---|
| 3251 | 3583 |  	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); | 
|---|
| 3252 | 3584 |  } | 
|---|
| 3253 | 3585 |   | 
|---|
| 3254 |  | -static int visit_groups_merge(struct perf_event_groups *groups, int cpu,  | 
|---|
| 3255 |  | -			      int (*func)(struct perf_event *, void *), void *data)  | 
|---|
 | 3586 | +static bool perf_less_group_idx(const void *l, const void *r)  | 
|---|
| 3256 | 3587 |  { | 
|---|
| 3257 |  | -	struct perf_event **evt, *evt1, *evt2;  | 
|---|
 | 3588 | +	const struct perf_event *le = *(const struct perf_event **)l;  | 
|---|
 | 3589 | +	const struct perf_event *re = *(const struct perf_event **)r;  | 
|---|
 | 3590 | +  | 
|---|
 | 3591 | +	return le->group_index < re->group_index;  | 
|---|
 | 3592 | +}  | 
|---|
 | 3593 | +  | 
|---|
 | 3594 | +static void swap_ptr(void *l, void *r)  | 
|---|
 | 3595 | +{  | 
|---|
 | 3596 | +	void **lp = l, **rp = r;  | 
|---|
 | 3597 | +  | 
|---|
 | 3598 | +	swap(*lp, *rp);  | 
|---|
 | 3599 | +}  | 
|---|
 | 3600 | +  | 
|---|
 | 3601 | +static const struct min_heap_callbacks perf_min_heap = {  | 
|---|
 | 3602 | +	.elem_size = sizeof(struct perf_event *),  | 
|---|
 | 3603 | +	.less = perf_less_group_idx,  | 
|---|
 | 3604 | +	.swp = swap_ptr,  | 
|---|
 | 3605 | +};  | 
|---|
 | 3606 | +  | 
|---|
 | 3607 | +static void __heap_add(struct min_heap *heap, struct perf_event *event)  | 
|---|
 | 3608 | +{  | 
|---|
 | 3609 | +	struct perf_event **itrs = heap->data;  | 
|---|
 | 3610 | +  | 
|---|
 | 3611 | +	if (event) {  | 
|---|
 | 3612 | +		itrs[heap->nr] = event;  | 
|---|
 | 3613 | +		heap->nr++;  | 
|---|
 | 3614 | +	}  | 
|---|
 | 3615 | +}  | 
|---|
 | 3616 | +  | 
|---|
 | 3617 | +static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,  | 
|---|
 | 3618 | +				struct perf_event_groups *groups, int cpu,  | 
|---|
 | 3619 | +				int (*func)(struct perf_event *, void *),  | 
|---|
 | 3620 | +				void *data)  | 
|---|
 | 3621 | +{  | 
|---|
 | 3622 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 3623 | +	struct cgroup_subsys_state *css = NULL;  | 
|---|
 | 3624 | +#endif  | 
|---|
 | 3625 | +	/* Space for per CPU and/or any CPU event iterators. */  | 
|---|
 | 3626 | +	struct perf_event *itrs[2];  | 
|---|
 | 3627 | +	struct min_heap event_heap;  | 
|---|
 | 3628 | +	struct perf_event **evt;  | 
|---|
| 3258 | 3629 |  	int ret; | 
|---|
| 3259 | 3630 |   | 
|---|
| 3260 |  | -	evt1 = perf_event_groups_first(groups, -1);  | 
|---|
| 3261 |  | -	evt2 = perf_event_groups_first(groups, cpu);  | 
|---|
 | 3631 | +	if (cpuctx) {  | 
|---|
 | 3632 | +		event_heap = (struct min_heap){  | 
|---|
 | 3633 | +			.data = cpuctx->heap,  | 
|---|
 | 3634 | +			.nr = 0,  | 
|---|
 | 3635 | +			.size = cpuctx->heap_size,  | 
|---|
 | 3636 | +		};  | 
|---|
| 3262 | 3637 |   | 
|---|
| 3263 |  | -	while (evt1 || evt2) {  | 
|---|
| 3264 |  | -		if (evt1 && evt2) {  | 
|---|
| 3265 |  | -			if (evt1->group_index < evt2->group_index)  | 
|---|
| 3266 |  | -				evt = &evt1;  | 
|---|
| 3267 |  | -			else  | 
|---|
| 3268 |  | -				evt = &evt2;  | 
|---|
| 3269 |  | -		} else if (evt1) {  | 
|---|
| 3270 |  | -			evt = &evt1;  | 
|---|
| 3271 |  | -		} else {  | 
|---|
| 3272 |  | -			evt = &evt2;  | 
|---|
| 3273 |  | -		}  | 
|---|
 | 3638 | +		lockdep_assert_held(&cpuctx->ctx.lock);  | 
|---|
| 3274 | 3639 |   | 
|---|
 | 3640 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 3641 | +		if (cpuctx->cgrp)  | 
|---|
 | 3642 | +			css = &cpuctx->cgrp->css;  | 
|---|
 | 3643 | +#endif  | 
|---|
 | 3644 | +	} else {  | 
|---|
 | 3645 | +		event_heap = (struct min_heap){  | 
|---|
 | 3646 | +			.data = itrs,  | 
|---|
 | 3647 | +			.nr = 0,  | 
|---|
 | 3648 | +			.size = ARRAY_SIZE(itrs),  | 
|---|
 | 3649 | +		};  | 
|---|
 | 3650 | +		/* Events not within a CPU context may be on any CPU. */  | 
|---|
 | 3651 | +		__heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));  | 
|---|
 | 3652 | +	}  | 
|---|
 | 3653 | +	evt = event_heap.data;  | 
|---|
 | 3654 | +  | 
|---|
 | 3655 | +	__heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));  | 
|---|
 | 3656 | +  | 
|---|
 | 3657 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 3658 | +	for (; css; css = css->parent)  | 
|---|
 | 3659 | +		__heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));  | 
|---|
 | 3660 | +#endif  | 
|---|
 | 3661 | +  | 
|---|
 | 3662 | +	min_heapify_all(&event_heap, &perf_min_heap);  | 
|---|
 | 3663 | +  | 
|---|
 | 3664 | +	while (event_heap.nr) {  | 
|---|
| 3275 | 3665 |  		ret = func(*evt, data); | 
|---|
| 3276 | 3666 |  		if (ret) | 
|---|
| 3277 | 3667 |  			return ret; | 
|---|
| 3278 | 3668 |   | 
|---|
| 3279 | 3669 |  		*evt = perf_event_groups_next(*evt); | 
|---|
| 3280 |  | -	}  | 
|---|
| 3281 |  | -  | 
|---|
| 3282 |  | -	return 0;  | 
|---|
| 3283 |  | -}  | 
|---|
| 3284 |  | -  | 
|---|
| 3285 |  | -struct sched_in_data {  | 
|---|
| 3286 |  | -	struct perf_event_context *ctx;  | 
|---|
| 3287 |  | -	struct perf_cpu_context *cpuctx;  | 
|---|
| 3288 |  | -	int can_add_hw;  | 
|---|
| 3289 |  | -};  | 
|---|
| 3290 |  | -  | 
|---|
| 3291 |  | -static int pinned_sched_in(struct perf_event *event, void *data)  | 
|---|
| 3292 |  | -{  | 
|---|
| 3293 |  | -	struct sched_in_data *sid = data;  | 
|---|
| 3294 |  | -  | 
|---|
| 3295 |  | -	if (event->state <= PERF_EVENT_STATE_OFF)  | 
|---|
| 3296 |  | -		return 0;  | 
|---|
| 3297 |  | -  | 
|---|
| 3298 |  | -	if (!event_filter_match(event))  | 
|---|
| 3299 |  | -		return 0;  | 
|---|
| 3300 |  | -  | 
|---|
| 3301 |  | -	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {  | 
|---|
| 3302 |  | -		if (!group_sched_in(event, sid->cpuctx, sid->ctx))  | 
|---|
| 3303 |  | -			list_add_tail(&event->active_list, &sid->ctx->pinned_active);  | 
|---|
| 3304 |  | -	}  | 
|---|
| 3305 |  | -  | 
|---|
| 3306 |  | -	/*  | 
|---|
| 3307 |  | -	 * If this pinned group hasn't been scheduled,  | 
|---|
| 3308 |  | -	 * put it in error state.  | 
|---|
| 3309 |  | -	 */  | 
|---|
| 3310 |  | -	if (event->state == PERF_EVENT_STATE_INACTIVE)  | 
|---|
| 3311 |  | -		perf_event_set_state(event, PERF_EVENT_STATE_ERROR);  | 
|---|
| 3312 |  | -  | 
|---|
| 3313 |  | -	return 0;  | 
|---|
| 3314 |  | -}  | 
|---|
| 3315 |  | -  | 
|---|
| 3316 |  | -static int flexible_sched_in(struct perf_event *event, void *data)  | 
|---|
| 3317 |  | -{  | 
|---|
| 3318 |  | -	struct sched_in_data *sid = data;  | 
|---|
| 3319 |  | -  | 
|---|
| 3320 |  | -	if (event->state <= PERF_EVENT_STATE_OFF)  | 
|---|
| 3321 |  | -		return 0;  | 
|---|
| 3322 |  | -  | 
|---|
| 3323 |  | -	if (!event_filter_match(event))  | 
|---|
| 3324 |  | -		return 0;  | 
|---|
| 3325 |  | -  | 
|---|
| 3326 |  | -	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {  | 
|---|
| 3327 |  | -		if (!group_sched_in(event, sid->cpuctx, sid->ctx))  | 
|---|
| 3328 |  | -			list_add_tail(&event->active_list, &sid->ctx->flexible_active);  | 
|---|
 | 3670 | +		if (*evt)  | 
|---|
 | 3671 | +			min_heapify(&event_heap, 0, &perf_min_heap);  | 
|---|
| 3329 | 3672 |  		else | 
|---|
| 3330 |  | -			sid->can_add_hw = 0;  | 
|---|
 | 3673 | +			min_heap_pop(&event_heap, &perf_min_heap);  | 
|---|
 | 3674 | +	}  | 
|---|
 | 3675 | +  | 
|---|
 | 3676 | +	return 0;  | 
|---|
 | 3677 | +}  | 
|---|
 | 3678 | +  | 
|---|
 | 3679 | +static inline bool event_update_userpage(struct perf_event *event)  | 
|---|
 | 3680 | +{  | 
|---|
 | 3681 | +	if (likely(!atomic_read(&event->mmap_count)))  | 
|---|
 | 3682 | +		return false;  | 
|---|
 | 3683 | +  | 
|---|
 | 3684 | +	perf_event_update_time(event);  | 
|---|
 | 3685 | +	perf_set_shadow_time(event, event->ctx);  | 
|---|
 | 3686 | +	perf_event_update_userpage(event);  | 
|---|
 | 3687 | +  | 
|---|
 | 3688 | +	return true;  | 
|---|
 | 3689 | +}  | 
|---|
 | 3690 | +  | 
|---|
 | 3691 | +static inline void group_update_userpage(struct perf_event *group_event)  | 
|---|
 | 3692 | +{  | 
|---|
 | 3693 | +	struct perf_event *event;  | 
|---|
 | 3694 | +  | 
|---|
 | 3695 | +	if (!event_update_userpage(group_event))  | 
|---|
 | 3696 | +		return;  | 
|---|
 | 3697 | +  | 
|---|
 | 3698 | +	for_each_sibling_event(event, group_event)  | 
|---|
 | 3699 | +		event_update_userpage(event);  | 
|---|
 | 3700 | +}  | 
|---|
 | 3701 | +  | 
|---|
 | 3702 | +static int merge_sched_in(struct perf_event *event, void *data)  | 
|---|
 | 3703 | +{  | 
|---|
 | 3704 | +	struct perf_event_context *ctx = event->ctx;  | 
|---|
 | 3705 | +	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);  | 
|---|
 | 3706 | +	int *can_add_hw = data;  | 
|---|
 | 3707 | +  | 
|---|
 | 3708 | +	if (event->state <= PERF_EVENT_STATE_OFF)  | 
|---|
 | 3709 | +		return 0;  | 
|---|
 | 3710 | +  | 
|---|
 | 3711 | +	if (!event_filter_match(event))  | 
|---|
 | 3712 | +		return 0;  | 
|---|
 | 3713 | +  | 
|---|
 | 3714 | +	if (group_can_go_on(event, cpuctx, *can_add_hw)) {  | 
|---|
 | 3715 | +		if (!group_sched_in(event, cpuctx, ctx))  | 
|---|
 | 3716 | +			list_add_tail(&event->active_list, get_event_list(event));  | 
|---|
 | 3717 | +	}  | 
|---|
 | 3718 | +  | 
|---|
 | 3719 | +	if (event->state == PERF_EVENT_STATE_INACTIVE) {  | 
|---|
 | 3720 | +		*can_add_hw = 0;  | 
|---|
 | 3721 | +		if (event->attr.pinned) {  | 
|---|
 | 3722 | +			perf_cgroup_event_disable(event, ctx);  | 
|---|
 | 3723 | +			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);  | 
|---|
 | 3724 | +		} else {  | 
|---|
 | 3725 | +			ctx->rotate_necessary = 1;  | 
|---|
 | 3726 | +			perf_mux_hrtimer_restart(cpuctx);  | 
|---|
 | 3727 | +			group_update_userpage(event);  | 
|---|
 | 3728 | +		}  | 
|---|
| 3331 | 3729 |  	} | 
|---|
| 3332 | 3730 |   | 
|---|
| 3333 | 3731 |  	return 0; | 
|---|
| .. | .. | 
|---|
| 3337 | 3735 |  ctx_pinned_sched_in(struct perf_event_context *ctx, | 
|---|
| 3338 | 3736 |  		    struct perf_cpu_context *cpuctx) | 
|---|
| 3339 | 3737 |  { | 
|---|
| 3340 |  | -	struct sched_in_data sid = {  | 
|---|
| 3341 |  | -		.ctx = ctx,  | 
|---|
| 3342 |  | -		.cpuctx = cpuctx,  | 
|---|
| 3343 |  | -		.can_add_hw = 1,  | 
|---|
| 3344 |  | -	};  | 
|---|
 | 3738 | +	int can_add_hw = 1;  | 
|---|
| 3345 | 3739 |   | 
|---|
| 3346 |  | -	visit_groups_merge(&ctx->pinned_groups,  | 
|---|
 | 3740 | +	if (ctx != &cpuctx->ctx)  | 
|---|
 | 3741 | +		cpuctx = NULL;  | 
|---|
 | 3742 | +  | 
|---|
 | 3743 | +	visit_groups_merge(cpuctx, &ctx->pinned_groups,  | 
|---|
| 3347 | 3744 |  			   smp_processor_id(), | 
|---|
| 3348 |  | -			   pinned_sched_in, &sid);  | 
|---|
 | 3745 | +			   merge_sched_in, &can_add_hw);  | 
|---|
| 3349 | 3746 |  } | 
|---|
| 3350 | 3747 |   | 
|---|
| 3351 | 3748 |  static void | 
|---|
| 3352 | 3749 |  ctx_flexible_sched_in(struct perf_event_context *ctx, | 
|---|
| 3353 | 3750 |  		      struct perf_cpu_context *cpuctx) | 
|---|
| 3354 | 3751 |  { | 
|---|
| 3355 |  | -	struct sched_in_data sid = {  | 
|---|
| 3356 |  | -		.ctx = ctx,  | 
|---|
| 3357 |  | -		.cpuctx = cpuctx,  | 
|---|
| 3358 |  | -		.can_add_hw = 1,  | 
|---|
| 3359 |  | -	};  | 
|---|
 | 3752 | +	int can_add_hw = 1;  | 
|---|
| 3360 | 3753 |   | 
|---|
| 3361 |  | -	visit_groups_merge(&ctx->flexible_groups,  | 
|---|
 | 3754 | +	if (ctx != &cpuctx->ctx)  | 
|---|
 | 3755 | +		cpuctx = NULL;  | 
|---|
 | 3756 | +  | 
|---|
 | 3757 | +	visit_groups_merge(cpuctx, &ctx->flexible_groups,  | 
|---|
| 3362 | 3758 |  			   smp_processor_id(), | 
|---|
| 3363 |  | -			   flexible_sched_in, &sid);  | 
|---|
 | 3759 | +			   merge_sched_in, &can_add_hw);  | 
|---|
| 3364 | 3760 |  } | 
|---|
| 3365 | 3761 |   | 
|---|
| 3366 | 3762 |  static void | 
|---|
| .. | .. | 
|---|
| 3419 | 3815 |  					struct task_struct *task) | 
|---|
| 3420 | 3816 |  { | 
|---|
| 3421 | 3817 |  	struct perf_cpu_context *cpuctx; | 
|---|
 | 3818 | +	struct pmu *pmu = ctx->pmu;  | 
|---|
| 3422 | 3819 |   | 
|---|
| 3423 | 3820 |  	cpuctx = __get_cpu_context(ctx); | 
|---|
| 3424 |  | -	if (cpuctx->task_ctx == ctx)  | 
|---|
 | 3821 | +	if (cpuctx->task_ctx == ctx) {  | 
|---|
 | 3822 | +		if (cpuctx->sched_cb_usage)  | 
|---|
 | 3823 | +			__perf_pmu_sched_task(cpuctx, true);  | 
|---|
| 3425 | 3824 |  		return; | 
|---|
 | 3825 | +	}  | 
|---|
| 3426 | 3826 |   | 
|---|
| 3427 | 3827 |  	perf_ctx_lock(cpuctx, ctx); | 
|---|
| 3428 | 3828 |  	/* | 
|---|
| .. | .. | 
|---|
| 3432 | 3832 |  	if (!ctx->nr_events) | 
|---|
| 3433 | 3833 |  		goto unlock; | 
|---|
| 3434 | 3834 |   | 
|---|
| 3435 |  | -	perf_pmu_disable(ctx->pmu);  | 
|---|
 | 3835 | +	perf_pmu_disable(pmu);  | 
|---|
| 3436 | 3836 |  	/* | 
|---|
| 3437 | 3837 |  	 * We want to keep the following priority order: | 
|---|
| 3438 | 3838 |  	 * cpu pinned (that don't need to move), task pinned, | 
|---|
| .. | .. | 
|---|
| 3444 | 3844 |  	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) | 
|---|
| 3445 | 3845 |  		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 
|---|
| 3446 | 3846 |  	perf_event_sched_in(cpuctx, ctx, task); | 
|---|
| 3447 |  | -	perf_pmu_enable(ctx->pmu);  | 
|---|
 | 3847 | +  | 
|---|
 | 3848 | +	if (cpuctx->sched_cb_usage && pmu->sched_task)  | 
|---|
 | 3849 | +		pmu->sched_task(cpuctx->task_ctx, true);  | 
|---|
 | 3850 | +  | 
|---|
 | 3851 | +	perf_pmu_enable(pmu);  | 
|---|
| 3448 | 3852 |   | 
|---|
| 3449 | 3853 |  unlock: | 
|---|
| 3450 | 3854 |  	perf_ctx_unlock(cpuctx, ctx); | 
|---|
| .. | .. | 
|---|
| 3685 | 4089 |  	perf_event_groups_insert(&ctx->flexible_groups, event); | 
|---|
| 3686 | 4090 |  } | 
|---|
| 3687 | 4091 |   | 
|---|
 | 4092 | +/* pick an event from the flexible_groups to rotate */  | 
|---|
| 3688 | 4093 |  static inline struct perf_event * | 
|---|
| 3689 |  | -ctx_first_active(struct perf_event_context *ctx)  | 
|---|
 | 4094 | +ctx_event_to_rotate(struct perf_event_context *ctx)  | 
|---|
| 3690 | 4095 |  { | 
|---|
| 3691 |  | -	return list_first_entry_or_null(&ctx->flexible_active,  | 
|---|
| 3692 |  | -					struct perf_event, active_list);  | 
|---|
 | 4096 | +	struct perf_event *event;  | 
|---|
 | 4097 | +  | 
|---|
 | 4098 | +	/* pick the first active flexible event */  | 
|---|
 | 4099 | +	event = list_first_entry_or_null(&ctx->flexible_active,  | 
|---|
 | 4100 | +					 struct perf_event, active_list);  | 
|---|
 | 4101 | +  | 
|---|
 | 4102 | +	/* if no active flexible event, pick the first event */  | 
|---|
 | 4103 | +	if (!event) {  | 
|---|
 | 4104 | +		event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),  | 
|---|
 | 4105 | +				      typeof(*event), group_node);  | 
|---|
 | 4106 | +	}  | 
|---|
 | 4107 | +  | 
|---|
 | 4108 | +	/*  | 
|---|
 | 4109 | +	 * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()  | 
|---|
 | 4110 | +	 * finds there are unschedulable events, it will set it again.  | 
|---|
 | 4111 | +	 */  | 
|---|
 | 4112 | +	ctx->rotate_necessary = 0;  | 
|---|
 | 4113 | +  | 
|---|
 | 4114 | +	return event;  | 
|---|
| 3693 | 4115 |  } | 
|---|
| 3694 | 4116 |   | 
|---|
| 3695 | 4117 |  static bool perf_rotate_context(struct perf_cpu_context *cpuctx) | 
|---|
| 3696 | 4118 |  { | 
|---|
| 3697 | 4119 |  	struct perf_event *cpu_event = NULL, *task_event = NULL; | 
|---|
| 3698 |  | -	bool cpu_rotate = false, task_rotate = false;  | 
|---|
| 3699 |  | -	struct perf_event_context *ctx = NULL;  | 
|---|
 | 4120 | +	struct perf_event_context *task_ctx = NULL;  | 
|---|
 | 4121 | +	int cpu_rotate, task_rotate;  | 
|---|
| 3700 | 4122 |   | 
|---|
| 3701 | 4123 |  	/* | 
|---|
| 3702 | 4124 |  	 * Since we run this from IRQ context, nobody can install new | 
|---|
| 3703 | 4125 |  	 * events, thus the event count values are stable. | 
|---|
| 3704 | 4126 |  	 */ | 
|---|
| 3705 | 4127 |   | 
|---|
| 3706 |  | -	if (cpuctx->ctx.nr_events) {  | 
|---|
| 3707 |  | -		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)  | 
|---|
| 3708 |  | -			cpu_rotate = true;  | 
|---|
| 3709 |  | -	}  | 
|---|
| 3710 |  | -  | 
|---|
| 3711 |  | -	ctx = cpuctx->task_ctx;  | 
|---|
| 3712 |  | -	if (ctx && ctx->nr_events) {  | 
|---|
| 3713 |  | -		if (ctx->nr_events != ctx->nr_active)  | 
|---|
| 3714 |  | -			task_rotate = true;  | 
|---|
| 3715 |  | -	}  | 
|---|
 | 4128 | +	cpu_rotate = cpuctx->ctx.rotate_necessary;  | 
|---|
 | 4129 | +	task_ctx = cpuctx->task_ctx;  | 
|---|
 | 4130 | +	task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;  | 
|---|
| 3716 | 4131 |   | 
|---|
| 3717 | 4132 |  	if (!(cpu_rotate || task_rotate)) | 
|---|
| 3718 | 4133 |  		return false; | 
|---|
| .. | .. | 
|---|
| 3721 | 4136 |  	perf_pmu_disable(cpuctx->ctx.pmu); | 
|---|
| 3722 | 4137 |   | 
|---|
| 3723 | 4138 |  	if (task_rotate) | 
|---|
| 3724 |  | -		task_event = ctx_first_active(ctx);  | 
|---|
 | 4139 | +		task_event = ctx_event_to_rotate(task_ctx);  | 
|---|
| 3725 | 4140 |  	if (cpu_rotate) | 
|---|
| 3726 |  | -		cpu_event = ctx_first_active(&cpuctx->ctx);  | 
|---|
 | 4141 | +		cpu_event = ctx_event_to_rotate(&cpuctx->ctx);  | 
|---|
| 3727 | 4142 |   | 
|---|
| 3728 | 4143 |  	/* | 
|---|
| 3729 | 4144 |  	 * As per the order given at ctx_resched() first 'pop' task flexible | 
|---|
| 3730 | 4145 |  	 * and then, if needed CPU flexible. | 
|---|
| 3731 | 4146 |  	 */ | 
|---|
| 3732 |  | -	if (task_event || (ctx && cpu_event))  | 
|---|
| 3733 |  | -		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);  | 
|---|
 | 4147 | +	if (task_event || (task_ctx && cpu_event))  | 
|---|
 | 4148 | +		ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);  | 
|---|
| 3734 | 4149 |  	if (cpu_event) | 
|---|
| 3735 | 4150 |  		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 
|---|
| 3736 | 4151 |   | 
|---|
| 3737 | 4152 |  	if (task_event) | 
|---|
| 3738 |  | -		rotate_ctx(ctx, task_event);  | 
|---|
 | 4153 | +		rotate_ctx(task_ctx, task_event);  | 
|---|
| 3739 | 4154 |  	if (cpu_event) | 
|---|
| 3740 | 4155 |  		rotate_ctx(&cpuctx->ctx, cpu_event); | 
|---|
| 3741 | 4156 |   | 
|---|
| 3742 |  | -	perf_event_sched_in(cpuctx, ctx, current);  | 
|---|
 | 4157 | +	perf_event_sched_in(cpuctx, task_ctx, current);  | 
|---|
| 3743 | 4158 |   | 
|---|
| 3744 | 4159 |  	perf_pmu_enable(cpuctx->ctx.pmu); | 
|---|
| 3745 | 4160 |  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | 
|---|
| .. | .. | 
|---|
| 3983 | 4398 |   | 
|---|
| 3984 | 4399 |  	return ret; | 
|---|
| 3985 | 4400 |  } | 
|---|
 | 4401 | +EXPORT_SYMBOL_GPL(perf_event_read_local);  | 
|---|
| 3986 | 4402 |   | 
|---|
| 3987 | 4403 |  static int perf_event_read(struct perf_event *event, bool group) | 
|---|
| 3988 | 4404 |  { | 
|---|
| .. | .. | 
|---|
| 4074 | 4490 |  	INIT_LIST_HEAD(&ctx->event_list); | 
|---|
| 4075 | 4491 |  	INIT_LIST_HEAD(&ctx->pinned_active); | 
|---|
| 4076 | 4492 |  	INIT_LIST_HEAD(&ctx->flexible_active); | 
|---|
| 4077 |  | -	atomic_set(&ctx->refcount, 1);  | 
|---|
 | 4493 | +	refcount_set(&ctx->refcount, 1);  | 
|---|
| 4078 | 4494 |  } | 
|---|
| 4079 | 4495 |   | 
|---|
| 4080 | 4496 |  static struct perf_event_context * | 
|---|
| .. | .. | 
|---|
| 4087 | 4503 |  		return NULL; | 
|---|
| 4088 | 4504 |   | 
|---|
| 4089 | 4505 |  	__perf_event_init_context(ctx); | 
|---|
| 4090 |  | -	if (task) {  | 
|---|
| 4091 |  | -		ctx->task = task;  | 
|---|
| 4092 |  | -		get_task_struct(task);  | 
|---|
| 4093 |  | -	}  | 
|---|
 | 4506 | +	if (task)  | 
|---|
 | 4507 | +		ctx->task = get_task_struct(task);  | 
|---|
| 4094 | 4508 |  	ctx->pmu = pmu; | 
|---|
| 4095 | 4509 |   | 
|---|
| 4096 | 4510 |  	return ctx; | 
|---|
| .. | .. | 
|---|
| 4152 | 4566 |  		goto errout; | 
|---|
| 4153 | 4567 |   | 
|---|
| 4154 | 4568 |  	if (event->attach_state & PERF_ATTACH_TASK_DATA) { | 
|---|
| 4155 |  | -		task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);  | 
|---|
 | 4569 | +		task_ctx_data = alloc_task_ctx_data(pmu);  | 
|---|
| 4156 | 4570 |  		if (!task_ctx_data) { | 
|---|
| 4157 | 4571 |  			err = -ENOMEM; | 
|---|
| 4158 | 4572 |  			goto errout; | 
|---|
| .. | .. | 
|---|
| 4210 | 4624 |  		} | 
|---|
| 4211 | 4625 |  	} | 
|---|
| 4212 | 4626 |   | 
|---|
| 4213 |  | -	kfree(task_ctx_data);  | 
|---|
 | 4627 | +	free_task_ctx_data(pmu, task_ctx_data);  | 
|---|
| 4214 | 4628 |  	return ctx; | 
|---|
| 4215 | 4629 |   | 
|---|
| 4216 | 4630 |  errout: | 
|---|
| 4217 |  | -	kfree(task_ctx_data);  | 
|---|
 | 4631 | +	free_task_ctx_data(pmu, task_ctx_data);  | 
|---|
| 4218 | 4632 |  	return ERR_PTR(err); | 
|---|
| 4219 | 4633 |  } | 
|---|
| 4220 | 4634 |   | 
|---|
| .. | .. | 
|---|
| 4233 | 4647 |  } | 
|---|
| 4234 | 4648 |   | 
|---|
| 4235 | 4649 |  static void ring_buffer_attach(struct perf_event *event, | 
|---|
| 4236 |  | -			       struct ring_buffer *rb);  | 
|---|
 | 4650 | +			       struct perf_buffer *rb);  | 
|---|
| 4237 | 4651 |   | 
|---|
| 4238 | 4652 |  static void detach_sb_event(struct perf_event *event) | 
|---|
| 4239 | 4653 |  { | 
|---|
| .. | .. | 
|---|
| 4256 | 4670 |   | 
|---|
| 4257 | 4671 |  	if (attr->mmap || attr->mmap_data || attr->mmap2 || | 
|---|
| 4258 | 4672 |  	    attr->comm || attr->comm_exec || | 
|---|
| 4259 |  | -	    attr->task ||  | 
|---|
| 4260 |  | -	    attr->context_switch)  | 
|---|
 | 4673 | +	    attr->task || attr->ksymbol ||  | 
|---|
 | 4674 | +	    attr->context_switch || attr->text_poke ||  | 
|---|
 | 4675 | +	    attr->bpf_event)  | 
|---|
| 4261 | 4676 |  		return true; | 
|---|
| 4262 | 4677 |  	return false; | 
|---|
| 4263 | 4678 |  } | 
|---|
| .. | .. | 
|---|
| 4306 | 4721 |  	if (event->parent) | 
|---|
| 4307 | 4722 |  		return; | 
|---|
| 4308 | 4723 |   | 
|---|
| 4309 |  | -	if (event->attach_state & PERF_ATTACH_TASK)  | 
|---|
 | 4724 | +	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))  | 
|---|
| 4310 | 4725 |  		dec = true; | 
|---|
| 4311 | 4726 |  	if (event->attr.mmap || event->attr.mmap_data) | 
|---|
| 4312 | 4727 |  		atomic_dec(&nr_mmap_events); | 
|---|
| .. | .. | 
|---|
| 4314 | 4729 |  		atomic_dec(&nr_comm_events); | 
|---|
| 4315 | 4730 |  	if (event->attr.namespaces) | 
|---|
| 4316 | 4731 |  		atomic_dec(&nr_namespaces_events); | 
|---|
 | 4732 | +	if (event->attr.cgroup)  | 
|---|
 | 4733 | +		atomic_dec(&nr_cgroup_events);  | 
|---|
| 4317 | 4734 |  	if (event->attr.task) | 
|---|
| 4318 | 4735 |  		atomic_dec(&nr_task_events); | 
|---|
| 4319 | 4736 |  	if (event->attr.freq) | 
|---|
| .. | .. | 
|---|
| 4326 | 4743 |  		dec = true; | 
|---|
| 4327 | 4744 |  	if (has_branch_stack(event)) | 
|---|
| 4328 | 4745 |  		dec = true; | 
|---|
 | 4746 | +	if (event->attr.ksymbol)  | 
|---|
 | 4747 | +		atomic_dec(&nr_ksymbol_events);  | 
|---|
 | 4748 | +	if (event->attr.bpf_event)  | 
|---|
 | 4749 | +		atomic_dec(&nr_bpf_events);  | 
|---|
 | 4750 | +	if (event->attr.text_poke)  | 
|---|
 | 4751 | +		atomic_dec(&nr_text_poke_events);  | 
|---|
| 4329 | 4752 |   | 
|---|
| 4330 | 4753 |  	if (dec) { | 
|---|
| 4331 | 4754 |  		if (!atomic_add_unless(&perf_sched_count, -1, 1)) | 
|---|
| .. | .. | 
|---|
| 4909 | 5332 |  static __poll_t perf_poll(struct file *file, poll_table *wait) | 
|---|
| 4910 | 5333 |  { | 
|---|
| 4911 | 5334 |  	struct perf_event *event = file->private_data; | 
|---|
| 4912 |  | -	struct ring_buffer *rb;  | 
|---|
 | 5335 | +	struct perf_buffer *rb;  | 
|---|
| 4913 | 5336 |  	__poll_t events = EPOLLHUP; | 
|---|
| 4914 | 5337 |   | 
|---|
| 4915 | 5338 |  	poll_wait(file, &event->waitq, wait); | 
|---|
| .. | .. | 
|---|
| 4935 | 5358 |  	local64_set(&event->count, 0); | 
|---|
| 4936 | 5359 |  	perf_event_update_userpage(event); | 
|---|
| 4937 | 5360 |  } | 
|---|
 | 5361 | +  | 
|---|
 | 5362 | +/* Assume it's not an event with inherit set. */  | 
|---|
 | 5363 | +u64 perf_event_pause(struct perf_event *event, bool reset)  | 
|---|
 | 5364 | +{  | 
|---|
 | 5365 | +	struct perf_event_context *ctx;  | 
|---|
 | 5366 | +	u64 count;  | 
|---|
 | 5367 | +  | 
|---|
 | 5368 | +	ctx = perf_event_ctx_lock(event);  | 
|---|
 | 5369 | +	WARN_ON_ONCE(event->attr.inherit);  | 
|---|
 | 5370 | +	_perf_event_disable(event);  | 
|---|
 | 5371 | +	count = local64_read(&event->count);  | 
|---|
 | 5372 | +	if (reset)  | 
|---|
 | 5373 | +		local64_set(&event->count, 0);  | 
|---|
 | 5374 | +	perf_event_ctx_unlock(event, ctx);  | 
|---|
 | 5375 | +  | 
|---|
 | 5376 | +	return count;  | 
|---|
 | 5377 | +}  | 
|---|
 | 5378 | +EXPORT_SYMBOL_GPL(perf_event_pause);  | 
|---|
| 4938 | 5379 |   | 
|---|
| 4939 | 5380 |  /* | 
|---|
| 4940 | 5381 |   * Holding the top-level event's child_mutex means that any | 
|---|
| .. | .. | 
|---|
| 5013 | 5454 |  	return event->pmu->check_period(event, value); | 
|---|
| 5014 | 5455 |  } | 
|---|
| 5015 | 5456 |   | 
|---|
| 5016 |  | -static int perf_event_period(struct perf_event *event, u64 __user *arg)  | 
|---|
 | 5457 | +static int _perf_event_period(struct perf_event *event, u64 value)  | 
|---|
| 5017 | 5458 |  { | 
|---|
| 5018 |  | -	u64 value;  | 
|---|
| 5019 |  | -  | 
|---|
| 5020 | 5459 |  	if (!is_sampling_event(event)) | 
|---|
| 5021 | 5460 |  		return -EINVAL; | 
|---|
| 5022 |  | -  | 
|---|
| 5023 |  | -	if (copy_from_user(&value, arg, sizeof(value)))  | 
|---|
| 5024 |  | -		return -EFAULT;  | 
|---|
| 5025 | 5461 |   | 
|---|
| 5026 | 5462 |  	if (!value) | 
|---|
| 5027 | 5463 |  		return -EINVAL; | 
|---|
| .. | .. | 
|---|
| 5039 | 5475 |   | 
|---|
| 5040 | 5476 |  	return 0; | 
|---|
| 5041 | 5477 |  } | 
|---|
 | 5478 | +  | 
|---|
 | 5479 | +int perf_event_period(struct perf_event *event, u64 value)  | 
|---|
 | 5480 | +{  | 
|---|
 | 5481 | +	struct perf_event_context *ctx;  | 
|---|
 | 5482 | +	int ret;  | 
|---|
 | 5483 | +  | 
|---|
 | 5484 | +	ctx = perf_event_ctx_lock(event);  | 
|---|
 | 5485 | +	ret = _perf_event_period(event, value);  | 
|---|
 | 5486 | +	perf_event_ctx_unlock(event, ctx);  | 
|---|
 | 5487 | +  | 
|---|
 | 5488 | +	return ret;  | 
|---|
 | 5489 | +}  | 
|---|
 | 5490 | +EXPORT_SYMBOL_GPL(perf_event_period);  | 
|---|
| 5042 | 5491 |   | 
|---|
| 5043 | 5492 |  static const struct file_operations perf_fops; | 
|---|
| 5044 | 5493 |   | 
|---|
| .. | .. | 
|---|
| 5083 | 5532 |  		return _perf_event_refresh(event, arg); | 
|---|
| 5084 | 5533 |   | 
|---|
| 5085 | 5534 |  	case PERF_EVENT_IOC_PERIOD: | 
|---|
| 5086 |  | -		return perf_event_period(event, (u64 __user *)arg);  | 
|---|
 | 5535 | +	{  | 
|---|
 | 5536 | +		u64 value;  | 
|---|
| 5087 | 5537 |   | 
|---|
 | 5538 | +		if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))  | 
|---|
 | 5539 | +			return -EFAULT;  | 
|---|
 | 5540 | +  | 
|---|
 | 5541 | +		return _perf_event_period(event, value);  | 
|---|
 | 5542 | +	}  | 
|---|
| 5088 | 5543 |  	case PERF_EVENT_IOC_ID: | 
|---|
| 5089 | 5544 |  	{ | 
|---|
| 5090 | 5545 |  		u64 id = primary_event_id(event); | 
|---|
| .. | .. | 
|---|
| 5119 | 5574 |  		return perf_event_set_bpf_prog(event, arg); | 
|---|
| 5120 | 5575 |   | 
|---|
| 5121 | 5576 |  	case PERF_EVENT_IOC_PAUSE_OUTPUT: { | 
|---|
| 5122 |  | -		struct ring_buffer *rb;  | 
|---|
 | 5577 | +		struct perf_buffer *rb;  | 
|---|
| 5123 | 5578 |   | 
|---|
| 5124 | 5579 |  		rcu_read_lock(); | 
|---|
| 5125 | 5580 |  		rb = rcu_dereference(event->rb); | 
|---|
| .. | .. | 
|---|
| 5255 | 5710 |  static void perf_event_init_userpage(struct perf_event *event) | 
|---|
| 5256 | 5711 |  { | 
|---|
| 5257 | 5712 |  	struct perf_event_mmap_page *userpg; | 
|---|
| 5258 |  | -	struct ring_buffer *rb;  | 
|---|
 | 5713 | +	struct perf_buffer *rb;  | 
|---|
| 5259 | 5714 |   | 
|---|
| 5260 | 5715 |  	rcu_read_lock(); | 
|---|
| 5261 | 5716 |  	rb = rcu_dereference(event->rb); | 
|---|
| .. | .. | 
|---|
| 5287 | 5742 |  void perf_event_update_userpage(struct perf_event *event) | 
|---|
| 5288 | 5743 |  { | 
|---|
| 5289 | 5744 |  	struct perf_event_mmap_page *userpg; | 
|---|
| 5290 |  | -	struct ring_buffer *rb;  | 
|---|
 | 5745 | +	struct perf_buffer *rb;  | 
|---|
| 5291 | 5746 |  	u64 enabled, running, now; | 
|---|
| 5292 | 5747 |   | 
|---|
| 5293 | 5748 |  	rcu_read_lock(); | 
|---|
| .. | .. | 
|---|
| 5338 | 5793 |  static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) | 
|---|
| 5339 | 5794 |  { | 
|---|
| 5340 | 5795 |  	struct perf_event *event = vmf->vma->vm_file->private_data; | 
|---|
| 5341 |  | -	struct ring_buffer *rb;  | 
|---|
 | 5796 | +	struct perf_buffer *rb;  | 
|---|
| 5342 | 5797 |  	vm_fault_t ret = VM_FAULT_SIGBUS; | 
|---|
| 5343 | 5798 |   | 
|---|
| 5344 | 5799 |  	if (vmf->flags & FAULT_FLAG_MKWRITE) { | 
|---|
| .. | .. | 
|---|
| 5371 | 5826 |  } | 
|---|
| 5372 | 5827 |   | 
|---|
| 5373 | 5828 |  static void ring_buffer_attach(struct perf_event *event, | 
|---|
| 5374 |  | -			       struct ring_buffer *rb)  | 
|---|
 | 5829 | +			       struct perf_buffer *rb)  | 
|---|
| 5375 | 5830 |  { | 
|---|
| 5376 |  | -	struct ring_buffer *old_rb = NULL;  | 
|---|
 | 5831 | +	struct perf_buffer *old_rb = NULL;  | 
|---|
| 5377 | 5832 |  	unsigned long flags; | 
|---|
 | 5833 | +  | 
|---|
 | 5834 | +	WARN_ON_ONCE(event->parent);  | 
|---|
| 5378 | 5835 |   | 
|---|
| 5379 | 5836 |  	if (event->rb) { | 
|---|
| 5380 | 5837 |  		/* | 
|---|
| .. | .. | 
|---|
| 5431 | 5888 |   | 
|---|
| 5432 | 5889 |  static void ring_buffer_wakeup(struct perf_event *event) | 
|---|
| 5433 | 5890 |  { | 
|---|
| 5434 |  | -	struct ring_buffer *rb;  | 
|---|
 | 5891 | +	struct perf_buffer *rb;  | 
|---|
 | 5892 | +  | 
|---|
 | 5893 | +	if (event->parent)  | 
|---|
 | 5894 | +		event = event->parent;  | 
|---|
| 5435 | 5895 |   | 
|---|
| 5436 | 5896 |  	rcu_read_lock(); | 
|---|
| 5437 | 5897 |  	rb = rcu_dereference(event->rb); | 
|---|
| .. | .. | 
|---|
| 5442 | 5902 |  	rcu_read_unlock(); | 
|---|
| 5443 | 5903 |  } | 
|---|
| 5444 | 5904 |   | 
|---|
| 5445 |  | -struct ring_buffer *ring_buffer_get(struct perf_event *event)  | 
|---|
 | 5905 | +struct perf_buffer *ring_buffer_get(struct perf_event *event)  | 
|---|
| 5446 | 5906 |  { | 
|---|
| 5447 |  | -	struct ring_buffer *rb;  | 
|---|
 | 5907 | +	struct perf_buffer *rb;  | 
|---|
 | 5908 | +  | 
|---|
 | 5909 | +	if (event->parent)  | 
|---|
 | 5910 | +		event = event->parent;  | 
|---|
| 5448 | 5911 |   | 
|---|
| 5449 | 5912 |  	rcu_read_lock(); | 
|---|
| 5450 | 5913 |  	rb = rcu_dereference(event->rb); | 
|---|
| 5451 | 5914 |  	if (rb) { | 
|---|
| 5452 |  | -		if (!atomic_inc_not_zero(&rb->refcount))  | 
|---|
 | 5915 | +		if (!refcount_inc_not_zero(&rb->refcount))  | 
|---|
| 5453 | 5916 |  			rb = NULL; | 
|---|
| 5454 | 5917 |  	} | 
|---|
| 5455 | 5918 |  	rcu_read_unlock(); | 
|---|
| .. | .. | 
|---|
| 5457 | 5920 |  	return rb; | 
|---|
| 5458 | 5921 |  } | 
|---|
| 5459 | 5922 |   | 
|---|
| 5460 |  | -void ring_buffer_put(struct ring_buffer *rb)  | 
|---|
 | 5923 | +void ring_buffer_put(struct perf_buffer *rb)  | 
|---|
| 5461 | 5924 |  { | 
|---|
| 5462 |  | -	if (!atomic_dec_and_test(&rb->refcount))  | 
|---|
 | 5925 | +	if (!refcount_dec_and_test(&rb->refcount))  | 
|---|
| 5463 | 5926 |  		return; | 
|---|
| 5464 | 5927 |   | 
|---|
| 5465 | 5928 |  	WARN_ON_ONCE(!list_empty(&rb->event_list)); | 
|---|
| .. | .. | 
|---|
| 5494 | 5957 |  static void perf_mmap_close(struct vm_area_struct *vma) | 
|---|
| 5495 | 5958 |  { | 
|---|
| 5496 | 5959 |  	struct perf_event *event = vma->vm_file->private_data; | 
|---|
| 5497 |  | -	struct ring_buffer *rb = ring_buffer_get(event);  | 
|---|
 | 5960 | +	struct perf_buffer *rb = ring_buffer_get(event);  | 
|---|
| 5498 | 5961 |  	struct user_struct *mmap_user = rb->mmap_user; | 
|---|
| 5499 | 5962 |  	int mmap_locked = rb->mmap_locked; | 
|---|
| 5500 | 5963 |  	unsigned long size = perf_data_size(rb); | 
|---|
| .. | .. | 
|---|
| 5519 | 5982 |  		perf_pmu_output_stop(event); | 
|---|
| 5520 | 5983 |   | 
|---|
| 5521 | 5984 |  		/* now it's safe to free the pages */ | 
|---|
| 5522 |  | -		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);  | 
|---|
| 5523 |  | -		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;  | 
|---|
 | 5985 | +		atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);  | 
|---|
 | 5986 | +		atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);  | 
|---|
| 5524 | 5987 |   | 
|---|
| 5525 | 5988 |  		/* this has to be the last one */ | 
|---|
| 5526 | 5989 |  		rb_free_aux(rb); | 
|---|
| 5527 |  | -		WARN_ON_ONCE(atomic_read(&rb->aux_refcount));  | 
|---|
 | 5990 | +		WARN_ON_ONCE(refcount_read(&rb->aux_refcount));  | 
|---|
| 5528 | 5991 |   | 
|---|
| 5529 | 5992 |  		mutex_unlock(&event->mmap_mutex); | 
|---|
| 5530 | 5993 |  	} | 
|---|
| .. | .. | 
|---|
| 5593 | 6056 |  	 * undo the VM accounting. | 
|---|
| 5594 | 6057 |  	 */ | 
|---|
| 5595 | 6058 |   | 
|---|
| 5596 |  | -	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);  | 
|---|
| 5597 |  | -	vma->vm_mm->pinned_vm -= mmap_locked;  | 
|---|
 | 6059 | +	atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,  | 
|---|
 | 6060 | +			&mmap_user->locked_vm);  | 
|---|
 | 6061 | +	atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);  | 
|---|
| 5598 | 6062 |  	free_uid(mmap_user); | 
|---|
| 5599 | 6063 |   | 
|---|
| 5600 | 6064 |  out_put: | 
|---|
| .. | .. | 
|---|
| 5603 | 6067 |   | 
|---|
| 5604 | 6068 |  static const struct vm_operations_struct perf_mmap_vmops = { | 
|---|
| 5605 | 6069 |  	.open		= perf_mmap_open, | 
|---|
| 5606 |  | -	.close		= perf_mmap_close, /* non mergable */  | 
|---|
 | 6070 | +	.close		= perf_mmap_close, /* non mergeable */  | 
|---|
| 5607 | 6071 |  	.fault		= perf_mmap_fault, | 
|---|
| 5608 | 6072 |  	.page_mkwrite	= perf_mmap_fault, | 
|---|
| 5609 | 6073 |  }; | 
|---|
| .. | .. | 
|---|
| 5613 | 6077 |  	struct perf_event *event = file->private_data; | 
|---|
| 5614 | 6078 |  	unsigned long user_locked, user_lock_limit; | 
|---|
| 5615 | 6079 |  	struct user_struct *user = current_user(); | 
|---|
 | 6080 | +	struct perf_buffer *rb = NULL;  | 
|---|
| 5616 | 6081 |  	unsigned long locked, lock_limit; | 
|---|
| 5617 |  | -	struct ring_buffer *rb = NULL;  | 
|---|
| 5618 | 6082 |  	unsigned long vma_size; | 
|---|
| 5619 | 6083 |  	unsigned long nr_pages; | 
|---|
| 5620 | 6084 |  	long user_extra = 0, extra = 0; | 
|---|
| .. | .. | 
|---|
| 5711 | 6175 |  again: | 
|---|
| 5712 | 6176 |  	mutex_lock(&event->mmap_mutex); | 
|---|
| 5713 | 6177 |  	if (event->rb) { | 
|---|
| 5714 |  | -		if (event->rb->nr_pages != nr_pages) {  | 
|---|
 | 6178 | +		if (data_page_nr(event->rb) != nr_pages) {  | 
|---|
| 5715 | 6179 |  			ret = -EINVAL; | 
|---|
| 5716 | 6180 |  			goto unlock; | 
|---|
| 5717 | 6181 |  		} | 
|---|
| 5718 | 6182 |   | 
|---|
| 5719 | 6183 |  		if (!atomic_inc_not_zero(&event->rb->mmap_count)) { | 
|---|
| 5720 | 6184 |  			/* | 
|---|
| 5721 |  | -			 * Raced against perf_mmap_close() through  | 
|---|
| 5722 |  | -			 * perf_event_set_output(). Try again, hope for better  | 
|---|
| 5723 |  | -			 * luck.  | 
|---|
 | 6185 | +			 * Raced against perf_mmap_close(); remove the  | 
|---|
 | 6186 | +			 * event and try again.  | 
|---|
| 5724 | 6187 |  			 */ | 
|---|
 | 6188 | +			ring_buffer_attach(event, NULL);  | 
|---|
| 5725 | 6189 |  			mutex_unlock(&event->mmap_mutex); | 
|---|
| 5726 | 6190 |  			goto again; | 
|---|
| 5727 | 6191 |  		} | 
|---|
| .. | .. | 
|---|
| 5749 | 6213 |  		user_locked = user_lock_limit; | 
|---|
| 5750 | 6214 |  	user_locked += user_extra; | 
|---|
| 5751 | 6215 |   | 
|---|
| 5752 |  | -	if (user_locked > user_lock_limit)  | 
|---|
 | 6216 | +	if (user_locked > user_lock_limit) {  | 
|---|
 | 6217 | +		/*  | 
|---|
 | 6218 | +		 * charge locked_vm until it hits user_lock_limit;  | 
|---|
 | 6219 | +		 * charge the rest from pinned_vm  | 
|---|
 | 6220 | +		 */  | 
|---|
| 5753 | 6221 |  		extra = user_locked - user_lock_limit; | 
|---|
 | 6222 | +		user_extra -= extra;  | 
|---|
 | 6223 | +	}  | 
|---|
| 5754 | 6224 |   | 
|---|
| 5755 | 6225 |  	lock_limit = rlimit(RLIMIT_MEMLOCK); | 
|---|
| 5756 | 6226 |  	lock_limit >>= PAGE_SHIFT; | 
|---|
| 5757 |  | -	locked = vma->vm_mm->pinned_vm + extra;  | 
|---|
 | 6227 | +	locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;  | 
|---|
| 5758 | 6228 |   | 
|---|
| 5759 | 6229 |  	if ((locked > lock_limit) && perf_is_paranoid() && | 
|---|
| 5760 | 6230 |  		!capable(CAP_IPC_LOCK)) { | 
|---|
| .. | .. | 
|---|
| 5783 | 6253 |   | 
|---|
| 5784 | 6254 |  		ring_buffer_attach(event, rb); | 
|---|
| 5785 | 6255 |   | 
|---|
 | 6256 | +		perf_event_update_time(event);  | 
|---|
 | 6257 | +		perf_set_shadow_time(event, event->ctx);  | 
|---|
| 5786 | 6258 |  		perf_event_init_userpage(event); | 
|---|
| 5787 | 6259 |  		perf_event_update_userpage(event); | 
|---|
| 5788 | 6260 |  	} else { | 
|---|
| .. | .. | 
|---|
| 5795 | 6267 |  unlock: | 
|---|
| 5796 | 6268 |  	if (!ret) { | 
|---|
| 5797 | 6269 |  		atomic_long_add(user_extra, &user->locked_vm); | 
|---|
| 5798 |  | -		vma->vm_mm->pinned_vm += extra;  | 
|---|
 | 6270 | +		atomic64_add(extra, &vma->vm_mm->pinned_vm);  | 
|---|
| 5799 | 6271 |   | 
|---|
| 5800 | 6272 |  		atomic_inc(&event->mmap_count); | 
|---|
| 5801 | 6273 |  	} else if (rb) { | 
|---|
| .. | .. | 
|---|
| 5932 | 6404 |   * Later on, we might change it to a list if there is | 
|---|
| 5933 | 6405 |   * another virtualization implementation supporting the callbacks. | 
|---|
| 5934 | 6406 |   */ | 
|---|
| 5935 |  | -struct perf_guest_info_callbacks *perf_guest_cbs;  | 
|---|
 | 6407 | +struct perf_guest_info_callbacks __rcu *perf_guest_cbs;  | 
|---|
| 5936 | 6408 |   | 
|---|
| 5937 | 6409 |  int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) | 
|---|
| 5938 | 6410 |  { | 
|---|
| 5939 |  | -	perf_guest_cbs = cbs;  | 
|---|
 | 6411 | +	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))  | 
|---|
 | 6412 | +		return -EBUSY;  | 
|---|
 | 6413 | +  | 
|---|
 | 6414 | +	rcu_assign_pointer(perf_guest_cbs, cbs);  | 
|---|
| 5940 | 6415 |  	return 0; | 
|---|
| 5941 | 6416 |  } | 
|---|
| 5942 | 6417 |  EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); | 
|---|
| 5943 | 6418 |   | 
|---|
| 5944 | 6419 |  int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) | 
|---|
| 5945 | 6420 |  { | 
|---|
| 5946 |  | -	perf_guest_cbs = NULL;  | 
|---|
 | 6421 | +	if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))  | 
|---|
 | 6422 | +		return -EINVAL;  | 
|---|
 | 6423 | +  | 
|---|
 | 6424 | +	rcu_assign_pointer(perf_guest_cbs, NULL);  | 
|---|
 | 6425 | +	synchronize_rcu();  | 
|---|
| 5947 | 6426 |  	return 0; | 
|---|
| 5948 | 6427 |  } | 
|---|
| 5949 | 6428 |  EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); | 
|---|
| .. | .. | 
|---|
| 5965 | 6444 |  } | 
|---|
| 5966 | 6445 |   | 
|---|
| 5967 | 6446 |  static void perf_sample_regs_user(struct perf_regs *regs_user, | 
|---|
| 5968 |  | -				  struct pt_regs *regs,  | 
|---|
| 5969 |  | -				  struct pt_regs *regs_user_copy)  | 
|---|
 | 6447 | +				  struct pt_regs *regs)  | 
|---|
| 5970 | 6448 |  { | 
|---|
| 5971 | 6449 |  	if (user_mode(regs)) { | 
|---|
| 5972 | 6450 |  		regs_user->abi = perf_reg_abi(current); | 
|---|
| 5973 | 6451 |  		regs_user->regs = regs; | 
|---|
| 5974 | 6452 |  	} else if (!(current->flags & PF_KTHREAD)) { | 
|---|
| 5975 |  | -		perf_get_regs_user(regs_user, regs, regs_user_copy);  | 
|---|
 | 6453 | +		perf_get_regs_user(regs_user, regs);  | 
|---|
| 5976 | 6454 |  	} else { | 
|---|
| 5977 | 6455 |  		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; | 
|---|
| 5978 | 6456 |  		regs_user->regs = NULL; | 
|---|
| .. | .. | 
|---|
| 5991 | 6469 |   * Get remaining task size from user stack pointer. | 
|---|
| 5992 | 6470 |   * | 
|---|
| 5993 | 6471 |   * It'd be better to take stack vma map and limit this more | 
|---|
| 5994 |  | - * precisly, but there's no way to get it safely under interrupt,  | 
|---|
 | 6472 | + * precisely, but there's no way to get it safely under interrupt,  | 
|---|
| 5995 | 6473 |   * so using TASK_SIZE as limit. | 
|---|
| 5996 | 6474 |   */ | 
|---|
| 5997 | 6475 |  static u64 perf_ustack_task_size(struct pt_regs *regs) | 
|---|
| .. | .. | 
|---|
| 6073 | 6551 |   | 
|---|
| 6074 | 6552 |  		/* Data. */ | 
|---|
| 6075 | 6553 |  		sp = perf_user_stack_pointer(regs); | 
|---|
| 6076 |  | -		fs = get_fs();  | 
|---|
| 6077 |  | -		set_fs(USER_DS);  | 
|---|
 | 6554 | +		fs = force_uaccess_begin();  | 
|---|
| 6078 | 6555 |  		rem = __output_copy_user(handle, (void *) sp, dump_size); | 
|---|
| 6079 |  | -		set_fs(fs);  | 
|---|
 | 6556 | +		force_uaccess_end(fs);  | 
|---|
| 6080 | 6557 |  		dyn_size = dump_size - rem; | 
|---|
| 6081 | 6558 |   | 
|---|
| 6082 | 6559 |  		perf_output_skip(handle, rem); | 
|---|
| .. | .. | 
|---|
| 6084 | 6561 |  		/* Dynamic size. */ | 
|---|
| 6085 | 6562 |  		perf_output_put(handle, dyn_size); | 
|---|
| 6086 | 6563 |  	} | 
|---|
 | 6564 | +}  | 
|---|
 | 6565 | +  | 
|---|
 | 6566 | +static unsigned long perf_prepare_sample_aux(struct perf_event *event,  | 
|---|
 | 6567 | +					  struct perf_sample_data *data,  | 
|---|
 | 6568 | +					  size_t size)  | 
|---|
 | 6569 | +{  | 
|---|
 | 6570 | +	struct perf_event *sampler = event->aux_event;  | 
|---|
 | 6571 | +	struct perf_buffer *rb;  | 
|---|
 | 6572 | +  | 
|---|
 | 6573 | +	data->aux_size = 0;  | 
|---|
 | 6574 | +  | 
|---|
 | 6575 | +	if (!sampler)  | 
|---|
 | 6576 | +		goto out;  | 
|---|
 | 6577 | +  | 
|---|
 | 6578 | +	if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))  | 
|---|
 | 6579 | +		goto out;  | 
|---|
 | 6580 | +  | 
|---|
 | 6581 | +	if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))  | 
|---|
 | 6582 | +		goto out;  | 
|---|
 | 6583 | +  | 
|---|
 | 6584 | +	rb = ring_buffer_get(sampler);  | 
|---|
 | 6585 | +	if (!rb)  | 
|---|
 | 6586 | +		goto out;  | 
|---|
 | 6587 | +  | 
|---|
 | 6588 | +	/*  | 
|---|
 | 6589 | +	 * If this is an NMI hit inside sampling code, don't take  | 
|---|
 | 6590 | +	 * the sample. See also perf_aux_sample_output().  | 
|---|
 | 6591 | +	 */  | 
|---|
 | 6592 | +	if (READ_ONCE(rb->aux_in_sampling)) {  | 
|---|
 | 6593 | +		data->aux_size = 0;  | 
|---|
 | 6594 | +	} else {  | 
|---|
 | 6595 | +		size = min_t(size_t, size, perf_aux_size(rb));  | 
|---|
 | 6596 | +		data->aux_size = ALIGN(size, sizeof(u64));  | 
|---|
 | 6597 | +	}  | 
|---|
 | 6598 | +	ring_buffer_put(rb);  | 
|---|
 | 6599 | +  | 
|---|
 | 6600 | +out:  | 
|---|
 | 6601 | +	return data->aux_size;  | 
|---|
 | 6602 | +}  | 
|---|
 | 6603 | +  | 
|---|
 | 6604 | +long perf_pmu_snapshot_aux(struct perf_buffer *rb,  | 
|---|
 | 6605 | +			   struct perf_event *event,  | 
|---|
 | 6606 | +			   struct perf_output_handle *handle,  | 
|---|
 | 6607 | +			   unsigned long size)  | 
|---|
 | 6608 | +{  | 
|---|
 | 6609 | +	unsigned long flags;  | 
|---|
 | 6610 | +	long ret;  | 
|---|
 | 6611 | +  | 
|---|
 | 6612 | +	/*  | 
|---|
 | 6613 | +	 * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler  | 
|---|
 | 6614 | +	 * paths. If we start calling them in NMI context, they may race with  | 
|---|
 | 6615 | +	 * the IRQ ones, that is, for example, re-starting an event that's just  | 
|---|
 | 6616 | +	 * been stopped, which is why we're using a separate callback that  | 
|---|
 | 6617 | +	 * doesn't change the event state.  | 
|---|
 | 6618 | +	 *  | 
|---|
 | 6619 | +	 * IRQs need to be disabled to prevent IPIs from racing with us.  | 
|---|
 | 6620 | +	 */  | 
|---|
 | 6621 | +	local_irq_save(flags);  | 
|---|
 | 6622 | +	/*  | 
|---|
 | 6623 | +	 * Guard against NMI hits inside the critical section;  | 
|---|
 | 6624 | +	 * see also perf_prepare_sample_aux().  | 
|---|
 | 6625 | +	 */  | 
|---|
 | 6626 | +	WRITE_ONCE(rb->aux_in_sampling, 1);  | 
|---|
 | 6627 | +	barrier();  | 
|---|
 | 6628 | +  | 
|---|
 | 6629 | +	ret = event->pmu->snapshot_aux(event, handle, size);  | 
|---|
 | 6630 | +  | 
|---|
 | 6631 | +	barrier();  | 
|---|
 | 6632 | +	WRITE_ONCE(rb->aux_in_sampling, 0);  | 
|---|
 | 6633 | +	local_irq_restore(flags);  | 
|---|
 | 6634 | +  | 
|---|
 | 6635 | +	return ret;  | 
|---|
 | 6636 | +}  | 
|---|
 | 6637 | +  | 
|---|
 | 6638 | +static void perf_aux_sample_output(struct perf_event *event,  | 
|---|
 | 6639 | +				   struct perf_output_handle *handle,  | 
|---|
 | 6640 | +				   struct perf_sample_data *data)  | 
|---|
 | 6641 | +{  | 
|---|
 | 6642 | +	struct perf_event *sampler = event->aux_event;  | 
|---|
 | 6643 | +	struct perf_buffer *rb;  | 
|---|
 | 6644 | +	unsigned long pad;  | 
|---|
 | 6645 | +	long size;  | 
|---|
 | 6646 | +  | 
|---|
 | 6647 | +	if (WARN_ON_ONCE(!sampler || !data->aux_size))  | 
|---|
 | 6648 | +		return;  | 
|---|
 | 6649 | +  | 
|---|
 | 6650 | +	rb = ring_buffer_get(sampler);  | 
|---|
 | 6651 | +	if (!rb)  | 
|---|
 | 6652 | +		return;  | 
|---|
 | 6653 | +  | 
|---|
 | 6654 | +	size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);  | 
|---|
 | 6655 | +  | 
|---|
 | 6656 | +	/*  | 
|---|
 | 6657 | +	 * An error here means that perf_output_copy() failed (returned a  | 
|---|
 | 6658 | +	 * non-zero surplus that it didn't copy), which in its current  | 
|---|
 | 6659 | +	 * enlightened implementation is not possible. If that changes, we'd  | 
|---|
 | 6660 | +	 * like to know.  | 
|---|
 | 6661 | +	 */  | 
|---|
 | 6662 | +	if (WARN_ON_ONCE(size < 0))  | 
|---|
 | 6663 | +		goto out_put;  | 
|---|
 | 6664 | +  | 
|---|
 | 6665 | +	/*  | 
|---|
 | 6666 | +	 * The pad comes from ALIGN()ing data->aux_size up to u64 in  | 
|---|
 | 6667 | +	 * perf_prepare_sample_aux(), so should not be more than that.  | 
|---|
 | 6668 | +	 */  | 
|---|
 | 6669 | +	pad = data->aux_size - size;  | 
|---|
 | 6670 | +	if (WARN_ON_ONCE(pad >= sizeof(u64)))  | 
|---|
 | 6671 | +		pad = 8;  | 
|---|
 | 6672 | +  | 
|---|
 | 6673 | +	if (pad) {  | 
|---|
 | 6674 | +		u64 zero = 0;  | 
|---|
 | 6675 | +		perf_output_copy(handle, &zero, pad);  | 
|---|
 | 6676 | +	}  | 
|---|
 | 6677 | +  | 
|---|
 | 6678 | +out_put:  | 
|---|
 | 6679 | +	ring_buffer_put(rb);  | 
|---|
| 6087 | 6680 |  } | 
|---|
| 6088 | 6681 |   | 
|---|
| 6089 | 6682 |  static void __perf_event_header__init_id(struct perf_event_header *header, | 
|---|
| .. | .. | 
|---|
| 6255 | 6848 |  		perf_output_read_one(handle, event, enabled, running); | 
|---|
| 6256 | 6849 |  } | 
|---|
| 6257 | 6850 |   | 
|---|
 | 6851 | +static inline bool perf_sample_save_hw_index(struct perf_event *event)  | 
|---|
 | 6852 | +{  | 
|---|
 | 6853 | +	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;  | 
|---|
 | 6854 | +}  | 
|---|
 | 6855 | +  | 
|---|
| 6258 | 6856 |  void perf_output_sample(struct perf_output_handle *handle, | 
|---|
| 6259 | 6857 |  			struct perf_event_header *header, | 
|---|
| 6260 | 6858 |  			struct perf_sample_data *data, | 
|---|
| .. | .. | 
|---|
| 6343 | 6941 |  			     * sizeof(struct perf_branch_entry); | 
|---|
| 6344 | 6942 |   | 
|---|
| 6345 | 6943 |  			perf_output_put(handle, data->br_stack->nr); | 
|---|
 | 6944 | +			if (perf_sample_save_hw_index(event))  | 
|---|
 | 6945 | +				perf_output_put(handle, data->br_stack->hw_idx);  | 
|---|
| 6346 | 6946 |  			perf_output_copy(handle, data->br_stack->entries, size); | 
|---|
| 6347 | 6947 |  		} else { | 
|---|
| 6348 | 6948 |  			/* | 
|---|
| .. | .. | 
|---|
| 6405 | 7005 |  	if (sample_type & PERF_SAMPLE_PHYS_ADDR) | 
|---|
| 6406 | 7006 |  		perf_output_put(handle, data->phys_addr); | 
|---|
| 6407 | 7007 |   | 
|---|
 | 7008 | +	if (sample_type & PERF_SAMPLE_CGROUP)  | 
|---|
 | 7009 | +		perf_output_put(handle, data->cgroup);  | 
|---|
 | 7010 | +  | 
|---|
 | 7011 | +	if (sample_type & PERF_SAMPLE_AUX) {  | 
|---|
 | 7012 | +		perf_output_put(handle, data->aux_size);  | 
|---|
 | 7013 | +  | 
|---|
 | 7014 | +		if (data->aux_size)  | 
|---|
 | 7015 | +			perf_aux_sample_output(event, handle, data);  | 
|---|
 | 7016 | +	}  | 
|---|
 | 7017 | +  | 
|---|
| 6408 | 7018 |  	if (!event->attr.watermark) { | 
|---|
| 6409 | 7019 |  		int wakeup_events = event->attr.wakeup_events; | 
|---|
| 6410 | 7020 |   | 
|---|
| 6411 | 7021 |  		if (wakeup_events) { | 
|---|
| 6412 |  | -			struct ring_buffer *rb = handle->rb;  | 
|---|
 | 7022 | +			struct perf_buffer *rb = handle->rb;  | 
|---|
| 6413 | 7023 |  			int events = local_inc_return(&rb->events); | 
|---|
| 6414 | 7024 |   | 
|---|
| 6415 | 7025 |  			if (events >= wakeup_events) { | 
|---|
| .. | .. | 
|---|
| 6437 | 7047 |  		 * Walking the pages tables for user address. | 
|---|
| 6438 | 7048 |  		 * Interrupts are disabled, so it prevents any tear down | 
|---|
| 6439 | 7049 |  		 * of the page tables. | 
|---|
| 6440 |  | -		 * Try IRQ-safe __get_user_pages_fast first.  | 
|---|
 | 7050 | +		 * Try IRQ-safe get_user_page_fast_only first.  | 
|---|
| 6441 | 7051 |  		 * If failed, leave phys_addr as 0. | 
|---|
| 6442 | 7052 |  		 */ | 
|---|
| 6443 | 7053 |  		if (current->mm != NULL) { | 
|---|
| 6444 | 7054 |  			struct page *p; | 
|---|
| 6445 | 7055 |   | 
|---|
| 6446 | 7056 |  			pagefault_disable(); | 
|---|
| 6447 |  | -			if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {  | 
|---|
 | 7057 | +			if (get_user_page_fast_only(virt, 0, &p)) {  | 
|---|
| 6448 | 7058 |  				phys_addr = page_to_phys(p) + virt % PAGE_SIZE; | 
|---|
| 6449 | 7059 |  				put_page(p); | 
|---|
| 6450 | 7060 |  			} | 
|---|
| .. | .. | 
|---|
| 6532 | 7142 |  	if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | 
|---|
| 6533 | 7143 |  		int size = sizeof(u64); /* nr */ | 
|---|
| 6534 | 7144 |  		if (data->br_stack) { | 
|---|
 | 7145 | +			if (perf_sample_save_hw_index(event))  | 
|---|
 | 7146 | +				size += sizeof(u64);  | 
|---|
 | 7147 | +  | 
|---|
| 6535 | 7148 |  			size += data->br_stack->nr | 
|---|
| 6536 | 7149 |  			      * sizeof(struct perf_branch_entry); | 
|---|
| 6537 | 7150 |  		} | 
|---|
| .. | .. | 
|---|
| 6539 | 7152 |  	} | 
|---|
| 6540 | 7153 |   | 
|---|
| 6541 | 7154 |  	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) | 
|---|
| 6542 |  | -		perf_sample_regs_user(&data->regs_user, regs,  | 
|---|
| 6543 |  | -				      &data->regs_user_copy);  | 
|---|
 | 7155 | +		perf_sample_regs_user(&data->regs_user, regs);  | 
|---|
| 6544 | 7156 |   | 
|---|
| 6545 | 7157 |  	if (sample_type & PERF_SAMPLE_REGS_USER) { | 
|---|
| 6546 | 7158 |  		/* regs dump ABI info */ | 
|---|
| .. | .. | 
|---|
| 6556 | 7168 |   | 
|---|
| 6557 | 7169 |  	if (sample_type & PERF_SAMPLE_STACK_USER) { | 
|---|
| 6558 | 7170 |  		/* | 
|---|
| 6559 |  | -		 * Either we need PERF_SAMPLE_STACK_USER bit to be allways  | 
|---|
 | 7171 | +		 * Either we need PERF_SAMPLE_STACK_USER bit to be always  | 
|---|
| 6560 | 7172 |  		 * processed as the last one or have additional check added | 
|---|
| 6561 | 7173 |  		 * in case new sample type is added, because we could eat | 
|---|
| 6562 | 7174 |  		 * up the rest of the sample size. | 
|---|
| .. | .. | 
|---|
| 6596 | 7208 |   | 
|---|
| 6597 | 7209 |  	if (sample_type & PERF_SAMPLE_PHYS_ADDR) | 
|---|
| 6598 | 7210 |  		data->phys_addr = perf_virt_to_phys(data->addr); | 
|---|
 | 7211 | +  | 
|---|
 | 7212 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 7213 | +	if (sample_type & PERF_SAMPLE_CGROUP) {  | 
|---|
 | 7214 | +		struct cgroup *cgrp;  | 
|---|
 | 7215 | +  | 
|---|
 | 7216 | +		/* protected by RCU */  | 
|---|
 | 7217 | +		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;  | 
|---|
 | 7218 | +		data->cgroup = cgroup_id(cgrp);  | 
|---|
 | 7219 | +	}  | 
|---|
 | 7220 | +#endif  | 
|---|
 | 7221 | +  | 
|---|
 | 7222 | +	if (sample_type & PERF_SAMPLE_AUX) {  | 
|---|
 | 7223 | +		u64 size;  | 
|---|
 | 7224 | +  | 
|---|
 | 7225 | +		header->size += sizeof(u64); /* size */  | 
|---|
 | 7226 | +  | 
|---|
 | 7227 | +		/*  | 
|---|
 | 7228 | +		 * Given the 16bit nature of header::size, an AUX sample can  | 
|---|
 | 7229 | +		 * easily overflow it, what with all the preceding sample bits.  | 
|---|
 | 7230 | +		 * Make sure this doesn't happen by using up to U16_MAX bytes  | 
|---|
 | 7231 | +		 * per sample in total (rounded down to 8 byte boundary).  | 
|---|
 | 7232 | +		 */  | 
|---|
 | 7233 | +		size = min_t(size_t, U16_MAX - header->size,  | 
|---|
 | 7234 | +			     event->attr.aux_sample_size);  | 
|---|
 | 7235 | +		size = rounddown(size, 8);  | 
|---|
 | 7236 | +		size = perf_prepare_sample_aux(event, data, size);  | 
|---|
 | 7237 | +  | 
|---|
 | 7238 | +		WARN_ON_ONCE(size + header->size > U16_MAX);  | 
|---|
 | 7239 | +		header->size += size;  | 
|---|
 | 7240 | +	}  | 
|---|
 | 7241 | +	/*  | 
|---|
 | 7242 | +	 * If you're adding more sample types here, you likely need to do  | 
|---|
 | 7243 | +	 * something about the overflowing header::size, like repurpose the  | 
|---|
 | 7244 | +	 * lowest 3 bits of size, which should be always zero at the moment.  | 
|---|
 | 7245 | +	 * This raises a more important question, do we really need 512k sized  | 
|---|
 | 7246 | +	 * samples and why, so good argumentation is in order for whatever you  | 
|---|
 | 7247 | +	 * do here next.  | 
|---|
 | 7248 | +	 */  | 
|---|
 | 7249 | +	WARN_ON_ONCE(header->size & 7);  | 
|---|
| 6599 | 7250 |  } | 
|---|
| 6600 | 7251 |   | 
|---|
| 6601 |  | -static __always_inline void  | 
|---|
 | 7252 | +static __always_inline int  | 
|---|
| 6602 | 7253 |  __perf_event_output(struct perf_event *event, | 
|---|
| 6603 | 7254 |  		    struct perf_sample_data *data, | 
|---|
| 6604 | 7255 |  		    struct pt_regs *regs, | 
|---|
| 6605 | 7256 |  		    int (*output_begin)(struct perf_output_handle *, | 
|---|
 | 7257 | +					struct perf_sample_data *,  | 
|---|
| 6606 | 7258 |  					struct perf_event *, | 
|---|
| 6607 | 7259 |  					unsigned int)) | 
|---|
| 6608 | 7260 |  { | 
|---|
| 6609 | 7261 |  	struct perf_output_handle handle; | 
|---|
| 6610 | 7262 |  	struct perf_event_header header; | 
|---|
 | 7263 | +	int err;  | 
|---|
| 6611 | 7264 |   | 
|---|
| 6612 | 7265 |  	/* protect the callchain buffers */ | 
|---|
| 6613 | 7266 |  	rcu_read_lock(); | 
|---|
| 6614 | 7267 |   | 
|---|
| 6615 | 7268 |  	perf_prepare_sample(&header, data, event, regs); | 
|---|
| 6616 | 7269 |   | 
|---|
| 6617 |  | -	if (output_begin(&handle, event, header.size))  | 
|---|
 | 7270 | +	err = output_begin(&handle, data, event, header.size);  | 
|---|
 | 7271 | +	if (err)  | 
|---|
| 6618 | 7272 |  		goto exit; | 
|---|
| 6619 | 7273 |   | 
|---|
| 6620 | 7274 |  	perf_output_sample(&handle, &header, data, event); | 
|---|
| .. | .. | 
|---|
| 6623 | 7277 |   | 
|---|
| 6624 | 7278 |  exit: | 
|---|
| 6625 | 7279 |  	rcu_read_unlock(); | 
|---|
 | 7280 | +	return err;  | 
|---|
| 6626 | 7281 |  } | 
|---|
| 6627 | 7282 |   | 
|---|
| 6628 | 7283 |  void | 
|---|
| .. | .. | 
|---|
| 6641 | 7296 |  	__perf_event_output(event, data, regs, perf_output_begin_backward); | 
|---|
| 6642 | 7297 |  } | 
|---|
| 6643 | 7298 |   | 
|---|
| 6644 |  | -void  | 
|---|
 | 7299 | +int  | 
|---|
| 6645 | 7300 |  perf_event_output(struct perf_event *event, | 
|---|
| 6646 | 7301 |  		  struct perf_sample_data *data, | 
|---|
| 6647 | 7302 |  		  struct pt_regs *regs) | 
|---|
| 6648 | 7303 |  { | 
|---|
| 6649 |  | -	__perf_event_output(event, data, regs, perf_output_begin);  | 
|---|
 | 7304 | +	return __perf_event_output(event, data, regs, perf_output_begin);  | 
|---|
| 6650 | 7305 |  } | 
|---|
| 6651 | 7306 |   | 
|---|
| 6652 | 7307 |  /* | 
|---|
| .. | .. | 
|---|
| 6678 | 7333 |  	int ret; | 
|---|
| 6679 | 7334 |   | 
|---|
| 6680 | 7335 |  	perf_event_header__init_id(&read_event.header, &sample, event); | 
|---|
| 6681 |  | -	ret = perf_output_begin(&handle, event, read_event.header.size);  | 
|---|
 | 7336 | +	ret = perf_output_begin(&handle, &sample, event, read_event.header.size);  | 
|---|
| 6682 | 7337 |  	if (ret) | 
|---|
| 6683 | 7338 |  		return; | 
|---|
| 6684 | 7339 |   | 
|---|
| .. | .. | 
|---|
| 6823 | 7478 |  } | 
|---|
| 6824 | 7479 |   | 
|---|
| 6825 | 7480 |  struct remote_output { | 
|---|
| 6826 |  | -	struct ring_buffer	*rb;  | 
|---|
 | 7481 | +	struct perf_buffer	*rb;  | 
|---|
| 6827 | 7482 |  	int			err; | 
|---|
| 6828 | 7483 |  }; | 
|---|
| 6829 | 7484 |   | 
|---|
| .. | .. | 
|---|
| 6831 | 7486 |  { | 
|---|
| 6832 | 7487 |  	struct perf_event *parent = event->parent; | 
|---|
| 6833 | 7488 |  	struct remote_output *ro = data; | 
|---|
| 6834 |  | -	struct ring_buffer *rb = ro->rb;  | 
|---|
 | 7489 | +	struct perf_buffer *rb = ro->rb;  | 
|---|
| 6835 | 7490 |  	struct stop_event_data sd = { | 
|---|
| 6836 | 7491 |  		.event	= event, | 
|---|
| 6837 | 7492 |  	}; | 
|---|
| .. | .. | 
|---|
| 6947 | 7602 |   | 
|---|
| 6948 | 7603 |  	perf_event_header__init_id(&task_event->event_id.header, &sample, event); | 
|---|
| 6949 | 7604 |   | 
|---|
| 6950 |  | -	ret = perf_output_begin(&handle, event,  | 
|---|
 | 7605 | +	ret = perf_output_begin(&handle, &sample, event,  | 
|---|
| 6951 | 7606 |  				task_event->event_id.header.size); | 
|---|
| 6952 | 7607 |  	if (ret) | 
|---|
| 6953 | 7608 |  		goto out; | 
|---|
| .. | .. | 
|---|
| 7050 | 7705 |  		return; | 
|---|
| 7051 | 7706 |   | 
|---|
| 7052 | 7707 |  	perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | 
|---|
| 7053 |  | -	ret = perf_output_begin(&handle, event,  | 
|---|
 | 7708 | +	ret = perf_output_begin(&handle, &sample, event,  | 
|---|
| 7054 | 7709 |  				comm_event->event_id.header.size); | 
|---|
| 7055 | 7710 |   | 
|---|
| 7056 | 7711 |  	if (ret) | 
|---|
| .. | .. | 
|---|
| 7150 | 7805 |   | 
|---|
| 7151 | 7806 |  	perf_event_header__init_id(&namespaces_event->event_id.header, | 
|---|
| 7152 | 7807 |  				   &sample, event); | 
|---|
| 7153 |  | -	ret = perf_output_begin(&handle, event,  | 
|---|
 | 7808 | +	ret = perf_output_begin(&handle, &sample, event,  | 
|---|
| 7154 | 7809 |  				namespaces_event->event_id.header.size); | 
|---|
| 7155 | 7810 |  	if (ret) | 
|---|
| 7156 | 7811 |  		goto out; | 
|---|
| .. | .. | 
|---|
| 7175 | 7830 |  { | 
|---|
| 7176 | 7831 |  	struct path ns_path; | 
|---|
| 7177 | 7832 |  	struct inode *ns_inode; | 
|---|
| 7178 |  | -	void *error;  | 
|---|
 | 7833 | +	int error;  | 
|---|
| 7179 | 7834 |   | 
|---|
| 7180 | 7835 |  	error = ns_get_path(&ns_path, task, ns_ops); | 
|---|
| 7181 | 7836 |  	if (!error) { | 
|---|
| .. | .. | 
|---|
| 7245 | 7900 |  } | 
|---|
| 7246 | 7901 |   | 
|---|
| 7247 | 7902 |  /* | 
|---|
 | 7903 | + * cgroup tracking  | 
|---|
 | 7904 | + */  | 
|---|
 | 7905 | +#ifdef CONFIG_CGROUP_PERF  | 
|---|
 | 7906 | +  | 
|---|
 | 7907 | +struct perf_cgroup_event {  | 
|---|
 | 7908 | +	char				*path;  | 
|---|
 | 7909 | +	int				path_size;  | 
|---|
 | 7910 | +	struct {  | 
|---|
 | 7911 | +		struct perf_event_header	header;  | 
|---|
 | 7912 | +		u64				id;  | 
|---|
 | 7913 | +		char				path[];  | 
|---|
 | 7914 | +	} event_id;  | 
|---|
 | 7915 | +};  | 
|---|
 | 7916 | +  | 
|---|
 | 7917 | +static int perf_event_cgroup_match(struct perf_event *event)  | 
|---|
 | 7918 | +{  | 
|---|
 | 7919 | +	return event->attr.cgroup;  | 
|---|
 | 7920 | +}  | 
|---|
 | 7921 | +  | 
|---|
 | 7922 | +static void perf_event_cgroup_output(struct perf_event *event, void *data)  | 
|---|
 | 7923 | +{  | 
|---|
 | 7924 | +	struct perf_cgroup_event *cgroup_event = data;  | 
|---|
 | 7925 | +	struct perf_output_handle handle;  | 
|---|
 | 7926 | +	struct perf_sample_data sample;  | 
|---|
 | 7927 | +	u16 header_size = cgroup_event->event_id.header.size;  | 
|---|
 | 7928 | +	int ret;  | 
|---|
 | 7929 | +  | 
|---|
 | 7930 | +	if (!perf_event_cgroup_match(event))  | 
|---|
 | 7931 | +		return;  | 
|---|
 | 7932 | +  | 
|---|
 | 7933 | +	perf_event_header__init_id(&cgroup_event->event_id.header,  | 
|---|
 | 7934 | +				   &sample, event);  | 
|---|
 | 7935 | +	ret = perf_output_begin(&handle, &sample, event,  | 
|---|
 | 7936 | +				cgroup_event->event_id.header.size);  | 
|---|
 | 7937 | +	if (ret)  | 
|---|
 | 7938 | +		goto out;  | 
|---|
 | 7939 | +  | 
|---|
 | 7940 | +	perf_output_put(&handle, cgroup_event->event_id);  | 
|---|
 | 7941 | +	__output_copy(&handle, cgroup_event->path, cgroup_event->path_size);  | 
|---|
 | 7942 | +  | 
|---|
 | 7943 | +	perf_event__output_id_sample(event, &handle, &sample);  | 
|---|
 | 7944 | +  | 
|---|
 | 7945 | +	perf_output_end(&handle);  | 
|---|
 | 7946 | +out:  | 
|---|
 | 7947 | +	cgroup_event->event_id.header.size = header_size;  | 
|---|
 | 7948 | +}  | 
|---|
 | 7949 | +  | 
|---|
 | 7950 | +static void perf_event_cgroup(struct cgroup *cgrp)  | 
|---|
 | 7951 | +{  | 
|---|
 | 7952 | +	struct perf_cgroup_event cgroup_event;  | 
|---|
 | 7953 | +	char path_enomem[16] = "//enomem";  | 
|---|
 | 7954 | +	char *pathname;  | 
|---|
 | 7955 | +	size_t size;  | 
|---|
 | 7956 | +  | 
|---|
 | 7957 | +	if (!atomic_read(&nr_cgroup_events))  | 
|---|
 | 7958 | +		return;  | 
|---|
 | 7959 | +  | 
|---|
 | 7960 | +	cgroup_event = (struct perf_cgroup_event){  | 
|---|
 | 7961 | +		.event_id  = {  | 
|---|
 | 7962 | +			.header = {  | 
|---|
 | 7963 | +				.type = PERF_RECORD_CGROUP,  | 
|---|
 | 7964 | +				.misc = 0,  | 
|---|
 | 7965 | +				.size = sizeof(cgroup_event.event_id),  | 
|---|
 | 7966 | +			},  | 
|---|
 | 7967 | +			.id = cgroup_id(cgrp),  | 
|---|
 | 7968 | +		},  | 
|---|
 | 7969 | +	};  | 
|---|
 | 7970 | +  | 
|---|
 | 7971 | +	pathname = kmalloc(PATH_MAX, GFP_KERNEL);  | 
|---|
 | 7972 | +	if (pathname == NULL) {  | 
|---|
 | 7973 | +		cgroup_event.path = path_enomem;  | 
|---|
 | 7974 | +	} else {  | 
|---|
 | 7975 | +		/* just to be sure to have enough space for alignment */  | 
|---|
 | 7976 | +		cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));  | 
|---|
 | 7977 | +		cgroup_event.path = pathname;  | 
|---|
 | 7978 | +	}  | 
|---|
 | 7979 | +  | 
|---|
 | 7980 | +	/*  | 
|---|
 | 7981 | +	 * Since our buffer works in 8 byte units we need to align our string  | 
|---|
 | 7982 | +	 * size to a multiple of 8. However, we must guarantee the tail end is  | 
|---|
 | 7983 | +	 * zero'd out to avoid leaking random bits to userspace.  | 
|---|
 | 7984 | +	 */  | 
|---|
 | 7985 | +	size = strlen(cgroup_event.path) + 1;  | 
|---|
 | 7986 | +	while (!IS_ALIGNED(size, sizeof(u64)))  | 
|---|
 | 7987 | +		cgroup_event.path[size++] = '\0';  | 
|---|
 | 7988 | +  | 
|---|
 | 7989 | +	cgroup_event.event_id.header.size += size;  | 
|---|
 | 7990 | +	cgroup_event.path_size = size;  | 
|---|
 | 7991 | +  | 
|---|
 | 7992 | +	perf_iterate_sb(perf_event_cgroup_output,  | 
|---|
 | 7993 | +			&cgroup_event,  | 
|---|
 | 7994 | +			NULL);  | 
|---|
 | 7995 | +  | 
|---|
 | 7996 | +	kfree(pathname);  | 
|---|
 | 7997 | +}  | 
|---|
 | 7998 | +  | 
|---|
 | 7999 | +#endif  | 
|---|
 | 8000 | +  | 
|---|
 | 8001 | +/*  | 
|---|
| 7248 | 8002 |   * mmap tracking | 
|---|
| 7249 | 8003 |   */ | 
|---|
| 7250 | 8004 |   | 
|---|
| .. | .. | 
|---|
| 7304 | 8058 |  	} | 
|---|
| 7305 | 8059 |   | 
|---|
| 7306 | 8060 |  	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | 
|---|
| 7307 |  | -	ret = perf_output_begin(&handle, event,  | 
|---|
 | 8061 | +	ret = perf_output_begin(&handle, &sample, event,  | 
|---|
| 7308 | 8062 |  				mmap_event->event_id.header.size); | 
|---|
| 7309 | 8063 |  	if (ret) | 
|---|
| 7310 | 8064 |  		goto out; | 
|---|
| .. | .. | 
|---|
| 7364 | 8118 |  		flags |= MAP_EXECUTABLE; | 
|---|
| 7365 | 8119 |  	if (vma->vm_flags & VM_LOCKED) | 
|---|
| 7366 | 8120 |  		flags |= MAP_LOCKED; | 
|---|
| 7367 |  | -	if (vma->vm_flags & VM_HUGETLB)  | 
|---|
 | 8121 | +	if (is_vm_hugetlb_page(vma))  | 
|---|
| 7368 | 8122 |  		flags |= MAP_HUGETLB; | 
|---|
| 7369 | 8123 |   | 
|---|
| 7370 | 8124 |  	if (file) { | 
|---|
| .. | .. | 
|---|
| 7614 | 8368 |  	int ret; | 
|---|
| 7615 | 8369 |   | 
|---|
| 7616 | 8370 |  	perf_event_header__init_id(&rec.header, &sample, event); | 
|---|
| 7617 |  | -	ret = perf_output_begin(&handle, event, rec.header.size);  | 
|---|
 | 8371 | +	ret = perf_output_begin(&handle, &sample, event, rec.header.size);  | 
|---|
| 7618 | 8372 |   | 
|---|
| 7619 | 8373 |  	if (ret) | 
|---|
| 7620 | 8374 |  		return; | 
|---|
| .. | .. | 
|---|
| 7648 | 8402 |   | 
|---|
| 7649 | 8403 |  	perf_event_header__init_id(&lost_samples_event.header, &sample, event); | 
|---|
| 7650 | 8404 |   | 
|---|
| 7651 |  | -	ret = perf_output_begin(&handle, event,  | 
|---|
 | 8405 | +	ret = perf_output_begin(&handle, &sample, event,  | 
|---|
| 7652 | 8406 |  				lost_samples_event.header.size); | 
|---|
| 7653 | 8407 |  	if (ret) | 
|---|
| 7654 | 8408 |  		return; | 
|---|
| .. | .. | 
|---|
| 7703 | 8457 |   | 
|---|
| 7704 | 8458 |  	perf_event_header__init_id(&se->event_id.header, &sample, event); | 
|---|
| 7705 | 8459 |   | 
|---|
| 7706 |  | -	ret = perf_output_begin(&handle, event, se->event_id.header.size);  | 
|---|
 | 8460 | +	ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);  | 
|---|
| 7707 | 8461 |  	if (ret) | 
|---|
| 7708 | 8462 |  		return; | 
|---|
| 7709 | 8463 |   | 
|---|
| .. | .. | 
|---|
| 7778 | 8532 |   | 
|---|
| 7779 | 8533 |  	perf_event_header__init_id(&throttle_event.header, &sample, event); | 
|---|
| 7780 | 8534 |   | 
|---|
| 7781 |  | -	ret = perf_output_begin(&handle, event,  | 
|---|
 | 8535 | +	ret = perf_output_begin(&handle, &sample, event,  | 
|---|
| 7782 | 8536 |  				throttle_event.header.size); | 
|---|
| 7783 | 8537 |  	if (ret) | 
|---|
| 7784 | 8538 |  		return; | 
|---|
| .. | .. | 
|---|
| 7786 | 8540 |  	perf_output_put(&handle, throttle_event); | 
|---|
| 7787 | 8541 |  	perf_event__output_id_sample(event, &handle, &sample); | 
|---|
| 7788 | 8542 |  	perf_output_end(&handle); | 
|---|
 | 8543 | +}  | 
|---|
 | 8544 | +  | 
|---|
 | 8545 | +/*  | 
|---|
 | 8546 | + * ksymbol register/unregister tracking  | 
|---|
 | 8547 | + */  | 
|---|
 | 8548 | +  | 
|---|
 | 8549 | +struct perf_ksymbol_event {  | 
|---|
 | 8550 | +	const char	*name;  | 
|---|
 | 8551 | +	int		name_len;  | 
|---|
 | 8552 | +	struct {  | 
|---|
 | 8553 | +		struct perf_event_header        header;  | 
|---|
 | 8554 | +		u64				addr;  | 
|---|
 | 8555 | +		u32				len;  | 
|---|
 | 8556 | +		u16				ksym_type;  | 
|---|
 | 8557 | +		u16				flags;  | 
|---|
 | 8558 | +	} event_id;  | 
|---|
 | 8559 | +};  | 
|---|
 | 8560 | +  | 
|---|
 | 8561 | +static int perf_event_ksymbol_match(struct perf_event *event)  | 
|---|
 | 8562 | +{  | 
|---|
 | 8563 | +	return event->attr.ksymbol;  | 
|---|
 | 8564 | +}  | 
|---|
 | 8565 | +  | 
|---|
 | 8566 | +static void perf_event_ksymbol_output(struct perf_event *event, void *data)  | 
|---|
 | 8567 | +{  | 
|---|
 | 8568 | +	struct perf_ksymbol_event *ksymbol_event = data;  | 
|---|
 | 8569 | +	struct perf_output_handle handle;  | 
|---|
 | 8570 | +	struct perf_sample_data sample;  | 
|---|
 | 8571 | +	int ret;  | 
|---|
 | 8572 | +  | 
|---|
 | 8573 | +	if (!perf_event_ksymbol_match(event))  | 
|---|
 | 8574 | +		return;  | 
|---|
 | 8575 | +  | 
|---|
 | 8576 | +	perf_event_header__init_id(&ksymbol_event->event_id.header,  | 
|---|
 | 8577 | +				   &sample, event);  | 
|---|
 | 8578 | +	ret = perf_output_begin(&handle, &sample, event,  | 
|---|
 | 8579 | +				ksymbol_event->event_id.header.size);  | 
|---|
 | 8580 | +	if (ret)  | 
|---|
 | 8581 | +		return;  | 
|---|
 | 8582 | +  | 
|---|
 | 8583 | +	perf_output_put(&handle, ksymbol_event->event_id);  | 
|---|
 | 8584 | +	__output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);  | 
|---|
 | 8585 | +	perf_event__output_id_sample(event, &handle, &sample);  | 
|---|
 | 8586 | +  | 
|---|
 | 8587 | +	perf_output_end(&handle);  | 
|---|
 | 8588 | +}  | 
|---|
 | 8589 | +  | 
|---|
 | 8590 | +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,  | 
|---|
 | 8591 | +			const char *sym)  | 
|---|
 | 8592 | +{  | 
|---|
 | 8593 | +	struct perf_ksymbol_event ksymbol_event;  | 
|---|
 | 8594 | +	char name[KSYM_NAME_LEN];  | 
|---|
 | 8595 | +	u16 flags = 0;  | 
|---|
 | 8596 | +	int name_len;  | 
|---|
 | 8597 | +  | 
|---|
 | 8598 | +	if (!atomic_read(&nr_ksymbol_events))  | 
|---|
 | 8599 | +		return;  | 
|---|
 | 8600 | +  | 
|---|
 | 8601 | +	if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||  | 
|---|
 | 8602 | +	    ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)  | 
|---|
 | 8603 | +		goto err;  | 
|---|
 | 8604 | +  | 
|---|
 | 8605 | +	strlcpy(name, sym, KSYM_NAME_LEN);  | 
|---|
 | 8606 | +	name_len = strlen(name) + 1;  | 
|---|
 | 8607 | +	while (!IS_ALIGNED(name_len, sizeof(u64)))  | 
|---|
 | 8608 | +		name[name_len++] = '\0';  | 
|---|
 | 8609 | +	BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));  | 
|---|
 | 8610 | +  | 
|---|
 | 8611 | +	if (unregister)  | 
|---|
 | 8612 | +		flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;  | 
|---|
 | 8613 | +  | 
|---|
 | 8614 | +	ksymbol_event = (struct perf_ksymbol_event){  | 
|---|
 | 8615 | +		.name = name,  | 
|---|
 | 8616 | +		.name_len = name_len,  | 
|---|
 | 8617 | +		.event_id = {  | 
|---|
 | 8618 | +			.header = {  | 
|---|
 | 8619 | +				.type = PERF_RECORD_KSYMBOL,  | 
|---|
 | 8620 | +				.size = sizeof(ksymbol_event.event_id) +  | 
|---|
 | 8621 | +					name_len,  | 
|---|
 | 8622 | +			},  | 
|---|
 | 8623 | +			.addr = addr,  | 
|---|
 | 8624 | +			.len = len,  | 
|---|
 | 8625 | +			.ksym_type = ksym_type,  | 
|---|
 | 8626 | +			.flags = flags,  | 
|---|
 | 8627 | +		},  | 
|---|
 | 8628 | +	};  | 
|---|
 | 8629 | +  | 
|---|
 | 8630 | +	perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);  | 
|---|
 | 8631 | +	return;  | 
|---|
 | 8632 | +err:  | 
|---|
 | 8633 | +	WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);  | 
|---|
 | 8634 | +}  | 
|---|
 | 8635 | +  | 
|---|
 | 8636 | +/*  | 
|---|
 | 8637 | + * bpf program load/unload tracking  | 
|---|
 | 8638 | + */  | 
|---|
 | 8639 | +  | 
|---|
 | 8640 | +struct perf_bpf_event {  | 
|---|
 | 8641 | +	struct bpf_prog	*prog;  | 
|---|
 | 8642 | +	struct {  | 
|---|
 | 8643 | +		struct perf_event_header        header;  | 
|---|
 | 8644 | +		u16				type;  | 
|---|
 | 8645 | +		u16				flags;  | 
|---|
 | 8646 | +		u32				id;  | 
|---|
 | 8647 | +		u8				tag[BPF_TAG_SIZE];  | 
|---|
 | 8648 | +	} event_id;  | 
|---|
 | 8649 | +};  | 
|---|
 | 8650 | +  | 
|---|
 | 8651 | +static int perf_event_bpf_match(struct perf_event *event)  | 
|---|
 | 8652 | +{  | 
|---|
 | 8653 | +	return event->attr.bpf_event;  | 
|---|
 | 8654 | +}  | 
|---|
 | 8655 | +  | 
|---|
 | 8656 | +static void perf_event_bpf_output(struct perf_event *event, void *data)  | 
|---|
 | 8657 | +{  | 
|---|
 | 8658 | +	struct perf_bpf_event *bpf_event = data;  | 
|---|
 | 8659 | +	struct perf_output_handle handle;  | 
|---|
 | 8660 | +	struct perf_sample_data sample;  | 
|---|
 | 8661 | +	int ret;  | 
|---|
 | 8662 | +  | 
|---|
 | 8663 | +	if (!perf_event_bpf_match(event))  | 
|---|
 | 8664 | +		return;  | 
|---|
 | 8665 | +  | 
|---|
 | 8666 | +	perf_event_header__init_id(&bpf_event->event_id.header,  | 
|---|
 | 8667 | +				   &sample, event);  | 
|---|
 | 8668 | +	ret = perf_output_begin(&handle, data, event,  | 
|---|
 | 8669 | +				bpf_event->event_id.header.size);  | 
|---|
 | 8670 | +	if (ret)  | 
|---|
 | 8671 | +		return;  | 
|---|
 | 8672 | +  | 
|---|
 | 8673 | +	perf_output_put(&handle, bpf_event->event_id);  | 
|---|
 | 8674 | +	perf_event__output_id_sample(event, &handle, &sample);  | 
|---|
 | 8675 | +  | 
|---|
 | 8676 | +	perf_output_end(&handle);  | 
|---|
 | 8677 | +}  | 
|---|
 | 8678 | +  | 
|---|
 | 8679 | +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,  | 
|---|
 | 8680 | +					 enum perf_bpf_event_type type)  | 
|---|
 | 8681 | +{  | 
|---|
 | 8682 | +	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;  | 
|---|
 | 8683 | +	int i;  | 
|---|
 | 8684 | +  | 
|---|
 | 8685 | +	if (prog->aux->func_cnt == 0) {  | 
|---|
 | 8686 | +		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,  | 
|---|
 | 8687 | +				   (u64)(unsigned long)prog->bpf_func,  | 
|---|
 | 8688 | +				   prog->jited_len, unregister,  | 
|---|
 | 8689 | +				   prog->aux->ksym.name);  | 
|---|
 | 8690 | +	} else {  | 
|---|
 | 8691 | +		for (i = 0; i < prog->aux->func_cnt; i++) {  | 
|---|
 | 8692 | +			struct bpf_prog *subprog = prog->aux->func[i];  | 
|---|
 | 8693 | +  | 
|---|
 | 8694 | +			perf_event_ksymbol(  | 
|---|
 | 8695 | +				PERF_RECORD_KSYMBOL_TYPE_BPF,  | 
|---|
 | 8696 | +				(u64)(unsigned long)subprog->bpf_func,  | 
|---|
 | 8697 | +				subprog->jited_len, unregister,  | 
|---|
 | 8698 | +				subprog->aux->ksym.name);  | 
|---|
 | 8699 | +		}  | 
|---|
 | 8700 | +	}  | 
|---|
 | 8701 | +}  | 
|---|
 | 8702 | +  | 
|---|
 | 8703 | +void perf_event_bpf_event(struct bpf_prog *prog,  | 
|---|
 | 8704 | +			  enum perf_bpf_event_type type,  | 
|---|
 | 8705 | +			  u16 flags)  | 
|---|
 | 8706 | +{  | 
|---|
 | 8707 | +	struct perf_bpf_event bpf_event;  | 
|---|
 | 8708 | +  | 
|---|
 | 8709 | +	if (type <= PERF_BPF_EVENT_UNKNOWN ||  | 
|---|
 | 8710 | +	    type >= PERF_BPF_EVENT_MAX)  | 
|---|
 | 8711 | +		return;  | 
|---|
 | 8712 | +  | 
|---|
 | 8713 | +	switch (type) {  | 
|---|
 | 8714 | +	case PERF_BPF_EVENT_PROG_LOAD:  | 
|---|
 | 8715 | +	case PERF_BPF_EVENT_PROG_UNLOAD:  | 
|---|
 | 8716 | +		if (atomic_read(&nr_ksymbol_events))  | 
|---|
 | 8717 | +			perf_event_bpf_emit_ksymbols(prog, type);  | 
|---|
 | 8718 | +		break;  | 
|---|
 | 8719 | +	default:  | 
|---|
 | 8720 | +		break;  | 
|---|
 | 8721 | +	}  | 
|---|
 | 8722 | +  | 
|---|
 | 8723 | +	if (!atomic_read(&nr_bpf_events))  | 
|---|
 | 8724 | +		return;  | 
|---|
 | 8725 | +  | 
|---|
 | 8726 | +	bpf_event = (struct perf_bpf_event){  | 
|---|
 | 8727 | +		.prog = prog,  | 
|---|
 | 8728 | +		.event_id = {  | 
|---|
 | 8729 | +			.header = {  | 
|---|
 | 8730 | +				.type = PERF_RECORD_BPF_EVENT,  | 
|---|
 | 8731 | +				.size = sizeof(bpf_event.event_id),  | 
|---|
 | 8732 | +			},  | 
|---|
 | 8733 | +			.type = type,  | 
|---|
 | 8734 | +			.flags = flags,  | 
|---|
 | 8735 | +			.id = prog->aux->id,  | 
|---|
 | 8736 | +		},  | 
|---|
 | 8737 | +	};  | 
|---|
 | 8738 | +  | 
|---|
 | 8739 | +	BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));  | 
|---|
 | 8740 | +  | 
|---|
 | 8741 | +	memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);  | 
|---|
 | 8742 | +	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);  | 
|---|
 | 8743 | +}  | 
|---|
 | 8744 | +  | 
|---|
 | 8745 | +struct perf_text_poke_event {  | 
|---|
 | 8746 | +	const void		*old_bytes;  | 
|---|
 | 8747 | +	const void		*new_bytes;  | 
|---|
 | 8748 | +	size_t			pad;  | 
|---|
 | 8749 | +	u16			old_len;  | 
|---|
 | 8750 | +	u16			new_len;  | 
|---|
 | 8751 | +  | 
|---|
 | 8752 | +	struct {  | 
|---|
 | 8753 | +		struct perf_event_header	header;  | 
|---|
 | 8754 | +  | 
|---|
 | 8755 | +		u64				addr;  | 
|---|
 | 8756 | +	} event_id;  | 
|---|
 | 8757 | +};  | 
|---|
 | 8758 | +  | 
|---|
 | 8759 | +static int perf_event_text_poke_match(struct perf_event *event)  | 
|---|
 | 8760 | +{  | 
|---|
 | 8761 | +	return event->attr.text_poke;  | 
|---|
 | 8762 | +}  | 
|---|
 | 8763 | +  | 
|---|
 | 8764 | +static void perf_event_text_poke_output(struct perf_event *event, void *data)  | 
|---|
 | 8765 | +{  | 
|---|
 | 8766 | +	struct perf_text_poke_event *text_poke_event = data;  | 
|---|
 | 8767 | +	struct perf_output_handle handle;  | 
|---|
 | 8768 | +	struct perf_sample_data sample;  | 
|---|
 | 8769 | +	u64 padding = 0;  | 
|---|
 | 8770 | +	int ret;  | 
|---|
 | 8771 | +  | 
|---|
 | 8772 | +	if (!perf_event_text_poke_match(event))  | 
|---|
 | 8773 | +		return;  | 
|---|
 | 8774 | +  | 
|---|
 | 8775 | +	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);  | 
|---|
 | 8776 | +  | 
|---|
 | 8777 | +	ret = perf_output_begin(&handle, &sample, event,  | 
|---|
 | 8778 | +				text_poke_event->event_id.header.size);  | 
|---|
 | 8779 | +	if (ret)  | 
|---|
 | 8780 | +		return;  | 
|---|
 | 8781 | +  | 
|---|
 | 8782 | +	perf_output_put(&handle, text_poke_event->event_id);  | 
|---|
 | 8783 | +	perf_output_put(&handle, text_poke_event->old_len);  | 
|---|
 | 8784 | +	perf_output_put(&handle, text_poke_event->new_len);  | 
|---|
 | 8785 | +  | 
|---|
 | 8786 | +	__output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);  | 
|---|
 | 8787 | +	__output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);  | 
|---|
 | 8788 | +  | 
|---|
 | 8789 | +	if (text_poke_event->pad)  | 
|---|
 | 8790 | +		__output_copy(&handle, &padding, text_poke_event->pad);  | 
|---|
 | 8791 | +  | 
|---|
 | 8792 | +	perf_event__output_id_sample(event, &handle, &sample);  | 
|---|
 | 8793 | +  | 
|---|
 | 8794 | +	perf_output_end(&handle);  | 
|---|
 | 8795 | +}  | 
|---|
 | 8796 | +  | 
|---|
 | 8797 | +void perf_event_text_poke(const void *addr, const void *old_bytes,  | 
|---|
 | 8798 | +			  size_t old_len, const void *new_bytes, size_t new_len)  | 
|---|
 | 8799 | +{  | 
|---|
 | 8800 | +	struct perf_text_poke_event text_poke_event;  | 
|---|
 | 8801 | +	size_t tot, pad;  | 
|---|
 | 8802 | +  | 
|---|
 | 8803 | +	if (!atomic_read(&nr_text_poke_events))  | 
|---|
 | 8804 | +		return;  | 
|---|
 | 8805 | +  | 
|---|
 | 8806 | +	tot  = sizeof(text_poke_event.old_len) + old_len;  | 
|---|
 | 8807 | +	tot += sizeof(text_poke_event.new_len) + new_len;  | 
|---|
 | 8808 | +	pad  = ALIGN(tot, sizeof(u64)) - tot;  | 
|---|
 | 8809 | +  | 
|---|
 | 8810 | +	text_poke_event = (struct perf_text_poke_event){  | 
|---|
 | 8811 | +		.old_bytes    = old_bytes,  | 
|---|
 | 8812 | +		.new_bytes    = new_bytes,  | 
|---|
 | 8813 | +		.pad          = pad,  | 
|---|
 | 8814 | +		.old_len      = old_len,  | 
|---|
 | 8815 | +		.new_len      = new_len,  | 
|---|
 | 8816 | +		.event_id  = {  | 
|---|
 | 8817 | +			.header = {  | 
|---|
 | 8818 | +				.type = PERF_RECORD_TEXT_POKE,  | 
|---|
 | 8819 | +				.misc = PERF_RECORD_MISC_KERNEL,  | 
|---|
 | 8820 | +				.size = sizeof(text_poke_event.event_id) + tot + pad,  | 
|---|
 | 8821 | +			},  | 
|---|
 | 8822 | +			.addr = (unsigned long)addr,  | 
|---|
 | 8823 | +		},  | 
|---|
 | 8824 | +	};  | 
|---|
 | 8825 | +  | 
|---|
 | 8826 | +	perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);  | 
|---|
| 7789 | 8827 |  } | 
|---|
| 7790 | 8828 |   | 
|---|
| 7791 | 8829 |  void perf_event_itrace_started(struct perf_event *event) | 
|---|
| .. | .. | 
|---|
| 7818 | 8856 |  	rec.tid	= perf_event_tid(event, current); | 
|---|
| 7819 | 8857 |   | 
|---|
| 7820 | 8858 |  	perf_event_header__init_id(&rec.header, &sample, event); | 
|---|
| 7821 |  | -	ret = perf_output_begin(&handle, event, rec.header.size);  | 
|---|
 | 8859 | +	ret = perf_output_begin(&handle, &sample, event, rec.header.size);  | 
|---|
| 7822 | 8860 |   | 
|---|
| 7823 | 8861 |  	if (ret) | 
|---|
| 7824 | 8862 |  		return; | 
|---|
| .. | .. | 
|---|
| 8386 | 9424 |  	if (event->hw.state & PERF_HES_STOPPED) | 
|---|
| 8387 | 9425 |  		return 0; | 
|---|
| 8388 | 9426 |  	/* | 
|---|
| 8389 |  | -	 * All tracepoints are from kernel-space.  | 
|---|
 | 9427 | +	 * If exclude_kernel, only trace user-space tracepoints (uprobes)  | 
|---|
| 8390 | 9428 |  	 */ | 
|---|
| 8391 |  | -	if (event->attr.exclude_kernel)  | 
|---|
 | 9429 | +	if (event->attr.exclude_kernel && !user_mode(regs))  | 
|---|
| 8392 | 9430 |  		return 0; | 
|---|
| 8393 | 9431 |   | 
|---|
| 8394 | 9432 |  	if (!perf_tp_filter_match(event, data)) | 
|---|
| .. | .. | 
|---|
| 8514 | 9552 |   * | 
|---|
| 8515 | 9553 |   * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe | 
|---|
| 8516 | 9554 |   *                               if not set, create kprobe/uprobe | 
|---|
 | 9555 | + *  | 
|---|
 | 9556 | + * The following values specify a reference counter (or semaphore in the  | 
|---|
 | 9557 | + * terminology of tools like dtrace, systemtap, etc.) Userspace Statically  | 
|---|
 | 9558 | + * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.  | 
|---|
 | 9559 | + *  | 
|---|
 | 9560 | + * PERF_UPROBE_REF_CTR_OFFSET_BITS	# of bits in config as th offset  | 
|---|
 | 9561 | + * PERF_UPROBE_REF_CTR_OFFSET_SHIFT	# of bits to shift left  | 
|---|
| 8517 | 9562 |   */ | 
|---|
| 8518 | 9563 |  enum perf_probe_config { | 
|---|
| 8519 | 9564 |  	PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */ | 
|---|
 | 9565 | +	PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,  | 
|---|
 | 9566 | +	PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,  | 
|---|
| 8520 | 9567 |  }; | 
|---|
| 8521 | 9568 |   | 
|---|
| 8522 | 9569 |  PMU_FORMAT_ATTR(retprobe, "config:0"); | 
|---|
 | 9570 | +#endif  | 
|---|
| 8523 | 9571 |   | 
|---|
| 8524 |  | -static struct attribute *probe_attrs[] = {  | 
|---|
 | 9572 | +#ifdef CONFIG_KPROBE_EVENTS  | 
|---|
 | 9573 | +static struct attribute *kprobe_attrs[] = {  | 
|---|
| 8525 | 9574 |  	&format_attr_retprobe.attr, | 
|---|
| 8526 | 9575 |  	NULL, | 
|---|
| 8527 | 9576 |  }; | 
|---|
| 8528 | 9577 |   | 
|---|
| 8529 |  | -static struct attribute_group probe_format_group = {  | 
|---|
 | 9578 | +static struct attribute_group kprobe_format_group = {  | 
|---|
| 8530 | 9579 |  	.name = "format", | 
|---|
| 8531 |  | -	.attrs = probe_attrs,  | 
|---|
 | 9580 | +	.attrs = kprobe_attrs,  | 
|---|
| 8532 | 9581 |  }; | 
|---|
| 8533 | 9582 |   | 
|---|
| 8534 |  | -static const struct attribute_group *probe_attr_groups[] = {  | 
|---|
| 8535 |  | -	&probe_format_group,  | 
|---|
 | 9583 | +static const struct attribute_group *kprobe_attr_groups[] = {  | 
|---|
 | 9584 | +	&kprobe_format_group,  | 
|---|
| 8536 | 9585 |  	NULL, | 
|---|
| 8537 | 9586 |  }; | 
|---|
| 8538 |  | -#endif  | 
|---|
| 8539 | 9587 |   | 
|---|
| 8540 |  | -#ifdef CONFIG_KPROBE_EVENTS  | 
|---|
| 8541 | 9588 |  static int perf_kprobe_event_init(struct perf_event *event); | 
|---|
| 8542 | 9589 |  static struct pmu perf_kprobe = { | 
|---|
| 8543 | 9590 |  	.task_ctx_nr	= perf_sw_context, | 
|---|
| .. | .. | 
|---|
| 8547 | 9594 |  	.start		= perf_swevent_start, | 
|---|
| 8548 | 9595 |  	.stop		= perf_swevent_stop, | 
|---|
| 8549 | 9596 |  	.read		= perf_swevent_read, | 
|---|
| 8550 |  | -	.attr_groups	= probe_attr_groups,  | 
|---|
 | 9597 | +	.attr_groups	= kprobe_attr_groups,  | 
|---|
| 8551 | 9598 |  }; | 
|---|
| 8552 | 9599 |   | 
|---|
| 8553 | 9600 |  static int perf_kprobe_event_init(struct perf_event *event) | 
|---|
| .. | .. | 
|---|
| 8558 | 9605 |  	if (event->attr.type != perf_kprobe.type) | 
|---|
| 8559 | 9606 |  		return -ENOENT; | 
|---|
| 8560 | 9607 |   | 
|---|
| 8561 |  | -	if (!capable(CAP_SYS_ADMIN))  | 
|---|
 | 9608 | +	if (!perfmon_capable())  | 
|---|
| 8562 | 9609 |  		return -EACCES; | 
|---|
| 8563 | 9610 |   | 
|---|
| 8564 | 9611 |  	/* | 
|---|
| .. | .. | 
|---|
| 8579 | 9626 |  #endif /* CONFIG_KPROBE_EVENTS */ | 
|---|
| 8580 | 9627 |   | 
|---|
| 8581 | 9628 |  #ifdef CONFIG_UPROBE_EVENTS | 
|---|
 | 9629 | +PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");  | 
|---|
 | 9630 | +  | 
|---|
 | 9631 | +static struct attribute *uprobe_attrs[] = {  | 
|---|
 | 9632 | +	&format_attr_retprobe.attr,  | 
|---|
 | 9633 | +	&format_attr_ref_ctr_offset.attr,  | 
|---|
 | 9634 | +	NULL,  | 
|---|
 | 9635 | +};  | 
|---|
 | 9636 | +  | 
|---|
 | 9637 | +static struct attribute_group uprobe_format_group = {  | 
|---|
 | 9638 | +	.name = "format",  | 
|---|
 | 9639 | +	.attrs = uprobe_attrs,  | 
|---|
 | 9640 | +};  | 
|---|
 | 9641 | +  | 
|---|
 | 9642 | +static const struct attribute_group *uprobe_attr_groups[] = {  | 
|---|
 | 9643 | +	&uprobe_format_group,  | 
|---|
 | 9644 | +	NULL,  | 
|---|
 | 9645 | +};  | 
|---|
 | 9646 | +  | 
|---|
| 8582 | 9647 |  static int perf_uprobe_event_init(struct perf_event *event); | 
|---|
| 8583 | 9648 |  static struct pmu perf_uprobe = { | 
|---|
| 8584 | 9649 |  	.task_ctx_nr	= perf_sw_context, | 
|---|
| .. | .. | 
|---|
| 8588 | 9653 |  	.start		= perf_swevent_start, | 
|---|
| 8589 | 9654 |  	.stop		= perf_swevent_stop, | 
|---|
| 8590 | 9655 |  	.read		= perf_swevent_read, | 
|---|
| 8591 |  | -	.attr_groups	= probe_attr_groups,  | 
|---|
 | 9656 | +	.attr_groups	= uprobe_attr_groups,  | 
|---|
| 8592 | 9657 |  }; | 
|---|
| 8593 | 9658 |   | 
|---|
| 8594 | 9659 |  static int perf_uprobe_event_init(struct perf_event *event) | 
|---|
| 8595 | 9660 |  { | 
|---|
| 8596 | 9661 |  	int err; | 
|---|
 | 9662 | +	unsigned long ref_ctr_offset;  | 
|---|
| 8597 | 9663 |  	bool is_retprobe; | 
|---|
| 8598 | 9664 |   | 
|---|
| 8599 | 9665 |  	if (event->attr.type != perf_uprobe.type) | 
|---|
| 8600 | 9666 |  		return -ENOENT; | 
|---|
| 8601 | 9667 |   | 
|---|
| 8602 |  | -	if (!capable(CAP_SYS_ADMIN))  | 
|---|
 | 9668 | +	if (!perfmon_capable())  | 
|---|
| 8603 | 9669 |  		return -EACCES; | 
|---|
| 8604 | 9670 |   | 
|---|
| 8605 | 9671 |  	/* | 
|---|
| .. | .. | 
|---|
| 8609 | 9675 |  		return -EOPNOTSUPP; | 
|---|
| 8610 | 9676 |   | 
|---|
| 8611 | 9677 |  	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; | 
|---|
| 8612 |  | -	err = perf_uprobe_init(event, is_retprobe);  | 
|---|
 | 9678 | +	ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;  | 
|---|
 | 9679 | +	err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);  | 
|---|
| 8613 | 9680 |  	if (err) | 
|---|
| 8614 | 9681 |  		return err; | 
|---|
| 8615 | 9682 |   | 
|---|
| .. | .. | 
|---|
| 8647 | 9714 |  	int ret = 0; | 
|---|
| 8648 | 9715 |   | 
|---|
| 8649 | 9716 |  	ctx.regs = perf_arch_bpf_user_pt_regs(regs); | 
|---|
| 8650 |  | -	preempt_disable();  | 
|---|
| 8651 | 9717 |  	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) | 
|---|
| 8652 | 9718 |  		goto out; | 
|---|
| 8653 | 9719 |  	rcu_read_lock(); | 
|---|
| .. | .. | 
|---|
| 8655 | 9721 |  	rcu_read_unlock(); | 
|---|
| 8656 | 9722 |  out: | 
|---|
| 8657 | 9723 |  	__this_cpu_dec(bpf_prog_active); | 
|---|
| 8658 |  | -	preempt_enable();  | 
|---|
| 8659 | 9724 |  	if (!ret) | 
|---|
| 8660 | 9725 |  		return; | 
|---|
| 8661 | 9726 |   | 
|---|
| .. | .. | 
|---|
| 8676 | 9741 |  	prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); | 
|---|
| 8677 | 9742 |  	if (IS_ERR(prog)) | 
|---|
| 8678 | 9743 |  		return PTR_ERR(prog); | 
|---|
 | 9744 | +  | 
|---|
 | 9745 | +	if (event->attr.precise_ip &&  | 
|---|
 | 9746 | +	    prog->call_get_stack &&  | 
|---|
 | 9747 | +	    (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||  | 
|---|
 | 9748 | +	     event->attr.exclude_callchain_kernel ||  | 
|---|
 | 9749 | +	     event->attr.exclude_callchain_user)) {  | 
|---|
 | 9750 | +		/*  | 
|---|
 | 9751 | +		 * On perf_event with precise_ip, calling bpf_get_stack()  | 
|---|
 | 9752 | +		 * may trigger unwinder warnings and occasional crashes.  | 
|---|
 | 9753 | +		 * bpf_get_[stack|stackid] works around this issue by using  | 
|---|
 | 9754 | +		 * callchain attached to perf_sample_data. If the  | 
|---|
 | 9755 | +		 * perf_event does not full (kernel and user) callchain  | 
|---|
 | 9756 | +		 * attached to perf_sample_data, do not allow attaching BPF  | 
|---|
 | 9757 | +		 * program that calls bpf_get_[stack|stackid].  | 
|---|
 | 9758 | +		 */  | 
|---|
 | 9759 | +		bpf_prog_put(prog);  | 
|---|
 | 9760 | +		return -EPROTO;  | 
|---|
 | 9761 | +	}  | 
|---|
| 8679 | 9762 |   | 
|---|
| 8680 | 9763 |  	event->prog = prog; | 
|---|
| 8681 | 9764 |  	event->orig_overflow_handler = READ_ONCE(event->overflow_handler); | 
|---|
| .. | .. | 
|---|
| 8875 | 9958 |  /* | 
|---|
| 8876 | 9959 |   * Scan through mm's vmas and see if one of them matches the | 
|---|
| 8877 | 9960 |   * @filter; if so, adjust filter's address range. | 
|---|
| 8878 |  | - * Called with mm::mmap_sem down for reading.  | 
|---|
 | 9961 | + * Called with mm::mmap_lock down for reading.  | 
|---|
| 8879 | 9962 |   */ | 
|---|
| 8880 | 9963 |  static void perf_addr_filter_apply(struct perf_addr_filter *filter, | 
|---|
| 8881 | 9964 |  				   struct mm_struct *mm, | 
|---|
| .. | .. | 
|---|
| 8917 | 10000 |  		if (!mm) | 
|---|
| 8918 | 10001 |  			goto restart; | 
|---|
| 8919 | 10002 |   | 
|---|
| 8920 |  | -		down_read(&mm->mmap_sem);  | 
|---|
 | 10003 | +		mmap_read_lock(mm);  | 
|---|
| 8921 | 10004 |  	} | 
|---|
| 8922 | 10005 |   | 
|---|
| 8923 | 10006 |  	raw_spin_lock_irqsave(&ifh->lock, flags); | 
|---|
| .. | .. | 
|---|
| 8943 | 10026 |  	raw_spin_unlock_irqrestore(&ifh->lock, flags); | 
|---|
| 8944 | 10027 |   | 
|---|
| 8945 | 10028 |  	if (ifh->nr_file_filters) { | 
|---|
| 8946 |  | -		up_read(&mm->mmap_sem);  | 
|---|
 | 10029 | +		mmap_read_unlock(mm);  | 
|---|
| 8947 | 10030 |   | 
|---|
| 8948 | 10031 |  		mmput(mm); | 
|---|
| 8949 | 10032 |  	} | 
|---|
| .. | .. | 
|---|
| 9050 | 10133 |  		case IF_SRC_KERNELADDR: | 
|---|
| 9051 | 10134 |  		case IF_SRC_KERNEL: | 
|---|
| 9052 | 10135 |  			kernel = 1; | 
|---|
 | 10136 | +			fallthrough;  | 
|---|
| 9053 | 10137 |   | 
|---|
| 9054 | 10138 |  		case IF_SRC_FILEADDR: | 
|---|
| 9055 | 10139 |  		case IF_SRC_FILE: | 
|---|
| .. | .. | 
|---|
| 9136 | 10220 |  			} | 
|---|
| 9137 | 10221 |   | 
|---|
| 9138 | 10222 |  			/* ready to consume more filters */ | 
|---|
 | 10223 | +			kfree(filename);  | 
|---|
 | 10224 | +			filename = NULL;  | 
|---|
| 9139 | 10225 |  			state = IF_STATE_ACTION; | 
|---|
| 9140 | 10226 |  			filter = NULL; | 
|---|
 | 10227 | +			kernel = 0;  | 
|---|
| 9141 | 10228 |  		} | 
|---|
| 9142 | 10229 |  	} | 
|---|
| 9143 | 10230 |   | 
|---|
| .. | .. | 
|---|
| 9285 | 10372 |  		period = max_t(u64, 10000, hwc->sample_period); | 
|---|
| 9286 | 10373 |  	} | 
|---|
| 9287 | 10374 |  	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), | 
|---|
| 9288 |  | -		      HRTIMER_MODE_REL_PINNED);  | 
|---|
 | 10375 | +		      HRTIMER_MODE_REL_PINNED_HARD);  | 
|---|
| 9289 | 10376 |  } | 
|---|
| 9290 | 10377 |   | 
|---|
| 9291 | 10378 |  static void perf_swevent_cancel_hrtimer(struct perf_event *event) | 
|---|
| .. | .. | 
|---|
| 9307 | 10394 |  	if (!is_sampling_event(event)) | 
|---|
| 9308 | 10395 |  		return; | 
|---|
| 9309 | 10396 |   | 
|---|
| 9310 |  | -	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);  | 
|---|
 | 10397 | +	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);  | 
|---|
| 9311 | 10398 |  	hwc->hrtimer.function = perf_swevent_hrtimer; | 
|---|
| 9312 | 10399 |   | 
|---|
| 9313 | 10400 |  	/* | 
|---|
| .. | .. | 
|---|
| 9696 | 10783 |  	if (ret) | 
|---|
| 9697 | 10784 |  		goto del_dev; | 
|---|
| 9698 | 10785 |   | 
|---|
 | 10786 | +	if (pmu->attr_update)  | 
|---|
 | 10787 | +		ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);  | 
|---|
 | 10788 | +  | 
|---|
 | 10789 | +	if (ret)  | 
|---|
 | 10790 | +		goto del_dev;  | 
|---|
 | 10791 | +  | 
|---|
| 9699 | 10792 |  out: | 
|---|
| 9700 | 10793 |  	return ret; | 
|---|
| 9701 | 10794 |   | 
|---|
| .. | .. | 
|---|
| 9712 | 10805 |   | 
|---|
| 9713 | 10806 |  int perf_pmu_register(struct pmu *pmu, const char *name, int type) | 
|---|
| 9714 | 10807 |  { | 
|---|
| 9715 |  | -	int cpu, ret;  | 
|---|
 | 10808 | +	int cpu, ret, max = PERF_TYPE_MAX;  | 
|---|
| 9716 | 10809 |   | 
|---|
| 9717 | 10810 |  	mutex_lock(&pmus_lock); | 
|---|
| 9718 | 10811 |  	ret = -ENOMEM; | 
|---|
| .. | .. | 
|---|
| 9725 | 10818 |  		goto skip_type; | 
|---|
| 9726 | 10819 |  	pmu->name = name; | 
|---|
| 9727 | 10820 |   | 
|---|
| 9728 |  | -	if (type < 0) {  | 
|---|
| 9729 |  | -		type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);  | 
|---|
| 9730 |  | -		if (type < 0) {  | 
|---|
| 9731 |  | -			ret = type;  | 
|---|
 | 10821 | +	if (type != PERF_TYPE_SOFTWARE) {  | 
|---|
 | 10822 | +		if (type >= 0)  | 
|---|
 | 10823 | +			max = type;  | 
|---|
 | 10824 | +  | 
|---|
 | 10825 | +		ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);  | 
|---|
 | 10826 | +		if (ret < 0)  | 
|---|
| 9732 | 10827 |  			goto free_pdc; | 
|---|
| 9733 |  | -		}  | 
|---|
 | 10828 | +  | 
|---|
 | 10829 | +		WARN_ON(type >= 0 && ret != type);  | 
|---|
 | 10830 | +  | 
|---|
 | 10831 | +		type = ret;  | 
|---|
| 9734 | 10832 |  	} | 
|---|
| 9735 | 10833 |  	pmu->type = type; | 
|---|
| 9736 | 10834 |   | 
|---|
| .. | .. | 
|---|
| 9776 | 10874 |  		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); | 
|---|
| 9777 | 10875 |   | 
|---|
| 9778 | 10876 |  		__perf_mux_hrtimer_init(cpuctx, cpu); | 
|---|
 | 10877 | +  | 
|---|
 | 10878 | +		cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);  | 
|---|
 | 10879 | +		cpuctx->heap = cpuctx->heap_default;  | 
|---|
| 9779 | 10880 |  	} | 
|---|
| 9780 | 10881 |   | 
|---|
| 9781 | 10882 |  got_cpu_context: | 
|---|
| .. | .. | 
|---|
| 9807 | 10908 |  	if (!pmu->event_idx) | 
|---|
| 9808 | 10909 |  		pmu->event_idx = perf_event_idx_default; | 
|---|
| 9809 | 10910 |   | 
|---|
| 9810 |  | -	list_add_rcu(&pmu->entry, &pmus);  | 
|---|
 | 10911 | +	/*  | 
|---|
 | 10912 | +	 * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,  | 
|---|
 | 10913 | +	 * since these cannot be in the IDR. This way the linear search  | 
|---|
 | 10914 | +	 * is fast, provided a valid software event is provided.  | 
|---|
 | 10915 | +	 */  | 
|---|
 | 10916 | +	if (type == PERF_TYPE_SOFTWARE || !name)  | 
|---|
 | 10917 | +		list_add_rcu(&pmu->entry, &pmus);  | 
|---|
 | 10918 | +	else  | 
|---|
 | 10919 | +		list_add_tail_rcu(&pmu->entry, &pmus);  | 
|---|
 | 10920 | +  | 
|---|
| 9811 | 10921 |  	atomic_set(&pmu->exclusive_cnt, 0); | 
|---|
| 9812 | 10922 |  	ret = 0; | 
|---|
| 9813 | 10923 |  unlock: | 
|---|
| .. | .. | 
|---|
| 9820 | 10930 |  	put_device(pmu->dev); | 
|---|
| 9821 | 10931 |   | 
|---|
| 9822 | 10932 |  free_idr: | 
|---|
| 9823 |  | -	if (pmu->type >= PERF_TYPE_MAX)  | 
|---|
 | 10933 | +	if (pmu->type != PERF_TYPE_SOFTWARE)  | 
|---|
| 9824 | 10934 |  		idr_remove(&pmu_idr, pmu->type); | 
|---|
| 9825 | 10935 |   | 
|---|
| 9826 | 10936 |  free_pdc: | 
|---|
| .. | .. | 
|---|
| 9842 | 10952 |  	synchronize_rcu(); | 
|---|
| 9843 | 10953 |   | 
|---|
| 9844 | 10954 |  	free_percpu(pmu->pmu_disable_count); | 
|---|
| 9845 |  | -	if (pmu->type >= PERF_TYPE_MAX)  | 
|---|
 | 10955 | +	if (pmu->type != PERF_TYPE_SOFTWARE)  | 
|---|
| 9846 | 10956 |  		idr_remove(&pmu_idr, pmu->type); | 
|---|
| 9847 | 10957 |  	if (pmu_bus_running) { | 
|---|
| 9848 | 10958 |  		if (pmu->nr_addr_filters) | 
|---|
| .. | .. | 
|---|
| 9854 | 10964 |  	mutex_unlock(&pmus_lock); | 
|---|
| 9855 | 10965 |  } | 
|---|
| 9856 | 10966 |  EXPORT_SYMBOL_GPL(perf_pmu_unregister); | 
|---|
 | 10967 | +  | 
|---|
 | 10968 | +static inline bool has_extended_regs(struct perf_event *event)  | 
|---|
 | 10969 | +{  | 
|---|
 | 10970 | +	return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||  | 
|---|
 | 10971 | +	       (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);  | 
|---|
 | 10972 | +}  | 
|---|
| 9857 | 10973 |   | 
|---|
| 9858 | 10974 |  static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) | 
|---|
| 9859 | 10975 |  { | 
|---|
| .. | .. | 
|---|
| 9885 | 11001 |  	if (ctx) | 
|---|
| 9886 | 11002 |  		perf_event_ctx_unlock(event->group_leader, ctx); | 
|---|
| 9887 | 11003 |   | 
|---|
 | 11004 | +	if (!ret) {  | 
|---|
 | 11005 | +		if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&  | 
|---|
 | 11006 | +		    has_extended_regs(event))  | 
|---|
 | 11007 | +			ret = -EOPNOTSUPP;  | 
|---|
 | 11008 | +  | 
|---|
 | 11009 | +		if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&  | 
|---|
 | 11010 | +		    event_has_any_exclude_flag(event))  | 
|---|
 | 11011 | +			ret = -EINVAL;  | 
|---|
 | 11012 | +  | 
|---|
 | 11013 | +		if (ret && event->destroy)  | 
|---|
 | 11014 | +			event->destroy(event);  | 
|---|
 | 11015 | +	}  | 
|---|
 | 11016 | +  | 
|---|
| 9888 | 11017 |  	if (ret) | 
|---|
| 9889 | 11018 |  		module_put(pmu->module); | 
|---|
| 9890 | 11019 |   | 
|---|
| .. | .. | 
|---|
| 9893 | 11022 |   | 
|---|
| 9894 | 11023 |  static struct pmu *perf_init_event(struct perf_event *event) | 
|---|
| 9895 | 11024 |  { | 
|---|
 | 11025 | +	int idx, type, ret;  | 
|---|
| 9896 | 11026 |  	struct pmu *pmu; | 
|---|
| 9897 |  | -	int idx;  | 
|---|
| 9898 |  | -	int ret;  | 
|---|
| 9899 | 11027 |   | 
|---|
| 9900 | 11028 |  	idx = srcu_read_lock(&pmus_srcu); | 
|---|
| 9901 | 11029 |   | 
|---|
| .. | .. | 
|---|
| 9907 | 11035 |  			goto unlock; | 
|---|
| 9908 | 11036 |  	} | 
|---|
| 9909 | 11037 |   | 
|---|
 | 11038 | +	/*  | 
|---|
 | 11039 | +	 * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE  | 
|---|
 | 11040 | +	 * are often aliases for PERF_TYPE_RAW.  | 
|---|
 | 11041 | +	 */  | 
|---|
 | 11042 | +	type = event->attr.type;  | 
|---|
 | 11043 | +	if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)  | 
|---|
 | 11044 | +		type = PERF_TYPE_RAW;  | 
|---|
 | 11045 | +  | 
|---|
 | 11046 | +again:  | 
|---|
| 9910 | 11047 |  	rcu_read_lock(); | 
|---|
| 9911 |  | -	pmu = idr_find(&pmu_idr, event->attr.type);  | 
|---|
 | 11048 | +	pmu = idr_find(&pmu_idr, type);  | 
|---|
| 9912 | 11049 |  	rcu_read_unlock(); | 
|---|
| 9913 | 11050 |  	if (pmu) { | 
|---|
| 9914 | 11051 |  		ret = perf_try_init_event(pmu, event); | 
|---|
 | 11052 | +		if (ret == -ENOENT && event->attr.type != type) {  | 
|---|
 | 11053 | +			type = event->attr.type;  | 
|---|
 | 11054 | +			goto again;  | 
|---|
 | 11055 | +		}  | 
|---|
 | 11056 | +  | 
|---|
| 9915 | 11057 |  		if (ret) | 
|---|
| 9916 | 11058 |  			pmu = ERR_PTR(ret); | 
|---|
 | 11059 | +  | 
|---|
| 9917 | 11060 |  		goto unlock; | 
|---|
| 9918 | 11061 |  	} | 
|---|
| 9919 | 11062 |   | 
|---|
| 9920 |  | -	list_for_each_entry_rcu(pmu, &pmus, entry) {  | 
|---|
 | 11063 | +	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {  | 
|---|
| 9921 | 11064 |  		ret = perf_try_init_event(pmu, event); | 
|---|
| 9922 | 11065 |  		if (!ret) | 
|---|
| 9923 | 11066 |  			goto unlock; | 
|---|
| .. | .. | 
|---|
| 9993 | 11136 |  	if (event->parent) | 
|---|
| 9994 | 11137 |  		return; | 
|---|
| 9995 | 11138 |   | 
|---|
| 9996 |  | -	if (event->attach_state & PERF_ATTACH_TASK)  | 
|---|
 | 11139 | +	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))  | 
|---|
| 9997 | 11140 |  		inc = true; | 
|---|
| 9998 | 11141 |  	if (event->attr.mmap || event->attr.mmap_data) | 
|---|
| 9999 | 11142 |  		atomic_inc(&nr_mmap_events); | 
|---|
| .. | .. | 
|---|
| 10001 | 11144 |  		atomic_inc(&nr_comm_events); | 
|---|
| 10002 | 11145 |  	if (event->attr.namespaces) | 
|---|
| 10003 | 11146 |  		atomic_inc(&nr_namespaces_events); | 
|---|
 | 11147 | +	if (event->attr.cgroup)  | 
|---|
 | 11148 | +		atomic_inc(&nr_cgroup_events);  | 
|---|
| 10004 | 11149 |  	if (event->attr.task) | 
|---|
| 10005 | 11150 |  		atomic_inc(&nr_task_events); | 
|---|
| 10006 | 11151 |  	if (event->attr.freq) | 
|---|
| .. | .. | 
|---|
| 10013 | 11158 |  		inc = true; | 
|---|
| 10014 | 11159 |  	if (is_cgroup_event(event)) | 
|---|
| 10015 | 11160 |  		inc = true; | 
|---|
 | 11161 | +	if (event->attr.ksymbol)  | 
|---|
 | 11162 | +		atomic_inc(&nr_ksymbol_events);  | 
|---|
 | 11163 | +	if (event->attr.bpf_event)  | 
|---|
 | 11164 | +		atomic_inc(&nr_bpf_events);  | 
|---|
 | 11165 | +	if (event->attr.text_poke)  | 
|---|
 | 11166 | +		atomic_inc(&nr_text_poke_events);  | 
|---|
| 10016 | 11167 |   | 
|---|
| 10017 | 11168 |  	if (inc) { | 
|---|
| 10018 | 11169 |  		/* | 
|---|
| .. | .. | 
|---|
| 10031 | 11182 |  			 * call the perf scheduling hooks before proceeding to | 
|---|
| 10032 | 11183 |  			 * install events that need them. | 
|---|
| 10033 | 11184 |  			 */ | 
|---|
| 10034 |  | -			synchronize_sched();  | 
|---|
 | 11185 | +			synchronize_rcu();  | 
|---|
| 10035 | 11186 |  		} | 
|---|
| 10036 | 11187 |  		/* | 
|---|
| 10037 | 11188 |  		 * Now that we have waited for the sync_sched(), allow further | 
|---|
| .. | .. | 
|---|
| 10120 | 11271 |  		 * and we cannot use the ctx information because we need the | 
|---|
| 10121 | 11272 |  		 * pmu before we get a ctx. | 
|---|
| 10122 | 11273 |  		 */ | 
|---|
| 10123 |  | -		get_task_struct(task);  | 
|---|
| 10124 |  | -		event->hw.target = task;  | 
|---|
 | 11274 | +		event->hw.target = get_task_struct(task);  | 
|---|
| 10125 | 11275 |  	} | 
|---|
| 10126 | 11276 |   | 
|---|
| 10127 | 11277 |  	event->clock = &local_clock; | 
|---|
| .. | .. | 
|---|
| 10133 | 11283 |  		context = parent_event->overflow_handler_context; | 
|---|
| 10134 | 11284 |  #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) | 
|---|
| 10135 | 11285 |  		if (overflow_handler == bpf_overflow_handler) { | 
|---|
| 10136 |  | -			struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);  | 
|---|
 | 11286 | +			struct bpf_prog *prog = parent_event->prog;  | 
|---|
| 10137 | 11287 |   | 
|---|
| 10138 |  | -			if (IS_ERR(prog)) {  | 
|---|
| 10139 |  | -				err = PTR_ERR(prog);  | 
|---|
| 10140 |  | -				goto err_ns;  | 
|---|
| 10141 |  | -			}  | 
|---|
 | 11288 | +			bpf_prog_inc(prog);  | 
|---|
| 10142 | 11289 |  			event->prog = prog; | 
|---|
| 10143 | 11290 |  			event->orig_overflow_handler = | 
|---|
| 10144 | 11291 |  				parent_event->orig_overflow_handler; | 
|---|
| .. | .. | 
|---|
| 10179 | 11326 |  	if (!has_branch_stack(event)) | 
|---|
| 10180 | 11327 |  		event->attr.branch_sample_type = 0; | 
|---|
| 10181 | 11328 |   | 
|---|
| 10182 |  | -	if (cgroup_fd != -1) {  | 
|---|
| 10183 |  | -		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);  | 
|---|
| 10184 |  | -		if (err)  | 
|---|
| 10185 |  | -			goto err_ns;  | 
|---|
| 10186 |  | -	}  | 
|---|
| 10187 |  | -  | 
|---|
| 10188 | 11329 |  	pmu = perf_init_event(event); | 
|---|
| 10189 | 11330 |  	if (IS_ERR(pmu)) { | 
|---|
| 10190 | 11331 |  		err = PTR_ERR(pmu); | 
|---|
| 10191 | 11332 |  		goto err_ns; | 
|---|
 | 11333 | +	}  | 
|---|
 | 11334 | +  | 
|---|
 | 11335 | +	/*  | 
|---|
 | 11336 | +	 * Disallow uncore-cgroup events, they don't make sense as the cgroup will  | 
|---|
 | 11337 | +	 * be different on other CPUs in the uncore mask.  | 
|---|
 | 11338 | +	 */  | 
|---|
 | 11339 | +	if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {  | 
|---|
 | 11340 | +		err = -EINVAL;  | 
|---|
 | 11341 | +		goto err_pmu;  | 
|---|
 | 11342 | +	}  | 
|---|
 | 11343 | +  | 
|---|
 | 11344 | +	if (event->attr.aux_output &&  | 
|---|
 | 11345 | +	    !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {  | 
|---|
 | 11346 | +		err = -EOPNOTSUPP;  | 
|---|
 | 11347 | +		goto err_pmu;  | 
|---|
 | 11348 | +	}  | 
|---|
 | 11349 | +  | 
|---|
 | 11350 | +	if (cgroup_fd != -1) {  | 
|---|
 | 11351 | +		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);  | 
|---|
 | 11352 | +		if (err)  | 
|---|
 | 11353 | +			goto err_pmu;  | 
|---|
| 10192 | 11354 |  	} | 
|---|
| 10193 | 11355 |   | 
|---|
| 10194 | 11356 |  	err = exclusive_event_init(event); | 
|---|
| .. | .. | 
|---|
| 10251 | 11413 |  	exclusive_event_destroy(event); | 
|---|
| 10252 | 11414 |   | 
|---|
| 10253 | 11415 |  err_pmu: | 
|---|
 | 11416 | +	if (is_cgroup_event(event))  | 
|---|
 | 11417 | +		perf_detach_cgroup(event);  | 
|---|
| 10254 | 11418 |  	if (event->destroy) | 
|---|
| 10255 | 11419 |  		event->destroy(event); | 
|---|
| 10256 | 11420 |  	module_put(pmu->module); | 
|---|
| 10257 | 11421 |  err_ns: | 
|---|
| 10258 |  | -	if (is_cgroup_event(event))  | 
|---|
| 10259 |  | -		perf_detach_cgroup(event);  | 
|---|
| 10260 | 11422 |  	if (event->ns) | 
|---|
| 10261 | 11423 |  		put_pid_ns(event->ns); | 
|---|
| 10262 | 11424 |  	if (event->hw.target) | 
|---|
| .. | .. | 
|---|
| 10272 | 11434 |  	u32 size; | 
|---|
| 10273 | 11435 |  	int ret; | 
|---|
| 10274 | 11436 |   | 
|---|
| 10275 |  | -	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))  | 
|---|
| 10276 |  | -		return -EFAULT;  | 
|---|
| 10277 |  | -  | 
|---|
| 10278 |  | -	/*  | 
|---|
| 10279 |  | -	 * zero the full structure, so that a short copy will be nice.  | 
|---|
| 10280 |  | -	 */  | 
|---|
 | 11437 | +	/* Zero the full structure, so that a short copy will be nice. */  | 
|---|
| 10281 | 11438 |  	memset(attr, 0, sizeof(*attr)); | 
|---|
| 10282 | 11439 |   | 
|---|
| 10283 | 11440 |  	ret = get_user(size, &uattr->size); | 
|---|
| 10284 | 11441 |  	if (ret) | 
|---|
| 10285 | 11442 |  		return ret; | 
|---|
| 10286 | 11443 |   | 
|---|
| 10287 |  | -	if (size > PAGE_SIZE)	/* silly large */  | 
|---|
| 10288 |  | -		goto err_size;  | 
|---|
| 10289 |  | -  | 
|---|
| 10290 |  | -	if (!size)		/* abi compat */  | 
|---|
 | 11444 | +	/* ABI compatibility quirk: */  | 
|---|
 | 11445 | +	if (!size)  | 
|---|
| 10291 | 11446 |  		size = PERF_ATTR_SIZE_VER0; | 
|---|
| 10292 |  | -  | 
|---|
| 10293 |  | -	if (size < PERF_ATTR_SIZE_VER0)  | 
|---|
 | 11447 | +	if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)  | 
|---|
| 10294 | 11448 |  		goto err_size; | 
|---|
| 10295 | 11449 |   | 
|---|
| 10296 |  | -	/*  | 
|---|
| 10297 |  | -	 * If we're handed a bigger struct than we know of,  | 
|---|
| 10298 |  | -	 * ensure all the unknown bits are 0 - i.e. new  | 
|---|
| 10299 |  | -	 * user-space does not rely on any kernel feature  | 
|---|
| 10300 |  | -	 * extensions we dont know about yet.  | 
|---|
| 10301 |  | -	 */  | 
|---|
| 10302 |  | -	if (size > sizeof(*attr)) {  | 
|---|
| 10303 |  | -		unsigned char __user *addr;  | 
|---|
| 10304 |  | -		unsigned char __user *end;  | 
|---|
| 10305 |  | -		unsigned char val;  | 
|---|
| 10306 |  | -  | 
|---|
| 10307 |  | -		addr = (void __user *)uattr + sizeof(*attr);  | 
|---|
| 10308 |  | -		end  = (void __user *)uattr + size;  | 
|---|
| 10309 |  | -  | 
|---|
| 10310 |  | -		for (; addr < end; addr++) {  | 
|---|
| 10311 |  | -			ret = get_user(val, addr);  | 
|---|
| 10312 |  | -			if (ret)  | 
|---|
| 10313 |  | -				return ret;  | 
|---|
| 10314 |  | -			if (val)  | 
|---|
| 10315 |  | -				goto err_size;  | 
|---|
| 10316 |  | -		}  | 
|---|
| 10317 |  | -		size = sizeof(*attr);  | 
|---|
 | 11450 | +	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);  | 
|---|
 | 11451 | +	if (ret) {  | 
|---|
 | 11452 | +		if (ret == -E2BIG)  | 
|---|
 | 11453 | +			goto err_size;  | 
|---|
 | 11454 | +		return ret;  | 
|---|
| 10318 | 11455 |  	} | 
|---|
| 10319 |  | -  | 
|---|
| 10320 |  | -	ret = copy_from_user(attr, uattr, size);  | 
|---|
| 10321 |  | -	if (ret)  | 
|---|
| 10322 |  | -		return -EFAULT;  | 
|---|
| 10323 | 11456 |   | 
|---|
| 10324 | 11457 |  	attr->size = size; | 
|---|
| 10325 | 11458 |   | 
|---|
| 10326 |  | -	if (attr->__reserved_1)  | 
|---|
 | 11459 | +	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)  | 
|---|
| 10327 | 11460 |  		return -EINVAL; | 
|---|
| 10328 | 11461 |   | 
|---|
| 10329 | 11462 |  	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) | 
|---|
| .. | .. | 
|---|
| 10394 | 11527 |   | 
|---|
| 10395 | 11528 |  	if (attr->sample_type & PERF_SAMPLE_REGS_INTR) | 
|---|
| 10396 | 11529 |  		ret = perf_reg_validate(attr->sample_regs_intr); | 
|---|
 | 11530 | +  | 
|---|
 | 11531 | +#ifndef CONFIG_CGROUP_PERF  | 
|---|
 | 11532 | +	if (attr->sample_type & PERF_SAMPLE_CGROUP)  | 
|---|
 | 11533 | +		return -EINVAL;  | 
|---|
 | 11534 | +#endif  | 
|---|
 | 11535 | +  | 
|---|
| 10397 | 11536 |  out: | 
|---|
| 10398 | 11537 |  	return ret; | 
|---|
| 10399 | 11538 |   | 
|---|
| .. | .. | 
|---|
| 10403 | 11542 |  	goto out; | 
|---|
| 10404 | 11543 |  } | 
|---|
| 10405 | 11544 |   | 
|---|
 | 11545 | +static void mutex_lock_double(struct mutex *a, struct mutex *b)  | 
|---|
 | 11546 | +{  | 
|---|
 | 11547 | +	if (b < a)  | 
|---|
 | 11548 | +		swap(a, b);  | 
|---|
 | 11549 | +  | 
|---|
 | 11550 | +	mutex_lock(a);  | 
|---|
 | 11551 | +	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);  | 
|---|
 | 11552 | +}  | 
|---|
 | 11553 | +  | 
|---|
| 10406 | 11554 |  static int | 
|---|
| 10407 | 11555 |  perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | 
|---|
| 10408 | 11556 |  { | 
|---|
| 10409 |  | -	struct ring_buffer *rb = NULL;  | 
|---|
 | 11557 | +	struct perf_buffer *rb = NULL;  | 
|---|
| 10410 | 11558 |  	int ret = -EINVAL; | 
|---|
| 10411 | 11559 |   | 
|---|
| 10412 |  | -	if (!output_event)  | 
|---|
 | 11560 | +	if (!output_event) {  | 
|---|
 | 11561 | +		mutex_lock(&event->mmap_mutex);  | 
|---|
| 10413 | 11562 |  		goto set; | 
|---|
 | 11563 | +	}  | 
|---|
| 10414 | 11564 |   | 
|---|
| 10415 | 11565 |  	/* don't allow circular references */ | 
|---|
| 10416 | 11566 |  	if (event == output_event) | 
|---|
| .. | .. | 
|---|
| 10448 | 11598 |  	    event->pmu != output_event->pmu) | 
|---|
| 10449 | 11599 |  		goto out; | 
|---|
| 10450 | 11600 |   | 
|---|
 | 11601 | +	/*  | 
|---|
 | 11602 | +	 * Hold both mmap_mutex to serialize against perf_mmap_close().  Since  | 
|---|
 | 11603 | +	 * output_event is already on rb->event_list, and the list iteration  | 
|---|
 | 11604 | +	 * restarts after every removal, it is guaranteed this new event is  | 
|---|
 | 11605 | +	 * observed *OR* if output_event is already removed, it's guaranteed we  | 
|---|
 | 11606 | +	 * observe !rb->mmap_count.  | 
|---|
 | 11607 | +	 */  | 
|---|
 | 11608 | +	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);  | 
|---|
| 10451 | 11609 |  set: | 
|---|
| 10452 |  | -	mutex_lock(&event->mmap_mutex);  | 
|---|
| 10453 | 11610 |  	/* Can't redirect output if we've got an active mmap() */ | 
|---|
| 10454 | 11611 |  	if (atomic_read(&event->mmap_count)) | 
|---|
| 10455 | 11612 |  		goto unlock; | 
|---|
| .. | .. | 
|---|
| 10459 | 11616 |  		rb = ring_buffer_get(output_event); | 
|---|
| 10460 | 11617 |  		if (!rb) | 
|---|
| 10461 | 11618 |  			goto unlock; | 
|---|
 | 11619 | +  | 
|---|
 | 11620 | +		/* did we race against perf_mmap_close() */  | 
|---|
 | 11621 | +		if (!atomic_read(&rb->mmap_count)) {  | 
|---|
 | 11622 | +			ring_buffer_put(rb);  | 
|---|
 | 11623 | +			goto unlock;  | 
|---|
 | 11624 | +		}  | 
|---|
| 10462 | 11625 |  	} | 
|---|
| 10463 | 11626 |   | 
|---|
| 10464 | 11627 |  	ring_buffer_attach(event, rb); | 
|---|
| .. | .. | 
|---|
| 10466 | 11629 |  	ret = 0; | 
|---|
| 10467 | 11630 |  unlock: | 
|---|
| 10468 | 11631 |  	mutex_unlock(&event->mmap_mutex); | 
|---|
 | 11632 | +	if (output_event)  | 
|---|
 | 11633 | +		mutex_unlock(&output_event->mmap_mutex);  | 
|---|
| 10469 | 11634 |   | 
|---|
| 10470 | 11635 |  out: | 
|---|
| 10471 | 11636 |  	return ret; | 
|---|
| 10472 |  | -}  | 
|---|
| 10473 |  | -  | 
|---|
| 10474 |  | -static void mutex_lock_double(struct mutex *a, struct mutex *b)  | 
|---|
| 10475 |  | -{  | 
|---|
| 10476 |  | -	if (b < a)  | 
|---|
| 10477 |  | -		swap(a, b);  | 
|---|
| 10478 |  | -  | 
|---|
| 10479 |  | -	mutex_lock(a);  | 
|---|
| 10480 |  | -	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);  | 
|---|
| 10481 | 11637 |  } | 
|---|
| 10482 | 11638 |   | 
|---|
| 10483 | 11639 |  static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) | 
|---|
| .. | .. | 
|---|
| 10500 | 11656 |  		break; | 
|---|
| 10501 | 11657 |   | 
|---|
| 10502 | 11658 |  	case CLOCK_BOOTTIME: | 
|---|
| 10503 |  | -		event->clock = &ktime_get_boot_ns;  | 
|---|
 | 11659 | +		event->clock = &ktime_get_boottime_ns;  | 
|---|
| 10504 | 11660 |  		break; | 
|---|
| 10505 | 11661 |   | 
|---|
| 10506 | 11662 |  	case CLOCK_TAI: | 
|---|
| 10507 |  | -		event->clock = &ktime_get_tai_ns;  | 
|---|
 | 11663 | +		event->clock = &ktime_get_clocktai_ns;  | 
|---|
| 10508 | 11664 |  		break; | 
|---|
| 10509 | 11665 |   | 
|---|
| 10510 | 11666 |  	default: | 
|---|
| .. | .. | 
|---|
| 10530 | 11686 |  again: | 
|---|
| 10531 | 11687 |  	rcu_read_lock(); | 
|---|
| 10532 | 11688 |  	gctx = READ_ONCE(group_leader->ctx); | 
|---|
| 10533 |  | -	if (!atomic_inc_not_zero(&gctx->refcount)) {  | 
|---|
 | 11689 | +	if (!refcount_inc_not_zero(&gctx->refcount)) {  | 
|---|
| 10534 | 11690 |  		rcu_read_unlock(); | 
|---|
| 10535 | 11691 |  		goto again; | 
|---|
| 10536 | 11692 |  	} | 
|---|
| .. | .. | 
|---|
| 10563 | 11719 |  	struct perf_event *group_leader = NULL, *output_event = NULL; | 
|---|
| 10564 | 11720 |  	struct perf_event *event, *sibling; | 
|---|
| 10565 | 11721 |  	struct perf_event_attr attr; | 
|---|
| 10566 |  | -	struct perf_event_context *ctx, *uninitialized_var(gctx);  | 
|---|
 | 11722 | +	struct perf_event_context *ctx, *gctx;  | 
|---|
| 10567 | 11723 |  	struct file *event_file = NULL; | 
|---|
| 10568 | 11724 |  	struct fd group = {NULL, 0}; | 
|---|
| 10569 | 11725 |  	struct task_struct *task = NULL; | 
|---|
| .. | .. | 
|---|
| 10577 | 11733 |  	/* for future expandability... */ | 
|---|
| 10578 | 11734 |  	if (flags & ~PERF_FLAG_ALL) | 
|---|
| 10579 | 11735 |  		return -EINVAL; | 
|---|
| 10580 |  | -  | 
|---|
| 10581 |  | -	if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))  | 
|---|
| 10582 |  | -		return -EACCES;  | 
|---|
| 10583 | 11736 |   | 
|---|
| 10584 | 11737 |  	/* Do we allow access to perf_event_open(2) ? */ | 
|---|
| 10585 | 11738 |  	err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); | 
|---|
| .. | .. | 
|---|
| 10597 | 11750 |  	} | 
|---|
| 10598 | 11751 |   | 
|---|
| 10599 | 11752 |  	if (attr.namespaces) { | 
|---|
| 10600 |  | -		if (!capable(CAP_SYS_ADMIN))  | 
|---|
 | 11753 | +		if (!perfmon_capable())  | 
|---|
| 10601 | 11754 |  			return -EACCES; | 
|---|
| 10602 | 11755 |  	} | 
|---|
| 10603 | 11756 |   | 
|---|
| .. | .. | 
|---|
| 10612 | 11765 |  	/* Only privileged users can get physical addresses */ | 
|---|
| 10613 | 11766 |  	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { | 
|---|
| 10614 | 11767 |  		err = perf_allow_kernel(&attr); | 
|---|
 | 11768 | +		if (err)  | 
|---|
 | 11769 | +			return err;  | 
|---|
 | 11770 | +	}  | 
|---|
 | 11771 | +  | 
|---|
 | 11772 | +	/* REGS_INTR can leak data, lockdown must prevent this */  | 
|---|
 | 11773 | +	if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {  | 
|---|
 | 11774 | +		err = security_locked_down(LOCKDOWN_PERF);  | 
|---|
| 10615 | 11775 |  		if (err) | 
|---|
| 10616 | 11776 |  			return err; | 
|---|
| 10617 | 11777 |  	} | 
|---|
| .. | .. | 
|---|
| 10657 | 11817 |  		goto err_task; | 
|---|
| 10658 | 11818 |  	} | 
|---|
| 10659 | 11819 |   | 
|---|
| 10660 |  | -	if (task) {  | 
|---|
| 10661 |  | -		err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);  | 
|---|
| 10662 |  | -		if (err)  | 
|---|
| 10663 |  | -			goto err_task;  | 
|---|
| 10664 |  | -  | 
|---|
| 10665 |  | -		/*  | 
|---|
| 10666 |  | -		 * Reuse ptrace permission checks for now.  | 
|---|
| 10667 |  | -		 *  | 
|---|
| 10668 |  | -		 * We must hold cred_guard_mutex across this and any potential  | 
|---|
| 10669 |  | -		 * perf_install_in_context() call for this new event to  | 
|---|
| 10670 |  | -		 * serialize against exec() altering our credentials (and the  | 
|---|
| 10671 |  | -		 * perf_event_exit_task() that could imply).  | 
|---|
| 10672 |  | -		 */  | 
|---|
| 10673 |  | -		err = -EACCES;  | 
|---|
| 10674 |  | -		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))  | 
|---|
| 10675 |  | -			goto err_cred;  | 
|---|
| 10676 |  | -	}  | 
|---|
| 10677 |  | -  | 
|---|
| 10678 | 11820 |  	if (flags & PERF_FLAG_PID_CGROUP) | 
|---|
| 10679 | 11821 |  		cgroup_fd = pid; | 
|---|
| 10680 | 11822 |   | 
|---|
| .. | .. | 
|---|
| 10682 | 11824 |  				 NULL, NULL, cgroup_fd); | 
|---|
| 10683 | 11825 |  	if (IS_ERR(event)) { | 
|---|
| 10684 | 11826 |  		err = PTR_ERR(event); | 
|---|
| 10685 |  | -		goto err_cred;  | 
|---|
 | 11827 | +		goto err_task;  | 
|---|
| 10686 | 11828 |  	} | 
|---|
| 10687 | 11829 |   | 
|---|
| 10688 | 11830 |  	if (is_sampling_event(event)) { | 
|---|
| .. | .. | 
|---|
| 10776 | 11918 |  		 * Do not allow to attach to a group in a different task | 
|---|
| 10777 | 11919 |  		 * or CPU context. If we're moving SW events, we'll fix | 
|---|
| 10778 | 11920 |  		 * this up later, so allow that. | 
|---|
 | 11921 | +		 *  | 
|---|
 | 11922 | +		 * Racy, not holding group_leader->ctx->mutex, see comment with  | 
|---|
 | 11923 | +		 * perf_event_ctx_lock().  | 
|---|
| 10779 | 11924 |  		 */ | 
|---|
| 10780 | 11925 |  		if (!move_group && group_leader->ctx != ctx) | 
|---|
| 10781 | 11926 |  			goto err_context; | 
|---|
| .. | .. | 
|---|
| 10799 | 11944 |  		err = PTR_ERR(event_file); | 
|---|
| 10800 | 11945 |  		event_file = NULL; | 
|---|
| 10801 | 11946 |  		goto err_context; | 
|---|
 | 11947 | +	}  | 
|---|
 | 11948 | +  | 
|---|
 | 11949 | +	if (task) {  | 
|---|
 | 11950 | +		err = down_read_interruptible(&task->signal->exec_update_lock);  | 
|---|
 | 11951 | +		if (err)  | 
|---|
 | 11952 | +			goto err_file;  | 
|---|
 | 11953 | +  | 
|---|
 | 11954 | +		/*  | 
|---|
 | 11955 | +		 * Preserve ptrace permission check for backwards compatibility.  | 
|---|
 | 11956 | +		 *  | 
|---|
 | 11957 | +		 * We must hold exec_update_lock across this and any potential  | 
|---|
 | 11958 | +		 * perf_install_in_context() call for this new event to  | 
|---|
 | 11959 | +		 * serialize against exec() altering our credentials (and the  | 
|---|
 | 11960 | +		 * perf_event_exit_task() that could imply).  | 
|---|
 | 11961 | +		 */  | 
|---|
 | 11962 | +		err = -EACCES;  | 
|---|
 | 11963 | +		if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))  | 
|---|
 | 11964 | +			goto err_cred;  | 
|---|
| 10802 | 11965 |  	} | 
|---|
| 10803 | 11966 |   | 
|---|
| 10804 | 11967 |  	if (move_group) { | 
|---|
| .. | .. | 
|---|
| 10825 | 11988 |  			} else { | 
|---|
| 10826 | 11989 |  				perf_event_ctx_unlock(group_leader, gctx); | 
|---|
| 10827 | 11990 |  				move_group = 0; | 
|---|
 | 11991 | +				goto not_move_group;  | 
|---|
| 10828 | 11992 |  			} | 
|---|
| 10829 | 11993 |  		} | 
|---|
| 10830 | 11994 |   | 
|---|
| .. | .. | 
|---|
| 10841 | 12005 |  		} | 
|---|
| 10842 | 12006 |  	} else { | 
|---|
| 10843 | 12007 |  		mutex_lock(&ctx->mutex); | 
|---|
 | 12008 | +  | 
|---|
 | 12009 | +		/*  | 
|---|
 | 12010 | +		 * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,  | 
|---|
 | 12011 | +		 * see the group_leader && !move_group test earlier.  | 
|---|
 | 12012 | +		 */  | 
|---|
 | 12013 | +		if (group_leader && group_leader->ctx != ctx) {  | 
|---|
 | 12014 | +			err = -EINVAL;  | 
|---|
 | 12015 | +			goto err_locked;  | 
|---|
 | 12016 | +		}  | 
|---|
| 10844 | 12017 |  	} | 
|---|
 | 12018 | +not_move_group:  | 
|---|
| 10845 | 12019 |   | 
|---|
| 10846 | 12020 |  	if (ctx->task == TASK_TOMBSTONE) { | 
|---|
| 10847 | 12021 |  		err = -ESRCH; | 
|---|
| .. | .. | 
|---|
| 10869 | 12043 |  		} | 
|---|
| 10870 | 12044 |  	} | 
|---|
| 10871 | 12045 |   | 
|---|
 | 12046 | +	if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {  | 
|---|
 | 12047 | +		err = -EINVAL;  | 
|---|
 | 12048 | +		goto err_locked;  | 
|---|
 | 12049 | +	}  | 
|---|
| 10872 | 12050 |   | 
|---|
| 10873 | 12051 |  	/* | 
|---|
| 10874 | 12052 |  	 * Must be under the same ctx::mutex as perf_install_in_context(), | 
|---|
| .. | .. | 
|---|
| 10950 | 12128 |  	mutex_unlock(&ctx->mutex); | 
|---|
| 10951 | 12129 |   | 
|---|
| 10952 | 12130 |  	if (task) { | 
|---|
| 10953 |  | -		mutex_unlock(&task->signal->cred_guard_mutex);  | 
|---|
 | 12131 | +		up_read(&task->signal->exec_update_lock);  | 
|---|
| 10954 | 12132 |  		put_task_struct(task); | 
|---|
| 10955 | 12133 |  	} | 
|---|
| 10956 | 12134 |   | 
|---|
| .. | .. | 
|---|
| 10972 | 12150 |  	if (move_group) | 
|---|
| 10973 | 12151 |  		perf_event_ctx_unlock(group_leader, gctx); | 
|---|
| 10974 | 12152 |  	mutex_unlock(&ctx->mutex); | 
|---|
| 10975 |  | -/* err_file: */  | 
|---|
 | 12153 | +err_cred:  | 
|---|
 | 12154 | +	if (task)  | 
|---|
 | 12155 | +		up_read(&task->signal->exec_update_lock);  | 
|---|
 | 12156 | +err_file:  | 
|---|
| 10976 | 12157 |  	fput(event_file); | 
|---|
| 10977 | 12158 |  err_context: | 
|---|
| 10978 | 12159 |  	perf_unpin_context(ctx); | 
|---|
| .. | .. | 
|---|
| 10984 | 12165 |  	 */ | 
|---|
| 10985 | 12166 |  	if (!event_file) | 
|---|
| 10986 | 12167 |  		free_event(event); | 
|---|
| 10987 |  | -err_cred:  | 
|---|
| 10988 |  | -	if (task)  | 
|---|
| 10989 |  | -		mutex_unlock(&task->signal->cred_guard_mutex);  | 
|---|
| 10990 | 12168 |  err_task: | 
|---|
| 10991 | 12169 |  	if (task) | 
|---|
| 10992 | 12170 |  		put_task_struct(task); | 
|---|
| .. | .. | 
|---|
| 11015 | 12193 |  	int err; | 
|---|
| 11016 | 12194 |   | 
|---|
| 11017 | 12195 |  	/* | 
|---|
| 11018 |  | -	 * Get the target context (task or percpu):  | 
|---|
 | 12196 | +	 * Grouping is not supported for kernel events, neither is 'AUX',  | 
|---|
 | 12197 | +	 * make sure the caller's intentions are adjusted.  | 
|---|
| 11019 | 12198 |  	 */ | 
|---|
 | 12199 | +	if (attr->aux_output)  | 
|---|
 | 12200 | +		return ERR_PTR(-EINVAL);  | 
|---|
| 11020 | 12201 |   | 
|---|
| 11021 | 12202 |  	event = perf_event_alloc(attr, cpu, task, NULL, NULL, | 
|---|
| 11022 | 12203 |  				 overflow_handler, context, -1); | 
|---|
| .. | .. | 
|---|
| 11028 | 12209 |  	/* Mark owner so we could distinguish it from user events. */ | 
|---|
| 11029 | 12210 |  	event->owner = TASK_TOMBSTONE; | 
|---|
| 11030 | 12211 |   | 
|---|
 | 12212 | +	/*  | 
|---|
 | 12213 | +	 * Get the target context (task or percpu):  | 
|---|
 | 12214 | +	 */  | 
|---|
| 11031 | 12215 |  	ctx = find_get_context(event->pmu, task, event); | 
|---|
| 11032 | 12216 |  	if (IS_ERR(ctx)) { | 
|---|
| 11033 | 12217 |  		err = PTR_ERR(ctx); | 
|---|
| .. | .. | 
|---|
| 11285 | 12469 |  /* | 
|---|
| 11286 | 12470 |   * When a child task exits, feed back event values to parent events. | 
|---|
| 11287 | 12471 |   * | 
|---|
| 11288 |  | - * Can be called with cred_guard_mutex held when called from  | 
|---|
| 11289 |  | - * install_exec_creds().  | 
|---|
 | 12472 | + * Can be called with exec_update_lock held when called from  | 
|---|
 | 12473 | + * setup_new_exec().  | 
|---|
| 11290 | 12474 |   */ | 
|---|
| 11291 | 12475 |  void perf_event_exit_task(struct task_struct *child) | 
|---|
| 11292 | 12476 |  { | 
|---|
| .. | .. | 
|---|
| 11390 | 12574 |  		 * | 
|---|
| 11391 | 12575 |  		 * Wait for all events to drop their context reference. | 
|---|
| 11392 | 12576 |  		 */ | 
|---|
| 11393 |  | -		wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1);  | 
|---|
 | 12577 | +		wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);  | 
|---|
| 11394 | 12578 |  		put_ctx(ctx); /* must be last */ | 
|---|
| 11395 | 12579 |  	} | 
|---|
| 11396 | 12580 |  } | 
|---|
| .. | .. | 
|---|
| 11405 | 12589 |   | 
|---|
| 11406 | 12590 |  struct file *perf_event_get(unsigned int fd) | 
|---|
| 11407 | 12591 |  { | 
|---|
| 11408 |  | -	struct file *file;  | 
|---|
| 11409 |  | -  | 
|---|
| 11410 |  | -	file = fget_raw(fd);  | 
|---|
 | 12592 | +	struct file *file = fget(fd);  | 
|---|
| 11411 | 12593 |  	if (!file) | 
|---|
| 11412 | 12594 |  		return ERR_PTR(-EBADF); | 
|---|
| 11413 | 12595 |   | 
|---|
| .. | .. | 
|---|
| 11477 | 12659 |  	    !child_ctx->task_ctx_data) { | 
|---|
| 11478 | 12660 |  		struct pmu *pmu = child_event->pmu; | 
|---|
| 11479 | 12661 |   | 
|---|
| 11480 |  | -		child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,  | 
|---|
| 11481 |  | -						   GFP_KERNEL);  | 
|---|
 | 12662 | +		child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);  | 
|---|
| 11482 | 12663 |  		if (!child_ctx->task_ctx_data) { | 
|---|
| 11483 | 12664 |  			free_event(child_event); | 
|---|
| 11484 | 12665 |  			return ERR_PTR(-ENOMEM); | 
|---|
| .. | .. | 
|---|
| 11583 | 12764 |  					    child, leader, child_ctx); | 
|---|
| 11584 | 12765 |  		if (IS_ERR(child_ctr)) | 
|---|
| 11585 | 12766 |  			return PTR_ERR(child_ctr); | 
|---|
 | 12767 | +  | 
|---|
 | 12768 | +		if (sub->aux_event == parent_event && child_ctr &&  | 
|---|
 | 12769 | +		    !perf_get_aux_event(child_ctr, leader))  | 
|---|
 | 12770 | +			return -EINVAL;  | 
|---|
| 11586 | 12771 |  	} | 
|---|
| 11587 | 12772 |  	return 0; | 
|---|
| 11588 | 12773 |  } | 
|---|
| .. | .. | 
|---|
| 11778 | 12963 |  	} | 
|---|
| 11779 | 12964 |  } | 
|---|
| 11780 | 12965 |   | 
|---|
| 11781 |  | -void perf_swevent_init_cpu(unsigned int cpu)  | 
|---|
 | 12966 | +static void perf_swevent_init_cpu(unsigned int cpu)  | 
|---|
| 11782 | 12967 |  { | 
|---|
| 11783 | 12968 |  	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 
|---|
| 11784 | 12969 |   | 
|---|
| .. | .. | 
|---|
| 11975 | 13160 |  	kfree(jc); | 
|---|
| 11976 | 13161 |  } | 
|---|
| 11977 | 13162 |   | 
|---|
 | 13163 | +static int perf_cgroup_css_online(struct cgroup_subsys_state *css)  | 
|---|
 | 13164 | +{  | 
|---|
 | 13165 | +	perf_event_cgroup(css->cgroup);  | 
|---|
 | 13166 | +	return 0;  | 
|---|
 | 13167 | +}  | 
|---|
 | 13168 | +  | 
|---|
| 11978 | 13169 |  static int __perf_cgroup_move(void *info) | 
|---|
| 11979 | 13170 |  { | 
|---|
| 11980 | 13171 |  	struct task_struct *task = info; | 
|---|
| .. | .. | 
|---|
| 11996 | 13187 |  struct cgroup_subsys perf_event_cgrp_subsys = { | 
|---|
| 11997 | 13188 |  	.css_alloc	= perf_cgroup_css_alloc, | 
|---|
| 11998 | 13189 |  	.css_free	= perf_cgroup_css_free, | 
|---|
 | 13190 | +	.css_online	= perf_cgroup_css_online,  | 
|---|
| 11999 | 13191 |  	.attach		= perf_cgroup_attach, | 
|---|
| 12000 | 13192 |  	/* | 
|---|
| 12001 | 13193 |  	 * Implicitly enable on dfl hierarchy so that perf events can | 
|---|