~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0
1	2	/*
2	3	* Performance events core code:
3	4	*
..	..	@@ -5,8 +6,6 @@
5	6	* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6	7	* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
7	8	* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8		- *
9		- * For licensing details see kernel-base/COPYING
10	9	*/
11	10
12	11	#include <linux/fs.h>
..	..	@@ -29,6 +28,7 @@
29	28	#include <linux/export.h>
30	29	#include <linux/vmalloc.h>
31	30	#include <linux/hardirq.h>
	31	+#include <linux/hugetlb.h>
32	32	#include <linux/rculist.h>
33	33	#include <linux/uaccess.h>
34	34	#include <linux/syscalls.h>
..	..	@@ -50,6 +50,7 @@
50	50	#include <linux/sched/mm.h>
51	51	#include <linux/proc_ns.h>
52	52	#include <linux/mount.h>
	53	+#include <linux/min_heap.h>
53	54
54	55	#include "internal.h"
55	56
..	..	@@ -265,7 +266,7 @@
265	266	if (!event->parent) {
266	267	/*
267	268	* If this is a !child event, we must hold ctx::mutex to
268		- * stabilize the the event->ctx relation. See
	269	+ * stabilize the event->ctx relation. See
269	270	* perf_event_ctx_lock().
270	271	*/
271	272	lockdep_assert_held(&ctx->mutex);
..	..	@@ -391,6 +392,10 @@
391	392	static atomic_t nr_task_events __read_mostly;
392	393	static atomic_t nr_freq_events __read_mostly;
393	394	static atomic_t nr_switch_events __read_mostly;
	395	+static atomic_t nr_ksymbol_events __read_mostly;
	396	+static atomic_t nr_bpf_events __read_mostly;
	397	+static atomic_t nr_cgroup_events __read_mostly;
	398	+static atomic_t nr_text_poke_events __read_mostly;
394	399
395	400	static LIST_HEAD(pmus);
396	401	static DEFINE_MUTEX(pmus_lock);
..	..	@@ -403,13 +408,8 @@
403	408	* 0 - disallow raw tracepoint access for unpriv
404	409	* 1 - disallow cpu events for unpriv
405	410	* 2 - disallow kernel profiling for unpriv
406		- * 3 - disallow all unpriv perf event use
407	411	*/
408		-#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
409		-int sysctl_perf_event_paranoid __read_mostly = 3;
410		-#else
411	412	int sysctl_perf_event_paranoid __read_mostly = 2;
412		-#endif
413	413
414	414	/* Minimum for 512 kiB + 1 user control page */
415	415	int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
..	..	@@ -444,8 +444,7 @@
444	444	static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
445	445
446	446	int perf_proc_update_handler(struct ctl_table *table, int write,
447		- void __user buffer, size_t lenp,
448		- loff_t *ppos)
	447	+ void buffer, size_t lenp, loff_t *ppos)
449	448	{
450	449	int ret;
451	450	int perf_cpu = sysctl_perf_cpu_time_max_percent;
..	..	@@ -469,8 +468,7 @@
469	468	int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
470	469
471	470	int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
472		- void __user buffer, size_t lenp,
473		- loff_t *ppos)
	471	+ void buffer, size_t lenp, loff_t *ppos)
474	472	{
475	473	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
476	474
..	..	@@ -761,7 +759,7 @@
761	759	/*
762	760	* Do not update time when cgroup is not active
763	761	*/
764		- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
	762	+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
765	763	__update_cgrp_time(event->cgrp);
766	764	}
767	765
..	..	@@ -901,6 +899,47 @@
901	899	rcu_read_unlock();
902	900	}
903	901
	902	+static int perf_cgroup_ensure_storage(struct perf_event *event,
	903	+ struct cgroup_subsys_state *css)
	904	+{
	905	+ struct perf_cpu_context *cpuctx;
	906	+ struct perf_event **storage;
	907	+ int cpu, heap_size, ret = 0;
	908	+
	909	+ /*
	910	+ * Allow storage to have sufficent space for an iterator for each
	911	+ * possibly nested cgroup plus an iterator for events with no cgroup.
	912	+ */
	913	+ for (heap_size = 1; css; css = css->parent)
	914	+ heap_size++;
	915	+
	916	+ for_each_possible_cpu(cpu) {
	917	+ cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
	918	+ if (heap_size <= cpuctx->heap_size)
	919	+ continue;
	920	+
	921	+ storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
	922	+ GFP_KERNEL, cpu_to_node(cpu));
	923	+ if (!storage) {
	924	+ ret = -ENOMEM;
	925	+ break;
	926	+ }
	927	+
	928	+ raw_spin_lock_irq(&cpuctx->ctx.lock);
	929	+ if (cpuctx->heap_size < heap_size) {
	930	+ swap(cpuctx->heap, storage);
	931	+ if (storage == cpuctx->heap_default)
	932	+ storage = NULL;
	933	+ cpuctx->heap_size = heap_size;
	934	+ }
	935	+ raw_spin_unlock_irq(&cpuctx->ctx.lock);
	936	+
	937	+ kfree(storage);
	938	+ }
	939	+
	940	+ return ret;
	941	+}
	942	+
904	943	static inline int perf_cgroup_connect(int fd, struct perf_event *event,
905	944	struct perf_event_attr *attr,
906	945	struct perf_event *group_leader)
..	..	@@ -919,6 +958,10 @@
919	958	ret = PTR_ERR(css);
920	959	goto out;
921	960	}
	961	+
	962	+ ret = perf_cgroup_ensure_storage(event, css);
	963	+ if (ret)
	964	+ goto out;
922	965
923	966	cgrp = container_of(css, struct perf_cgroup, css);
924	967	event->cgrp = cgrp;
..	..	@@ -945,25 +988,19 @@
945	988	event->shadow_ctx_time = now - t->timestamp;
946	989	}
947	990
948		-/*
949		- * Update cpuctx->cgrp so that it is set when first cgroup event is added and
950		- * cleared when last cgroup event is removed.
951		- */
952	991	static inline void
953		-list_update_cgroup_event(struct perf_event *event,
954		- struct perf_event_context *ctx, bool add)
	992	+perf_cgroup_event_enable(struct perf_event event, struct perf_event_context ctx)
955	993	{
956	994	struct perf_cpu_context *cpuctx;
957		- struct list_head *cpuctx_entry;
958	995
959	996	if (!is_cgroup_event(event))
960	997	return;
961	998
962	999	/*
963	1000	* Because cgroup events are always per-cpu events,
964		- * this will always be called from the right CPU.
	1001	+ * @ctx == &cpuctx->ctx.
965	1002	*/
966		- cpuctx = __get_cpu_context(ctx);
	1003	+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
967	1004
968	1005	/*
969	1006	* Since setting cpuctx->cgrp is conditional on the current @cgrp
..	..	@@ -971,27 +1008,41 @@
971	1008	* because if the first would mismatch, the second would not try again
972	1009	* and we would leave cpuctx->cgrp unset.
973	1010	*/
974		- if (add && !cpuctx->cgrp) {
	1011	+ if (ctx->is_active && !cpuctx->cgrp) {
975	1012	struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
976	1013
977	1014	if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
978	1015	cpuctx->cgrp = cgrp;
979	1016	}
980	1017
981		- if (add && ctx->nr_cgroups++)
982		- return;
983		- else if (!add && --ctx->nr_cgroups)
	1018	+ if (ctx->nr_cgroups++)
984	1019	return;
985	1020
986		- /* no cgroup running */
987		- if (!add)
	1021	+ list_add(&cpuctx->cgrp_cpuctx_entry,
	1022	+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
	1023	+}
	1024	+
	1025	+static inline void
	1026	+perf_cgroup_event_disable(struct perf_event event, struct perf_event_context ctx)
	1027	+{
	1028	+ struct perf_cpu_context *cpuctx;
	1029	+
	1030	+ if (!is_cgroup_event(event))
	1031	+ return;
	1032	+
	1033	+ /*
	1034	+ * Because cgroup events are always per-cpu events,
	1035	+ * @ctx == &cpuctx->ctx.
	1036	+ */
	1037	+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
	1038	+
	1039	+ if (--ctx->nr_cgroups)
	1040	+ return;
	1041	+
	1042	+ if (ctx->is_active && cpuctx->cgrp)
988	1043	cpuctx->cgrp = NULL;
989	1044
990		- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
991		- if (add)
992		- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
993		- else
994		- list_del(cpuctx_entry);
	1045	+ list_del(&cpuctx->cgrp_cpuctx_entry);
995	1046	}
996	1047
997	1048	#else /* !CONFIG_CGROUP_PERF */
..	..	@@ -1041,7 +1092,7 @@
1041	1092	{
1042	1093	}
1043	1094
1044		-void
	1095	+static inline void
1045	1096	perf_cgroup_switch(struct task_struct task, struct task_struct next)
1046	1097	{
1047	1098	}
..	..	@@ -1057,11 +1108,14 @@
1057	1108	}
1058	1109
1059	1110	static inline void
1060		-list_update_cgroup_event(struct perf_event *event,
1061		- struct perf_event_context *ctx, bool add)
	1111	+perf_cgroup_event_enable(struct perf_event event, struct perf_event_context ctx)
1062	1112	{
1063	1113	}
1064	1114
	1115	+static inline void
	1116	+perf_cgroup_event_disable(struct perf_event event, struct perf_event_context ctx)
	1117	+{
	1118	+}
1065	1119	#endif
1066	1120
1067	1121	/*
..	..	@@ -1113,7 +1167,7 @@
1113	1167	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1114	1168
1115	1169	raw_spin_lock_init(&cpuctx->hrtimer_lock);
1116		- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
	1170	+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1117	1171	timer->function = perf_mux_hrtimer_handler;
1118	1172	}
1119	1173
..	..	@@ -1131,7 +1185,7 @@
1131	1185	if (!cpuctx->hrtimer_active) {
1132	1186	cpuctx->hrtimer_active = 1;
1133	1187	hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1134		- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
	1188	+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1135	1189	}
1136	1190	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1137	1191
..	..	@@ -1182,7 +1236,21 @@
1182	1236
1183	1237	static void get_ctx(struct perf_event_context *ctx)
1184	1238	{
1185		- WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
	1239	+ refcount_inc(&ctx->refcount);
	1240	+}
	1241	+
	1242	+static void alloc_task_ctx_data(struct pmu pmu)
	1243	+{
	1244	+ if (pmu->task_ctx_cache)
	1245	+ return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
	1246	+
	1247	+ return NULL;
	1248	+}
	1249	+
	1250	+static void free_task_ctx_data(struct pmu pmu, void task_ctx_data)
	1251	+{
	1252	+ if (pmu->task_ctx_cache && task_ctx_data)
	1253	+ kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1186	1254	}
1187	1255
1188	1256	static void free_ctx(struct rcu_head *head)
..	..	@@ -1190,13 +1258,13 @@
1190	1258	struct perf_event_context *ctx;
1191	1259
1192	1260	ctx = container_of(head, struct perf_event_context, rcu_head);
1193		- kfree(ctx->task_ctx_data);
	1261	+ free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1194	1262	kfree(ctx);
1195	1263	}
1196	1264
1197	1265	static void put_ctx(struct perf_event_context *ctx)
1198	1266	{
1199		- if (atomic_dec_and_test(&ctx->refcount)) {
	1267	+ if (refcount_dec_and_test(&ctx->refcount)) {
1200	1268	if (ctx->parent_ctx)
1201	1269	put_ctx(ctx->parent_ctx);
1202	1270	if (ctx->task && ctx->task != TASK_TOMBSTONE)
..	..	@@ -1232,7 +1300,7 @@
1232	1300	* life-time rules separate them. That is an exiting task cannot fork, and a
1233	1301	* spawning task cannot (yet) exit.
1234	1302	*
1235		- * But remember that that these are parent<->child context relations, and
	1303	+ * But remember that these are parent<->child context relations, and
1236	1304	* migration does not affect children, therefore these two orderings should not
1237	1305	* interact.
1238	1306	*
..	..	@@ -1258,13 +1326,13 @@
1258	1326	* function.
1259	1327	*
1260	1328	* Lock order:
1261		- * cred_guard_mutex
	1329	+ * exec_update_lock
1262	1330	* task_struct::perf_event_mutex
1263	1331	* perf_event_context::mutex
1264	1332	* perf_event::child_mutex;
1265	1333	* perf_event_context::lock
1266	1334	* perf_event::mmap_mutex
1267		- * mmap_sem
	1335	+ * mmap_lock
1268	1336	* perf_addr_filters_head::lock
1269	1337	*
1270	1338	* cpu_hotplug_lock
..	..	@@ -1279,7 +1347,7 @@
1279	1347	again:
1280	1348	rcu_read_lock();
1281	1349	ctx = READ_ONCE(event->ctx);
1282		- if (!atomic_inc_not_zero(&ctx->refcount)) {
	1350	+ if (!refcount_inc_not_zero(&ctx->refcount)) {
1283	1351	rcu_read_unlock();
1284	1352	goto again;
1285	1353	}
..	..	@@ -1371,7 +1439,7 @@
1371	1439	/*
1372	1440	* Get the perf_event_context for a task and lock it.
1373	1441	*
1374		- * This has to cope with with the fact that until it is locked,
	1442	+ * This has to cope with the fact that until it is locked,
1375	1443	* the context could get moved to another task.
1376	1444	*/
1377	1445	static struct perf_event_context *
..	..	@@ -1412,7 +1480,7 @@
1412	1480	}
1413	1481
1414	1482	if (ctx->task == TASK_TOMBSTONE \|\|
1415		- !atomic_inc_not_zero(&ctx->refcount)) {
	1483	+ !refcount_inc_not_zero(&ctx->refcount)) {
1416	1484	raw_spin_unlock(&ctx->lock);
1417	1485	ctx = NULL;
1418	1486	} else {
..	..	@@ -1540,6 +1608,30 @@
1540	1608	if (left->cpu > right->cpu)
1541	1609	return false;
1542	1610
	1611	+#ifdef CONFIG_CGROUP_PERF
	1612	+ if (left->cgrp != right->cgrp) {
	1613	+ if (!left->cgrp \|\| !left->cgrp->css.cgroup) {
	1614	+ /*
	1615	+ * Left has no cgroup but right does, no cgroups come
	1616	+ * first.
	1617	+ */
	1618	+ return true;
	1619	+ }
	1620	+ if (!right->cgrp \|\| !right->cgrp->css.cgroup) {
	1621	+ /*
	1622	+ * Right has no cgroup but left does, no cgroups come
	1623	+ * first.
	1624	+ */
	1625	+ return false;
	1626	+ }
	1627	+ /* Two dissimilar cgroups, order by id. */
	1628	+ if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
	1629	+ return true;
	1630	+
	1631	+ return false;
	1632	+ }
	1633	+#endif
	1634	+
1543	1635	if (left->group_index < right->group_index)
1544	1636	return true;
1545	1637	if (left->group_index > right->group_index)
..	..	@@ -1619,25 +1711,48 @@
1619	1711	}
1620	1712
1621	1713	/*
1622		- * Get the leftmost event in the @cpu subtree.
	1714	+ * Get the leftmost event in the cpu/cgroup subtree.
1623	1715	*/
1624	1716	static struct perf_event *
1625		-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
	1717	+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
	1718	+ struct cgroup *cgrp)
1626	1719	{
1627	1720	struct perf_event node_event = NULL, match = NULL;
1628	1721	struct rb_node *node = groups->tree.rb_node;
	1722	+#ifdef CONFIG_CGROUP_PERF
	1723	+ u64 node_cgrp_id, cgrp_id = 0;
	1724	+
	1725	+ if (cgrp)
	1726	+ cgrp_id = cgrp->kn->id;
	1727	+#endif
1629	1728
1630	1729	while (node) {
1631	1730	node_event = container_of(node, struct perf_event, group_node);
1632	1731
1633	1732	if (cpu < node_event->cpu) {
1634	1733	node = node->rb_left;
1635		- } else if (cpu > node_event->cpu) {
1636		- node = node->rb_right;
1637		- } else {
1638		- match = node_event;
1639		- node = node->rb_left;
	1734	+ continue;
1640	1735	}
	1736	+ if (cpu > node_event->cpu) {
	1737	+ node = node->rb_right;
	1738	+ continue;
	1739	+ }
	1740	+#ifdef CONFIG_CGROUP_PERF
	1741	+ node_cgrp_id = 0;
	1742	+ if (node_event->cgrp && node_event->cgrp->css.cgroup)
	1743	+ node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
	1744	+
	1745	+ if (cgrp_id < node_cgrp_id) {
	1746	+ node = node->rb_left;
	1747	+ continue;
	1748	+ }
	1749	+ if (cgrp_id > node_cgrp_id) {
	1750	+ node = node->rb_right;
	1751	+ continue;
	1752	+ }
	1753	+#endif
	1754	+ match = node_event;
	1755	+ node = node->rb_left;
1641	1756	}
1642	1757
1643	1758	return match;
..	..	@@ -1650,12 +1765,26 @@
1650	1765	perf_event_groups_next(struct perf_event *event)
1651	1766	{
1652	1767	struct perf_event *next;
	1768	+#ifdef CONFIG_CGROUP_PERF
	1769	+ u64 curr_cgrp_id = 0;
	1770	+ u64 next_cgrp_id = 0;
	1771	+#endif
1653	1772
1654	1773	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1655		- if (next && next->cpu == event->cpu)
1656		- return next;
	1774	+ if (next == NULL \|\| next->cpu != event->cpu)
	1775	+ return NULL;
1657	1776
1658		- return NULL;
	1777	+#ifdef CONFIG_CGROUP_PERF
	1778	+ if (event->cgrp && event->cgrp->css.cgroup)
	1779	+ curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
	1780	+
	1781	+ if (next->cgrp && next->cgrp->css.cgroup)
	1782	+ next_cgrp_id = next->cgrp->css.cgroup->kn->id;
	1783	+
	1784	+ if (curr_cgrp_id != next_cgrp_id)
	1785	+ return NULL;
	1786	+#endif
	1787	+ return next;
1659	1788	}
1660	1789
1661	1790	/*
..	..	@@ -1691,12 +1820,13 @@
1691	1820	add_event_to_groups(event, ctx);
1692	1821	}
1693	1822
1694		- list_update_cgroup_event(event, ctx, true);
1695		-
1696	1823	list_add_rcu(&event->event_entry, &ctx->event_list);
1697	1824	ctx->nr_events++;
1698	1825	if (event->attr.inherit_stat)
1699	1826	ctx->nr_stat++;
	1827	+
	1828	+ if (event->state > PERF_EVENT_STATE_OFF)
	1829	+ perf_cgroup_event_enable(event, ctx);
1700	1830
1701	1831	ctx->generation++;
1702	1832	}
..	..	@@ -1762,6 +1892,9 @@
1762	1892
1763	1893	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1764	1894	size += sizeof(data->phys_addr);
	1895	+
	1896	+ if (sample_type & PERF_SAMPLE_CGROUP)
	1897	+ size += sizeof(data->cgroup);
1765	1898
1766	1899	event->header_size = size;
1767	1900	}
..	..	@@ -1873,8 +2006,6 @@
1873	2006
1874	2007	event->attach_state &= ~PERF_ATTACH_CONTEXT;
1875	2008
1876		- list_update_cgroup_event(event, ctx, false);
1877		-
1878	2009	ctx->nr_events--;
1879	2010	if (event->attr.inherit_stat)
1880	2011	ctx->nr_stat--;
..	..	@@ -1891,14 +2022,136 @@
1891	2022	* of error state is by explicit re-enabling
1892	2023	* of the event
1893	2024	*/
1894		- if (event->state > PERF_EVENT_STATE_OFF)
	2025	+ if (event->state > PERF_EVENT_STATE_OFF) {
	2026	+ perf_cgroup_event_disable(event, ctx);
1895	2027	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
	2028	+ }
1896	2029
1897	2030	ctx->generation++;
1898	2031	}
1899	2032
	2033	+static int
	2034	+perf_aux_output_match(struct perf_event event, struct perf_event aux_event)
	2035	+{
	2036	+ if (!has_aux(aux_event))
	2037	+ return 0;
	2038	+
	2039	+ if (!event->pmu->aux_output_match)
	2040	+ return 0;
	2041	+
	2042	+ return event->pmu->aux_output_match(aux_event);
	2043	+}
	2044	+
	2045	+static void put_event(struct perf_event *event);
	2046	+static void event_sched_out(struct perf_event *event,
	2047	+ struct perf_cpu_context *cpuctx,
	2048	+ struct perf_event_context *ctx);
	2049	+
	2050	+static void perf_put_aux_event(struct perf_event *event)
	2051	+{
	2052	+ struct perf_event_context *ctx = event->ctx;
	2053	+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
	2054	+ struct perf_event *iter;
	2055	+
	2056	+ /*
	2057	+ * If event uses aux_event tear down the link
	2058	+ */
	2059	+ if (event->aux_event) {
	2060	+ iter = event->aux_event;
	2061	+ event->aux_event = NULL;
	2062	+ put_event(iter);
	2063	+ return;
	2064	+ }
	2065	+
	2066	+ /*
	2067	+ * If the event is an aux_event, tear down all links to
	2068	+ * it from other events.
	2069	+ */
	2070	+ for_each_sibling_event(iter, event->group_leader) {
	2071	+ if (iter->aux_event != event)
	2072	+ continue;
	2073	+
	2074	+ iter->aux_event = NULL;
	2075	+ put_event(event);
	2076	+
	2077	+ /*
	2078	+ * If it's ACTIVE, schedule it out and put it into ERROR
	2079	+ * state so that we don't try to schedule it again. Note
	2080	+ * that perf_event_enable() will clear the ERROR status.
	2081	+ */
	2082	+ event_sched_out(iter, cpuctx, ctx);
	2083	+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
	2084	+ }
	2085	+}
	2086	+
	2087	+static bool perf_need_aux_event(struct perf_event *event)
	2088	+{
	2089	+ return !!event->attr.aux_output \|\| !!event->attr.aux_sample_size;
	2090	+}
	2091	+
	2092	+static int perf_get_aux_event(struct perf_event *event,
	2093	+ struct perf_event *group_leader)
	2094	+{
	2095	+ /*
	2096	+ * Our group leader must be an aux event if we want to be
	2097	+ * an aux_output. This way, the aux event will precede its
	2098	+ * aux_output events in the group, and therefore will always
	2099	+ * schedule first.
	2100	+ */
	2101	+ if (!group_leader)
	2102	+ return 0;
	2103	+
	2104	+ /*
	2105	+ * aux_output and aux_sample_size are mutually exclusive.
	2106	+ */
	2107	+ if (event->attr.aux_output && event->attr.aux_sample_size)
	2108	+ return 0;
	2109	+
	2110	+ if (event->attr.aux_output &&
	2111	+ !perf_aux_output_match(event, group_leader))
	2112	+ return 0;
	2113	+
	2114	+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
	2115	+ return 0;
	2116	+
	2117	+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
	2118	+ return 0;
	2119	+
	2120	+ /*
	2121	+ * Link aux_outputs to their aux event; this is undone in
	2122	+ * perf_group_detach() by perf_put_aux_event(). When the
	2123	+ * group in torn down, the aux_output events loose their
	2124	+ * link to the aux_event and can't schedule any more.
	2125	+ */
	2126	+ event->aux_event = group_leader;
	2127	+
	2128	+ return 1;
	2129	+}
	2130	+
	2131	+static inline struct list_head get_event_list(struct perf_event event)
	2132	+{
	2133	+ struct perf_event_context *ctx = event->ctx;
	2134	+ return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
	2135	+}
	2136	+
	2137	+/*
	2138	+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
	2139	+ * cannot exist on their own, schedule them out and move them into the ERROR
	2140	+ * state. Also see _perf_event_enable(), it will not be able to recover
	2141	+ * this ERROR state.
	2142	+ */
	2143	+static inline void perf_remove_sibling_event(struct perf_event *event)
	2144	+{
	2145	+ struct perf_event_context *ctx = event->ctx;
	2146	+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
	2147	+
	2148	+ event_sched_out(event, cpuctx, ctx);
	2149	+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
	2150	+}
	2151	+
1900	2152	static void perf_group_detach(struct perf_event *event)
1901	2153	{
	2154	+ struct perf_event *leader = event->group_leader;
1902	2155	struct perf_event sibling, tmp;
1903	2156	struct perf_event_context *ctx = event->ctx;
1904	2157
..	..	@@ -1912,10 +2165,12 @@
1912	2165
1913	2166	event->attach_state &= ~PERF_ATTACH_GROUP;
1914	2167
	2168	+ perf_put_aux_event(event);
	2169	+
1915	2170	/*
1916	2171	* If this is a sibling, remove it from its group.
1917	2172	*/
1918		- if (event->group_leader != event) {
	2173	+ if (leader != event) {
1919	2174	list_del_init(&event->sibling_list);
1920	2175	event->group_leader->nr_siblings--;
1921	2176	goto out;
..	..	@@ -1928,6 +2183,9 @@
1928	2183	*/
1929	2184	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
1930	2185
	2186	+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
	2187	+ perf_remove_sibling_event(sibling);
	2188	+
1931	2189	sibling->group_leader = sibling;
1932	2190	list_del_init(&sibling->sibling_list);
1933	2191
..	..	@@ -1937,22 +2195,18 @@
1937	2195	if (!RB_EMPTY_NODE(&event->group_node)) {
1938	2196	add_event_to_groups(sibling, event->ctx);
1939	2197
1940		- if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1941		- struct list_head *list = sibling->attr.pinned ?
1942		- &ctx->pinned_active : &ctx->flexible_active;
1943		-
1944		- list_add_tail(&sibling->active_list, list);
1945		- }
	2198	+ if (sibling->state == PERF_EVENT_STATE_ACTIVE)
	2199	+ list_add_tail(&sibling->active_list, get_event_list(sibling));
1946	2200	}
1947	2201
1948	2202	WARN_ON_ONCE(sibling->ctx != event->ctx);
1949	2203	}
1950	2204
1951	2205	out:
1952		- perf_event__header_size(event->group_leader);
1953		-
1954		- for_each_sibling_event(tmp, event->group_leader)
	2206	+ for_each_sibling_event(tmp, leader)
1955	2207	perf_event__header_size(tmp);
	2208	+
	2209	+ perf_event__header_size(leader);
1956	2210	}
1957	2211
1958	2212	static bool is_orphaned_event(struct perf_event *event)
..	..	@@ -2021,6 +2275,7 @@
2021	2275
2022	2276	if (READ_ONCE(event->pending_disable) >= 0) {
2023	2277	WRITE_ONCE(event->pending_disable, -1);
	2278	+ perf_cgroup_event_disable(event, ctx);
2024	2279	state = PERF_EVENT_STATE_OFF;
2025	2280	}
2026	2281	perf_event_set_state(event, state);
..	..	@@ -2058,9 +2313,6 @@
2058	2313	event_sched_out(event, cpuctx, ctx);
2059	2314
2060	2315	perf_pmu_enable(ctx->pmu);
2061		-
2062		- if (group_event->attr.exclusive)
2063		- cpuctx->exclusive = 0;
2064	2316	}
2065	2317
2066	2318	#define DETACH_GROUP 0x01UL
..	..	@@ -2091,6 +2343,7 @@
2091	2343
2092	2344	if (!ctx->nr_events && ctx->is_active) {
2093	2345	ctx->is_active = 0;
	2346	+ ctx->rotate_necessary = 0;
2094	2347	if (ctx->task) {
2095	2348	WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2096	2349	cpuctx->task_ctx = NULL;
..	..	@@ -2157,6 +2410,7 @@
2157	2410	event_sched_out(event, cpuctx, ctx);
2158	2411
2159	2412	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
	2413	+ perf_cgroup_event_disable(event, ctx);
2160	2414	}
2161	2415
2162	2416	/*
..	..	@@ -2164,7 +2418,7 @@
2164	2418	*
2165	2419	* If event->ctx is a cloned context, callers must make sure that
2166	2420	* every task struct that event->ctx->task could possibly point to
2167		- * remains valid. This condition is satisifed when called through
	2421	+ * remains valid. This condition is satisfied when called through
2168	2422	* perf_event_for_each_child or perf_event_for_each because they
2169	2423	* hold the top-level event's child_mutex, so any descendant that
2170	2424	* goes to exit will block in perf_event_exit_event().
..	..	@@ -2238,7 +2492,7 @@
2238	2492	* But this is a bit hairy.
2239	2493	*
2240	2494	* So instead, we have an explicit cgroup call to remain
2241		- * within the time time source all along. We believe it
	2495	+ * within the time source all along. We believe it
2242	2496	* is cleaner and simpler to understand.
2243	2497	*/
2244	2498	if (is_cgroup_event(event))
..	..	@@ -2258,6 +2512,8 @@
2258	2512	struct perf_event_context *ctx)
2259	2513	{
2260	2514	int ret = 0;
	2515	+
	2516	+ WARN_ON_ONCE(event->ctx != ctx);
2261	2517
2262	2518	lockdep_assert_held(&ctx->lock);
2263	2519
..	..	@@ -2325,11 +2581,8 @@
2325	2581
2326	2582	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2327	2583
2328		- if (event_sched_in(group_event, cpuctx, ctx)) {
2329		- pmu->cancel_txn(pmu);
2330		- perf_mux_hrtimer_restart(cpuctx);
2331		- return -EAGAIN;
2332		- }
	2584	+ if (event_sched_in(group_event, cpuctx, ctx))
	2585	+ goto error;
2333	2586
2334	2587	/*
2335	2588	* Schedule in siblings as one group (if any):
..	..	@@ -2358,10 +2611,8 @@
2358	2611	}
2359	2612	event_sched_out(group_event, cpuctx, ctx);
2360	2613
	2614	+error:
2361	2615	pmu->cancel_txn(pmu);
2362		-
2363		- perf_mux_hrtimer_restart(cpuctx);
2364		-
2365	2616	return -EAGAIN;
2366	2617	}
2367	2618
..	..	@@ -2387,7 +2638,7 @@
2387	2638	* If this group is exclusive and there are already
2388	2639	* events on the CPU, it can't go on.
2389	2640	*/
2390		- if (event->attr.exclusive && cpuctx->active_oncpu)
	2641	+ if (event->attr.exclusive && !list_empty(get_event_list(event)))
2391	2642	return 0;
2392	2643	/*
2393	2644	* Otherwise, try to add it if all previous groups were able
..	..	@@ -2488,6 +2739,16 @@
2488	2739	perf_pmu_enable(cpuctx->ctx.pmu);
2489	2740	}
2490	2741
	2742	+void perf_pmu_resched(struct pmu *pmu)
	2743	+{
	2744	+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
	2745	+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
	2746	+
	2747	+ perf_ctx_lock(cpuctx, task_ctx);
	2748	+ ctx_resched(cpuctx, task_ctx, EVENT_ALL\|EVENT_CPU);
	2749	+ perf_ctx_unlock(cpuctx, task_ctx);
	2750	+}
	2751	+
2491	2752	/*
2492	2753	* Cross CPU call to install and enable a performance event
2493	2754	*
..	..	@@ -2528,7 +2789,7 @@
2528	2789	}
2529	2790
2530	2791	#ifdef CONFIG_CGROUP_PERF
2531		- if (is_cgroup_event(event)) {
	2792	+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2532	2793	/*
2533	2794	* If the current cgroup doesn't match the event's
2534	2795	* cgroup, we should not try to schedule it.
..	..	@@ -2580,6 +2841,25 @@
2580	2841	* will be 'complete'. See perf_iterate_sb_cpu().
2581	2842	*/
2582	2843	smp_store_release(&event->ctx, ctx);
	2844	+
	2845	+ /*
	2846	+ * perf_event_attr::disabled events will not run and can be initialized
	2847	+ * without IPI. Except when this is the first event for the context, in
	2848	+ * that case we need the magic of the IPI to set ctx->is_active.
	2849	+ *
	2850	+ * The IOC_ENABLE that is sure to follow the creation of a disabled
	2851	+ * event will issue the IPI and reprogram the hardware.
	2852	+ */
	2853	+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
	2854	+ raw_spin_lock_irq(&ctx->lock);
	2855	+ if (ctx->task == TASK_TOMBSTONE) {
	2856	+ raw_spin_unlock_irq(&ctx->lock);
	2857	+ return;
	2858	+ }
	2859	+ add_event_to_ctx(event, ctx);
	2860	+ raw_spin_unlock_irq(&ctx->lock);
	2861	+ return;
	2862	+ }
2583	2863
2584	2864	if (!task) {
2585	2865	cpu_function_call(cpu, __perf_install_in_context, event);
..	..	@@ -2669,6 +2949,7 @@
2669	2949	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2670	2950
2671	2951	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
	2952	+ perf_cgroup_event_enable(event, ctx);
2672	2953
2673	2954	if (!ctx->is_active)
2674	2955	return;
..	..	@@ -2710,6 +2991,7 @@
2710	2991	raw_spin_lock_irq(&ctx->lock);
2711	2992	if (event->state >= PERF_EVENT_STATE_INACTIVE \|\|
2712	2993	event->state < PERF_EVENT_STATE_ERROR) {
	2994	+out:
2713	2995	raw_spin_unlock_irq(&ctx->lock);
2714	2996	return;
2715	2997	}
..	..	@@ -2721,8 +3003,16 @@
2721	3003	* has gone back into error state, as distinct from the task having
2722	3004	* been scheduled away before the cross-call arrived.
2723	3005	*/
2724		- if (event->state == PERF_EVENT_STATE_ERROR)
	3006	+ if (event->state == PERF_EVENT_STATE_ERROR) {
	3007	+ /*
	3008	+ * Detached SIBLING events cannot leave ERROR state.
	3009	+ */
	3010	+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
	3011	+ event->group_leader == event)
	3012	+ goto out;
	3013	+
2725	3014	event->state = PERF_EVENT_STATE_OFF;
	3015	+ }
2726	3016	raw_spin_unlock_irq(&ctx->lock);
2727	3017
2728	3018	event_function_call(event, __perf_event_enable, NULL);
..	..	@@ -2826,7 +3116,7 @@
2826	3116	* pre-existing mappings, called once when new filters arrive via SET_FILTER
2827	3117	* ioctl;
2828	3118	* (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2829		- * registered mapping, called for every new mmap(), with mm::mmap_sem down
	3119	+ * registered mapping, called for every new mmap(), with mm::mmap_lock down
2830	3120	* for reading;
2831	3121	* (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2832	3122	* of exec.
..	..	@@ -2966,6 +3256,13 @@
2966	3256	if (is_active & EVENT_FLEXIBLE) {
2967	3257	list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
2968	3258	group_sched_out(event, cpuctx, ctx);
	3259	+
	3260	+ /*
	3261	+ * Since we cleared EVENT_FLEXIBLE, also clear
	3262	+ * rotate_necessary, is will be reset by
	3263	+ * ctx_flexible_sched_in() when needed.
	3264	+ */
	3265	+ ctx->rotate_necessary = 0;
2969	3266	}
2970	3267	perf_pmu_enable(ctx->pmu);
2971	3268	}
..	..	@@ -3080,10 +3377,12 @@
3080	3377	struct perf_event_context parent, next_parent;
3081	3378	struct perf_cpu_context *cpuctx;
3082	3379	int do_switch = 1;
	3380	+ struct pmu *pmu;
3083	3381
3084	3382	if (likely(!ctx))
3085	3383	return;
3086	3384
	3385	+ pmu = ctx->pmu;
3087	3386	cpuctx = __get_cpu_context(ctx);
3088	3387	if (!cpuctx->task_ctx)
3089	3388	return;
..	..	@@ -3113,10 +3412,27 @@
3113	3412	raw_spin_lock(&ctx->lock);
3114	3413	raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3115	3414	if (context_equiv(ctx, next_ctx)) {
	3415	+
3116	3416	WRITE_ONCE(ctx->task, next);
3117	3417	WRITE_ONCE(next_ctx->task, task);
3118	3418
3119		- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
	3419	+ perf_pmu_disable(pmu);
	3420	+
	3421	+ if (cpuctx->sched_cb_usage && pmu->sched_task)
	3422	+ pmu->sched_task(ctx, false);
	3423	+
	3424	+ /*
	3425	+ * PMU specific parts of task perf context can require
	3426	+ * additional synchronization. As an example of such
	3427	+ * synchronization see implementation details of Intel
	3428	+ * LBR call stack data profiling;
	3429	+ */
	3430	+ if (pmu->swap_task_ctx)
	3431	+ pmu->swap_task_ctx(ctx, next_ctx);
	3432	+ else
	3433	+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
	3434	+
	3435	+ perf_pmu_enable(pmu);
3120	3436
3121	3437	/*
3122	3438	* RCU_INIT_POINTER here is safe because we've not
..	..	@@ -3140,7 +3456,13 @@
3140	3456
3141	3457	if (do_switch) {
3142	3458	raw_spin_lock(&ctx->lock);
	3459	+ perf_pmu_disable(pmu);
	3460	+
	3461	+ if (cpuctx->sched_cb_usage && pmu->sched_task)
	3462	+ pmu->sched_task(ctx, false);
3143	3463	task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
	3464	+
	3465	+ perf_pmu_enable(pmu);
3144	3466	raw_spin_unlock(&ctx->lock);
3145	3467	}
3146	3468	}
..	..	@@ -3176,29 +3498,39 @@
3176	3498	* PEBS requires this to provide PID/TID information. This requires we flush
3177	3499	* all queued PEBS records before we context switch to a new task.
3178	3500	*/
	3501	+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
	3502	+{
	3503	+ struct pmu *pmu;
	3504	+
	3505	+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
	3506	+
	3507	+ if (WARN_ON_ONCE(!pmu->sched_task))
	3508	+ return;
	3509	+
	3510	+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
	3511	+ perf_pmu_disable(pmu);
	3512	+
	3513	+ pmu->sched_task(cpuctx->task_ctx, sched_in);
	3514	+
	3515	+ perf_pmu_enable(pmu);
	3516	+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
	3517	+}
	3518	+
3179	3519	static void perf_pmu_sched_task(struct task_struct *prev,
3180	3520	struct task_struct *next,
3181	3521	bool sched_in)
3182	3522	{
3183	3523	struct perf_cpu_context *cpuctx;
3184		- struct pmu *pmu;
3185	3524
3186	3525	if (prev == next)
3187	3526	return;
3188	3527
3189	3528	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3190		- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3191		-
3192		- if (WARN_ON_ONCE(!pmu->sched_task))
	3529	+ /* will be handled in perf_event_context_sched_in/out */
	3530	+ if (cpuctx->task_ctx)
3193	3531	continue;
3194	3532
3195		- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3196		- perf_pmu_disable(pmu);
3197		-
3198		- pmu->sched_task(cpuctx->task_ctx, sched_in);
3199		-
3200		- perf_pmu_enable(pmu);
3201		- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
	3533	+ __perf_pmu_sched_task(cpuctx, sched_in);
3202	3534	}
3203	3535	}
3204	3536
..	..	@@ -3251,83 +3583,149 @@
3251	3583	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3252	3584	}
3253	3585
3254		-static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3255		- int (func)(struct perf_event , void ), void data)
	3586	+static bool perf_less_group_idx(const void l, const void r)
3256	3587	{
3257		- struct perf_event *evt, evt1, *evt2;
	3588	+ const struct perf_event le = (const struct perf_event **)l;
	3589	+ const struct perf_event re = (const struct perf_event **)r;
	3590	+
	3591	+ return le->group_index < re->group_index;
	3592	+}
	3593	+
	3594	+static void swap_ptr(void l, void r)
	3595	+{
	3596	+ void lp = l, rp = r;
	3597	+
	3598	+ swap(lp, rp);
	3599	+}
	3600	+
	3601	+static const struct min_heap_callbacks perf_min_heap = {
	3602	+ .elem_size = sizeof(struct perf_event *),
	3603	+ .less = perf_less_group_idx,
	3604	+ .swp = swap_ptr,
	3605	+};
	3606	+
	3607	+static void __heap_add(struct min_heap heap, struct perf_event event)
	3608	+{
	3609	+ struct perf_event **itrs = heap->data;
	3610	+
	3611	+ if (event) {
	3612	+ itrs[heap->nr] = event;
	3613	+ heap->nr++;
	3614	+ }
	3615	+}
	3616	+
	3617	+static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
	3618	+ struct perf_event_groups *groups, int cpu,
	3619	+ int (func)(struct perf_event , void *),
	3620	+ void *data)
	3621	+{
	3622	+#ifdef CONFIG_CGROUP_PERF
	3623	+ struct cgroup_subsys_state *css = NULL;
	3624	+#endif
	3625	+ /* Space for per CPU and/or any CPU event iterators. */
	3626	+ struct perf_event *itrs[2];
	3627	+ struct min_heap event_heap;
	3628	+ struct perf_event **evt;
3258	3629	int ret;
3259	3630
3260		- evt1 = perf_event_groups_first(groups, -1);
3261		- evt2 = perf_event_groups_first(groups, cpu);
	3631	+ if (cpuctx) {
	3632	+ event_heap = (struct min_heap){
	3633	+ .data = cpuctx->heap,
	3634	+ .nr = 0,
	3635	+ .size = cpuctx->heap_size,
	3636	+ };
3262	3637
3263		- while (evt1 \|\| evt2) {
3264		- if (evt1 && evt2) {
3265		- if (evt1->group_index < evt2->group_index)
3266		- evt = &evt1;
3267		- else
3268		- evt = &evt2;
3269		- } else if (evt1) {
3270		- evt = &evt1;
3271		- } else {
3272		- evt = &evt2;
3273		- }
	3638	+ lockdep_assert_held(&cpuctx->ctx.lock);
3274	3639
	3640	+#ifdef CONFIG_CGROUP_PERF
	3641	+ if (cpuctx->cgrp)
	3642	+ css = &cpuctx->cgrp->css;
	3643	+#endif
	3644	+ } else {
	3645	+ event_heap = (struct min_heap){
	3646	+ .data = itrs,
	3647	+ .nr = 0,
	3648	+ .size = ARRAY_SIZE(itrs),
	3649	+ };
	3650	+ /* Events not within a CPU context may be on any CPU. */
	3651	+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
	3652	+ }
	3653	+ evt = event_heap.data;
	3654	+
	3655	+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
	3656	+
	3657	+#ifdef CONFIG_CGROUP_PERF
	3658	+ for (; css; css = css->parent)
	3659	+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
	3660	+#endif
	3661	+
	3662	+ min_heapify_all(&event_heap, &perf_min_heap);
	3663	+
	3664	+ while (event_heap.nr) {
3275	3665	ret = func(*evt, data);
3276	3666	if (ret)
3277	3667	return ret;
3278	3668
3279	3669	evt = perf_event_groups_next(evt);
3280		- }
3281		-
3282		- return 0;
3283		-}
3284		-
3285		-struct sched_in_data {
3286		- struct perf_event_context *ctx;
3287		- struct perf_cpu_context *cpuctx;
3288		- int can_add_hw;
3289		-};
3290		-
3291		-static int pinned_sched_in(struct perf_event event, void data)
3292		-{
3293		- struct sched_in_data *sid = data;
3294		-
3295		- if (event->state <= PERF_EVENT_STATE_OFF)
3296		- return 0;
3297		-
3298		- if (!event_filter_match(event))
3299		- return 0;
3300		-
3301		- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3302		- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3303		- list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3304		- }
3305		-
3306		- /*
3307		- * If this pinned group hasn't been scheduled,
3308		- * put it in error state.
3309		- */
3310		- if (event->state == PERF_EVENT_STATE_INACTIVE)
3311		- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3312		-
3313		- return 0;
3314		-}
3315		-
3316		-static int flexible_sched_in(struct perf_event event, void data)
3317		-{
3318		- struct sched_in_data *sid = data;
3319		-
3320		- if (event->state <= PERF_EVENT_STATE_OFF)
3321		- return 0;
3322		-
3323		- if (!event_filter_match(event))
3324		- return 0;
3325		-
3326		- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3327		- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3328		- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
	3670	+ if (*evt)
	3671	+ min_heapify(&event_heap, 0, &perf_min_heap);
3329	3672	else
3330		- sid->can_add_hw = 0;
	3673	+ min_heap_pop(&event_heap, &perf_min_heap);
	3674	+ }
	3675	+
	3676	+ return 0;
	3677	+}
	3678	+
	3679	+static inline bool event_update_userpage(struct perf_event *event)
	3680	+{
	3681	+ if (likely(!atomic_read(&event->mmap_count)))
	3682	+ return false;
	3683	+
	3684	+ perf_event_update_time(event);
	3685	+ perf_set_shadow_time(event, event->ctx);
	3686	+ perf_event_update_userpage(event);
	3687	+
	3688	+ return true;
	3689	+}
	3690	+
	3691	+static inline void group_update_userpage(struct perf_event *group_event)
	3692	+{
	3693	+ struct perf_event *event;
	3694	+
	3695	+ if (!event_update_userpage(group_event))
	3696	+ return;
	3697	+
	3698	+ for_each_sibling_event(event, group_event)
	3699	+ event_update_userpage(event);
	3700	+}
	3701	+
	3702	+static int merge_sched_in(struct perf_event event, void data)
	3703	+{
	3704	+ struct perf_event_context *ctx = event->ctx;
	3705	+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
	3706	+ int *can_add_hw = data;
	3707	+
	3708	+ if (event->state <= PERF_EVENT_STATE_OFF)
	3709	+ return 0;
	3710	+
	3711	+ if (!event_filter_match(event))
	3712	+ return 0;
	3713	+
	3714	+ if (group_can_go_on(event, cpuctx, *can_add_hw)) {
	3715	+ if (!group_sched_in(event, cpuctx, ctx))
	3716	+ list_add_tail(&event->active_list, get_event_list(event));
	3717	+ }
	3718	+
	3719	+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
	3720	+ *can_add_hw = 0;
	3721	+ if (event->attr.pinned) {
	3722	+ perf_cgroup_event_disable(event, ctx);
	3723	+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
	3724	+ } else {
	3725	+ ctx->rotate_necessary = 1;
	3726	+ perf_mux_hrtimer_restart(cpuctx);
	3727	+ group_update_userpage(event);
	3728	+ }
3331	3729	}
3332	3730
3333	3731	return 0;
..	..	@@ -3337,30 +3735,28 @@
3337	3735	ctx_pinned_sched_in(struct perf_event_context *ctx,
3338	3736	struct perf_cpu_context *cpuctx)
3339	3737	{
3340		- struct sched_in_data sid = {
3341		- .ctx = ctx,
3342		- .cpuctx = cpuctx,
3343		- .can_add_hw = 1,
3344		- };
	3738	+ int can_add_hw = 1;
3345	3739
3346		- visit_groups_merge(&ctx->pinned_groups,
	3740	+ if (ctx != &cpuctx->ctx)
	3741	+ cpuctx = NULL;
	3742	+
	3743	+ visit_groups_merge(cpuctx, &ctx->pinned_groups,
3347	3744	smp_processor_id(),
3348		- pinned_sched_in, &sid);
	3745	+ merge_sched_in, &can_add_hw);
3349	3746	}
3350	3747
3351	3748	static void
3352	3749	ctx_flexible_sched_in(struct perf_event_context *ctx,
3353	3750	struct perf_cpu_context *cpuctx)
3354	3751	{
3355		- struct sched_in_data sid = {
3356		- .ctx = ctx,
3357		- .cpuctx = cpuctx,
3358		- .can_add_hw = 1,
3359		- };
	3752	+ int can_add_hw = 1;
3360	3753
3361		- visit_groups_merge(&ctx->flexible_groups,
	3754	+ if (ctx != &cpuctx->ctx)
	3755	+ cpuctx = NULL;
	3756	+
	3757	+ visit_groups_merge(cpuctx, &ctx->flexible_groups,
3362	3758	smp_processor_id(),
3363		- flexible_sched_in, &sid);
	3759	+ merge_sched_in, &can_add_hw);
3364	3760	}
3365	3761
3366	3762	static void
..	..	@@ -3419,10 +3815,14 @@
3419	3815	struct task_struct *task)
3420	3816	{
3421	3817	struct perf_cpu_context *cpuctx;
	3818	+ struct pmu *pmu = ctx->pmu;
3422	3819
3423	3820	cpuctx = __get_cpu_context(ctx);
3424		- if (cpuctx->task_ctx == ctx)
	3821	+ if (cpuctx->task_ctx == ctx) {
	3822	+ if (cpuctx->sched_cb_usage)
	3823	+ __perf_pmu_sched_task(cpuctx, true);
3425	3824	return;
	3825	+ }
3426	3826
3427	3827	perf_ctx_lock(cpuctx, ctx);
3428	3828	/*
..	..	@@ -3432,7 +3832,7 @@
3432	3832	if (!ctx->nr_events)
3433	3833	goto unlock;
3434	3834
3435		- perf_pmu_disable(ctx->pmu);
	3835	+ perf_pmu_disable(pmu);
3436	3836	/*
3437	3837	* We want to keep the following priority order:
3438	3838	* cpu pinned (that don't need to move), task pinned,
..	..	@@ -3444,7 +3844,11 @@
3444	3844	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3445	3845	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3446	3846	perf_event_sched_in(cpuctx, ctx, task);
3447		- perf_pmu_enable(ctx->pmu);
	3847	+
	3848	+ if (cpuctx->sched_cb_usage && pmu->sched_task)
	3849	+ pmu->sched_task(cpuctx->task_ctx, true);
	3850	+
	3851	+ perf_pmu_enable(pmu);
3448	3852
3449	3853	unlock:
3450	3854	perf_ctx_unlock(cpuctx, ctx);
..	..	@@ -3685,34 +4089,45 @@
3685	4089	perf_event_groups_insert(&ctx->flexible_groups, event);
3686	4090	}
3687	4091
	4092	+/* pick an event from the flexible_groups to rotate */
3688	4093	static inline struct perf_event *
3689		-ctx_first_active(struct perf_event_context *ctx)
	4094	+ctx_event_to_rotate(struct perf_event_context *ctx)
3690	4095	{
3691		- return list_first_entry_or_null(&ctx->flexible_active,
3692		- struct perf_event, active_list);
	4096	+ struct perf_event *event;
	4097	+
	4098	+ /* pick the first active flexible event */
	4099	+ event = list_first_entry_or_null(&ctx->flexible_active,
	4100	+ struct perf_event, active_list);
	4101	+
	4102	+ /* if no active flexible event, pick the first event */
	4103	+ if (!event) {
	4104	+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
	4105	+ typeof(*event), group_node);
	4106	+ }
	4107	+
	4108	+ /*
	4109	+ * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
	4110	+ * finds there are unschedulable events, it will set it again.
	4111	+ */
	4112	+ ctx->rotate_necessary = 0;
	4113	+
	4114	+ return event;
3693	4115	}
3694	4116
3695	4117	static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3696	4118	{
3697	4119	struct perf_event cpu_event = NULL, task_event = NULL;
3698		- bool cpu_rotate = false, task_rotate = false;
3699		- struct perf_event_context *ctx = NULL;
	4120	+ struct perf_event_context *task_ctx = NULL;
	4121	+ int cpu_rotate, task_rotate;
3700	4122
3701	4123	/*
3702	4124	* Since we run this from IRQ context, nobody can install new
3703	4125	* events, thus the event count values are stable.
3704	4126	*/
3705	4127
3706		- if (cpuctx->ctx.nr_events) {
3707		- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3708		- cpu_rotate = true;
3709		- }
3710		-
3711		- ctx = cpuctx->task_ctx;
3712		- if (ctx && ctx->nr_events) {
3713		- if (ctx->nr_events != ctx->nr_active)
3714		- task_rotate = true;
3715		- }
	4128	+ cpu_rotate = cpuctx->ctx.rotate_necessary;
	4129	+ task_ctx = cpuctx->task_ctx;
	4130	+ task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
3716	4131
3717	4132	if (!(cpu_rotate \|\| task_rotate))
3718	4133	return false;
..	..	@@ -3721,25 +4136,25 @@
3721	4136	perf_pmu_disable(cpuctx->ctx.pmu);
3722	4137
3723	4138	if (task_rotate)
3724		- task_event = ctx_first_active(ctx);
	4139	+ task_event = ctx_event_to_rotate(task_ctx);
3725	4140	if (cpu_rotate)
3726		- cpu_event = ctx_first_active(&cpuctx->ctx);
	4141	+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
3727	4142
3728	4143	/*
3729	4144	* As per the order given at ctx_resched() first 'pop' task flexible
3730	4145	* and then, if needed CPU flexible.
3731	4146	*/
3732		- if (task_event \|\| (ctx && cpu_event))
3733		- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
	4147	+ if (task_event \|\| (task_ctx && cpu_event))
	4148	+ ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
3734	4149	if (cpu_event)
3735	4150	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3736	4151
3737	4152	if (task_event)
3738		- rotate_ctx(ctx, task_event);
	4153	+ rotate_ctx(task_ctx, task_event);
3739	4154	if (cpu_event)
3740	4155	rotate_ctx(&cpuctx->ctx, cpu_event);
3741	4156
3742		- perf_event_sched_in(cpuctx, ctx, current);
	4157	+ perf_event_sched_in(cpuctx, task_ctx, current);
3743	4158
3744	4159	perf_pmu_enable(cpuctx->ctx.pmu);
3745	4160	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
..	..	@@ -3983,6 +4398,7 @@
3983	4398
3984	4399	return ret;
3985	4400	}
	4401	+EXPORT_SYMBOL_GPL(perf_event_read_local);
3986	4402
3987	4403	static int perf_event_read(struct perf_event *event, bool group)
3988	4404	{
..	..	@@ -4074,7 +4490,7 @@
4074	4490	INIT_LIST_HEAD(&ctx->event_list);
4075	4491	INIT_LIST_HEAD(&ctx->pinned_active);
4076	4492	INIT_LIST_HEAD(&ctx->flexible_active);
4077		- atomic_set(&ctx->refcount, 1);
	4493	+ refcount_set(&ctx->refcount, 1);
4078	4494	}
4079	4495
4080	4496	static struct perf_event_context *
..	..	@@ -4087,10 +4503,8 @@
4087	4503	return NULL;
4088	4504
4089	4505	__perf_event_init_context(ctx);
4090		- if (task) {
4091		- ctx->task = task;
4092		- get_task_struct(task);
4093		- }
	4506	+ if (task)
	4507	+ ctx->task = get_task_struct(task);
4094	4508	ctx->pmu = pmu;
4095	4509
4096	4510	return ctx;
..	..	@@ -4152,7 +4566,7 @@
4152	4566	goto errout;
4153	4567
4154	4568	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4155		- task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
	4569	+ task_ctx_data = alloc_task_ctx_data(pmu);
4156	4570	if (!task_ctx_data) {
4157	4571	err = -ENOMEM;
4158	4572	goto errout;
..	..	@@ -4210,11 +4624,11 @@
4210	4624	}
4211	4625	}
4212	4626
4213		- kfree(task_ctx_data);
	4627	+ free_task_ctx_data(pmu, task_ctx_data);
4214	4628	return ctx;
4215	4629
4216	4630	errout:
4217		- kfree(task_ctx_data);
	4631	+ free_task_ctx_data(pmu, task_ctx_data);
4218	4632	return ERR_PTR(err);
4219	4633	}
4220	4634
..	..	@@ -4233,7 +4647,7 @@
4233	4647	}
4234	4648
4235	4649	static void ring_buffer_attach(struct perf_event *event,
4236		- struct ring_buffer *rb);
	4650	+ struct perf_buffer *rb);
4237	4651
4238	4652	static void detach_sb_event(struct perf_event *event)
4239	4653	{
..	..	@@ -4256,8 +4670,9 @@
4256	4670
4257	4671	if (attr->mmap \|\| attr->mmap_data \|\| attr->mmap2 \|\|
4258	4672	attr->comm \|\| attr->comm_exec \|\|
4259		- attr->task \|\|
4260		- attr->context_switch)
	4673	+ attr->task \|\| attr->ksymbol \|\|
	4674	+ attr->context_switch \|\| attr->text_poke \|\|
	4675	+ attr->bpf_event)
4261	4676	return true;
4262	4677	return false;
4263	4678	}
..	..	@@ -4306,7 +4721,7 @@
4306	4721	if (event->parent)
4307	4722	return;
4308	4723
4309		- if (event->attach_state & PERF_ATTACH_TASK)
	4724	+ if (event->attach_state & (PERF_ATTACH_TASK \| PERF_ATTACH_SCHED_CB))
4310	4725	dec = true;
4311	4726	if (event->attr.mmap \|\| event->attr.mmap_data)
4312	4727	atomic_dec(&nr_mmap_events);
..	..	@@ -4314,6 +4729,8 @@
4314	4729	atomic_dec(&nr_comm_events);
4315	4730	if (event->attr.namespaces)
4316	4731	atomic_dec(&nr_namespaces_events);
	4732	+ if (event->attr.cgroup)
	4733	+ atomic_dec(&nr_cgroup_events);
4317	4734	if (event->attr.task)
4318	4735	atomic_dec(&nr_task_events);
4319	4736	if (event->attr.freq)
..	..	@@ -4326,6 +4743,12 @@
4326	4743	dec = true;
4327	4744	if (has_branch_stack(event))
4328	4745	dec = true;
	4746	+ if (event->attr.ksymbol)
	4747	+ atomic_dec(&nr_ksymbol_events);
	4748	+ if (event->attr.bpf_event)
	4749	+ atomic_dec(&nr_bpf_events);
	4750	+ if (event->attr.text_poke)
	4751	+ atomic_dec(&nr_text_poke_events);
4329	4752
4330	4753	if (dec) {
4331	4754	if (!atomic_add_unless(&perf_sched_count, -1, 1))
..	..	@@ -4909,7 +5332,7 @@
4909	5332	static __poll_t perf_poll(struct file file, poll_table wait)
4910	5333	{
4911	5334	struct perf_event *event = file->private_data;
4912		- struct ring_buffer *rb;
	5335	+ struct perf_buffer *rb;
4913	5336	__poll_t events = EPOLLHUP;
4914	5337
4915	5338	poll_wait(file, &event->waitq, wait);
..	..	@@ -4935,6 +5358,24 @@
4935	5358	local64_set(&event->count, 0);
4936	5359	perf_event_update_userpage(event);
4937	5360	}
	5361	+
	5362	+/* Assume it's not an event with inherit set. */
	5363	+u64 perf_event_pause(struct perf_event *event, bool reset)
	5364	+{
	5365	+ struct perf_event_context *ctx;
	5366	+ u64 count;
	5367	+
	5368	+ ctx = perf_event_ctx_lock(event);
	5369	+ WARN_ON_ONCE(event->attr.inherit);
	5370	+ _perf_event_disable(event);
	5371	+ count = local64_read(&event->count);
	5372	+ if (reset)
	5373	+ local64_set(&event->count, 0);
	5374	+ perf_event_ctx_unlock(event, ctx);
	5375	+
	5376	+ return count;
	5377	+}
	5378	+EXPORT_SYMBOL_GPL(perf_event_pause);
4938	5379
4939	5380	/*
4940	5381	* Holding the top-level event's child_mutex means that any
..	..	@@ -5013,15 +5454,10 @@
5013	5454	return event->pmu->check_period(event, value);
5014	5455	}
5015	5456
5016		-static int perf_event_period(struct perf_event event, u64 __user arg)
	5457	+static int _perf_event_period(struct perf_event *event, u64 value)
5017	5458	{
5018		- u64 value;
5019		-
5020	5459	if (!is_sampling_event(event))
5021	5460	return -EINVAL;
5022		-
5023		- if (copy_from_user(&value, arg, sizeof(value)))
5024		- return -EFAULT;
5025	5461
5026	5462	if (!value)
5027	5463	return -EINVAL;
..	..	@@ -5039,6 +5475,19 @@
5039	5475
5040	5476	return 0;
5041	5477	}
	5478	+
	5479	+int perf_event_period(struct perf_event *event, u64 value)
	5480	+{
	5481	+ struct perf_event_context *ctx;
	5482	+ int ret;
	5483	+
	5484	+ ctx = perf_event_ctx_lock(event);
	5485	+ ret = _perf_event_period(event, value);
	5486	+ perf_event_ctx_unlock(event, ctx);
	5487	+
	5488	+ return ret;
	5489	+}
	5490	+EXPORT_SYMBOL_GPL(perf_event_period);
5042	5491
5043	5492	static const struct file_operations perf_fops;
5044	5493
..	..	@@ -5083,8 +5532,14 @@
5083	5532	return _perf_event_refresh(event, arg);
5084	5533
5085	5534	case PERF_EVENT_IOC_PERIOD:
5086		- return perf_event_period(event, (u64 __user *)arg);
	5535	+ {
	5536	+ u64 value;
5087	5537
	5538	+ if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
	5539	+ return -EFAULT;
	5540	+
	5541	+ return _perf_event_period(event, value);
	5542	+ }
5088	5543	case PERF_EVENT_IOC_ID:
5089	5544	{
5090	5545	u64 id = primary_event_id(event);
..	..	@@ -5119,7 +5574,7 @@
5119	5574	return perf_event_set_bpf_prog(event, arg);
5120	5575
5121	5576	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5122		- struct ring_buffer *rb;
	5577	+ struct perf_buffer *rb;
5123	5578
5124	5579	rcu_read_lock();
5125	5580	rb = rcu_dereference(event->rb);
..	..	@@ -5255,7 +5710,7 @@
5255	5710	static void perf_event_init_userpage(struct perf_event *event)
5256	5711	{
5257	5712	struct perf_event_mmap_page *userpg;
5258		- struct ring_buffer *rb;
	5713	+ struct perf_buffer *rb;
5259	5714
5260	5715	rcu_read_lock();
5261	5716	rb = rcu_dereference(event->rb);
..	..	@@ -5287,7 +5742,7 @@
5287	5742	void perf_event_update_userpage(struct perf_event *event)
5288	5743	{
5289	5744	struct perf_event_mmap_page *userpg;
5290		- struct ring_buffer *rb;
	5745	+ struct perf_buffer *rb;
5291	5746	u64 enabled, running, now;
5292	5747
5293	5748	rcu_read_lock();
..	..	@@ -5338,7 +5793,7 @@
5338	5793	static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5339	5794	{
5340	5795	struct perf_event *event = vmf->vma->vm_file->private_data;
5341		- struct ring_buffer *rb;
	5796	+ struct perf_buffer *rb;
5342	5797	vm_fault_t ret = VM_FAULT_SIGBUS;
5343	5798
5344	5799	if (vmf->flags & FAULT_FLAG_MKWRITE) {
..	..	@@ -5371,10 +5826,12 @@
5371	5826	}
5372	5827
5373	5828	static void ring_buffer_attach(struct perf_event *event,
5374		- struct ring_buffer *rb)
	5829	+ struct perf_buffer *rb)
5375	5830	{
5376		- struct ring_buffer *old_rb = NULL;
	5831	+ struct perf_buffer *old_rb = NULL;
5377	5832	unsigned long flags;
	5833	+
	5834	+ WARN_ON_ONCE(event->parent);
5378	5835
5379	5836	if (event->rb) {
5380	5837	/*
..	..	@@ -5431,7 +5888,10 @@
5431	5888
5432	5889	static void ring_buffer_wakeup(struct perf_event *event)
5433	5890	{
5434		- struct ring_buffer *rb;
	5891	+ struct perf_buffer *rb;
	5892	+
	5893	+ if (event->parent)
	5894	+ event = event->parent;
5435	5895
5436	5896	rcu_read_lock();
5437	5897	rb = rcu_dereference(event->rb);
..	..	@@ -5442,14 +5902,17 @@
5442	5902	rcu_read_unlock();
5443	5903	}
5444	5904
5445		-struct ring_buffer ring_buffer_get(struct perf_event event)
	5905	+struct perf_buffer ring_buffer_get(struct perf_event event)
5446	5906	{
5447		- struct ring_buffer *rb;
	5907	+ struct perf_buffer *rb;
	5908	+
	5909	+ if (event->parent)
	5910	+ event = event->parent;
5448	5911
5449	5912	rcu_read_lock();
5450	5913	rb = rcu_dereference(event->rb);
5451	5914	if (rb) {
5452		- if (!atomic_inc_not_zero(&rb->refcount))
	5915	+ if (!refcount_inc_not_zero(&rb->refcount))
5453	5916	rb = NULL;
5454	5917	}
5455	5918	rcu_read_unlock();
..	..	@@ -5457,9 +5920,9 @@
5457	5920	return rb;
5458	5921	}
5459	5922
5460		-void ring_buffer_put(struct ring_buffer *rb)
	5923	+void ring_buffer_put(struct perf_buffer *rb)
5461	5924	{
5462		- if (!atomic_dec_and_test(&rb->refcount))
	5925	+ if (!refcount_dec_and_test(&rb->refcount))
5463	5926	return;
5464	5927
5465	5928	WARN_ON_ONCE(!list_empty(&rb->event_list));
..	..	@@ -5494,7 +5957,7 @@
5494	5957	static void perf_mmap_close(struct vm_area_struct *vma)
5495	5958	{
5496	5959	struct perf_event *event = vma->vm_file->private_data;
5497		- struct ring_buffer *rb = ring_buffer_get(event);
	5960	+ struct perf_buffer *rb = ring_buffer_get(event);
5498	5961	struct user_struct *mmap_user = rb->mmap_user;
5499	5962	int mmap_locked = rb->mmap_locked;
5500	5963	unsigned long size = perf_data_size(rb);
..	..	@@ -5519,12 +5982,12 @@
5519	5982	perf_pmu_output_stop(event);
5520	5983
5521	5984	/* now it's safe to free the pages */
5522		- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5523		- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
	5985	+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
	5986	+ atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5524	5987
5525	5988	/* this has to be the last one */
5526	5989	rb_free_aux(rb);
5527		- WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
	5990	+ WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5528	5991
5529	5992	mutex_unlock(&event->mmap_mutex);
5530	5993	}
..	..	@@ -5593,8 +6056,9 @@
5593	6056	* undo the VM accounting.
5594	6057	*/
5595	6058
5596		- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5597		- vma->vm_mm->pinned_vm -= mmap_locked;
	6059	+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
	6060	+ &mmap_user->locked_vm);
	6061	+ atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
5598	6062	free_uid(mmap_user);
5599	6063
5600	6064	out_put:
..	..	@@ -5603,7 +6067,7 @@
5603	6067
5604	6068	static const struct vm_operations_struct perf_mmap_vmops = {
5605	6069	.open = perf_mmap_open,
5606		- .close = perf_mmap_close, /* non mergable */
	6070	+ .close = perf_mmap_close, /* non mergeable */
5607	6071	.fault = perf_mmap_fault,
5608	6072	.page_mkwrite = perf_mmap_fault,
5609	6073	};
..	..	@@ -5613,8 +6077,8 @@
5613	6077	struct perf_event *event = file->private_data;
5614	6078	unsigned long user_locked, user_lock_limit;
5615	6079	struct user_struct *user = current_user();
	6080	+ struct perf_buffer *rb = NULL;
5616	6081	unsigned long locked, lock_limit;
5617		- struct ring_buffer *rb = NULL;
5618	6082	unsigned long vma_size;
5619	6083	unsigned long nr_pages;
5620	6084	long user_extra = 0, extra = 0;
..	..	@@ -5711,17 +6175,17 @@
5711	6175	again:
5712	6176	mutex_lock(&event->mmap_mutex);
5713	6177	if (event->rb) {
5714		- if (event->rb->nr_pages != nr_pages) {
	6178	+ if (data_page_nr(event->rb) != nr_pages) {
5715	6179	ret = -EINVAL;
5716	6180	goto unlock;
5717	6181	}
5718	6182
5719	6183	if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5720	6184	/*
5721		- * Raced against perf_mmap_close() through
5722		- * perf_event_set_output(). Try again, hope for better
5723		- * luck.
	6185	+ * Raced against perf_mmap_close(); remove the
	6186	+ * event and try again.
5724	6187	*/
	6188	+ ring_buffer_attach(event, NULL);
5725	6189	mutex_unlock(&event->mmap_mutex);
5726	6190	goto again;
5727	6191	}
..	..	@@ -5749,12 +6213,18 @@
5749	6213	user_locked = user_lock_limit;
5750	6214	user_locked += user_extra;
5751	6215
5752		- if (user_locked > user_lock_limit)
	6216	+ if (user_locked > user_lock_limit) {
	6217	+ /*
	6218	+ * charge locked_vm until it hits user_lock_limit;
	6219	+ * charge the rest from pinned_vm
	6220	+ */
5753	6221	extra = user_locked - user_lock_limit;
	6222	+ user_extra -= extra;
	6223	+ }
5754	6224
5755	6225	lock_limit = rlimit(RLIMIT_MEMLOCK);
5756	6226	lock_limit >>= PAGE_SHIFT;
5757		- locked = vma->vm_mm->pinned_vm + extra;
	6227	+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
5758	6228
5759	6229	if ((locked > lock_limit) && perf_is_paranoid() &&
5760	6230	!capable(CAP_IPC_LOCK)) {
..	..	@@ -5783,6 +6253,8 @@
5783	6253
5784	6254	ring_buffer_attach(event, rb);
5785	6255
	6256	+ perf_event_update_time(event);
	6257	+ perf_set_shadow_time(event, event->ctx);
5786	6258	perf_event_init_userpage(event);
5787	6259	perf_event_update_userpage(event);
5788	6260	} else {
..	..	@@ -5795,7 +6267,7 @@
5795	6267	unlock:
5796	6268	if (!ret) {
5797	6269	atomic_long_add(user_extra, &user->locked_vm);
5798		- vma->vm_mm->pinned_vm += extra;
	6270	+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
5799	6271
5800	6272	atomic_inc(&event->mmap_count);
5801	6273	} else if (rb) {
..	..	@@ -5932,18 +6404,25 @@
5932	6404	* Later on, we might change it to a list if there is
5933	6405	* another virtualization implementation supporting the callbacks.
5934	6406	*/
5935		-struct perf_guest_info_callbacks *perf_guest_cbs;
	6407	+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
5936	6408
5937	6409	int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5938	6410	{
5939		- perf_guest_cbs = cbs;
	6411	+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
	6412	+ return -EBUSY;
	6413	+
	6414	+ rcu_assign_pointer(perf_guest_cbs, cbs);
5940	6415	return 0;
5941	6416	}
5942	6417	EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5943	6418
5944	6419	int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5945	6420	{
5946		- perf_guest_cbs = NULL;
	6421	+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
	6422	+ return -EINVAL;
	6423	+
	6424	+ rcu_assign_pointer(perf_guest_cbs, NULL);
	6425	+ synchronize_rcu();
5947	6426	return 0;
5948	6427	}
5949	6428	EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
..	..	@@ -5965,14 +6444,13 @@
5965	6444	}
5966	6445
5967	6446	static void perf_sample_regs_user(struct perf_regs *regs_user,
5968		- struct pt_regs *regs,
5969		- struct pt_regs *regs_user_copy)
	6447	+ struct pt_regs *regs)
5970	6448	{
5971	6449	if (user_mode(regs)) {
5972	6450	regs_user->abi = perf_reg_abi(current);
5973	6451	regs_user->regs = regs;
5974	6452	} else if (!(current->flags & PF_KTHREAD)) {
5975		- perf_get_regs_user(regs_user, regs, regs_user_copy);
	6453	+ perf_get_regs_user(regs_user, regs);
5976	6454	} else {
5977	6455	regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5978	6456	regs_user->regs = NULL;
..	..	@@ -5991,7 +6469,7 @@
5991	6469	* Get remaining task size from user stack pointer.
5992	6470	*
5993	6471	* It'd be better to take stack vma map and limit this more
5994		- * precisly, but there's no way to get it safely under interrupt,
	6472	+ * precisely, but there's no way to get it safely under interrupt,
5995	6473	* so using TASK_SIZE as limit.
5996	6474	*/
5997	6475	static u64 perf_ustack_task_size(struct pt_regs *regs)
..	..	@@ -6073,10 +6551,9 @@
6073	6551
6074	6552	/* Data. */
6075	6553	sp = perf_user_stack_pointer(regs);
6076		- fs = get_fs();
6077		- set_fs(USER_DS);
	6554	+ fs = force_uaccess_begin();
6078	6555	rem = __output_copy_user(handle, (void *) sp, dump_size);
6079		- set_fs(fs);
	6556	+ force_uaccess_end(fs);
6080	6557	dyn_size = dump_size - rem;
6081	6558
6082	6559	perf_output_skip(handle, rem);
..	..	@@ -6084,6 +6561,122 @@
6084	6561	/* Dynamic size. */
6085	6562	perf_output_put(handle, dyn_size);
6086	6563	}
	6564	+}
	6565	+
	6566	+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
	6567	+ struct perf_sample_data *data,
	6568	+ size_t size)
	6569	+{
	6570	+ struct perf_event *sampler = event->aux_event;
	6571	+ struct perf_buffer *rb;
	6572	+
	6573	+ data->aux_size = 0;
	6574	+
	6575	+ if (!sampler)
	6576	+ goto out;
	6577	+
	6578	+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
	6579	+ goto out;
	6580	+
	6581	+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
	6582	+ goto out;
	6583	+
	6584	+ rb = ring_buffer_get(sampler);
	6585	+ if (!rb)
	6586	+ goto out;
	6587	+
	6588	+ /*
	6589	+ * If this is an NMI hit inside sampling code, don't take
	6590	+ * the sample. See also perf_aux_sample_output().
	6591	+ */
	6592	+ if (READ_ONCE(rb->aux_in_sampling)) {
	6593	+ data->aux_size = 0;
	6594	+ } else {
	6595	+ size = min_t(size_t, size, perf_aux_size(rb));
	6596	+ data->aux_size = ALIGN(size, sizeof(u64));
	6597	+ }
	6598	+ ring_buffer_put(rb);
	6599	+
	6600	+out:
	6601	+ return data->aux_size;
	6602	+}
	6603	+
	6604	+long perf_pmu_snapshot_aux(struct perf_buffer *rb,
	6605	+ struct perf_event *event,
	6606	+ struct perf_output_handle *handle,
	6607	+ unsigned long size)
	6608	+{
	6609	+ unsigned long flags;
	6610	+ long ret;
	6611	+
	6612	+ /*
	6613	+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
	6614	+ * paths. If we start calling them in NMI context, they may race with
	6615	+ * the IRQ ones, that is, for example, re-starting an event that's just
	6616	+ * been stopped, which is why we're using a separate callback that
	6617	+ * doesn't change the event state.
	6618	+ *
	6619	+ * IRQs need to be disabled to prevent IPIs from racing with us.
	6620	+ */
	6621	+ local_irq_save(flags);
	6622	+ /*
	6623	+ * Guard against NMI hits inside the critical section;
	6624	+ * see also perf_prepare_sample_aux().
	6625	+ */
	6626	+ WRITE_ONCE(rb->aux_in_sampling, 1);
	6627	+ barrier();
	6628	+
	6629	+ ret = event->pmu->snapshot_aux(event, handle, size);
	6630	+
	6631	+ barrier();
	6632	+ WRITE_ONCE(rb->aux_in_sampling, 0);
	6633	+ local_irq_restore(flags);
	6634	+
	6635	+ return ret;
	6636	+}
	6637	+
	6638	+static void perf_aux_sample_output(struct perf_event *event,
	6639	+ struct perf_output_handle *handle,
	6640	+ struct perf_sample_data *data)
	6641	+{
	6642	+ struct perf_event *sampler = event->aux_event;
	6643	+ struct perf_buffer *rb;
	6644	+ unsigned long pad;
	6645	+ long size;
	6646	+
	6647	+ if (WARN_ON_ONCE(!sampler \|\| !data->aux_size))
	6648	+ return;
	6649	+
	6650	+ rb = ring_buffer_get(sampler);
	6651	+ if (!rb)
	6652	+ return;
	6653	+
	6654	+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
	6655	+
	6656	+ /*
	6657	+ * An error here means that perf_output_copy() failed (returned a
	6658	+ * non-zero surplus that it didn't copy), which in its current
	6659	+ * enlightened implementation is not possible. If that changes, we'd
	6660	+ * like to know.
	6661	+ */
	6662	+ if (WARN_ON_ONCE(size < 0))
	6663	+ goto out_put;
	6664	+
	6665	+ /*
	6666	+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
	6667	+ * perf_prepare_sample_aux(), so should not be more than that.
	6668	+ */
	6669	+ pad = data->aux_size - size;
	6670	+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
	6671	+ pad = 8;
	6672	+
	6673	+ if (pad) {
	6674	+ u64 zero = 0;
	6675	+ perf_output_copy(handle, &zero, pad);
	6676	+ }
	6677	+
	6678	+out_put:
	6679	+ ring_buffer_put(rb);
6087	6680	}
6088	6681
6089	6682	static void __perf_event_header__init_id(struct perf_event_header *header,
..	..	@@ -6255,6 +6848,11 @@
6255	6848	perf_output_read_one(handle, event, enabled, running);
6256	6849	}
6257	6850
	6851	+static inline bool perf_sample_save_hw_index(struct perf_event *event)
	6852	+{
	6853	+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
	6854	+}
	6855	+
6258	6856	void perf_output_sample(struct perf_output_handle *handle,
6259	6857	struct perf_event_header *header,
6260	6858	struct perf_sample_data *data,
..	..	@@ -6343,6 +6941,8 @@
6343	6941	* sizeof(struct perf_branch_entry);
6344	6942
6345	6943	perf_output_put(handle, data->br_stack->nr);
	6944	+ if (perf_sample_save_hw_index(event))
	6945	+ perf_output_put(handle, data->br_stack->hw_idx);
6346	6946	perf_output_copy(handle, data->br_stack->entries, size);
6347	6947	} else {
6348	6948	/*
..	..	@@ -6405,11 +7005,21 @@
6405	7005	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6406	7006	perf_output_put(handle, data->phys_addr);
6407	7007
	7008	+ if (sample_type & PERF_SAMPLE_CGROUP)
	7009	+ perf_output_put(handle, data->cgroup);
	7010	+
	7011	+ if (sample_type & PERF_SAMPLE_AUX) {
	7012	+ perf_output_put(handle, data->aux_size);
	7013	+
	7014	+ if (data->aux_size)
	7015	+ perf_aux_sample_output(event, handle, data);
	7016	+ }
	7017	+
6408	7018	if (!event->attr.watermark) {
6409	7019	int wakeup_events = event->attr.wakeup_events;
6410	7020
6411	7021	if (wakeup_events) {
6412		- struct ring_buffer *rb = handle->rb;
	7022	+ struct perf_buffer *rb = handle->rb;
6413	7023	int events = local_inc_return(&rb->events);
6414	7024
6415	7025	if (events >= wakeup_events) {
..	..	@@ -6437,14 +7047,14 @@
6437	7047	* Walking the pages tables for user address.
6438	7048	* Interrupts are disabled, so it prevents any tear down
6439	7049	* of the page tables.
6440		- * Try IRQ-safe __get_user_pages_fast first.
	7050	+ * Try IRQ-safe get_user_page_fast_only first.
6441	7051	* If failed, leave phys_addr as 0.
6442	7052	*/
6443	7053	if (current->mm != NULL) {
6444	7054	struct page *p;
6445	7055
6446	7056	pagefault_disable();
6447		- if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {
	7057	+ if (get_user_page_fast_only(virt, 0, &p)) {
6448	7058	phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6449	7059	put_page(p);
6450	7060	}
..	..	@@ -6532,6 +7142,9 @@
6532	7142	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6533	7143	int size = sizeof(u64); /* nr */
6534	7144	if (data->br_stack) {
	7145	+ if (perf_sample_save_hw_index(event))
	7146	+ size += sizeof(u64);
	7147	+
6535	7148	size += data->br_stack->nr
6536	7149	* sizeof(struct perf_branch_entry);
6537	7150	}
..	..	@@ -6539,8 +7152,7 @@
6539	7152	}
6540	7153
6541	7154	if (sample_type & (PERF_SAMPLE_REGS_USER \| PERF_SAMPLE_STACK_USER))
6542		- perf_sample_regs_user(&data->regs_user, regs,
6543		- &data->regs_user_copy);
	7155	+ perf_sample_regs_user(&data->regs_user, regs);
6544	7156
6545	7157	if (sample_type & PERF_SAMPLE_REGS_USER) {
6546	7158	/* regs dump ABI info */
..	..	@@ -6556,7 +7168,7 @@
6556	7168
6557	7169	if (sample_type & PERF_SAMPLE_STACK_USER) {
6558	7170	/*
6559		- * Either we need PERF_SAMPLE_STACK_USER bit to be allways
	7171	+ * Either we need PERF_SAMPLE_STACK_USER bit to be always
6560	7172	* processed as the last one or have additional check added
6561	7173	* in case new sample type is added, because we could eat
6562	7174	* up the rest of the sample size.
..	..	@@ -6596,25 +7208,67 @@
6596	7208
6597	7209	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6598	7210	data->phys_addr = perf_virt_to_phys(data->addr);
	7211	+
	7212	+#ifdef CONFIG_CGROUP_PERF
	7213	+ if (sample_type & PERF_SAMPLE_CGROUP) {
	7214	+ struct cgroup *cgrp;
	7215	+
	7216	+ /* protected by RCU */
	7217	+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
	7218	+ data->cgroup = cgroup_id(cgrp);
	7219	+ }
	7220	+#endif
	7221	+
	7222	+ if (sample_type & PERF_SAMPLE_AUX) {
	7223	+ u64 size;
	7224	+
	7225	+ header->size += sizeof(u64); /* size */
	7226	+
	7227	+ /*
	7228	+ * Given the 16bit nature of header::size, an AUX sample can
	7229	+ * easily overflow it, what with all the preceding sample bits.
	7230	+ * Make sure this doesn't happen by using up to U16_MAX bytes
	7231	+ * per sample in total (rounded down to 8 byte boundary).
	7232	+ */
	7233	+ size = min_t(size_t, U16_MAX - header->size,
	7234	+ event->attr.aux_sample_size);
	7235	+ size = rounddown(size, 8);
	7236	+ size = perf_prepare_sample_aux(event, data, size);
	7237	+
	7238	+ WARN_ON_ONCE(size + header->size > U16_MAX);
	7239	+ header->size += size;
	7240	+ }
	7241	+ /*
	7242	+ * If you're adding more sample types here, you likely need to do
	7243	+ * something about the overflowing header::size, like repurpose the
	7244	+ * lowest 3 bits of size, which should be always zero at the moment.
	7245	+ * This raises a more important question, do we really need 512k sized
	7246	+ * samples and why, so good argumentation is in order for whatever you
	7247	+ * do here next.
	7248	+ */
	7249	+ WARN_ON_ONCE(header->size & 7);
6599	7250	}
6600	7251
6601		-static __always_inline void
	7252	+static __always_inline int
6602	7253	__perf_event_output(struct perf_event *event,
6603	7254	struct perf_sample_data *data,
6604	7255	struct pt_regs *regs,
6605	7256	int (output_begin)(struct perf_output_handle ,
	7257	+ struct perf_sample_data *,
6606	7258	struct perf_event *,
6607	7259	unsigned int))
6608	7260	{
6609	7261	struct perf_output_handle handle;
6610	7262	struct perf_event_header header;
	7263	+ int err;
6611	7264
6612	7265	/* protect the callchain buffers */
6613	7266	rcu_read_lock();
6614	7267
6615	7268	perf_prepare_sample(&header, data, event, regs);
6616	7269
6617		- if (output_begin(&handle, event, header.size))
	7270	+ err = output_begin(&handle, data, event, header.size);
	7271	+ if (err)
6618	7272	goto exit;
6619	7273
6620	7274	perf_output_sample(&handle, &header, data, event);
..	..	@@ -6623,6 +7277,7 @@
6623	7277
6624	7278	exit:
6625	7279	rcu_read_unlock();
	7280	+ return err;
6626	7281	}
6627	7282
6628	7283	void
..	..	@@ -6641,12 +7296,12 @@
6641	7296	__perf_event_output(event, data, regs, perf_output_begin_backward);
6642	7297	}
6643	7298
6644		-void
	7299	+int
6645	7300	perf_event_output(struct perf_event *event,
6646	7301	struct perf_sample_data *data,
6647	7302	struct pt_regs *regs)
6648	7303	{
6649		- __perf_event_output(event, data, regs, perf_output_begin);
	7304	+ return __perf_event_output(event, data, regs, perf_output_begin);
6650	7305	}
6651	7306
6652	7307	/*
..	..	@@ -6678,7 +7333,7 @@
6678	7333	int ret;
6679	7334
6680	7335	perf_event_header__init_id(&read_event.header, &sample, event);
6681		- ret = perf_output_begin(&handle, event, read_event.header.size);
	7336	+ ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
6682	7337	if (ret)
6683	7338	return;
6684	7339
..	..	@@ -6823,7 +7478,7 @@
6823	7478	}
6824	7479
6825	7480	struct remote_output {
6826		- struct ring_buffer *rb;
	7481	+ struct perf_buffer *rb;
6827	7482	int err;
6828	7483	};
6829	7484
..	..	@@ -6831,7 +7486,7 @@
6831	7486	{
6832	7487	struct perf_event *parent = event->parent;
6833	7488	struct remote_output *ro = data;
6834		- struct ring_buffer *rb = ro->rb;
	7489	+ struct perf_buffer *rb = ro->rb;
6835	7490	struct stop_event_data sd = {
6836	7491	.event = event,
6837	7492	};
..	..	@@ -6947,7 +7602,7 @@
6947	7602
6948	7603	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6949	7604
6950		- ret = perf_output_begin(&handle, event,
	7605	+ ret = perf_output_begin(&handle, &sample, event,
6951	7606	task_event->event_id.header.size);
6952	7607	if (ret)
6953	7608	goto out;
..	..	@@ -7050,7 +7705,7 @@
7050	7705	return;
7051	7706
7052	7707	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7053		- ret = perf_output_begin(&handle, event,
	7708	+ ret = perf_output_begin(&handle, &sample, event,
7054	7709	comm_event->event_id.header.size);
7055	7710
7056	7711	if (ret)
..	..	@@ -7150,7 +7805,7 @@
7150	7805
7151	7806	perf_event_header__init_id(&namespaces_event->event_id.header,
7152	7807	&sample, event);
7153		- ret = perf_output_begin(&handle, event,
	7808	+ ret = perf_output_begin(&handle, &sample, event,
7154	7809	namespaces_event->event_id.header.size);
7155	7810	if (ret)
7156	7811	goto out;
..	..	@@ -7175,7 +7830,7 @@
7175	7830	{
7176	7831	struct path ns_path;
7177	7832	struct inode *ns_inode;
7178		- void *error;
	7833	+ int error;
7179	7834
7180	7835	error = ns_get_path(&ns_path, task, ns_ops);
7181	7836	if (!error) {
..	..	@@ -7245,6 +7900,105 @@
7245	7900	}
7246	7901
7247	7902	/*
	7903	+ * cgroup tracking
	7904	+ */
	7905	+#ifdef CONFIG_CGROUP_PERF
	7906	+
	7907	+struct perf_cgroup_event {
	7908	+ char *path;
	7909	+ int path_size;
	7910	+ struct {
	7911	+ struct perf_event_header header;
	7912	+ u64 id;
	7913	+ char path[];
	7914	+ } event_id;
	7915	+};
	7916	+
	7917	+static int perf_event_cgroup_match(struct perf_event *event)
	7918	+{
	7919	+ return event->attr.cgroup;
	7920	+}
	7921	+
	7922	+static void perf_event_cgroup_output(struct perf_event event, void data)
	7923	+{
	7924	+ struct perf_cgroup_event *cgroup_event = data;
	7925	+ struct perf_output_handle handle;
	7926	+ struct perf_sample_data sample;
	7927	+ u16 header_size = cgroup_event->event_id.header.size;
	7928	+ int ret;
	7929	+
	7930	+ if (!perf_event_cgroup_match(event))
	7931	+ return;
	7932	+
	7933	+ perf_event_header__init_id(&cgroup_event->event_id.header,
	7934	+ &sample, event);
	7935	+ ret = perf_output_begin(&handle, &sample, event,
	7936	+ cgroup_event->event_id.header.size);
	7937	+ if (ret)
	7938	+ goto out;
	7939	+
	7940	+ perf_output_put(&handle, cgroup_event->event_id);
	7941	+ __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
	7942	+
	7943	+ perf_event__output_id_sample(event, &handle, &sample);
	7944	+
	7945	+ perf_output_end(&handle);
	7946	+out:
	7947	+ cgroup_event->event_id.header.size = header_size;
	7948	+}
	7949	+
	7950	+static void perf_event_cgroup(struct cgroup *cgrp)
	7951	+{
	7952	+ struct perf_cgroup_event cgroup_event;
	7953	+ char path_enomem[16] = "//enomem";
	7954	+ char *pathname;
	7955	+ size_t size;
	7956	+
	7957	+ if (!atomic_read(&nr_cgroup_events))
	7958	+ return;
	7959	+
	7960	+ cgroup_event = (struct perf_cgroup_event){
	7961	+ .event_id = {
	7962	+ .header = {
	7963	+ .type = PERF_RECORD_CGROUP,
	7964	+ .misc = 0,
	7965	+ .size = sizeof(cgroup_event.event_id),
	7966	+ },
	7967	+ .id = cgroup_id(cgrp),
	7968	+ },
	7969	+ };
	7970	+
	7971	+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
	7972	+ if (pathname == NULL) {
	7973	+ cgroup_event.path = path_enomem;
	7974	+ } else {
	7975	+ /* just to be sure to have enough space for alignment */
	7976	+ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
	7977	+ cgroup_event.path = pathname;
	7978	+ }
	7979	+
	7980	+ /*
	7981	+ * Since our buffer works in 8 byte units we need to align our string
	7982	+ * size to a multiple of 8. However, we must guarantee the tail end is
	7983	+ * zero'd out to avoid leaking random bits to userspace.
	7984	+ */
	7985	+ size = strlen(cgroup_event.path) + 1;
	7986	+ while (!IS_ALIGNED(size, sizeof(u64)))
	7987	+ cgroup_event.path[size++] = '\0';
	7988	+
	7989	+ cgroup_event.event_id.header.size += size;
	7990	+ cgroup_event.path_size = size;
	7991	+
	7992	+ perf_iterate_sb(perf_event_cgroup_output,
	7993	+ &cgroup_event,
	7994	+ NULL);
	7995	+
	7996	+ kfree(pathname);
	7997	+}
	7998	+
	7999	+#endif
	8000	+
	8001	+/*
7248	8002	* mmap tracking
7249	8003	*/
7250	8004
..	..	@@ -7304,7 +8058,7 @@
7304	8058	}
7305	8059
7306	8060	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7307		- ret = perf_output_begin(&handle, event,
	8061	+ ret = perf_output_begin(&handle, &sample, event,
7308	8062	mmap_event->event_id.header.size);
7309	8063	if (ret)
7310	8064	goto out;
..	..	@@ -7364,7 +8118,7 @@
7364	8118	flags \|= MAP_EXECUTABLE;
7365	8119	if (vma->vm_flags & VM_LOCKED)
7366	8120	flags \|= MAP_LOCKED;
7367		- if (vma->vm_flags & VM_HUGETLB)
	8121	+ if (is_vm_hugetlb_page(vma))
7368	8122	flags \|= MAP_HUGETLB;
7369	8123
7370	8124	if (file) {
..	..	@@ -7614,7 +8368,7 @@
7614	8368	int ret;
7615	8369
7616	8370	perf_event_header__init_id(&rec.header, &sample, event);
7617		- ret = perf_output_begin(&handle, event, rec.header.size);
	8371	+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
7618	8372
7619	8373	if (ret)
7620	8374	return;
..	..	@@ -7648,7 +8402,7 @@
7648	8402
7649	8403	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7650	8404
7651		- ret = perf_output_begin(&handle, event,
	8405	+ ret = perf_output_begin(&handle, &sample, event,
7652	8406	lost_samples_event.header.size);
7653	8407	if (ret)
7654	8408	return;
..	..	@@ -7703,7 +8457,7 @@
7703	8457
7704	8458	perf_event_header__init_id(&se->event_id.header, &sample, event);
7705	8459
7706		- ret = perf_output_begin(&handle, event, se->event_id.header.size);
	8460	+ ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
7707	8461	if (ret)
7708	8462	return;
7709	8463
..	..	@@ -7778,7 +8532,7 @@
7778	8532
7779	8533	perf_event_header__init_id(&throttle_event.header, &sample, event);
7780	8534
7781		- ret = perf_output_begin(&handle, event,
	8535	+ ret = perf_output_begin(&handle, &sample, event,
7782	8536	throttle_event.header.size);
7783	8537	if (ret)
7784	8538	return;
..	..	@@ -7786,6 +8540,290 @@
7786	8540	perf_output_put(&handle, throttle_event);
7787	8541	perf_event__output_id_sample(event, &handle, &sample);
7788	8542	perf_output_end(&handle);
	8543	+}
	8544	+
	8545	+/*
	8546	+ * ksymbol register/unregister tracking
	8547	+ */
	8548	+
	8549	+struct perf_ksymbol_event {
	8550	+ const char *name;
	8551	+ int name_len;
	8552	+ struct {
	8553	+ struct perf_event_header header;
	8554	+ u64 addr;
	8555	+ u32 len;
	8556	+ u16 ksym_type;
	8557	+ u16 flags;
	8558	+ } event_id;
	8559	+};
	8560	+
	8561	+static int perf_event_ksymbol_match(struct perf_event *event)
	8562	+{
	8563	+ return event->attr.ksymbol;
	8564	+}
	8565	+
	8566	+static void perf_event_ksymbol_output(struct perf_event event, void data)
	8567	+{
	8568	+ struct perf_ksymbol_event *ksymbol_event = data;
	8569	+ struct perf_output_handle handle;
	8570	+ struct perf_sample_data sample;
	8571	+ int ret;
	8572	+
	8573	+ if (!perf_event_ksymbol_match(event))
	8574	+ return;
	8575	+
	8576	+ perf_event_header__init_id(&ksymbol_event->event_id.header,
	8577	+ &sample, event);
	8578	+ ret = perf_output_begin(&handle, &sample, event,
	8579	+ ksymbol_event->event_id.header.size);
	8580	+ if (ret)
	8581	+ return;
	8582	+
	8583	+ perf_output_put(&handle, ksymbol_event->event_id);
	8584	+ __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
	8585	+ perf_event__output_id_sample(event, &handle, &sample);
	8586	+
	8587	+ perf_output_end(&handle);
	8588	+}
	8589	+
	8590	+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
	8591	+ const char *sym)
	8592	+{
	8593	+ struct perf_ksymbol_event ksymbol_event;
	8594	+ char name[KSYM_NAME_LEN];
	8595	+ u16 flags = 0;
	8596	+ int name_len;
	8597	+
	8598	+ if (!atomic_read(&nr_ksymbol_events))
	8599	+ return;
	8600	+
	8601	+ if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX \|\|
	8602	+ ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
	8603	+ goto err;
	8604	+
	8605	+ strlcpy(name, sym, KSYM_NAME_LEN);
	8606	+ name_len = strlen(name) + 1;
	8607	+ while (!IS_ALIGNED(name_len, sizeof(u64)))
	8608	+ name[name_len++] = '\0';
	8609	+ BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
	8610	+
	8611	+ if (unregister)
	8612	+ flags \|= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
	8613	+
	8614	+ ksymbol_event = (struct perf_ksymbol_event){
	8615	+ .name = name,
	8616	+ .name_len = name_len,
	8617	+ .event_id = {
	8618	+ .header = {
	8619	+ .type = PERF_RECORD_KSYMBOL,
	8620	+ .size = sizeof(ksymbol_event.event_id) +
	8621	+ name_len,
	8622	+ },
	8623	+ .addr = addr,
	8624	+ .len = len,
	8625	+ .ksym_type = ksym_type,
	8626	+ .flags = flags,
	8627	+ },
	8628	+ };
	8629	+
	8630	+ perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
	8631	+ return;
	8632	+err:
	8633	+ WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
	8634	+}
	8635	+
	8636	+/*
	8637	+ * bpf program load/unload tracking
	8638	+ */
	8639	+
	8640	+struct perf_bpf_event {
	8641	+ struct bpf_prog *prog;
	8642	+ struct {
	8643	+ struct perf_event_header header;
	8644	+ u16 type;
	8645	+ u16 flags;
	8646	+ u32 id;
	8647	+ u8 tag[BPF_TAG_SIZE];
	8648	+ } event_id;
	8649	+};
	8650	+
	8651	+static int perf_event_bpf_match(struct perf_event *event)
	8652	+{
	8653	+ return event->attr.bpf_event;
	8654	+}
	8655	+
	8656	+static void perf_event_bpf_output(struct perf_event event, void data)
	8657	+{
	8658	+ struct perf_bpf_event *bpf_event = data;
	8659	+ struct perf_output_handle handle;
	8660	+ struct perf_sample_data sample;
	8661	+ int ret;
	8662	+
	8663	+ if (!perf_event_bpf_match(event))
	8664	+ return;
	8665	+
	8666	+ perf_event_header__init_id(&bpf_event->event_id.header,
	8667	+ &sample, event);
	8668	+ ret = perf_output_begin(&handle, data, event,
	8669	+ bpf_event->event_id.header.size);
	8670	+ if (ret)
	8671	+ return;
	8672	+
	8673	+ perf_output_put(&handle, bpf_event->event_id);
	8674	+ perf_event__output_id_sample(event, &handle, &sample);
	8675	+
	8676	+ perf_output_end(&handle);
	8677	+}
	8678	+
	8679	+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
	8680	+ enum perf_bpf_event_type type)
	8681	+{
	8682	+ bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
	8683	+ int i;
	8684	+
	8685	+ if (prog->aux->func_cnt == 0) {
	8686	+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
	8687	+ (u64)(unsigned long)prog->bpf_func,
	8688	+ prog->jited_len, unregister,
	8689	+ prog->aux->ksym.name);
	8690	+ } else {
	8691	+ for (i = 0; i < prog->aux->func_cnt; i++) {
	8692	+ struct bpf_prog *subprog = prog->aux->func[i];
	8693	+
	8694	+ perf_event_ksymbol(
	8695	+ PERF_RECORD_KSYMBOL_TYPE_BPF,
	8696	+ (u64)(unsigned long)subprog->bpf_func,
	8697	+ subprog->jited_len, unregister,
	8698	+ subprog->aux->ksym.name);
	8699	+ }
	8700	+ }
	8701	+}
	8702	+
	8703	+void perf_event_bpf_event(struct bpf_prog *prog,
	8704	+ enum perf_bpf_event_type type,
	8705	+ u16 flags)
	8706	+{
	8707	+ struct perf_bpf_event bpf_event;
	8708	+
	8709	+ if (type <= PERF_BPF_EVENT_UNKNOWN \|\|
	8710	+ type >= PERF_BPF_EVENT_MAX)
	8711	+ return;
	8712	+
	8713	+ switch (type) {
	8714	+ case PERF_BPF_EVENT_PROG_LOAD:
	8715	+ case PERF_BPF_EVENT_PROG_UNLOAD:
	8716	+ if (atomic_read(&nr_ksymbol_events))
	8717	+ perf_event_bpf_emit_ksymbols(prog, type);
	8718	+ break;
	8719	+ default:
	8720	+ break;
	8721	+ }
	8722	+
	8723	+ if (!atomic_read(&nr_bpf_events))
	8724	+ return;
	8725	+
	8726	+ bpf_event = (struct perf_bpf_event){
	8727	+ .prog = prog,
	8728	+ .event_id = {
	8729	+ .header = {
	8730	+ .type = PERF_RECORD_BPF_EVENT,
	8731	+ .size = sizeof(bpf_event.event_id),
	8732	+ },
	8733	+ .type = type,
	8734	+ .flags = flags,
	8735	+ .id = prog->aux->id,
	8736	+ },
	8737	+ };
	8738	+
	8739	+ BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
	8740	+
	8741	+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
	8742	+ perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
	8743	+}
	8744	+
	8745	+struct perf_text_poke_event {
	8746	+ const void *old_bytes;
	8747	+ const void *new_bytes;
	8748	+ size_t pad;
	8749	+ u16 old_len;
	8750	+ u16 new_len;
	8751	+
	8752	+ struct {
	8753	+ struct perf_event_header header;
	8754	+
	8755	+ u64 addr;
	8756	+ } event_id;
	8757	+};
	8758	+
	8759	+static int perf_event_text_poke_match(struct perf_event *event)
	8760	+{
	8761	+ return event->attr.text_poke;
	8762	+}
	8763	+
	8764	+static void perf_event_text_poke_output(struct perf_event event, void data)
	8765	+{
	8766	+ struct perf_text_poke_event *text_poke_event = data;
	8767	+ struct perf_output_handle handle;
	8768	+ struct perf_sample_data sample;
	8769	+ u64 padding = 0;
	8770	+ int ret;
	8771	+
	8772	+ if (!perf_event_text_poke_match(event))
	8773	+ return;
	8774	+
	8775	+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
	8776	+
	8777	+ ret = perf_output_begin(&handle, &sample, event,
	8778	+ text_poke_event->event_id.header.size);
	8779	+ if (ret)
	8780	+ return;
	8781	+
	8782	+ perf_output_put(&handle, text_poke_event->event_id);
	8783	+ perf_output_put(&handle, text_poke_event->old_len);
	8784	+ perf_output_put(&handle, text_poke_event->new_len);
	8785	+
	8786	+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
	8787	+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
	8788	+
	8789	+ if (text_poke_event->pad)
	8790	+ __output_copy(&handle, &padding, text_poke_event->pad);
	8791	+
	8792	+ perf_event__output_id_sample(event, &handle, &sample);
	8793	+
	8794	+ perf_output_end(&handle);
	8795	+}
	8796	+
	8797	+void perf_event_text_poke(const void addr, const void old_bytes,
	8798	+ size_t old_len, const void *new_bytes, size_t new_len)
	8799	+{
	8800	+ struct perf_text_poke_event text_poke_event;
	8801	+ size_t tot, pad;
	8802	+
	8803	+ if (!atomic_read(&nr_text_poke_events))
	8804	+ return;
	8805	+
	8806	+ tot = sizeof(text_poke_event.old_len) + old_len;
	8807	+ tot += sizeof(text_poke_event.new_len) + new_len;
	8808	+ pad = ALIGN(tot, sizeof(u64)) - tot;
	8809	+
	8810	+ text_poke_event = (struct perf_text_poke_event){
	8811	+ .old_bytes = old_bytes,
	8812	+ .new_bytes = new_bytes,
	8813	+ .pad = pad,
	8814	+ .old_len = old_len,
	8815	+ .new_len = new_len,
	8816	+ .event_id = {
	8817	+ .header = {
	8818	+ .type = PERF_RECORD_TEXT_POKE,
	8819	+ .misc = PERF_RECORD_MISC_KERNEL,
	8820	+ .size = sizeof(text_poke_event.event_id) + tot + pad,
	8821	+ },
	8822	+ .addr = (unsigned long)addr,
	8823	+ },
	8824	+ };
	8825	+
	8826	+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
7789	8827	}
7790	8828
7791	8829	void perf_event_itrace_started(struct perf_event *event)
..	..	@@ -7818,7 +8856,7 @@
7818	8856	rec.tid = perf_event_tid(event, current);
7819	8857
7820	8858	perf_event_header__init_id(&rec.header, &sample, event);
7821		- ret = perf_output_begin(&handle, event, rec.header.size);
	8859	+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
7822	8860
7823	8861	if (ret)
7824	8862	return;
..	..	@@ -8386,9 +9424,9 @@
8386	9424	if (event->hw.state & PERF_HES_STOPPED)
8387	9425	return 0;
8388	9426	/*
8389		- * All tracepoints are from kernel-space.
	9427	+ * If exclude_kernel, only trace user-space tracepoints (uprobes)
8390	9428	*/
8391		- if (event->attr.exclude_kernel)
	9429	+ if (event->attr.exclude_kernel && !user_mode(regs))
8392	9430	return 0;
8393	9431
8394	9432	if (!perf_tp_filter_match(event, data))
..	..	@@ -8514,30 +9552,39 @@
8514	9552	*
8515	9553	* PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
8516	9554	* if not set, create kprobe/uprobe
	9555	+ *
	9556	+ * The following values specify a reference counter (or semaphore in the
	9557	+ * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
	9558	+ * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
	9559	+ *
	9560	+ * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset
	9561	+ * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
8517	9562	*/
8518	9563	enum perf_probe_config {
8519	9564	PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
	9565	+ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
	9566	+ PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
8520	9567	};
8521	9568
8522	9569	PMU_FORMAT_ATTR(retprobe, "config:0");
	9570	+#endif
8523	9571
8524		-static struct attribute *probe_attrs[] = {
	9572	+#ifdef CONFIG_KPROBE_EVENTS
	9573	+static struct attribute *kprobe_attrs[] = {
8525	9574	&format_attr_retprobe.attr,
8526	9575	NULL,
8527	9576	};
8528	9577
8529		-static struct attribute_group probe_format_group = {
	9578	+static struct attribute_group kprobe_format_group = {
8530	9579	.name = "format",
8531		- .attrs = probe_attrs,
	9580	+ .attrs = kprobe_attrs,
8532	9581	};
8533	9582
8534		-static const struct attribute_group *probe_attr_groups[] = {
8535		- &probe_format_group,
	9583	+static const struct attribute_group *kprobe_attr_groups[] = {
	9584	+ &kprobe_format_group,
8536	9585	NULL,
8537	9586	};
8538		-#endif
8539	9587
8540		-#ifdef CONFIG_KPROBE_EVENTS
8541	9588	static int perf_kprobe_event_init(struct perf_event *event);
8542	9589	static struct pmu perf_kprobe = {
8543	9590	.task_ctx_nr = perf_sw_context,
..	..	@@ -8547,7 +9594,7 @@
8547	9594	.start = perf_swevent_start,
8548	9595	.stop = perf_swevent_stop,
8549	9596	.read = perf_swevent_read,
8550		- .attr_groups = probe_attr_groups,
	9597	+ .attr_groups = kprobe_attr_groups,
8551	9598	};
8552	9599
8553	9600	static int perf_kprobe_event_init(struct perf_event *event)
..	..	@@ -8558,7 +9605,7 @@
8558	9605	if (event->attr.type != perf_kprobe.type)
8559	9606	return -ENOENT;
8560	9607
8561		- if (!capable(CAP_SYS_ADMIN))
	9608	+ if (!perfmon_capable())
8562	9609	return -EACCES;
8563	9610
8564	9611	/*
..	..	@@ -8579,6 +9626,24 @@
8579	9626	#endif /* CONFIG_KPROBE_EVENTS */
8580	9627
8581	9628	#ifdef CONFIG_UPROBE_EVENTS
	9629	+PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
	9630	+
	9631	+static struct attribute *uprobe_attrs[] = {
	9632	+ &format_attr_retprobe.attr,
	9633	+ &format_attr_ref_ctr_offset.attr,
	9634	+ NULL,
	9635	+};
	9636	+
	9637	+static struct attribute_group uprobe_format_group = {
	9638	+ .name = "format",
	9639	+ .attrs = uprobe_attrs,
	9640	+};
	9641	+
	9642	+static const struct attribute_group *uprobe_attr_groups[] = {
	9643	+ &uprobe_format_group,
	9644	+ NULL,
	9645	+};
	9646	+
8582	9647	static int perf_uprobe_event_init(struct perf_event *event);
8583	9648	static struct pmu perf_uprobe = {
8584	9649	.task_ctx_nr = perf_sw_context,
..	..	@@ -8588,18 +9653,19 @@
8588	9653	.start = perf_swevent_start,
8589	9654	.stop = perf_swevent_stop,
8590	9655	.read = perf_swevent_read,
8591		- .attr_groups = probe_attr_groups,
	9656	+ .attr_groups = uprobe_attr_groups,
8592	9657	};
8593	9658
8594	9659	static int perf_uprobe_event_init(struct perf_event *event)
8595	9660	{
8596	9661	int err;
	9662	+ unsigned long ref_ctr_offset;
8597	9663	bool is_retprobe;
8598	9664
8599	9665	if (event->attr.type != perf_uprobe.type)
8600	9666	return -ENOENT;
8601	9667
8602		- if (!capable(CAP_SYS_ADMIN))
	9668	+ if (!perfmon_capable())
8603	9669	return -EACCES;
8604	9670
8605	9671	/*
..	..	@@ -8609,7 +9675,8 @@
8609	9675	return -EOPNOTSUPP;
8610	9676
8611	9677	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8612		- err = perf_uprobe_init(event, is_retprobe);
	9678	+ ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
	9679	+ err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
8613	9680	if (err)
8614	9681	return err;
8615	9682
..	..	@@ -8647,7 +9714,6 @@
8647	9714	int ret = 0;
8648	9715
8649	9716	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
8650		- preempt_disable();
8651	9717	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8652	9718	goto out;
8653	9719	rcu_read_lock();
..	..	@@ -8655,7 +9721,6 @@
8655	9721	rcu_read_unlock();
8656	9722	out:
8657	9723	__this_cpu_dec(bpf_prog_active);
8658		- preempt_enable();
8659	9724	if (!ret)
8660	9725	return;
8661	9726
..	..	@@ -8676,6 +9741,24 @@
8676	9741	prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8677	9742	if (IS_ERR(prog))
8678	9743	return PTR_ERR(prog);
	9744	+
	9745	+ if (event->attr.precise_ip &&
	9746	+ prog->call_get_stack &&
	9747	+ (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) \|\|
	9748	+ event->attr.exclude_callchain_kernel \|\|
	9749	+ event->attr.exclude_callchain_user)) {
	9750	+ /*
	9751	+ * On perf_event with precise_ip, calling bpf_get_stack()
	9752	+ * may trigger unwinder warnings and occasional crashes.
	9753	+ * bpf_get_[stack\|stackid] works around this issue by using
	9754	+ * callchain attached to perf_sample_data. If the
	9755	+ * perf_event does not full (kernel and user) callchain
	9756	+ * attached to perf_sample_data, do not allow attaching BPF
	9757	+ * program that calls bpf_get_[stack\|stackid].
	9758	+ */
	9759	+ bpf_prog_put(prog);
	9760	+ return -EPROTO;
	9761	+ }
8679	9762
8680	9763	event->prog = prog;
8681	9764	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
..	..	@@ -8875,7 +9958,7 @@
8875	9958	/*
8876	9959	* Scan through mm's vmas and see if one of them matches the
8877	9960	* @filter; if so, adjust filter's address range.
8878		- * Called with mm::mmap_sem down for reading.
	9961	+ * Called with mm::mmap_lock down for reading.
8879	9962	*/
8880	9963	static void perf_addr_filter_apply(struct perf_addr_filter *filter,
8881	9964	struct mm_struct *mm,
..	..	@@ -8917,7 +10000,7 @@
8917	10000	if (!mm)
8918	10001	goto restart;
8919	10002
8920		- down_read(&mm->mmap_sem);
	10003	+ mmap_read_lock(mm);
8921	10004	}
8922	10005
8923	10006	raw_spin_lock_irqsave(&ifh->lock, flags);
..	..	@@ -8943,7 +10026,7 @@
8943	10026	raw_spin_unlock_irqrestore(&ifh->lock, flags);
8944	10027
8945	10028	if (ifh->nr_file_filters) {
8946		- up_read(&mm->mmap_sem);
	10029	+ mmap_read_unlock(mm);
8947	10030
8948	10031	mmput(mm);
8949	10032	}
..	..	@@ -9050,6 +10133,7 @@
9050	10133	case IF_SRC_KERNELADDR:
9051	10134	case IF_SRC_KERNEL:
9052	10135	kernel = 1;
	10136	+ fallthrough;
9053	10137
9054	10138	case IF_SRC_FILEADDR:
9055	10139	case IF_SRC_FILE:
..	..	@@ -9136,8 +10220,11 @@
9136	10220	}
9137	10221
9138	10222	/* ready to consume more filters */
	10223	+ kfree(filename);
	10224	+ filename = NULL;
9139	10225	state = IF_STATE_ACTION;
9140	10226	filter = NULL;
	10227	+ kernel = 0;
9141	10228	}
9142	10229	}
9143	10230
..	..	@@ -9285,7 +10372,7 @@
9285	10372	period = max_t(u64, 10000, hwc->sample_period);
9286	10373	}
9287	10374	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
9288		- HRTIMER_MODE_REL_PINNED);
	10375	+ HRTIMER_MODE_REL_PINNED_HARD);
9289	10376	}
9290	10377
9291	10378	static void perf_swevent_cancel_hrtimer(struct perf_event *event)
..	..	@@ -9307,7 +10394,7 @@
9307	10394	if (!is_sampling_event(event))
9308	10395	return;
9309	10396
9310		- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	10397	+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
9311	10398	hwc->hrtimer.function = perf_swevent_hrtimer;
9312	10399
9313	10400	/*
..	..	@@ -9696,6 +10783,12 @@
9696	10783	if (ret)
9697	10784	goto del_dev;
9698	10785
	10786	+ if (pmu->attr_update)
	10787	+ ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
	10788	+
	10789	+ if (ret)
	10790	+ goto del_dev;
	10791	+
9699	10792	out:
9700	10793	return ret;
9701	10794
..	..	@@ -9712,7 +10805,7 @@
9712	10805
9713	10806	int perf_pmu_register(struct pmu pmu, const char name, int type)
9714	10807	{
9715		- int cpu, ret;
	10808	+ int cpu, ret, max = PERF_TYPE_MAX;
9716	10809
9717	10810	mutex_lock(&pmus_lock);
9718	10811	ret = -ENOMEM;
..	..	@@ -9725,12 +10818,17 @@
9725	10818	goto skip_type;
9726	10819	pmu->name = name;
9727	10820
9728		- if (type < 0) {
9729		- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9730		- if (type < 0) {
9731		- ret = type;
	10821	+ if (type != PERF_TYPE_SOFTWARE) {
	10822	+ if (type >= 0)
	10823	+ max = type;
	10824	+
	10825	+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
	10826	+ if (ret < 0)
9732	10827	goto free_pdc;
9733		- }
	10828	+
	10829	+ WARN_ON(type >= 0 && ret != type);
	10830	+
	10831	+ type = ret;
9734	10832	}
9735	10833	pmu->type = type;
9736	10834
..	..	@@ -9776,6 +10874,9 @@
9776	10874	cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9777	10875
9778	10876	__perf_mux_hrtimer_init(cpuctx, cpu);
	10877	+
	10878	+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
	10879	+ cpuctx->heap = cpuctx->heap_default;
9779	10880	}
9780	10881
9781	10882	got_cpu_context:
..	..	@@ -9807,7 +10908,16 @@
9807	10908	if (!pmu->event_idx)
9808	10909	pmu->event_idx = perf_event_idx_default;
9809	10910
9810		- list_add_rcu(&pmu->entry, &pmus);
	10911	+ /*
	10912	+ * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
	10913	+ * since these cannot be in the IDR. This way the linear search
	10914	+ * is fast, provided a valid software event is provided.
	10915	+ */
	10916	+ if (type == PERF_TYPE_SOFTWARE \|\| !name)
	10917	+ list_add_rcu(&pmu->entry, &pmus);
	10918	+ else
	10919	+ list_add_tail_rcu(&pmu->entry, &pmus);
	10920	+
9811	10921	atomic_set(&pmu->exclusive_cnt, 0);
9812	10922	ret = 0;
9813	10923	unlock:
..	..	@@ -9820,7 +10930,7 @@
9820	10930	put_device(pmu->dev);
9821	10931
9822	10932	free_idr:
9823		- if (pmu->type >= PERF_TYPE_MAX)
	10933	+ if (pmu->type != PERF_TYPE_SOFTWARE)
9824	10934	idr_remove(&pmu_idr, pmu->type);
9825	10935
9826	10936	free_pdc:
..	..	@@ -9842,7 +10952,7 @@
9842	10952	synchronize_rcu();
9843	10953
9844	10954	free_percpu(pmu->pmu_disable_count);
9845		- if (pmu->type >= PERF_TYPE_MAX)
	10955	+ if (pmu->type != PERF_TYPE_SOFTWARE)
9846	10956	idr_remove(&pmu_idr, pmu->type);
9847	10957	if (pmu_bus_running) {
9848	10958	if (pmu->nr_addr_filters)
..	..	@@ -9854,6 +10964,12 @@
9854	10964	mutex_unlock(&pmus_lock);
9855	10965	}
9856	10966	EXPORT_SYMBOL_GPL(perf_pmu_unregister);
	10967	+
	10968	+static inline bool has_extended_regs(struct perf_event *event)
	10969	+{
	10970	+ return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) \|\|
	10971	+ (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
	10972	+}
9857	10973
9858	10974	static int perf_try_init_event(struct pmu pmu, struct perf_event event)
9859	10975	{
..	..	@@ -9885,6 +11001,19 @@
9885	11001	if (ctx)
9886	11002	perf_event_ctx_unlock(event->group_leader, ctx);
9887	11003
	11004	+ if (!ret) {
	11005	+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
	11006	+ has_extended_regs(event))
	11007	+ ret = -EOPNOTSUPP;
	11008	+
	11009	+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
	11010	+ event_has_any_exclude_flag(event))
	11011	+ ret = -EINVAL;
	11012	+
	11013	+ if (ret && event->destroy)
	11014	+ event->destroy(event);
	11015	+ }
	11016	+
9888	11017	if (ret)
9889	11018	module_put(pmu->module);
9890	11019
..	..	@@ -9893,9 +11022,8 @@
9893	11022
9894	11023	static struct pmu perf_init_event(struct perf_event event)
9895	11024	{
	11025	+ int idx, type, ret;
9896	11026	struct pmu *pmu;
9897		- int idx;
9898		- int ret;
9899	11027
9900	11028	idx = srcu_read_lock(&pmus_srcu);
9901	11029
..	..	@@ -9907,17 +11035,32 @@
9907	11035	goto unlock;
9908	11036	}
9909	11037
	11038	+ /*
	11039	+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
	11040	+ * are often aliases for PERF_TYPE_RAW.
	11041	+ */
	11042	+ type = event->attr.type;
	11043	+ if (type == PERF_TYPE_HARDWARE \|\| type == PERF_TYPE_HW_CACHE)
	11044	+ type = PERF_TYPE_RAW;
	11045	+
	11046	+again:
9910	11047	rcu_read_lock();
9911		- pmu = idr_find(&pmu_idr, event->attr.type);
	11048	+ pmu = idr_find(&pmu_idr, type);
9912	11049	rcu_read_unlock();
9913	11050	if (pmu) {
9914	11051	ret = perf_try_init_event(pmu, event);
	11052	+ if (ret == -ENOENT && event->attr.type != type) {
	11053	+ type = event->attr.type;
	11054	+ goto again;
	11055	+ }
	11056	+
9915	11057	if (ret)
9916	11058	pmu = ERR_PTR(ret);
	11059	+
9917	11060	goto unlock;
9918	11061	}
9919	11062
9920		- list_for_each_entry_rcu(pmu, &pmus, entry) {
	11063	+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
9921	11064	ret = perf_try_init_event(pmu, event);
9922	11065	if (!ret)
9923	11066	goto unlock;
..	..	@@ -9993,7 +11136,7 @@
9993	11136	if (event->parent)
9994	11137	return;
9995	11138
9996		- if (event->attach_state & PERF_ATTACH_TASK)
	11139	+ if (event->attach_state & (PERF_ATTACH_TASK \| PERF_ATTACH_SCHED_CB))
9997	11140	inc = true;
9998	11141	if (event->attr.mmap \|\| event->attr.mmap_data)
9999	11142	atomic_inc(&nr_mmap_events);
..	..	@@ -10001,6 +11144,8 @@
10001	11144	atomic_inc(&nr_comm_events);
10002	11145	if (event->attr.namespaces)
10003	11146	atomic_inc(&nr_namespaces_events);
	11147	+ if (event->attr.cgroup)
	11148	+ atomic_inc(&nr_cgroup_events);
10004	11149	if (event->attr.task)
10005	11150	atomic_inc(&nr_task_events);
10006	11151	if (event->attr.freq)
..	..	@@ -10013,6 +11158,12 @@
10013	11158	inc = true;
10014	11159	if (is_cgroup_event(event))
10015	11160	inc = true;
	11161	+ if (event->attr.ksymbol)
	11162	+ atomic_inc(&nr_ksymbol_events);
	11163	+ if (event->attr.bpf_event)
	11164	+ atomic_inc(&nr_bpf_events);
	11165	+ if (event->attr.text_poke)
	11166	+ atomic_inc(&nr_text_poke_events);
10016	11167
10017	11168	if (inc) {
10018	11169	/*
..	..	@@ -10031,7 +11182,7 @@
10031	11182	* call the perf scheduling hooks before proceeding to
10032	11183	* install events that need them.
10033	11184	*/
10034		- synchronize_sched();
	11185	+ synchronize_rcu();
10035	11186	}
10036	11187	/*
10037	11188	* Now that we have waited for the sync_sched(), allow further
..	..	@@ -10120,8 +11271,7 @@
10120	11271	* and we cannot use the ctx information because we need the
10121	11272	* pmu before we get a ctx.
10122	11273	*/
10123		- get_task_struct(task);
10124		- event->hw.target = task;
	11274	+ event->hw.target = get_task_struct(task);
10125	11275	}
10126	11276
10127	11277	event->clock = &local_clock;
..	..	@@ -10133,12 +11283,9 @@
10133	11283	context = parent_event->overflow_handler_context;
10134	11284	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
10135	11285	if (overflow_handler == bpf_overflow_handler) {
10136		- struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
	11286	+ struct bpf_prog *prog = parent_event->prog;
10137	11287
10138		- if (IS_ERR(prog)) {
10139		- err = PTR_ERR(prog);
10140		- goto err_ns;
10141		- }
	11288	+ bpf_prog_inc(prog);
10142	11289	event->prog = prog;
10143	11290	event->orig_overflow_handler =
10144	11291	parent_event->orig_overflow_handler;
..	..	@@ -10179,16 +11326,31 @@
10179	11326	if (!has_branch_stack(event))
10180	11327	event->attr.branch_sample_type = 0;
10181	11328
10182		- if (cgroup_fd != -1) {
10183		- err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
10184		- if (err)
10185		- goto err_ns;
10186		- }
10187		-
10188	11329	pmu = perf_init_event(event);
10189	11330	if (IS_ERR(pmu)) {
10190	11331	err = PTR_ERR(pmu);
10191	11332	goto err_ns;
	11333	+ }
	11334	+
	11335	+ /*
	11336	+ * Disallow uncore-cgroup events, they don't make sense as the cgroup will
	11337	+ * be different on other CPUs in the uncore mask.
	11338	+ */
	11339	+ if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
	11340	+ err = -EINVAL;
	11341	+ goto err_pmu;
	11342	+ }
	11343	+
	11344	+ if (event->attr.aux_output &&
	11345	+ !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
	11346	+ err = -EOPNOTSUPP;
	11347	+ goto err_pmu;
	11348	+ }
	11349	+
	11350	+ if (cgroup_fd != -1) {
	11351	+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
	11352	+ if (err)
	11353	+ goto err_pmu;
10192	11354	}
10193	11355
10194	11356	err = exclusive_event_init(event);
..	..	@@ -10251,12 +11413,12 @@
10251	11413	exclusive_event_destroy(event);
10252	11414
10253	11415	err_pmu:
	11416	+ if (is_cgroup_event(event))
	11417	+ perf_detach_cgroup(event);
10254	11418	if (event->destroy)
10255	11419	event->destroy(event);
10256	11420	module_put(pmu->module);
10257	11421	err_ns:
10258		- if (is_cgroup_event(event))
10259		- perf_detach_cgroup(event);
10260	11422	if (event->ns)
10261	11423	put_pid_ns(event->ns);
10262	11424	if (event->hw.target)
..	..	@@ -10272,58 +11434,29 @@
10272	11434	u32 size;
10273	11435	int ret;
10274	11436
10275		- if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
10276		- return -EFAULT;
10277		-
10278		- /*
10279		- * zero the full structure, so that a short copy will be nice.
10280		- */
	11437	+ /* Zero the full structure, so that a short copy will be nice. */
10281	11438	memset(attr, 0, sizeof(*attr));
10282	11439
10283	11440	ret = get_user(size, &uattr->size);
10284	11441	if (ret)
10285	11442	return ret;
10286	11443
10287		- if (size > PAGE_SIZE) /* silly large */
10288		- goto err_size;
10289		-
10290		- if (!size) /* abi compat */
	11444	+ /* ABI compatibility quirk: */
	11445	+ if (!size)
10291	11446	size = PERF_ATTR_SIZE_VER0;
10292		-
10293		- if (size < PERF_ATTR_SIZE_VER0)
	11447	+ if (size < PERF_ATTR_SIZE_VER0 \|\| size > PAGE_SIZE)
10294	11448	goto err_size;
10295	11449
10296		- /*
10297		- * If we're handed a bigger struct than we know of,
10298		- * ensure all the unknown bits are 0 - i.e. new
10299		- * user-space does not rely on any kernel feature
10300		- * extensions we dont know about yet.
10301		- */
10302		- if (size > sizeof(*attr)) {
10303		- unsigned char __user *addr;
10304		- unsigned char __user *end;
10305		- unsigned char val;
10306		-
10307		- addr = (void __user )uattr + sizeof(attr);
10308		- end = (void __user *)uattr + size;
10309		-
10310		- for (; addr < end; addr++) {
10311		- ret = get_user(val, addr);
10312		- if (ret)
10313		- return ret;
10314		- if (val)
10315		- goto err_size;
10316		- }
10317		- size = sizeof(*attr);
	11450	+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
	11451	+ if (ret) {
	11452	+ if (ret == -E2BIG)
	11453	+ goto err_size;
	11454	+ return ret;
10318	11455	}
10319		-
10320		- ret = copy_from_user(attr, uattr, size);
10321		- if (ret)
10322		- return -EFAULT;
10323	11456
10324	11457	attr->size = size;
10325	11458
10326		- if (attr->__reserved_1)
	11459	+ if (attr->__reserved_1 \|\| attr->__reserved_2 \|\| attr->__reserved_3)
10327	11460	return -EINVAL;
10328	11461
10329	11462	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
..	..	@@ -10394,6 +11527,12 @@
10394	11527
10395	11528	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
10396	11529	ret = perf_reg_validate(attr->sample_regs_intr);
	11530	+
	11531	+#ifndef CONFIG_CGROUP_PERF
	11532	+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
	11533	+ return -EINVAL;
	11534	+#endif
	11535	+
10397	11536	out:
10398	11537	return ret;
10399	11538
..	..	@@ -10403,14 +11542,25 @@
10403	11542	goto out;
10404	11543	}
10405	11544
	11545	+static void mutex_lock_double(struct mutex a, struct mutex b)
	11546	+{
	11547	+ if (b < a)
	11548	+ swap(a, b);
	11549	+
	11550	+ mutex_lock(a);
	11551	+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
	11552	+}
	11553	+
10406	11554	static int
10407	11555	perf_event_set_output(struct perf_event event, struct perf_event output_event)
10408	11556	{
10409		- struct ring_buffer *rb = NULL;
	11557	+ struct perf_buffer *rb = NULL;
10410	11558	int ret = -EINVAL;
10411	11559
10412		- if (!output_event)
	11560	+ if (!output_event) {
	11561	+ mutex_lock(&event->mmap_mutex);
10413	11562	goto set;
	11563	+ }
10414	11564
10415	11565	/* don't allow circular references */
10416	11566	if (event == output_event)
..	..	@@ -10448,8 +11598,15 @@
10448	11598	event->pmu != output_event->pmu)
10449	11599	goto out;
10450	11600
	11601	+ /*
	11602	+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
	11603	+ * output_event is already on rb->event_list, and the list iteration
	11604	+ * restarts after every removal, it is guaranteed this new event is
	11605	+ * observed OR if output_event is already removed, it's guaranteed we
	11606	+ * observe !rb->mmap_count.
	11607	+ */
	11608	+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
10451	11609	set:
10452		- mutex_lock(&event->mmap_mutex);
10453	11610	/* Can't redirect output if we've got an active mmap() */
10454	11611	if (atomic_read(&event->mmap_count))
10455	11612	goto unlock;
..	..	@@ -10459,6 +11616,12 @@
10459	11616	rb = ring_buffer_get(output_event);
10460	11617	if (!rb)
10461	11618	goto unlock;
	11619	+
	11620	+ /* did we race against perf_mmap_close() */
	11621	+ if (!atomic_read(&rb->mmap_count)) {
	11622	+ ring_buffer_put(rb);
	11623	+ goto unlock;
	11624	+ }
10462	11625	}
10463	11626
10464	11627	ring_buffer_attach(event, rb);
..	..	@@ -10466,18 +11629,11 @@
10466	11629	ret = 0;
10467	11630	unlock:
10468	11631	mutex_unlock(&event->mmap_mutex);
	11632	+ if (output_event)
	11633	+ mutex_unlock(&output_event->mmap_mutex);
10469	11634
10470	11635	out:
10471	11636	return ret;
10472		-}
10473		-
10474		-static void mutex_lock_double(struct mutex a, struct mutex b)
10475		-{
10476		- if (b < a)
10477		- swap(a, b);
10478		-
10479		- mutex_lock(a);
10480		- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
10481	11637	}
10482	11638
10483	11639	static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
..	..	@@ -10500,11 +11656,11 @@
10500	11656	break;
10501	11657
10502	11658	case CLOCK_BOOTTIME:
10503		- event->clock = &ktime_get_boot_ns;
	11659	+ event->clock = &ktime_get_boottime_ns;
10504	11660	break;
10505	11661
10506	11662	case CLOCK_TAI:
10507		- event->clock = &ktime_get_tai_ns;
	11663	+ event->clock = &ktime_get_clocktai_ns;
10508	11664	break;
10509	11665
10510	11666	default:
..	..	@@ -10530,7 +11686,7 @@
10530	11686	again:
10531	11687	rcu_read_lock();
10532	11688	gctx = READ_ONCE(group_leader->ctx);
10533		- if (!atomic_inc_not_zero(&gctx->refcount)) {
	11689	+ if (!refcount_inc_not_zero(&gctx->refcount)) {
10534	11690	rcu_read_unlock();
10535	11691	goto again;
10536	11692	}
..	..	@@ -10563,7 +11719,7 @@
10563	11719	struct perf_event group_leader = NULL, output_event = NULL;
10564	11720	struct perf_event event, sibling;
10565	11721	struct perf_event_attr attr;
10566		- struct perf_event_context ctx, uninitialized_var(gctx);
	11722	+ struct perf_event_context ctx, gctx;
10567	11723	struct file *event_file = NULL;
10568	11724	struct fd group = {NULL, 0};
10569	11725	struct task_struct *task = NULL;
..	..	@@ -10577,9 +11733,6 @@
10577	11733	/* for future expandability... */
10578	11734	if (flags & ~PERF_FLAG_ALL)
10579	11735	return -EINVAL;
10580		-
10581		- if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
10582		- return -EACCES;
10583	11736
10584	11737	/* Do we allow access to perf_event_open(2) ? */
10585	11738	err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
..	..	@@ -10597,7 +11750,7 @@
10597	11750	}
10598	11751
10599	11752	if (attr.namespaces) {
10600		- if (!capable(CAP_SYS_ADMIN))
	11753	+ if (!perfmon_capable())
10601	11754	return -EACCES;
10602	11755	}
10603	11756
..	..	@@ -10612,6 +11765,13 @@
10612	11765	/* Only privileged users can get physical addresses */
10613	11766	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
10614	11767	err = perf_allow_kernel(&attr);
	11768	+ if (err)
	11769	+ return err;
	11770	+ }
	11771	+
	11772	+ /* REGS_INTR can leak data, lockdown must prevent this */
	11773	+ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
	11774	+ err = security_locked_down(LOCKDOWN_PERF);
10615	11775	if (err)
10616	11776	return err;
10617	11777	}
..	..	@@ -10657,24 +11817,6 @@
10657	11817	goto err_task;
10658	11818	}
10659	11819
10660		- if (task) {
10661		- err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10662		- if (err)
10663		- goto err_task;
10664		-
10665		- /*
10666		- * Reuse ptrace permission checks for now.
10667		- *
10668		- * We must hold cred_guard_mutex across this and any potential
10669		- * perf_install_in_context() call for this new event to
10670		- * serialize against exec() altering our credentials (and the
10671		- * perf_event_exit_task() that could imply).
10672		- */
10673		- err = -EACCES;
10674		- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10675		- goto err_cred;
10676		- }
10677		-
10678	11820	if (flags & PERF_FLAG_PID_CGROUP)
10679	11821	cgroup_fd = pid;
10680	11822
..	..	@@ -10682,7 +11824,7 @@
10682	11824	NULL, NULL, cgroup_fd);
10683	11825	if (IS_ERR(event)) {
10684	11826	err = PTR_ERR(event);
10685		- goto err_cred;
	11827	+ goto err_task;
10686	11828	}
10687	11829
10688	11830	if (is_sampling_event(event)) {
..	..	@@ -10776,6 +11918,9 @@
10776	11918	* Do not allow to attach to a group in a different task
10777	11919	* or CPU context. If we're moving SW events, we'll fix
10778	11920	* this up later, so allow that.
	11921	+ *
	11922	+ * Racy, not holding group_leader->ctx->mutex, see comment with
	11923	+ * perf_event_ctx_lock().
10779	11924	*/
10780	11925	if (!move_group && group_leader->ctx != ctx)
10781	11926	goto err_context;
..	..	@@ -10799,6 +11944,24 @@
10799	11944	err = PTR_ERR(event_file);
10800	11945	event_file = NULL;
10801	11946	goto err_context;
	11947	+ }
	11948	+
	11949	+ if (task) {
	11950	+ err = down_read_interruptible(&task->signal->exec_update_lock);
	11951	+ if (err)
	11952	+ goto err_file;
	11953	+
	11954	+ /*
	11955	+ * Preserve ptrace permission check for backwards compatibility.
	11956	+ *
	11957	+ * We must hold exec_update_lock across this and any potential
	11958	+ * perf_install_in_context() call for this new event to
	11959	+ * serialize against exec() altering our credentials (and the
	11960	+ * perf_event_exit_task() that could imply).
	11961	+ */
	11962	+ err = -EACCES;
	11963	+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
	11964	+ goto err_cred;
10802	11965	}
10803	11966
10804	11967	if (move_group) {
..	..	@@ -10825,6 +11988,7 @@
10825	11988	} else {
10826	11989	perf_event_ctx_unlock(group_leader, gctx);
10827	11990	move_group = 0;
	11991	+ goto not_move_group;
10828	11992	}
10829	11993	}
10830	11994
..	..	@@ -10841,7 +12005,17 @@
10841	12005	}
10842	12006	} else {
10843	12007	mutex_lock(&ctx->mutex);
	12008	+
	12009	+ /*
	12010	+ * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
	12011	+ * see the group_leader && !move_group test earlier.
	12012	+ */
	12013	+ if (group_leader && group_leader->ctx != ctx) {
	12014	+ err = -EINVAL;
	12015	+ goto err_locked;
	12016	+ }
10844	12017	}
	12018	+not_move_group:
10845	12019
10846	12020	if (ctx->task == TASK_TOMBSTONE) {
10847	12021	err = -ESRCH;
..	..	@@ -10869,6 +12043,10 @@
10869	12043	}
10870	12044	}
10871	12045
	12046	+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
	12047	+ err = -EINVAL;
	12048	+ goto err_locked;
	12049	+ }
10872	12050
10873	12051	/*
10874	12052	* Must be under the same ctx::mutex as perf_install_in_context(),
..	..	@@ -10950,7 +12128,7 @@
10950	12128	mutex_unlock(&ctx->mutex);
10951	12129
10952	12130	if (task) {
10953		- mutex_unlock(&task->signal->cred_guard_mutex);
	12131	+ up_read(&task->signal->exec_update_lock);
10954	12132	put_task_struct(task);
10955	12133	}
10956	12134
..	..	@@ -10972,7 +12150,10 @@
10972	12150	if (move_group)
10973	12151	perf_event_ctx_unlock(group_leader, gctx);
10974	12152	mutex_unlock(&ctx->mutex);
10975		-/* err_file: */
	12153	+err_cred:
	12154	+ if (task)
	12155	+ up_read(&task->signal->exec_update_lock);
	12156	+err_file:
10976	12157	fput(event_file);
10977	12158	err_context:
10978	12159	perf_unpin_context(ctx);
..	..	@@ -10984,9 +12165,6 @@
10984	12165	*/
10985	12166	if (!event_file)
10986	12167	free_event(event);
10987		-err_cred:
10988		- if (task)
10989		- mutex_unlock(&task->signal->cred_guard_mutex);
10990	12168	err_task:
10991	12169	if (task)
10992	12170	put_task_struct(task);
..	..	@@ -11015,8 +12193,11 @@
11015	12193	int err;
11016	12194
11017	12195	/*
11018		- * Get the target context (task or percpu):
	12196	+ * Grouping is not supported for kernel events, neither is 'AUX',
	12197	+ * make sure the caller's intentions are adjusted.
11019	12198	*/
	12199	+ if (attr->aux_output)
	12200	+ return ERR_PTR(-EINVAL);
11020	12201
11021	12202	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
11022	12203	overflow_handler, context, -1);
..	..	@@ -11028,6 +12209,9 @@
11028	12209	/* Mark owner so we could distinguish it from user events. */
11029	12210	event->owner = TASK_TOMBSTONE;
11030	12211
	12212	+ /*
	12213	+ * Get the target context (task or percpu):
	12214	+ */
11031	12215	ctx = find_get_context(event->pmu, task, event);
11032	12216	if (IS_ERR(ctx)) {
11033	12217	err = PTR_ERR(ctx);
..	..	@@ -11285,8 +12469,8 @@
11285	12469	/*
11286	12470	* When a child task exits, feed back event values to parent events.
11287	12471	*
11288		- * Can be called with cred_guard_mutex held when called from
11289		- * install_exec_creds().
	12472	+ * Can be called with exec_update_lock held when called from
	12473	+ * setup_new_exec().
11290	12474	*/
11291	12475	void perf_event_exit_task(struct task_struct *child)
11292	12476	{
..	..	@@ -11390,7 +12574,7 @@
11390	12574	*
11391	12575	* Wait for all events to drop their context reference.
11392	12576	*/
11393		- wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1);
	12577	+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
11394	12578	put_ctx(ctx); /* must be last */
11395	12579	}
11396	12580	}
..	..	@@ -11405,9 +12589,7 @@
11405	12589
11406	12590	struct file *perf_event_get(unsigned int fd)
11407	12591	{
11408		- struct file *file;
11409		-
11410		- file = fget_raw(fd);
	12592	+ struct file *file = fget(fd);
11411	12593	if (!file)
11412	12594	return ERR_PTR(-EBADF);
11413	12595
..	..	@@ -11477,8 +12659,7 @@
11477	12659	!child_ctx->task_ctx_data) {
11478	12660	struct pmu *pmu = child_event->pmu;
11479	12661
11480		- child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
11481		- GFP_KERNEL);
	12662	+ child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
11482	12663	if (!child_ctx->task_ctx_data) {
11483	12664	free_event(child_event);
11484	12665	return ERR_PTR(-ENOMEM);
..	..	@@ -11583,6 +12764,10 @@
11583	12764	child, leader, child_ctx);
11584	12765	if (IS_ERR(child_ctr))
11585	12766	return PTR_ERR(child_ctr);
	12767	+
	12768	+ if (sub->aux_event == parent_event && child_ctr &&
	12769	+ !perf_get_aux_event(child_ctr, leader))
	12770	+ return -EINVAL;
11586	12771	}
11587	12772	return 0;
11588	12773	}
..	..	@@ -11778,7 +12963,7 @@
11778	12963	}
11779	12964	}
11780	12965
11781		-void perf_swevent_init_cpu(unsigned int cpu)
	12966	+static void perf_swevent_init_cpu(unsigned int cpu)
11782	12967	{
11783	12968	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11784	12969
..	..	@@ -11975,6 +13160,12 @@
11975	13160	kfree(jc);
11976	13161	}
11977	13162
	13163	+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
	13164	+{
	13165	+ perf_event_cgroup(css->cgroup);
	13166	+ return 0;
	13167	+}
	13168	+
11978	13169	static int __perf_cgroup_move(void *info)
11979	13170	{
11980	13171	struct task_struct *task = info;
..	..	@@ -11996,6 +13187,7 @@
11996	13187	struct cgroup_subsys perf_event_cgrp_subsys = {
11997	13188	.css_alloc = perf_cgroup_css_alloc,
11998	13189	.css_free = perf_cgroup_css_free,
	13190	+ .css_online = perf_cgroup_css_online,
11999	13191	.attach = perf_cgroup_attach,
12000	13192	/*
12001	13193	* Implicitly enable on dfl hierarchy so that perf events can