~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0
1	2	/*
2	3	* Performance events core code:
3	4	*
..	..	@@ -5,8 +6,6 @@
5	6	* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6	7	* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
7	8	* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8		- *
9		- * For licensing details see kernel-base/COPYING
10	9	*/
11	10
12	11	#include <linux/fs.h>
..	..	@@ -29,6 +28,7 @@
29	28	#include <linux/export.h>
30	29	#include <linux/vmalloc.h>
31	30	#include <linux/hardirq.h>
	31	+#include <linux/hugetlb.h>
32	32	#include <linux/rculist.h>
33	33	#include <linux/uaccess.h>
34	34	#include <linux/syscalls.h>
..	..	@@ -50,6 +50,7 @@
50	50	#include <linux/sched/mm.h>
51	51	#include <linux/proc_ns.h>
52	52	#include <linux/mount.h>
	53	+#include <linux/min_heap.h>
53	54
54	55	#include "internal.h"
55	56
..	..	@@ -265,7 +266,7 @@
265	266	if (!event->parent) {
266	267	/*
267	268	* If this is a !child event, we must hold ctx::mutex to
268		- * stabilize the the event->ctx relation. See
	269	+ * stabilize the event->ctx relation. See
269	270	* perf_event_ctx_lock().
270	271	*/
271	272	lockdep_assert_held(&ctx->mutex);
..	..	@@ -391,6 +392,10 @@
391	392	static atomic_t nr_task_events __read_mostly;
392	393	static atomic_t nr_freq_events __read_mostly;
393	394	static atomic_t nr_switch_events __read_mostly;
	395	+static atomic_t nr_ksymbol_events __read_mostly;
	396	+static atomic_t nr_bpf_events __read_mostly;
	397	+static atomic_t nr_cgroup_events __read_mostly;
	398	+static atomic_t nr_text_poke_events __read_mostly;
394	399
395	400	static LIST_HEAD(pmus);
396	401	static DEFINE_MUTEX(pmus_lock);
..	..	@@ -403,13 +408,8 @@
403	408	* 0 - disallow raw tracepoint access for unpriv
404	409	* 1 - disallow cpu events for unpriv
405	410	* 2 - disallow kernel profiling for unpriv
406		- * 3 - disallow all unpriv perf event use
407	411	*/
408		-#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
409		-int sysctl_perf_event_paranoid __read_mostly = 3;
410		-#else
411	412	int sysctl_perf_event_paranoid __read_mostly = 2;
412		-#endif
413	413
414	414	/* Minimum for 512 kiB + 1 user control page */
415	415	int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
..	..	@@ -444,8 +444,7 @@
444	444	static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
445	445
446	446	int perf_proc_update_handler(struct ctl_table *table, int write,
447		- void __user buffer, size_t lenp,
448		- loff_t *ppos)
	447	+ void buffer, size_t lenp, loff_t *ppos)
449	448	{
450	449	int ret;
451	450	int perf_cpu = sysctl_perf_cpu_time_max_percent;
..	..	@@ -469,8 +468,7 @@
469	468	int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
470	469
471	470	int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
472		- void __user buffer, size_t lenp,
473		- loff_t *ppos)
	471	+ void buffer, size_t lenp, loff_t *ppos)
474	472	{
475	473	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
476	474
..	..	@@ -761,7 +759,7 @@
761	759	/*
762	760	* Do not update time when cgroup is not active
763	761	*/
764		- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
	762	+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
765	763	__update_cgrp_time(event->cgrp);
766	764	}
767	765
..	..	@@ -901,6 +899,47 @@
901	899	rcu_read_unlock();
902	900	}
903	901
	902	+static int perf_cgroup_ensure_storage(struct perf_event *event,
	903	+ struct cgroup_subsys_state *css)
	904	+{
	905	+ struct perf_cpu_context *cpuctx;
	906	+ struct perf_event **storage;
	907	+ int cpu, heap_size, ret = 0;
	908	+
	909	+ /*
	910	+ * Allow storage to have sufficent space for an iterator for each
	911	+ * possibly nested cgroup plus an iterator for events with no cgroup.
	912	+ */
	913	+ for (heap_size = 1; css; css = css->parent)
	914	+ heap_size++;
	915	+
	916	+ for_each_possible_cpu(cpu) {
	917	+ cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
	918	+ if (heap_size <= cpuctx->heap_size)
	919	+ continue;
	920	+
	921	+ storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
	922	+ GFP_KERNEL, cpu_to_node(cpu));
	923	+ if (!storage) {
	924	+ ret = -ENOMEM;
	925	+ break;
	926	+ }
	927	+
	928	+ raw_spin_lock_irq(&cpuctx->ctx.lock);
	929	+ if (cpuctx->heap_size < heap_size) {
	930	+ swap(cpuctx->heap, storage);
	931	+ if (storage == cpuctx->heap_default)
	932	+ storage = NULL;
	933	+ cpuctx->heap_size = heap_size;
	934	+ }
	935	+ raw_spin_unlock_irq(&cpuctx->ctx.lock);
	936	+
	937	+ kfree(storage);
	938	+ }
	939	+
	940	+ return ret;
	941	+}
	942	+
904	943	static inline int perf_cgroup_connect(int fd, struct perf_event *event,
905	944	struct perf_event_attr *attr,
906	945	struct perf_event *group_leader)
..	..	@@ -919,6 +958,10 @@
919	958	ret = PTR_ERR(css);
920	959	goto out;
921	960	}
	961	+
	962	+ ret = perf_cgroup_ensure_storage(event, css);
	963	+ if (ret)
	964	+ goto out;
922	965
923	966	cgrp = container_of(css, struct perf_cgroup, css);
924	967	event->cgrp = cgrp;
..	..	@@ -945,25 +988,19 @@
945	988	event->shadow_ctx_time = now - t->timestamp;
946	989	}
947	990
948		-/*
949		- * Update cpuctx->cgrp so that it is set when first cgroup event is added and
950		- * cleared when last cgroup event is removed.
951		- */
952	991	static inline void
953		-list_update_cgroup_event(struct perf_event *event,
954		- struct perf_event_context *ctx, bool add)
	992	+perf_cgroup_event_enable(struct perf_event event, struct perf_event_context ctx)
955	993	{
956	994	struct perf_cpu_context *cpuctx;
957		- struct list_head *cpuctx_entry;
958	995
959	996	if (!is_cgroup_event(event))
960	997	return;
961	998
962	999	/*
963	1000	* Because cgroup events are always per-cpu events,
964		- * this will always be called from the right CPU.
	1001	+ * @ctx == &cpuctx->ctx.
965	1002	*/
966		- cpuctx = __get_cpu_context(ctx);
	1003	+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
967	1004
968	1005	/*
969	1006	* Since setting cpuctx->cgrp is conditional on the current @cgrp
..	..	@@ -971,27 +1008,41 @@
971	1008	* because if the first would mismatch, the second would not try again
972	1009	* and we would leave cpuctx->cgrp unset.
973	1010	*/
974		- if (add && !cpuctx->cgrp) {
	1011	+ if (ctx->is_active && !cpuctx->cgrp) {
975	1012	struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
976	1013
977	1014	if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
978	1015	cpuctx->cgrp = cgrp;
979	1016	}
980	1017
981		- if (add && ctx->nr_cgroups++)
982		- return;
983		- else if (!add && --ctx->nr_cgroups)
	1018	+ if (ctx->nr_cgroups++)
984	1019	return;
985	1020
986		- /* no cgroup running */
987		- if (!add)
	1021	+ list_add(&cpuctx->cgrp_cpuctx_entry,
	1022	+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
	1023	+}
	1024	+
	1025	+static inline void
	1026	+perf_cgroup_event_disable(struct perf_event event, struct perf_event_context ctx)
	1027	+{
	1028	+ struct perf_cpu_context *cpuctx;
	1029	+
	1030	+ if (!is_cgroup_event(event))
	1031	+ return;
	1032	+
	1033	+ /*
	1034	+ * Because cgroup events are always per-cpu events,
	1035	+ * @ctx == &cpuctx->ctx.
	1036	+ */
	1037	+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
	1038	+
	1039	+ if (--ctx->nr_cgroups)
	1040	+ return;
	1041	+
	1042	+ if (ctx->is_active && cpuctx->cgrp)
988	1043	cpuctx->cgrp = NULL;
989	1044
990		- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
991		- if (add)
992		- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
993		- else
994		- list_del(cpuctx_entry);
	1045	+ list_del(&cpuctx->cgrp_cpuctx_entry);
995	1046	}
996	1047
997	1048	#else /* !CONFIG_CGROUP_PERF */
..	..	@@ -1041,7 +1092,7 @@
1041	1092	{
1042	1093	}
1043	1094
1044		-void
	1095	+static inline void
1045	1096	perf_cgroup_switch(struct task_struct task, struct task_struct next)
1046	1097	{
1047	1098	}
..	..	@@ -1057,11 +1108,14 @@
1057	1108	}
1058	1109
1059	1110	static inline void
1060		-list_update_cgroup_event(struct perf_event *event,
1061		- struct perf_event_context *ctx, bool add)
	1111	+perf_cgroup_event_enable(struct perf_event event, struct perf_event_context ctx)
1062	1112	{
1063	1113	}
1064	1114
	1115	+static inline void
	1116	+perf_cgroup_event_disable(struct perf_event event, struct perf_event_context ctx)
	1117	+{
	1118	+}
1065	1119	#endif
1066	1120
1067	1121	/*
..	..	@@ -1131,11 +1185,16 @@
1131	1185	if (!cpuctx->hrtimer_active) {
1132	1186	cpuctx->hrtimer_active = 1;
1133	1187	hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1134		- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
	1188	+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1135	1189	}
1136	1190	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1137	1191
1138	1192	return 0;
	1193	+}
	1194	+
	1195	+static int perf_mux_hrtimer_restart_ipi(void *arg)
	1196	+{
	1197	+ return perf_mux_hrtimer_restart(arg);
1139	1198	}
1140	1199
1141	1200	void perf_pmu_disable(struct pmu *pmu)
..	..	@@ -1182,7 +1241,21 @@
1182	1241
1183	1242	static void get_ctx(struct perf_event_context *ctx)
1184	1243	{
1185		- WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
	1244	+ refcount_inc(&ctx->refcount);
	1245	+}
	1246	+
	1247	+static void alloc_task_ctx_data(struct pmu pmu)
	1248	+{
	1249	+ if (pmu->task_ctx_cache)
	1250	+ return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
	1251	+
	1252	+ return NULL;
	1253	+}
	1254	+
	1255	+static void free_task_ctx_data(struct pmu pmu, void task_ctx_data)
	1256	+{
	1257	+ if (pmu->task_ctx_cache && task_ctx_data)
	1258	+ kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1186	1259	}
1187	1260
1188	1261	static void free_ctx(struct rcu_head *head)
..	..	@@ -1190,13 +1263,13 @@
1190	1263	struct perf_event_context *ctx;
1191	1264
1192	1265	ctx = container_of(head, struct perf_event_context, rcu_head);
1193		- kfree(ctx->task_ctx_data);
	1266	+ free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1194	1267	kfree(ctx);
1195	1268	}
1196	1269
1197	1270	static void put_ctx(struct perf_event_context *ctx)
1198	1271	{
1199		- if (atomic_dec_and_test(&ctx->refcount)) {
	1272	+ if (refcount_dec_and_test(&ctx->refcount)) {
1200	1273	if (ctx->parent_ctx)
1201	1274	put_ctx(ctx->parent_ctx);
1202	1275	if (ctx->task && ctx->task != TASK_TOMBSTONE)
..	..	@@ -1232,7 +1305,7 @@
1232	1305	* life-time rules separate them. That is an exiting task cannot fork, and a
1233	1306	* spawning task cannot (yet) exit.
1234	1307	*
1235		- * But remember that that these are parent<->child context relations, and
	1308	+ * But remember that these are parent<->child context relations, and
1236	1309	* migration does not affect children, therefore these two orderings should not
1237	1310	* interact.
1238	1311	*
..	..	@@ -1258,13 +1331,13 @@
1258	1331	* function.
1259	1332	*
1260	1333	* Lock order:
1261		- * cred_guard_mutex
	1334	+ * exec_update_lock
1262	1335	* task_struct::perf_event_mutex
1263	1336	* perf_event_context::mutex
1264	1337	* perf_event::child_mutex;
1265	1338	* perf_event_context::lock
1266	1339	* perf_event::mmap_mutex
1267		- * mmap_sem
	1340	+ * mmap_lock
1268	1341	* perf_addr_filters_head::lock
1269	1342	*
1270	1343	* cpu_hotplug_lock
..	..	@@ -1279,7 +1352,7 @@
1279	1352	again:
1280	1353	rcu_read_lock();
1281	1354	ctx = READ_ONCE(event->ctx);
1282		- if (!atomic_inc_not_zero(&ctx->refcount)) {
	1355	+ if (!refcount_inc_not_zero(&ctx->refcount)) {
1283	1356	rcu_read_unlock();
1284	1357	goto again;
1285	1358	}
..	..	@@ -1371,7 +1444,7 @@
1371	1444	/*
1372	1445	* Get the perf_event_context for a task and lock it.
1373	1446	*
1374		- * This has to cope with with the fact that until it is locked,
	1447	+ * This has to cope with the fact that until it is locked,
1375	1448	* the context could get moved to another task.
1376	1449	*/
1377	1450	static struct perf_event_context *
..	..	@@ -1412,7 +1485,7 @@
1412	1485	}
1413	1486
1414	1487	if (ctx->task == TASK_TOMBSTONE \|\|
1415		- !atomic_inc_not_zero(&ctx->refcount)) {
	1488	+ !refcount_inc_not_zero(&ctx->refcount)) {
1416	1489	raw_spin_unlock(&ctx->lock);
1417	1490	ctx = NULL;
1418	1491	} else {
..	..	@@ -1540,6 +1613,30 @@
1540	1613	if (left->cpu > right->cpu)
1541	1614	return false;
1542	1615
	1616	+#ifdef CONFIG_CGROUP_PERF
	1617	+ if (left->cgrp != right->cgrp) {
	1618	+ if (!left->cgrp \|\| !left->cgrp->css.cgroup) {
	1619	+ /*
	1620	+ * Left has no cgroup but right does, no cgroups come
	1621	+ * first.
	1622	+ */
	1623	+ return true;
	1624	+ }
	1625	+ if (!right->cgrp \|\| !right->cgrp->css.cgroup) {
	1626	+ /*
	1627	+ * Right has no cgroup but left does, no cgroups come
	1628	+ * first.
	1629	+ */
	1630	+ return false;
	1631	+ }
	1632	+ /* Two dissimilar cgroups, order by id. */
	1633	+ if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
	1634	+ return true;
	1635	+
	1636	+ return false;
	1637	+ }
	1638	+#endif
	1639	+
1543	1640	if (left->group_index < right->group_index)
1544	1641	return true;
1545	1642	if (left->group_index > right->group_index)
..	..	@@ -1619,25 +1716,48 @@
1619	1716	}
1620	1717
1621	1718	/*
1622		- * Get the leftmost event in the @cpu subtree.
	1719	+ * Get the leftmost event in the cpu/cgroup subtree.
1623	1720	*/
1624	1721	static struct perf_event *
1625		-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
	1722	+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
	1723	+ struct cgroup *cgrp)
1626	1724	{
1627	1725	struct perf_event node_event = NULL, match = NULL;
1628	1726	struct rb_node *node = groups->tree.rb_node;
	1727	+#ifdef CONFIG_CGROUP_PERF
	1728	+ u64 node_cgrp_id, cgrp_id = 0;
	1729	+
	1730	+ if (cgrp)
	1731	+ cgrp_id = cgrp->kn->id;
	1732	+#endif
1629	1733
1630	1734	while (node) {
1631	1735	node_event = container_of(node, struct perf_event, group_node);
1632	1736
1633	1737	if (cpu < node_event->cpu) {
1634	1738	node = node->rb_left;
1635		- } else if (cpu > node_event->cpu) {
1636		- node = node->rb_right;
1637		- } else {
1638		- match = node_event;
1639		- node = node->rb_left;
	1739	+ continue;
1640	1740	}
	1741	+ if (cpu > node_event->cpu) {
	1742	+ node = node->rb_right;
	1743	+ continue;
	1744	+ }
	1745	+#ifdef CONFIG_CGROUP_PERF
	1746	+ node_cgrp_id = 0;
	1747	+ if (node_event->cgrp && node_event->cgrp->css.cgroup)
	1748	+ node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
	1749	+
	1750	+ if (cgrp_id < node_cgrp_id) {
	1751	+ node = node->rb_left;
	1752	+ continue;
	1753	+ }
	1754	+ if (cgrp_id > node_cgrp_id) {
	1755	+ node = node->rb_right;
	1756	+ continue;
	1757	+ }
	1758	+#endif
	1759	+ match = node_event;
	1760	+ node = node->rb_left;
1641	1761	}
1642	1762
1643	1763	return match;
..	..	@@ -1650,12 +1770,26 @@
1650	1770	perf_event_groups_next(struct perf_event *event)
1651	1771	{
1652	1772	struct perf_event *next;
	1773	+#ifdef CONFIG_CGROUP_PERF
	1774	+ u64 curr_cgrp_id = 0;
	1775	+ u64 next_cgrp_id = 0;
	1776	+#endif
1653	1777
1654	1778	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1655		- if (next && next->cpu == event->cpu)
1656		- return next;
	1779	+ if (next == NULL \|\| next->cpu != event->cpu)
	1780	+ return NULL;
1657	1781
1658		- return NULL;
	1782	+#ifdef CONFIG_CGROUP_PERF
	1783	+ if (event->cgrp && event->cgrp->css.cgroup)
	1784	+ curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
	1785	+
	1786	+ if (next->cgrp && next->cgrp->css.cgroup)
	1787	+ next_cgrp_id = next->cgrp->css.cgroup->kn->id;
	1788	+
	1789	+ if (curr_cgrp_id != next_cgrp_id)
	1790	+ return NULL;
	1791	+#endif
	1792	+ return next;
1659	1793	}
1660	1794
1661	1795	/*
..	..	@@ -1691,12 +1825,13 @@
1691	1825	add_event_to_groups(event, ctx);
1692	1826	}
1693	1827
1694		- list_update_cgroup_event(event, ctx, true);
1695		-
1696	1828	list_add_rcu(&event->event_entry, &ctx->event_list);
1697	1829	ctx->nr_events++;
1698	1830	if (event->attr.inherit_stat)
1699	1831	ctx->nr_stat++;
	1832	+
	1833	+ if (event->state > PERF_EVENT_STATE_OFF)
	1834	+ perf_cgroup_event_enable(event, ctx);
1700	1835
1701	1836	ctx->generation++;
1702	1837	}
..	..	@@ -1762,6 +1897,9 @@
1762	1897
1763	1898	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1764	1899	size += sizeof(data->phys_addr);
	1900	+
	1901	+ if (sample_type & PERF_SAMPLE_CGROUP)
	1902	+ size += sizeof(data->cgroup);
1765	1903
1766	1904	event->header_size = size;
1767	1905	}
..	..	@@ -1873,8 +2011,6 @@
1873	2011
1874	2012	event->attach_state &= ~PERF_ATTACH_CONTEXT;
1875	2013
1876		- list_update_cgroup_event(event, ctx, false);
1877		-
1878	2014	ctx->nr_events--;
1879	2015	if (event->attr.inherit_stat)
1880	2016	ctx->nr_stat--;
..	..	@@ -1891,14 +2027,136 @@
1891	2027	* of error state is by explicit re-enabling
1892	2028	* of the event
1893	2029	*/
1894		- if (event->state > PERF_EVENT_STATE_OFF)
	2030	+ if (event->state > PERF_EVENT_STATE_OFF) {
	2031	+ perf_cgroup_event_disable(event, ctx);
1895	2032	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
	2033	+ }
1896	2034
1897	2035	ctx->generation++;
1898	2036	}
1899	2037
	2038	+static int
	2039	+perf_aux_output_match(struct perf_event event, struct perf_event aux_event)
	2040	+{
	2041	+ if (!has_aux(aux_event))
	2042	+ return 0;
	2043	+
	2044	+ if (!event->pmu->aux_output_match)
	2045	+ return 0;
	2046	+
	2047	+ return event->pmu->aux_output_match(aux_event);
	2048	+}
	2049	+
	2050	+static void put_event(struct perf_event *event);
	2051	+static void event_sched_out(struct perf_event *event,
	2052	+ struct perf_cpu_context *cpuctx,
	2053	+ struct perf_event_context *ctx);
	2054	+
	2055	+static void perf_put_aux_event(struct perf_event *event)
	2056	+{
	2057	+ struct perf_event_context *ctx = event->ctx;
	2058	+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
	2059	+ struct perf_event *iter;
	2060	+
	2061	+ /*
	2062	+ * If event uses aux_event tear down the link
	2063	+ */
	2064	+ if (event->aux_event) {
	2065	+ iter = event->aux_event;
	2066	+ event->aux_event = NULL;
	2067	+ put_event(iter);
	2068	+ return;
	2069	+ }
	2070	+
	2071	+ /*
	2072	+ * If the event is an aux_event, tear down all links to
	2073	+ * it from other events.
	2074	+ */
	2075	+ for_each_sibling_event(iter, event->group_leader) {
	2076	+ if (iter->aux_event != event)
	2077	+ continue;
	2078	+
	2079	+ iter->aux_event = NULL;
	2080	+ put_event(event);
	2081	+
	2082	+ /*
	2083	+ * If it's ACTIVE, schedule it out and put it into ERROR
	2084	+ * state so that we don't try to schedule it again. Note
	2085	+ * that perf_event_enable() will clear the ERROR status.
	2086	+ */
	2087	+ event_sched_out(iter, cpuctx, ctx);
	2088	+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
	2089	+ }
	2090	+}
	2091	+
	2092	+static bool perf_need_aux_event(struct perf_event *event)
	2093	+{
	2094	+ return !!event->attr.aux_output \|\| !!event->attr.aux_sample_size;
	2095	+}
	2096	+
	2097	+static int perf_get_aux_event(struct perf_event *event,
	2098	+ struct perf_event *group_leader)
	2099	+{
	2100	+ /*
	2101	+ * Our group leader must be an aux event if we want to be
	2102	+ * an aux_output. This way, the aux event will precede its
	2103	+ * aux_output events in the group, and therefore will always
	2104	+ * schedule first.
	2105	+ */
	2106	+ if (!group_leader)
	2107	+ return 0;
	2108	+
	2109	+ /*
	2110	+ * aux_output and aux_sample_size are mutually exclusive.
	2111	+ */
	2112	+ if (event->attr.aux_output && event->attr.aux_sample_size)
	2113	+ return 0;
	2114	+
	2115	+ if (event->attr.aux_output &&
	2116	+ !perf_aux_output_match(event, group_leader))
	2117	+ return 0;
	2118	+
	2119	+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
	2120	+ return 0;
	2121	+
	2122	+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
	2123	+ return 0;
	2124	+
	2125	+ /*
	2126	+ * Link aux_outputs to their aux event; this is undone in
	2127	+ * perf_group_detach() by perf_put_aux_event(). When the
	2128	+ * group in torn down, the aux_output events loose their
	2129	+ * link to the aux_event and can't schedule any more.
	2130	+ */
	2131	+ event->aux_event = group_leader;
	2132	+
	2133	+ return 1;
	2134	+}
	2135	+
	2136	+static inline struct list_head get_event_list(struct perf_event event)
	2137	+{
	2138	+ struct perf_event_context *ctx = event->ctx;
	2139	+ return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
	2140	+}
	2141	+
	2142	+/*
	2143	+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
	2144	+ * cannot exist on their own, schedule them out and move them into the ERROR
	2145	+ * state. Also see _perf_event_enable(), it will not be able to recover
	2146	+ * this ERROR state.
	2147	+ */
	2148	+static inline void perf_remove_sibling_event(struct perf_event *event)
	2149	+{
	2150	+ struct perf_event_context *ctx = event->ctx;
	2151	+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
	2152	+
	2153	+ event_sched_out(event, cpuctx, ctx);
	2154	+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
	2155	+}
	2156	+
1900	2157	static void perf_group_detach(struct perf_event *event)
1901	2158	{
	2159	+ struct perf_event *leader = event->group_leader;
1902	2160	struct perf_event sibling, tmp;
1903	2161	struct perf_event_context *ctx = event->ctx;
1904	2162
..	..	@@ -1912,10 +2170,12 @@
1912	2170
1913	2171	event->attach_state &= ~PERF_ATTACH_GROUP;
1914	2172
	2173	+ perf_put_aux_event(event);
	2174	+
1915	2175	/*
1916	2176	* If this is a sibling, remove it from its group.
1917	2177	*/
1918		- if (event->group_leader != event) {
	2178	+ if (leader != event) {
1919	2179	list_del_init(&event->sibling_list);
1920	2180	event->group_leader->nr_siblings--;
1921	2181	goto out;
..	..	@@ -1928,6 +2188,9 @@
1928	2188	*/
1929	2189	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
1930	2190
	2191	+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
	2192	+ perf_remove_sibling_event(sibling);
	2193	+
1931	2194	sibling->group_leader = sibling;
1932	2195	list_del_init(&sibling->sibling_list);
1933	2196
..	..	@@ -1937,22 +2200,18 @@
1937	2200	if (!RB_EMPTY_NODE(&event->group_node)) {
1938	2201	add_event_to_groups(sibling, event->ctx);
1939	2202
1940		- if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1941		- struct list_head *list = sibling->attr.pinned ?
1942		- &ctx->pinned_active : &ctx->flexible_active;
1943		-
1944		- list_add_tail(&sibling->active_list, list);
1945		- }
	2203	+ if (sibling->state == PERF_EVENT_STATE_ACTIVE)
	2204	+ list_add_tail(&sibling->active_list, get_event_list(sibling));
1946	2205	}
1947	2206
1948	2207	WARN_ON_ONCE(sibling->ctx != event->ctx);
1949	2208	}
1950	2209
1951	2210	out:
1952		- perf_event__header_size(event->group_leader);
1953		-
1954		- for_each_sibling_event(tmp, event->group_leader)
	2211	+ for_each_sibling_event(tmp, leader)
1955	2212	perf_event__header_size(tmp);
	2213	+
	2214	+ perf_event__header_size(leader);
1956	2215	}
1957	2216
1958	2217	static bool is_orphaned_event(struct perf_event *event)
..	..	@@ -2021,6 +2280,7 @@
2021	2280
2022	2281	if (READ_ONCE(event->pending_disable) >= 0) {
2023	2282	WRITE_ONCE(event->pending_disable, -1);
	2283	+ perf_cgroup_event_disable(event, ctx);
2024	2284	state = PERF_EVENT_STATE_OFF;
2025	2285	}
2026	2286	perf_event_set_state(event, state);
..	..	@@ -2058,9 +2318,6 @@
2058	2318	event_sched_out(event, cpuctx, ctx);
2059	2319
2060	2320	perf_pmu_enable(ctx->pmu);
2061		-
2062		- if (group_event->attr.exclusive)
2063		- cpuctx->exclusive = 0;
2064	2321	}
2065	2322
2066	2323	#define DETACH_GROUP 0x01UL
..	..	@@ -2091,6 +2348,7 @@
2091	2348
2092	2349	if (!ctx->nr_events && ctx->is_active) {
2093	2350	ctx->is_active = 0;
	2351	+ ctx->rotate_necessary = 0;
2094	2352	if (ctx->task) {
2095	2353	WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2096	2354	cpuctx->task_ctx = NULL;
..	..	@@ -2157,6 +2415,7 @@
2157	2415	event_sched_out(event, cpuctx, ctx);
2158	2416
2159	2417	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
	2418	+ perf_cgroup_event_disable(event, ctx);
2160	2419	}
2161	2420
2162	2421	/*
..	..	@@ -2164,7 +2423,7 @@
2164	2423	*
2165	2424	* If event->ctx is a cloned context, callers must make sure that
2166	2425	* every task struct that event->ctx->task could possibly point to
2167		- * remains valid. This condition is satisifed when called through
	2426	+ * remains valid. This condition is satisfied when called through
2168	2427	* perf_event_for_each_child or perf_event_for_each because they
2169	2428	* hold the top-level event's child_mutex, so any descendant that
2170	2429	* goes to exit will block in perf_event_exit_event().
..	..	@@ -2238,7 +2497,7 @@
2238	2497	* But this is a bit hairy.
2239	2498	*
2240	2499	* So instead, we have an explicit cgroup call to remain
2241		- * within the time time source all along. We believe it
	2500	+ * within the time source all along. We believe it
2242	2501	* is cleaner and simpler to understand.
2243	2502	*/
2244	2503	if (is_cgroup_event(event))
..	..	@@ -2258,6 +2517,8 @@
2258	2517	struct perf_event_context *ctx)
2259	2518	{
2260	2519	int ret = 0;
	2520	+
	2521	+ WARN_ON_ONCE(event->ctx != ctx);
2261	2522
2262	2523	lockdep_assert_held(&ctx->lock);
2263	2524
..	..	@@ -2325,11 +2586,8 @@
2325	2586
2326	2587	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2327	2588
2328		- if (event_sched_in(group_event, cpuctx, ctx)) {
2329		- pmu->cancel_txn(pmu);
2330		- perf_mux_hrtimer_restart(cpuctx);
2331		- return -EAGAIN;
2332		- }
	2589	+ if (event_sched_in(group_event, cpuctx, ctx))
	2590	+ goto error;
2333	2591
2334	2592	/*
2335	2593	* Schedule in siblings as one group (if any):
..	..	@@ -2358,10 +2616,8 @@
2358	2616	}
2359	2617	event_sched_out(group_event, cpuctx, ctx);
2360	2618
	2619	+error:
2361	2620	pmu->cancel_txn(pmu);
2362		-
2363		- perf_mux_hrtimer_restart(cpuctx);
2364		-
2365	2621	return -EAGAIN;
2366	2622	}
2367	2623
..	..	@@ -2387,7 +2643,7 @@
2387	2643	* If this group is exclusive and there are already
2388	2644	* events on the CPU, it can't go on.
2389	2645	*/
2390		- if (event->attr.exclusive && cpuctx->active_oncpu)
	2646	+ if (event->attr.exclusive && !list_empty(get_event_list(event)))
2391	2647	return 0;
2392	2648	/*
2393	2649	* Otherwise, try to add it if all previous groups were able
..	..	@@ -2488,6 +2744,16 @@
2488	2744	perf_pmu_enable(cpuctx->ctx.pmu);
2489	2745	}
2490	2746
	2747	+void perf_pmu_resched(struct pmu *pmu)
	2748	+{
	2749	+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
	2750	+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
	2751	+
	2752	+ perf_ctx_lock(cpuctx, task_ctx);
	2753	+ ctx_resched(cpuctx, task_ctx, EVENT_ALL\|EVENT_CPU);
	2754	+ perf_ctx_unlock(cpuctx, task_ctx);
	2755	+}
	2756	+
2491	2757	/*
2492	2758	* Cross CPU call to install and enable a performance event
2493	2759	*
..	..	@@ -2528,7 +2794,7 @@
2528	2794	}
2529	2795
2530	2796	#ifdef CONFIG_CGROUP_PERF
2531		- if (is_cgroup_event(event)) {
	2797	+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2532	2798	/*
2533	2799	* If the current cgroup doesn't match the event's
2534	2800	* cgroup, we should not try to schedule it.
..	..	@@ -2580,6 +2846,25 @@
2580	2846	* will be 'complete'. See perf_iterate_sb_cpu().
2581	2847	*/
2582	2848	smp_store_release(&event->ctx, ctx);
	2849	+
	2850	+ /*
	2851	+ * perf_event_attr::disabled events will not run and can be initialized
	2852	+ * without IPI. Except when this is the first event for the context, in
	2853	+ * that case we need the magic of the IPI to set ctx->is_active.
	2854	+ *
	2855	+ * The IOC_ENABLE that is sure to follow the creation of a disabled
	2856	+ * event will issue the IPI and reprogram the hardware.
	2857	+ */
	2858	+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
	2859	+ raw_spin_lock_irq(&ctx->lock);
	2860	+ if (ctx->task == TASK_TOMBSTONE) {
	2861	+ raw_spin_unlock_irq(&ctx->lock);
	2862	+ return;
	2863	+ }
	2864	+ add_event_to_ctx(event, ctx);
	2865	+ raw_spin_unlock_irq(&ctx->lock);
	2866	+ return;
	2867	+ }
2583	2868
2584	2869	if (!task) {
2585	2870	cpu_function_call(cpu, __perf_install_in_context, event);
..	..	@@ -2669,6 +2954,7 @@
2669	2954	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2670	2955
2671	2956	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
	2957	+ perf_cgroup_event_enable(event, ctx);
2672	2958
2673	2959	if (!ctx->is_active)
2674	2960	return;
..	..	@@ -2710,6 +2996,7 @@
2710	2996	raw_spin_lock_irq(&ctx->lock);
2711	2997	if (event->state >= PERF_EVENT_STATE_INACTIVE \|\|
2712	2998	event->state < PERF_EVENT_STATE_ERROR) {
	2999	+out:
2713	3000	raw_spin_unlock_irq(&ctx->lock);
2714	3001	return;
2715	3002	}
..	..	@@ -2721,8 +3008,16 @@
2721	3008	* has gone back into error state, as distinct from the task having
2722	3009	* been scheduled away before the cross-call arrived.
2723	3010	*/
2724		- if (event->state == PERF_EVENT_STATE_ERROR)
	3011	+ if (event->state == PERF_EVENT_STATE_ERROR) {
	3012	+ /*
	3013	+ * Detached SIBLING events cannot leave ERROR state.
	3014	+ */
	3015	+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
	3016	+ event->group_leader == event)
	3017	+ goto out;
	3018	+
2725	3019	event->state = PERF_EVENT_STATE_OFF;
	3020	+ }
2726	3021	raw_spin_unlock_irq(&ctx->lock);
2727	3022
2728	3023	event_function_call(event, __perf_event_enable, NULL);
..	..	@@ -2826,7 +3121,7 @@
2826	3121	* pre-existing mappings, called once when new filters arrive via SET_FILTER
2827	3122	* ioctl;
2828	3123	* (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2829		- * registered mapping, called for every new mmap(), with mm::mmap_sem down
	3124	+ * registered mapping, called for every new mmap(), with mm::mmap_lock down
2830	3125	* for reading;
2831	3126	* (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2832	3127	* of exec.
..	..	@@ -2966,6 +3261,13 @@
2966	3261	if (is_active & EVENT_FLEXIBLE) {
2967	3262	list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
2968	3263	group_sched_out(event, cpuctx, ctx);
	3264	+
	3265	+ /*
	3266	+ * Since we cleared EVENT_FLEXIBLE, also clear
	3267	+ * rotate_necessary, is will be reset by
	3268	+ * ctx_flexible_sched_in() when needed.
	3269	+ */
	3270	+ ctx->rotate_necessary = 0;
2969	3271	}
2970	3272	perf_pmu_enable(ctx->pmu);
2971	3273	}
..	..	@@ -3080,10 +3382,12 @@
3080	3382	struct perf_event_context parent, next_parent;
3081	3383	struct perf_cpu_context *cpuctx;
3082	3384	int do_switch = 1;
	3385	+ struct pmu *pmu;
3083	3386
3084	3387	if (likely(!ctx))
3085	3388	return;
3086	3389
	3390	+ pmu = ctx->pmu;
3087	3391	cpuctx = __get_cpu_context(ctx);
3088	3392	if (!cpuctx->task_ctx)
3089	3393	return;
..	..	@@ -3113,10 +3417,27 @@
3113	3417	raw_spin_lock(&ctx->lock);
3114	3418	raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3115	3419	if (context_equiv(ctx, next_ctx)) {
	3420	+
3116	3421	WRITE_ONCE(ctx->task, next);
3117	3422	WRITE_ONCE(next_ctx->task, task);
3118	3423
3119		- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
	3424	+ perf_pmu_disable(pmu);
	3425	+
	3426	+ if (cpuctx->sched_cb_usage && pmu->sched_task)
	3427	+ pmu->sched_task(ctx, false);
	3428	+
	3429	+ /*
	3430	+ * PMU specific parts of task perf context can require
	3431	+ * additional synchronization. As an example of such
	3432	+ * synchronization see implementation details of Intel
	3433	+ * LBR call stack data profiling;
	3434	+ */
	3435	+ if (pmu->swap_task_ctx)
	3436	+ pmu->swap_task_ctx(ctx, next_ctx);
	3437	+ else
	3438	+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
	3439	+
	3440	+ perf_pmu_enable(pmu);
3120	3441
3121	3442	/*
3122	3443	* RCU_INIT_POINTER here is safe because we've not
..	..	@@ -3140,7 +3461,13 @@
3140	3461
3141	3462	if (do_switch) {
3142	3463	raw_spin_lock(&ctx->lock);
	3464	+ perf_pmu_disable(pmu);
	3465	+
	3466	+ if (cpuctx->sched_cb_usage && pmu->sched_task)
	3467	+ pmu->sched_task(ctx, false);
3143	3468	task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
	3469	+
	3470	+ perf_pmu_enable(pmu);
3144	3471	raw_spin_unlock(&ctx->lock);
3145	3472	}
3146	3473	}
..	..	@@ -3176,29 +3503,39 @@
3176	3503	* PEBS requires this to provide PID/TID information. This requires we flush
3177	3504	* all queued PEBS records before we context switch to a new task.
3178	3505	*/
	3506	+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
	3507	+{
	3508	+ struct pmu *pmu;
	3509	+
	3510	+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
	3511	+
	3512	+ if (WARN_ON_ONCE(!pmu->sched_task))
	3513	+ return;
	3514	+
	3515	+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
	3516	+ perf_pmu_disable(pmu);
	3517	+
	3518	+ pmu->sched_task(cpuctx->task_ctx, sched_in);
	3519	+
	3520	+ perf_pmu_enable(pmu);
	3521	+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
	3522	+}
	3523	+
3179	3524	static void perf_pmu_sched_task(struct task_struct *prev,
3180	3525	struct task_struct *next,
3181	3526	bool sched_in)
3182	3527	{
3183	3528	struct perf_cpu_context *cpuctx;
3184		- struct pmu *pmu;
3185	3529
3186	3530	if (prev == next)
3187	3531	return;
3188	3532
3189	3533	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3190		- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3191		-
3192		- if (WARN_ON_ONCE(!pmu->sched_task))
	3534	+ /* will be handled in perf_event_context_sched_in/out */
	3535	+ if (cpuctx->task_ctx)
3193	3536	continue;
3194	3537
3195		- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3196		- perf_pmu_disable(pmu);
3197		-
3198		- pmu->sched_task(cpuctx->task_ctx, sched_in);
3199		-
3200		- perf_pmu_enable(pmu);
3201		- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
	3538	+ __perf_pmu_sched_task(cpuctx, sched_in);
3202	3539	}
3203	3540	}
3204	3541
..	..	@@ -3251,83 +3588,149 @@
3251	3588	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3252	3589	}
3253	3590
3254		-static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3255		- int (func)(struct perf_event , void ), void data)
	3591	+static bool perf_less_group_idx(const void l, const void r)
3256	3592	{
3257		- struct perf_event *evt, evt1, *evt2;
	3593	+ const struct perf_event le = (const struct perf_event **)l;
	3594	+ const struct perf_event re = (const struct perf_event **)r;
	3595	+
	3596	+ return le->group_index < re->group_index;
	3597	+}
	3598	+
	3599	+static void swap_ptr(void l, void r)
	3600	+{
	3601	+ void lp = l, rp = r;
	3602	+
	3603	+ swap(lp, rp);
	3604	+}
	3605	+
	3606	+static const struct min_heap_callbacks perf_min_heap = {
	3607	+ .elem_size = sizeof(struct perf_event *),
	3608	+ .less = perf_less_group_idx,
	3609	+ .swp = swap_ptr,
	3610	+};
	3611	+
	3612	+static void __heap_add(struct min_heap heap, struct perf_event event)
	3613	+{
	3614	+ struct perf_event **itrs = heap->data;
	3615	+
	3616	+ if (event) {
	3617	+ itrs[heap->nr] = event;
	3618	+ heap->nr++;
	3619	+ }
	3620	+}
	3621	+
	3622	+static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
	3623	+ struct perf_event_groups *groups, int cpu,
	3624	+ int (func)(struct perf_event , void *),
	3625	+ void *data)
	3626	+{
	3627	+#ifdef CONFIG_CGROUP_PERF
	3628	+ struct cgroup_subsys_state *css = NULL;
	3629	+#endif
	3630	+ /* Space for per CPU and/or any CPU event iterators. */
	3631	+ struct perf_event *itrs[2];
	3632	+ struct min_heap event_heap;
	3633	+ struct perf_event **evt;
3258	3634	int ret;
3259	3635
3260		- evt1 = perf_event_groups_first(groups, -1);
3261		- evt2 = perf_event_groups_first(groups, cpu);
	3636	+ if (cpuctx) {
	3637	+ event_heap = (struct min_heap){
	3638	+ .data = cpuctx->heap,
	3639	+ .nr = 0,
	3640	+ .size = cpuctx->heap_size,
	3641	+ };
3262	3642
3263		- while (evt1 \|\| evt2) {
3264		- if (evt1 && evt2) {
3265		- if (evt1->group_index < evt2->group_index)
3266		- evt = &evt1;
3267		- else
3268		- evt = &evt2;
3269		- } else if (evt1) {
3270		- evt = &evt1;
3271		- } else {
3272		- evt = &evt2;
3273		- }
	3643	+ lockdep_assert_held(&cpuctx->ctx.lock);
3274	3644
	3645	+#ifdef CONFIG_CGROUP_PERF
	3646	+ if (cpuctx->cgrp)
	3647	+ css = &cpuctx->cgrp->css;
	3648	+#endif
	3649	+ } else {
	3650	+ event_heap = (struct min_heap){
	3651	+ .data = itrs,
	3652	+ .nr = 0,
	3653	+ .size = ARRAY_SIZE(itrs),
	3654	+ };
	3655	+ /* Events not within a CPU context may be on any CPU. */
	3656	+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
	3657	+ }
	3658	+ evt = event_heap.data;
	3659	+
	3660	+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
	3661	+
	3662	+#ifdef CONFIG_CGROUP_PERF
	3663	+ for (; css; css = css->parent)
	3664	+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
	3665	+#endif
	3666	+
	3667	+ min_heapify_all(&event_heap, &perf_min_heap);
	3668	+
	3669	+ while (event_heap.nr) {
3275	3670	ret = func(*evt, data);
3276	3671	if (ret)
3277	3672	return ret;
3278	3673
3279	3674	evt = perf_event_groups_next(evt);
3280		- }
3281		-
3282		- return 0;
3283		-}
3284		-
3285		-struct sched_in_data {
3286		- struct perf_event_context *ctx;
3287		- struct perf_cpu_context *cpuctx;
3288		- int can_add_hw;
3289		-};
3290		-
3291		-static int pinned_sched_in(struct perf_event event, void data)
3292		-{
3293		- struct sched_in_data *sid = data;
3294		-
3295		- if (event->state <= PERF_EVENT_STATE_OFF)
3296		- return 0;
3297		-
3298		- if (!event_filter_match(event))
3299		- return 0;
3300		-
3301		- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3302		- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3303		- list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3304		- }
3305		-
3306		- /*
3307		- * If this pinned group hasn't been scheduled,
3308		- * put it in error state.
3309		- */
3310		- if (event->state == PERF_EVENT_STATE_INACTIVE)
3311		- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3312		-
3313		- return 0;
3314		-}
3315		-
3316		-static int flexible_sched_in(struct perf_event event, void data)
3317		-{
3318		- struct sched_in_data *sid = data;
3319		-
3320		- if (event->state <= PERF_EVENT_STATE_OFF)
3321		- return 0;
3322		-
3323		- if (!event_filter_match(event))
3324		- return 0;
3325		-
3326		- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3327		- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3328		- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
	3675	+ if (*evt)
	3676	+ min_heapify(&event_heap, 0, &perf_min_heap);
3329	3677	else
3330		- sid->can_add_hw = 0;
	3678	+ min_heap_pop(&event_heap, &perf_min_heap);
	3679	+ }
	3680	+
	3681	+ return 0;
	3682	+}
	3683	+
	3684	+static inline bool event_update_userpage(struct perf_event *event)
	3685	+{
	3686	+ if (likely(!atomic_read(&event->mmap_count)))
	3687	+ return false;
	3688	+
	3689	+ perf_event_update_time(event);
	3690	+ perf_set_shadow_time(event, event->ctx);
	3691	+ perf_event_update_userpage(event);
	3692	+
	3693	+ return true;
	3694	+}
	3695	+
	3696	+static inline void group_update_userpage(struct perf_event *group_event)
	3697	+{
	3698	+ struct perf_event *event;
	3699	+
	3700	+ if (!event_update_userpage(group_event))
	3701	+ return;
	3702	+
	3703	+ for_each_sibling_event(event, group_event)
	3704	+ event_update_userpage(event);
	3705	+}
	3706	+
	3707	+static int merge_sched_in(struct perf_event event, void data)
	3708	+{
	3709	+ struct perf_event_context *ctx = event->ctx;
	3710	+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
	3711	+ int *can_add_hw = data;
	3712	+
	3713	+ if (event->state <= PERF_EVENT_STATE_OFF)
	3714	+ return 0;
	3715	+
	3716	+ if (!event_filter_match(event))
	3717	+ return 0;
	3718	+
	3719	+ if (group_can_go_on(event, cpuctx, *can_add_hw)) {
	3720	+ if (!group_sched_in(event, cpuctx, ctx))
	3721	+ list_add_tail(&event->active_list, get_event_list(event));
	3722	+ }
	3723	+
	3724	+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
	3725	+ *can_add_hw = 0;
	3726	+ if (event->attr.pinned) {
	3727	+ perf_cgroup_event_disable(event, ctx);
	3728	+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
	3729	+ } else {
	3730	+ ctx->rotate_necessary = 1;
	3731	+ perf_mux_hrtimer_restart(cpuctx);
	3732	+ group_update_userpage(event);
	3733	+ }
3331	3734	}
3332	3735
3333	3736	return 0;
..	..	@@ -3337,30 +3740,28 @@
3337	3740	ctx_pinned_sched_in(struct perf_event_context *ctx,
3338	3741	struct perf_cpu_context *cpuctx)
3339	3742	{
3340		- struct sched_in_data sid = {
3341		- .ctx = ctx,
3342		- .cpuctx = cpuctx,
3343		- .can_add_hw = 1,
3344		- };
	3743	+ int can_add_hw = 1;
3345	3744
3346		- visit_groups_merge(&ctx->pinned_groups,
	3745	+ if (ctx != &cpuctx->ctx)
	3746	+ cpuctx = NULL;
	3747	+
	3748	+ visit_groups_merge(cpuctx, &ctx->pinned_groups,
3347	3749	smp_processor_id(),
3348		- pinned_sched_in, &sid);
	3750	+ merge_sched_in, &can_add_hw);
3349	3751	}
3350	3752
3351	3753	static void
3352	3754	ctx_flexible_sched_in(struct perf_event_context *ctx,
3353	3755	struct perf_cpu_context *cpuctx)
3354	3756	{
3355		- struct sched_in_data sid = {
3356		- .ctx = ctx,
3357		- .cpuctx = cpuctx,
3358		- .can_add_hw = 1,
3359		- };
	3757	+ int can_add_hw = 1;
3360	3758
3361		- visit_groups_merge(&ctx->flexible_groups,
	3759	+ if (ctx != &cpuctx->ctx)
	3760	+ cpuctx = NULL;
	3761	+
	3762	+ visit_groups_merge(cpuctx, &ctx->flexible_groups,
3362	3763	smp_processor_id(),
3363		- flexible_sched_in, &sid);
	3764	+ merge_sched_in, &can_add_hw);
3364	3765	}
3365	3766
3366	3767	static void
..	..	@@ -3419,10 +3820,14 @@
3419	3820	struct task_struct *task)
3420	3821	{
3421	3822	struct perf_cpu_context *cpuctx;
	3823	+ struct pmu *pmu = ctx->pmu;
3422	3824
3423	3825	cpuctx = __get_cpu_context(ctx);
3424		- if (cpuctx->task_ctx == ctx)
	3826	+ if (cpuctx->task_ctx == ctx) {
	3827	+ if (cpuctx->sched_cb_usage)
	3828	+ __perf_pmu_sched_task(cpuctx, true);
3425	3829	return;
	3830	+ }
3426	3831
3427	3832	perf_ctx_lock(cpuctx, ctx);
3428	3833	/*
..	..	@@ -3432,7 +3837,7 @@
3432	3837	if (!ctx->nr_events)
3433	3838	goto unlock;
3434	3839
3435		- perf_pmu_disable(ctx->pmu);
	3840	+ perf_pmu_disable(pmu);
3436	3841	/*
3437	3842	* We want to keep the following priority order:
3438	3843	* cpu pinned (that don't need to move), task pinned,
..	..	@@ -3444,7 +3849,11 @@
3444	3849	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3445	3850	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3446	3851	perf_event_sched_in(cpuctx, ctx, task);
3447		- perf_pmu_enable(ctx->pmu);
	3852	+
	3853	+ if (cpuctx->sched_cb_usage && pmu->sched_task)
	3854	+ pmu->sched_task(cpuctx->task_ctx, true);
	3855	+
	3856	+ perf_pmu_enable(pmu);
3448	3857
3449	3858	unlock:
3450	3859	perf_ctx_unlock(cpuctx, ctx);
..	..	@@ -3685,34 +4094,45 @@
3685	4094	perf_event_groups_insert(&ctx->flexible_groups, event);
3686	4095	}
3687	4096
	4097	+/* pick an event from the flexible_groups to rotate */
3688	4098	static inline struct perf_event *
3689		-ctx_first_active(struct perf_event_context *ctx)
	4099	+ctx_event_to_rotate(struct perf_event_context *ctx)
3690	4100	{
3691		- return list_first_entry_or_null(&ctx->flexible_active,
3692		- struct perf_event, active_list);
	4101	+ struct perf_event *event;
	4102	+
	4103	+ /* pick the first active flexible event */
	4104	+ event = list_first_entry_or_null(&ctx->flexible_active,
	4105	+ struct perf_event, active_list);
	4106	+
	4107	+ /* if no active flexible event, pick the first event */
	4108	+ if (!event) {
	4109	+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
	4110	+ typeof(*event), group_node);
	4111	+ }
	4112	+
	4113	+ /*
	4114	+ * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
	4115	+ * finds there are unschedulable events, it will set it again.
	4116	+ */
	4117	+ ctx->rotate_necessary = 0;
	4118	+
	4119	+ return event;
3693	4120	}
3694	4121
3695	4122	static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3696	4123	{
3697	4124	struct perf_event cpu_event = NULL, task_event = NULL;
3698		- bool cpu_rotate = false, task_rotate = false;
3699		- struct perf_event_context *ctx = NULL;
	4125	+ struct perf_event_context *task_ctx = NULL;
	4126	+ int cpu_rotate, task_rotate;
3700	4127
3701	4128	/*
3702	4129	* Since we run this from IRQ context, nobody can install new
3703	4130	* events, thus the event count values are stable.
3704	4131	*/
3705	4132
3706		- if (cpuctx->ctx.nr_events) {
3707		- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3708		- cpu_rotate = true;
3709		- }
3710		-
3711		- ctx = cpuctx->task_ctx;
3712		- if (ctx && ctx->nr_events) {
3713		- if (ctx->nr_events != ctx->nr_active)
3714		- task_rotate = true;
3715		- }
	4133	+ cpu_rotate = cpuctx->ctx.rotate_necessary;
	4134	+ task_ctx = cpuctx->task_ctx;
	4135	+ task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
3716	4136
3717	4137	if (!(cpu_rotate \|\| task_rotate))
3718	4138	return false;
..	..	@@ -3721,25 +4141,25 @@
3721	4141	perf_pmu_disable(cpuctx->ctx.pmu);
3722	4142
3723	4143	if (task_rotate)
3724		- task_event = ctx_first_active(ctx);
	4144	+ task_event = ctx_event_to_rotate(task_ctx);
3725	4145	if (cpu_rotate)
3726		- cpu_event = ctx_first_active(&cpuctx->ctx);
	4146	+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
3727	4147
3728	4148	/*
3729	4149	* As per the order given at ctx_resched() first 'pop' task flexible
3730	4150	* and then, if needed CPU flexible.
3731	4151	*/
3732		- if (task_event \|\| (ctx && cpu_event))
3733		- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
	4152	+ if (task_event \|\| (task_ctx && cpu_event))
	4153	+ ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
3734	4154	if (cpu_event)
3735	4155	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3736	4156
3737	4157	if (task_event)
3738		- rotate_ctx(ctx, task_event);
	4158	+ rotate_ctx(task_ctx, task_event);
3739	4159	if (cpu_event)
3740	4160	rotate_ctx(&cpuctx->ctx, cpu_event);
3741	4161
3742		- perf_event_sched_in(cpuctx, ctx, current);
	4162	+ perf_event_sched_in(cpuctx, task_ctx, current);
3743	4163
3744	4164	perf_pmu_enable(cpuctx->ctx.pmu);
3745	4165	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
..	..	@@ -3983,6 +4403,7 @@
3983	4403
3984	4404	return ret;
3985	4405	}
	4406	+EXPORT_SYMBOL_GPL(perf_event_read_local);
3986	4407
3987	4408	static int perf_event_read(struct perf_event *event, bool group)
3988	4409	{
..	..	@@ -4074,7 +4495,7 @@
4074	4495	INIT_LIST_HEAD(&ctx->event_list);
4075	4496	INIT_LIST_HEAD(&ctx->pinned_active);
4076	4497	INIT_LIST_HEAD(&ctx->flexible_active);
4077		- atomic_set(&ctx->refcount, 1);
	4498	+ refcount_set(&ctx->refcount, 1);
4078	4499	}
4079	4500
4080	4501	static struct perf_event_context *
..	..	@@ -4087,10 +4508,8 @@
4087	4508	return NULL;
4088	4509
4089	4510	__perf_event_init_context(ctx);
4090		- if (task) {
4091		- ctx->task = task;
4092		- get_task_struct(task);
4093		- }
	4511	+ if (task)
	4512	+ ctx->task = get_task_struct(task);
4094	4513	ctx->pmu = pmu;
4095	4514
4096	4515	return ctx;
..	..	@@ -4152,7 +4571,7 @@
4152	4571	goto errout;
4153	4572
4154	4573	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4155		- task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
	4574	+ task_ctx_data = alloc_task_ctx_data(pmu);
4156	4575	if (!task_ctx_data) {
4157	4576	err = -ENOMEM;
4158	4577	goto errout;
..	..	@@ -4210,11 +4629,11 @@
4210	4629	}
4211	4630	}
4212	4631
4213		- kfree(task_ctx_data);
	4632	+ free_task_ctx_data(pmu, task_ctx_data);
4214	4633	return ctx;
4215	4634
4216	4635	errout:
4217		- kfree(task_ctx_data);
	4636	+ free_task_ctx_data(pmu, task_ctx_data);
4218	4637	return ERR_PTR(err);
4219	4638	}
4220	4639
..	..	@@ -4233,7 +4652,7 @@
4233	4652	}
4234	4653
4235	4654	static void ring_buffer_attach(struct perf_event *event,
4236		- struct ring_buffer *rb);
	4655	+ struct perf_buffer *rb);
4237	4656
4238	4657	static void detach_sb_event(struct perf_event *event)
4239	4658	{
..	..	@@ -4256,8 +4675,9 @@
4256	4675
4257	4676	if (attr->mmap \|\| attr->mmap_data \|\| attr->mmap2 \|\|
4258	4677	attr->comm \|\| attr->comm_exec \|\|
4259		- attr->task \|\|
4260		- attr->context_switch)
	4678	+ attr->task \|\| attr->ksymbol \|\|
	4679	+ attr->context_switch \|\| attr->text_poke \|\|
	4680	+ attr->bpf_event)
4261	4681	return true;
4262	4682	return false;
4263	4683	}
..	..	@@ -4306,7 +4726,7 @@
4306	4726	if (event->parent)
4307	4727	return;
4308	4728
4309		- if (event->attach_state & PERF_ATTACH_TASK)
	4729	+ if (event->attach_state & (PERF_ATTACH_TASK \| PERF_ATTACH_SCHED_CB))
4310	4730	dec = true;
4311	4731	if (event->attr.mmap \|\| event->attr.mmap_data)
4312	4732	atomic_dec(&nr_mmap_events);
..	..	@@ -4314,6 +4734,8 @@
4314	4734	atomic_dec(&nr_comm_events);
4315	4735	if (event->attr.namespaces)
4316	4736	atomic_dec(&nr_namespaces_events);
	4737	+ if (event->attr.cgroup)
	4738	+ atomic_dec(&nr_cgroup_events);
4317	4739	if (event->attr.task)
4318	4740	atomic_dec(&nr_task_events);
4319	4741	if (event->attr.freq)
..	..	@@ -4326,6 +4748,12 @@
4326	4748	dec = true;
4327	4749	if (has_branch_stack(event))
4328	4750	dec = true;
	4751	+ if (event->attr.ksymbol)
	4752	+ atomic_dec(&nr_ksymbol_events);
	4753	+ if (event->attr.bpf_event)
	4754	+ atomic_dec(&nr_bpf_events);
	4755	+ if (event->attr.text_poke)
	4756	+ atomic_dec(&nr_text_poke_events);
4329	4757
4330	4758	if (dec) {
4331	4759	if (!atomic_add_unless(&perf_sched_count, -1, 1))
..	..	@@ -4909,7 +5337,7 @@
4909	5337	static __poll_t perf_poll(struct file file, poll_table wait)
4910	5338	{
4911	5339	struct perf_event *event = file->private_data;
4912		- struct ring_buffer *rb;
	5340	+ struct perf_buffer *rb;
4913	5341	__poll_t events = EPOLLHUP;
4914	5342
4915	5343	poll_wait(file, &event->waitq, wait);
..	..	@@ -4935,6 +5363,24 @@
4935	5363	local64_set(&event->count, 0);
4936	5364	perf_event_update_userpage(event);
4937	5365	}
	5366	+
	5367	+/* Assume it's not an event with inherit set. */
	5368	+u64 perf_event_pause(struct perf_event *event, bool reset)
	5369	+{
	5370	+ struct perf_event_context *ctx;
	5371	+ u64 count;
	5372	+
	5373	+ ctx = perf_event_ctx_lock(event);
	5374	+ WARN_ON_ONCE(event->attr.inherit);
	5375	+ _perf_event_disable(event);
	5376	+ count = local64_read(&event->count);
	5377	+ if (reset)
	5378	+ local64_set(&event->count, 0);
	5379	+ perf_event_ctx_unlock(event, ctx);
	5380	+
	5381	+ return count;
	5382	+}
	5383	+EXPORT_SYMBOL_GPL(perf_event_pause);
4938	5384
4939	5385	/*
4940	5386	* Holding the top-level event's child_mutex means that any
..	..	@@ -5013,15 +5459,10 @@
5013	5459	return event->pmu->check_period(event, value);
5014	5460	}
5015	5461
5016		-static int perf_event_period(struct perf_event event, u64 __user arg)
	5462	+static int _perf_event_period(struct perf_event *event, u64 value)
5017	5463	{
5018		- u64 value;
5019		-
5020	5464	if (!is_sampling_event(event))
5021	5465	return -EINVAL;
5022		-
5023		- if (copy_from_user(&value, arg, sizeof(value)))
5024		- return -EFAULT;
5025	5466
5026	5467	if (!value)
5027	5468	return -EINVAL;
..	..	@@ -5039,6 +5480,19 @@
5039	5480
5040	5481	return 0;
5041	5482	}
	5483	+
	5484	+int perf_event_period(struct perf_event *event, u64 value)
	5485	+{
	5486	+ struct perf_event_context *ctx;
	5487	+ int ret;
	5488	+
	5489	+ ctx = perf_event_ctx_lock(event);
	5490	+ ret = _perf_event_period(event, value);
	5491	+ perf_event_ctx_unlock(event, ctx);
	5492	+
	5493	+ return ret;
	5494	+}
	5495	+EXPORT_SYMBOL_GPL(perf_event_period);
5042	5496
5043	5497	static const struct file_operations perf_fops;
5044	5498
..	..	@@ -5083,8 +5537,14 @@
5083	5537	return _perf_event_refresh(event, arg);
5084	5538
5085	5539	case PERF_EVENT_IOC_PERIOD:
5086		- return perf_event_period(event, (u64 __user *)arg);
	5540	+ {
	5541	+ u64 value;
5087	5542
	5543	+ if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
	5544	+ return -EFAULT;
	5545	+
	5546	+ return _perf_event_period(event, value);
	5547	+ }
5088	5548	case PERF_EVENT_IOC_ID:
5089	5549	{
5090	5550	u64 id = primary_event_id(event);
..	..	@@ -5119,7 +5579,7 @@
5119	5579	return perf_event_set_bpf_prog(event, arg);
5120	5580
5121	5581	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5122		- struct ring_buffer *rb;
	5582	+ struct perf_buffer *rb;
5123	5583
5124	5584	rcu_read_lock();
5125	5585	rb = rcu_dereference(event->rb);
..	..	@@ -5255,7 +5715,7 @@
5255	5715	static void perf_event_init_userpage(struct perf_event *event)
5256	5716	{
5257	5717	struct perf_event_mmap_page *userpg;
5258		- struct ring_buffer *rb;
	5718	+ struct perf_buffer *rb;
5259	5719
5260	5720	rcu_read_lock();
5261	5721	rb = rcu_dereference(event->rb);
..	..	@@ -5287,7 +5747,7 @@
5287	5747	void perf_event_update_userpage(struct perf_event *event)
5288	5748	{
5289	5749	struct perf_event_mmap_page *userpg;
5290		- struct ring_buffer *rb;
	5750	+ struct perf_buffer *rb;
5291	5751	u64 enabled, running, now;
5292	5752
5293	5753	rcu_read_lock();
..	..	@@ -5338,7 +5798,7 @@
5338	5798	static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5339	5799	{
5340	5800	struct perf_event *event = vmf->vma->vm_file->private_data;
5341		- struct ring_buffer *rb;
	5801	+ struct perf_buffer *rb;
5342	5802	vm_fault_t ret = VM_FAULT_SIGBUS;
5343	5803
5344	5804	if (vmf->flags & FAULT_FLAG_MKWRITE) {
..	..	@@ -5371,10 +5831,12 @@
5371	5831	}
5372	5832
5373	5833	static void ring_buffer_attach(struct perf_event *event,
5374		- struct ring_buffer *rb)
	5834	+ struct perf_buffer *rb)
5375	5835	{
5376		- struct ring_buffer *old_rb = NULL;
	5836	+ struct perf_buffer *old_rb = NULL;
5377	5837	unsigned long flags;
	5838	+
	5839	+ WARN_ON_ONCE(event->parent);
5378	5840
5379	5841	if (event->rb) {
5380	5842	/*
..	..	@@ -5431,7 +5893,10 @@
5431	5893
5432	5894	static void ring_buffer_wakeup(struct perf_event *event)
5433	5895	{
5434		- struct ring_buffer *rb;
	5896	+ struct perf_buffer *rb;
	5897	+
	5898	+ if (event->parent)
	5899	+ event = event->parent;
5435	5900
5436	5901	rcu_read_lock();
5437	5902	rb = rcu_dereference(event->rb);
..	..	@@ -5442,14 +5907,17 @@
5442	5907	rcu_read_unlock();
5443	5908	}
5444	5909
5445		-struct ring_buffer ring_buffer_get(struct perf_event event)
	5910	+struct perf_buffer ring_buffer_get(struct perf_event event)
5446	5911	{
5447		- struct ring_buffer *rb;
	5912	+ struct perf_buffer *rb;
	5913	+
	5914	+ if (event->parent)
	5915	+ event = event->parent;
5448	5916
5449	5917	rcu_read_lock();
5450	5918	rb = rcu_dereference(event->rb);
5451	5919	if (rb) {
5452		- if (!atomic_inc_not_zero(&rb->refcount))
	5920	+ if (!refcount_inc_not_zero(&rb->refcount))
5453	5921	rb = NULL;
5454	5922	}
5455	5923	rcu_read_unlock();
..	..	@@ -5457,9 +5925,9 @@
5457	5925	return rb;
5458	5926	}
5459	5927
5460		-void ring_buffer_put(struct ring_buffer *rb)
	5928	+void ring_buffer_put(struct perf_buffer *rb)
5461	5929	{
5462		- if (!atomic_dec_and_test(&rb->refcount))
	5930	+ if (!refcount_dec_and_test(&rb->refcount))
5463	5931	return;
5464	5932
5465	5933	WARN_ON_ONCE(!list_empty(&rb->event_list));
..	..	@@ -5494,7 +5962,7 @@
5494	5962	static void perf_mmap_close(struct vm_area_struct *vma)
5495	5963	{
5496	5964	struct perf_event *event = vma->vm_file->private_data;
5497		- struct ring_buffer *rb = ring_buffer_get(event);
	5965	+ struct perf_buffer *rb = ring_buffer_get(event);
5498	5966	struct user_struct *mmap_user = rb->mmap_user;
5499	5967	int mmap_locked = rb->mmap_locked;
5500	5968	unsigned long size = perf_data_size(rb);
..	..	@@ -5519,12 +5987,12 @@
5519	5987	perf_pmu_output_stop(event);
5520	5988
5521	5989	/* now it's safe to free the pages */
5522		- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5523		- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
	5990	+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
	5991	+ atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5524	5992
5525	5993	/* this has to be the last one */
5526	5994	rb_free_aux(rb);
5527		- WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
	5995	+ WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5528	5996
5529	5997	mutex_unlock(&event->mmap_mutex);
5530	5998	}
..	..	@@ -5593,8 +6061,9 @@
5593	6061	* undo the VM accounting.
5594	6062	*/
5595	6063
5596		- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5597		- vma->vm_mm->pinned_vm -= mmap_locked;
	6064	+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
	6065	+ &mmap_user->locked_vm);
	6066	+ atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
5598	6067	free_uid(mmap_user);
5599	6068
5600	6069	out_put:
..	..	@@ -5603,7 +6072,7 @@
5603	6072
5604	6073	static const struct vm_operations_struct perf_mmap_vmops = {
5605	6074	.open = perf_mmap_open,
5606		- .close = perf_mmap_close, /* non mergable */
	6075	+ .close = perf_mmap_close, /* non mergeable */
5607	6076	.fault = perf_mmap_fault,
5608	6077	.page_mkwrite = perf_mmap_fault,
5609	6078	};
..	..	@@ -5613,8 +6082,8 @@
5613	6082	struct perf_event *event = file->private_data;
5614	6083	unsigned long user_locked, user_lock_limit;
5615	6084	struct user_struct *user = current_user();
	6085	+ struct perf_buffer *rb = NULL;
5616	6086	unsigned long locked, lock_limit;
5617		- struct ring_buffer *rb = NULL;
5618	6087	unsigned long vma_size;
5619	6088	unsigned long nr_pages;
5620	6089	long user_extra = 0, extra = 0;
..	..	@@ -5711,17 +6180,17 @@
5711	6180	again:
5712	6181	mutex_lock(&event->mmap_mutex);
5713	6182	if (event->rb) {
5714		- if (event->rb->nr_pages != nr_pages) {
	6183	+ if (data_page_nr(event->rb) != nr_pages) {
5715	6184	ret = -EINVAL;
5716	6185	goto unlock;
5717	6186	}
5718	6187
5719	6188	if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5720	6189	/*
5721		- * Raced against perf_mmap_close() through
5722		- * perf_event_set_output(). Try again, hope for better
5723		- * luck.
	6190	+ * Raced against perf_mmap_close(); remove the
	6191	+ * event and try again.
5724	6192	*/
	6193	+ ring_buffer_attach(event, NULL);
5725	6194	mutex_unlock(&event->mmap_mutex);
5726	6195	goto again;
5727	6196	}
..	..	@@ -5749,12 +6218,18 @@
5749	6218	user_locked = user_lock_limit;
5750	6219	user_locked += user_extra;
5751	6220
5752		- if (user_locked > user_lock_limit)
	6221	+ if (user_locked > user_lock_limit) {
	6222	+ /*
	6223	+ * charge locked_vm until it hits user_lock_limit;
	6224	+ * charge the rest from pinned_vm
	6225	+ */
5753	6226	extra = user_locked - user_lock_limit;
	6227	+ user_extra -= extra;
	6228	+ }
5754	6229
5755	6230	lock_limit = rlimit(RLIMIT_MEMLOCK);
5756	6231	lock_limit >>= PAGE_SHIFT;
5757		- locked = vma->vm_mm->pinned_vm + extra;
	6232	+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
5758	6233
5759	6234	if ((locked > lock_limit) && perf_is_paranoid() &&
5760	6235	!capable(CAP_IPC_LOCK)) {
..	..	@@ -5783,6 +6258,8 @@
5783	6258
5784	6259	ring_buffer_attach(event, rb);
5785	6260
	6261	+ perf_event_update_time(event);
	6262	+ perf_set_shadow_time(event, event->ctx);
5786	6263	perf_event_init_userpage(event);
5787	6264	perf_event_update_userpage(event);
5788	6265	} else {
..	..	@@ -5795,7 +6272,7 @@
5795	6272	unlock:
5796	6273	if (!ret) {
5797	6274	atomic_long_add(user_extra, &user->locked_vm);
5798		- vma->vm_mm->pinned_vm += extra;
	6275	+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
5799	6276
5800	6277	atomic_inc(&event->mmap_count);
5801	6278	} else if (rb) {
..	..	@@ -5932,18 +6409,25 @@
5932	6409	* Later on, we might change it to a list if there is
5933	6410	* another virtualization implementation supporting the callbacks.
5934	6411	*/
5935		-struct perf_guest_info_callbacks *perf_guest_cbs;
	6412	+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
5936	6413
5937	6414	int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5938	6415	{
5939		- perf_guest_cbs = cbs;
	6416	+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
	6417	+ return -EBUSY;
	6418	+
	6419	+ rcu_assign_pointer(perf_guest_cbs, cbs);
5940	6420	return 0;
5941	6421	}
5942	6422	EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5943	6423
5944	6424	int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5945	6425	{
5946		- perf_guest_cbs = NULL;
	6426	+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
	6427	+ return -EINVAL;
	6428	+
	6429	+ rcu_assign_pointer(perf_guest_cbs, NULL);
	6430	+ synchronize_rcu();
5947	6431	return 0;
5948	6432	}
5949	6433	EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
..	..	@@ -5965,14 +6449,13 @@
5965	6449	}
5966	6450
5967	6451	static void perf_sample_regs_user(struct perf_regs *regs_user,
5968		- struct pt_regs *regs,
5969		- struct pt_regs *regs_user_copy)
	6452	+ struct pt_regs *regs)
5970	6453	{
5971	6454	if (user_mode(regs)) {
5972	6455	regs_user->abi = perf_reg_abi(current);
5973	6456	regs_user->regs = regs;
5974	6457	} else if (!(current->flags & PF_KTHREAD)) {
5975		- perf_get_regs_user(regs_user, regs, regs_user_copy);
	6458	+ perf_get_regs_user(regs_user, regs);
5976	6459	} else {
5977	6460	regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5978	6461	regs_user->regs = NULL;
..	..	@@ -5991,7 +6474,7 @@
5991	6474	* Get remaining task size from user stack pointer.
5992	6475	*
5993	6476	* It'd be better to take stack vma map and limit this more
5994		- * precisly, but there's no way to get it safely under interrupt,
	6477	+ * precisely, but there's no way to get it safely under interrupt,
5995	6478	* so using TASK_SIZE as limit.
5996	6479	*/
5997	6480	static u64 perf_ustack_task_size(struct pt_regs *regs)
..	..	@@ -6073,10 +6556,9 @@
6073	6556
6074	6557	/* Data. */
6075	6558	sp = perf_user_stack_pointer(regs);
6076		- fs = get_fs();
6077		- set_fs(USER_DS);
	6559	+ fs = force_uaccess_begin();
6078	6560	rem = __output_copy_user(handle, (void *) sp, dump_size);
6079		- set_fs(fs);
	6561	+ force_uaccess_end(fs);
6080	6562	dyn_size = dump_size - rem;
6081	6563
6082	6564	perf_output_skip(handle, rem);
..	..	@@ -6084,6 +6566,122 @@
6084	6566	/* Dynamic size. */
6085	6567	perf_output_put(handle, dyn_size);
6086	6568	}
	6569	+}
	6570	+
	6571	+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
	6572	+ struct perf_sample_data *data,
	6573	+ size_t size)
	6574	+{
	6575	+ struct perf_event *sampler = event->aux_event;
	6576	+ struct perf_buffer *rb;
	6577	+
	6578	+ data->aux_size = 0;
	6579	+
	6580	+ if (!sampler)
	6581	+ goto out;
	6582	+
	6583	+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
	6584	+ goto out;
	6585	+
	6586	+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
	6587	+ goto out;
	6588	+
	6589	+ rb = ring_buffer_get(sampler);
	6590	+ if (!rb)
	6591	+ goto out;
	6592	+
	6593	+ /*
	6594	+ * If this is an NMI hit inside sampling code, don't take
	6595	+ * the sample. See also perf_aux_sample_output().
	6596	+ */
	6597	+ if (READ_ONCE(rb->aux_in_sampling)) {
	6598	+ data->aux_size = 0;
	6599	+ } else {
	6600	+ size = min_t(size_t, size, perf_aux_size(rb));
	6601	+ data->aux_size = ALIGN(size, sizeof(u64));
	6602	+ }
	6603	+ ring_buffer_put(rb);
	6604	+
	6605	+out:
	6606	+ return data->aux_size;
	6607	+}
	6608	+
	6609	+long perf_pmu_snapshot_aux(struct perf_buffer *rb,
	6610	+ struct perf_event *event,
	6611	+ struct perf_output_handle *handle,
	6612	+ unsigned long size)
	6613	+{
	6614	+ unsigned long flags;
	6615	+ long ret;
	6616	+
	6617	+ /*
	6618	+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
	6619	+ * paths. If we start calling them in NMI context, they may race with
	6620	+ * the IRQ ones, that is, for example, re-starting an event that's just
	6621	+ * been stopped, which is why we're using a separate callback that
	6622	+ * doesn't change the event state.
	6623	+ *
	6624	+ * IRQs need to be disabled to prevent IPIs from racing with us.
	6625	+ */
	6626	+ local_irq_save(flags);
	6627	+ /*
	6628	+ * Guard against NMI hits inside the critical section;
	6629	+ * see also perf_prepare_sample_aux().
	6630	+ */
	6631	+ WRITE_ONCE(rb->aux_in_sampling, 1);
	6632	+ barrier();
	6633	+
	6634	+ ret = event->pmu->snapshot_aux(event, handle, size);
	6635	+
	6636	+ barrier();
	6637	+ WRITE_ONCE(rb->aux_in_sampling, 0);
	6638	+ local_irq_restore(flags);
	6639	+
	6640	+ return ret;
	6641	+}
	6642	+
	6643	+static void perf_aux_sample_output(struct perf_event *event,
	6644	+ struct perf_output_handle *handle,
	6645	+ struct perf_sample_data *data)
	6646	+{
	6647	+ struct perf_event *sampler = event->aux_event;
	6648	+ struct perf_buffer *rb;
	6649	+ unsigned long pad;
	6650	+ long size;
	6651	+
	6652	+ if (WARN_ON_ONCE(!sampler \|\| !data->aux_size))
	6653	+ return;
	6654	+
	6655	+ rb = ring_buffer_get(sampler);
	6656	+ if (!rb)
	6657	+ return;
	6658	+
	6659	+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
	6660	+
	6661	+ /*
	6662	+ * An error here means that perf_output_copy() failed (returned a
	6663	+ * non-zero surplus that it didn't copy), which in its current
	6664	+ * enlightened implementation is not possible. If that changes, we'd
	6665	+ * like to know.
	6666	+ */
	6667	+ if (WARN_ON_ONCE(size < 0))
	6668	+ goto out_put;
	6669	+
	6670	+ /*
	6671	+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
	6672	+ * perf_prepare_sample_aux(), so should not be more than that.
	6673	+ */
	6674	+ pad = data->aux_size - size;
	6675	+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
	6676	+ pad = 8;
	6677	+
	6678	+ if (pad) {
	6679	+ u64 zero = 0;
	6680	+ perf_output_copy(handle, &zero, pad);
	6681	+ }
	6682	+
	6683	+out_put:
	6684	+ ring_buffer_put(rb);
6087	6685	}
6088	6686
6089	6687	static void __perf_event_header__init_id(struct perf_event_header *header,
..	..	@@ -6255,6 +6853,11 @@
6255	6853	perf_output_read_one(handle, event, enabled, running);
6256	6854	}
6257	6855
	6856	+static inline bool perf_sample_save_hw_index(struct perf_event *event)
	6857	+{
	6858	+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
	6859	+}
	6860	+
6258	6861	void perf_output_sample(struct perf_output_handle *handle,
6259	6862	struct perf_event_header *header,
6260	6863	struct perf_sample_data *data,
..	..	@@ -6343,6 +6946,8 @@
6343	6946	* sizeof(struct perf_branch_entry);
6344	6947
6345	6948	perf_output_put(handle, data->br_stack->nr);
	6949	+ if (perf_sample_save_hw_index(event))
	6950	+ perf_output_put(handle, data->br_stack->hw_idx);
6346	6951	perf_output_copy(handle, data->br_stack->entries, size);
6347	6952	} else {
6348	6953	/*
..	..	@@ -6405,11 +7010,21 @@
6405	7010	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6406	7011	perf_output_put(handle, data->phys_addr);
6407	7012
	7013	+ if (sample_type & PERF_SAMPLE_CGROUP)
	7014	+ perf_output_put(handle, data->cgroup);
	7015	+
	7016	+ if (sample_type & PERF_SAMPLE_AUX) {
	7017	+ perf_output_put(handle, data->aux_size);
	7018	+
	7019	+ if (data->aux_size)
	7020	+ perf_aux_sample_output(event, handle, data);
	7021	+ }
	7022	+
6408	7023	if (!event->attr.watermark) {
6409	7024	int wakeup_events = event->attr.wakeup_events;
6410	7025
6411	7026	if (wakeup_events) {
6412		- struct ring_buffer *rb = handle->rb;
	7027	+ struct perf_buffer *rb = handle->rb;
6413	7028	int events = local_inc_return(&rb->events);
6414	7029
6415	7030	if (events >= wakeup_events) {
..	..	@@ -6437,14 +7052,14 @@
6437	7052	* Walking the pages tables for user address.
6438	7053	* Interrupts are disabled, so it prevents any tear down
6439	7054	* of the page tables.
6440		- * Try IRQ-safe __get_user_pages_fast first.
	7055	+ * Try IRQ-safe get_user_page_fast_only first.
6441	7056	* If failed, leave phys_addr as 0.
6442	7057	*/
6443	7058	if (current->mm != NULL) {
6444	7059	struct page *p;
6445	7060
6446	7061	pagefault_disable();
6447		- if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {
	7062	+ if (get_user_page_fast_only(virt, 0, &p)) {
6448	7063	phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6449	7064	put_page(p);
6450	7065	}
..	..	@@ -6532,6 +7147,9 @@
6532	7147	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6533	7148	int size = sizeof(u64); /* nr */
6534	7149	if (data->br_stack) {
	7150	+ if (perf_sample_save_hw_index(event))
	7151	+ size += sizeof(u64);
	7152	+
6535	7153	size += data->br_stack->nr
6536	7154	* sizeof(struct perf_branch_entry);
6537	7155	}
..	..	@@ -6539,8 +7157,7 @@
6539	7157	}
6540	7158
6541	7159	if (sample_type & (PERF_SAMPLE_REGS_USER \| PERF_SAMPLE_STACK_USER))
6542		- perf_sample_regs_user(&data->regs_user, regs,
6543		- &data->regs_user_copy);
	7160	+ perf_sample_regs_user(&data->regs_user, regs);
6544	7161
6545	7162	if (sample_type & PERF_SAMPLE_REGS_USER) {
6546	7163	/* regs dump ABI info */
..	..	@@ -6556,7 +7173,7 @@
6556	7173
6557	7174	if (sample_type & PERF_SAMPLE_STACK_USER) {
6558	7175	/*
6559		- * Either we need PERF_SAMPLE_STACK_USER bit to be allways
	7176	+ * Either we need PERF_SAMPLE_STACK_USER bit to be always
6560	7177	* processed as the last one or have additional check added
6561	7178	* in case new sample type is added, because we could eat
6562	7179	* up the rest of the sample size.
..	..	@@ -6596,25 +7213,67 @@
6596	7213
6597	7214	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6598	7215	data->phys_addr = perf_virt_to_phys(data->addr);
	7216	+
	7217	+#ifdef CONFIG_CGROUP_PERF
	7218	+ if (sample_type & PERF_SAMPLE_CGROUP) {
	7219	+ struct cgroup *cgrp;
	7220	+
	7221	+ /* protected by RCU */
	7222	+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
	7223	+ data->cgroup = cgroup_id(cgrp);
	7224	+ }
	7225	+#endif
	7226	+
	7227	+ if (sample_type & PERF_SAMPLE_AUX) {
	7228	+ u64 size;
	7229	+
	7230	+ header->size += sizeof(u64); /* size */
	7231	+
	7232	+ /*
	7233	+ * Given the 16bit nature of header::size, an AUX sample can
	7234	+ * easily overflow it, what with all the preceding sample bits.
	7235	+ * Make sure this doesn't happen by using up to U16_MAX bytes
	7236	+ * per sample in total (rounded down to 8 byte boundary).
	7237	+ */
	7238	+ size = min_t(size_t, U16_MAX - header->size,
	7239	+ event->attr.aux_sample_size);
	7240	+ size = rounddown(size, 8);
	7241	+ size = perf_prepare_sample_aux(event, data, size);
	7242	+
	7243	+ WARN_ON_ONCE(size + header->size > U16_MAX);
	7244	+ header->size += size;
	7245	+ }
	7246	+ /*
	7247	+ * If you're adding more sample types here, you likely need to do
	7248	+ * something about the overflowing header::size, like repurpose the
	7249	+ * lowest 3 bits of size, which should be always zero at the moment.
	7250	+ * This raises a more important question, do we really need 512k sized
	7251	+ * samples and why, so good argumentation is in order for whatever you
	7252	+ * do here next.
	7253	+ */
	7254	+ WARN_ON_ONCE(header->size & 7);
6599	7255	}
6600	7256
6601		-static __always_inline void
	7257	+static __always_inline int
6602	7258	__perf_event_output(struct perf_event *event,
6603	7259	struct perf_sample_data *data,
6604	7260	struct pt_regs *regs,
6605	7261	int (output_begin)(struct perf_output_handle ,
	7262	+ struct perf_sample_data *,
6606	7263	struct perf_event *,
6607	7264	unsigned int))
6608	7265	{
6609	7266	struct perf_output_handle handle;
6610	7267	struct perf_event_header header;
	7268	+ int err;
6611	7269
6612	7270	/* protect the callchain buffers */
6613	7271	rcu_read_lock();
6614	7272
6615	7273	perf_prepare_sample(&header, data, event, regs);
6616	7274
6617		- if (output_begin(&handle, event, header.size))
	7275	+ err = output_begin(&handle, data, event, header.size);
	7276	+ if (err)
6618	7277	goto exit;
6619	7278
6620	7279	perf_output_sample(&handle, &header, data, event);
..	..	@@ -6623,6 +7282,7 @@
6623	7282
6624	7283	exit:
6625	7284	rcu_read_unlock();
	7285	+ return err;
6626	7286	}
6627	7287
6628	7288	void
..	..	@@ -6641,12 +7301,12 @@
6641	7301	__perf_event_output(event, data, regs, perf_output_begin_backward);
6642	7302	}
6643	7303
6644		-void
	7304	+int
6645	7305	perf_event_output(struct perf_event *event,
6646	7306	struct perf_sample_data *data,
6647	7307	struct pt_regs *regs)
6648	7308	{
6649		- __perf_event_output(event, data, regs, perf_output_begin);
	7309	+ return __perf_event_output(event, data, regs, perf_output_begin);
6650	7310	}
6651	7311
6652	7312	/*
..	..	@@ -6678,7 +7338,7 @@
6678	7338	int ret;
6679	7339
6680	7340	perf_event_header__init_id(&read_event.header, &sample, event);
6681		- ret = perf_output_begin(&handle, event, read_event.header.size);
	7341	+ ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
6682	7342	if (ret)
6683	7343	return;
6684	7344
..	..	@@ -6823,7 +7483,7 @@
6823	7483	}
6824	7484
6825	7485	struct remote_output {
6826		- struct ring_buffer *rb;
	7486	+ struct perf_buffer *rb;
6827	7487	int err;
6828	7488	};
6829	7489
..	..	@@ -6831,7 +7491,7 @@
6831	7491	{
6832	7492	struct perf_event *parent = event->parent;
6833	7493	struct remote_output *ro = data;
6834		- struct ring_buffer *rb = ro->rb;
	7494	+ struct perf_buffer *rb = ro->rb;
6835	7495	struct stop_event_data sd = {
6836	7496	.event = event,
6837	7497	};
..	..	@@ -6947,7 +7607,7 @@
6947	7607
6948	7608	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6949	7609
6950		- ret = perf_output_begin(&handle, event,
	7610	+ ret = perf_output_begin(&handle, &sample, event,
6951	7611	task_event->event_id.header.size);
6952	7612	if (ret)
6953	7613	goto out;
..	..	@@ -7050,7 +7710,7 @@
7050	7710	return;
7051	7711
7052	7712	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7053		- ret = perf_output_begin(&handle, event,
	7713	+ ret = perf_output_begin(&handle, &sample, event,
7054	7714	comm_event->event_id.header.size);
7055	7715
7056	7716	if (ret)
..	..	@@ -7150,7 +7810,7 @@
7150	7810
7151	7811	perf_event_header__init_id(&namespaces_event->event_id.header,
7152	7812	&sample, event);
7153		- ret = perf_output_begin(&handle, event,
	7813	+ ret = perf_output_begin(&handle, &sample, event,
7154	7814	namespaces_event->event_id.header.size);
7155	7815	if (ret)
7156	7816	goto out;
..	..	@@ -7175,7 +7835,7 @@
7175	7835	{
7176	7836	struct path ns_path;
7177	7837	struct inode *ns_inode;
7178		- void *error;
	7838	+ int error;
7179	7839
7180	7840	error = ns_get_path(&ns_path, task, ns_ops);
7181	7841	if (!error) {
..	..	@@ -7245,6 +7905,105 @@
7245	7905	}
7246	7906
7247	7907	/*
	7908	+ * cgroup tracking
	7909	+ */
	7910	+#ifdef CONFIG_CGROUP_PERF
	7911	+
	7912	+struct perf_cgroup_event {
	7913	+ char *path;
	7914	+ int path_size;
	7915	+ struct {
	7916	+ struct perf_event_header header;
	7917	+ u64 id;
	7918	+ char path[];
	7919	+ } event_id;
	7920	+};
	7921	+
	7922	+static int perf_event_cgroup_match(struct perf_event *event)
	7923	+{
	7924	+ return event->attr.cgroup;
	7925	+}
	7926	+
	7927	+static void perf_event_cgroup_output(struct perf_event event, void data)
	7928	+{
	7929	+ struct perf_cgroup_event *cgroup_event = data;
	7930	+ struct perf_output_handle handle;
	7931	+ struct perf_sample_data sample;
	7932	+ u16 header_size = cgroup_event->event_id.header.size;
	7933	+ int ret;
	7934	+
	7935	+ if (!perf_event_cgroup_match(event))
	7936	+ return;
	7937	+
	7938	+ perf_event_header__init_id(&cgroup_event->event_id.header,
	7939	+ &sample, event);
	7940	+ ret = perf_output_begin(&handle, &sample, event,
	7941	+ cgroup_event->event_id.header.size);
	7942	+ if (ret)
	7943	+ goto out;
	7944	+
	7945	+ perf_output_put(&handle, cgroup_event->event_id);
	7946	+ __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
	7947	+
	7948	+ perf_event__output_id_sample(event, &handle, &sample);
	7949	+
	7950	+ perf_output_end(&handle);
	7951	+out:
	7952	+ cgroup_event->event_id.header.size = header_size;
	7953	+}
	7954	+
	7955	+static void perf_event_cgroup(struct cgroup *cgrp)
	7956	+{
	7957	+ struct perf_cgroup_event cgroup_event;
	7958	+ char path_enomem[16] = "//enomem";
	7959	+ char *pathname;
	7960	+ size_t size;
	7961	+
	7962	+ if (!atomic_read(&nr_cgroup_events))
	7963	+ return;
	7964	+
	7965	+ cgroup_event = (struct perf_cgroup_event){
	7966	+ .event_id = {
	7967	+ .header = {
	7968	+ .type = PERF_RECORD_CGROUP,
	7969	+ .misc = 0,
	7970	+ .size = sizeof(cgroup_event.event_id),
	7971	+ },
	7972	+ .id = cgroup_id(cgrp),
	7973	+ },
	7974	+ };
	7975	+
	7976	+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
	7977	+ if (pathname == NULL) {
	7978	+ cgroup_event.path = path_enomem;
	7979	+ } else {
	7980	+ /* just to be sure to have enough space for alignment */
	7981	+ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
	7982	+ cgroup_event.path = pathname;
	7983	+ }
	7984	+
	7985	+ /*
	7986	+ * Since our buffer works in 8 byte units we need to align our string
	7987	+ * size to a multiple of 8. However, we must guarantee the tail end is
	7988	+ * zero'd out to avoid leaking random bits to userspace.
	7989	+ */
	7990	+ size = strlen(cgroup_event.path) + 1;
	7991	+ while (!IS_ALIGNED(size, sizeof(u64)))
	7992	+ cgroup_event.path[size++] = '\0';
	7993	+
	7994	+ cgroup_event.event_id.header.size += size;
	7995	+ cgroup_event.path_size = size;
	7996	+
	7997	+ perf_iterate_sb(perf_event_cgroup_output,
	7998	+ &cgroup_event,
	7999	+ NULL);
	8000	+
	8001	+ kfree(pathname);
	8002	+}
	8003	+
	8004	+#endif
	8005	+
	8006	+/*
7248	8007	* mmap tracking
7249	8008	*/
7250	8009
..	..	@@ -7304,7 +8063,7 @@
7304	8063	}
7305	8064
7306	8065	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7307		- ret = perf_output_begin(&handle, event,
	8066	+ ret = perf_output_begin(&handle, &sample, event,
7308	8067	mmap_event->event_id.header.size);
7309	8068	if (ret)
7310	8069	goto out;
..	..	@@ -7364,7 +8123,7 @@
7364	8123	flags \|= MAP_EXECUTABLE;
7365	8124	if (vma->vm_flags & VM_LOCKED)
7366	8125	flags \|= MAP_LOCKED;
7367		- if (vma->vm_flags & VM_HUGETLB)
	8126	+ if (is_vm_hugetlb_page(vma))
7368	8127	flags \|= MAP_HUGETLB;
7369	8128
7370	8129	if (file) {
..	..	@@ -7614,7 +8373,7 @@
7614	8373	int ret;
7615	8374
7616	8375	perf_event_header__init_id(&rec.header, &sample, event);
7617		- ret = perf_output_begin(&handle, event, rec.header.size);
	8376	+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
7618	8377
7619	8378	if (ret)
7620	8379	return;
..	..	@@ -7648,7 +8407,7 @@
7648	8407
7649	8408	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7650	8409
7651		- ret = perf_output_begin(&handle, event,
	8410	+ ret = perf_output_begin(&handle, &sample, event,
7652	8411	lost_samples_event.header.size);
7653	8412	if (ret)
7654	8413	return;
..	..	@@ -7703,7 +8462,7 @@
7703	8462
7704	8463	perf_event_header__init_id(&se->event_id.header, &sample, event);
7705	8464
7706		- ret = perf_output_begin(&handle, event, se->event_id.header.size);
	8465	+ ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
7707	8466	if (ret)
7708	8467	return;
7709	8468
..	..	@@ -7778,7 +8537,7 @@
7778	8537
7779	8538	perf_event_header__init_id(&throttle_event.header, &sample, event);
7780	8539
7781		- ret = perf_output_begin(&handle, event,
	8540	+ ret = perf_output_begin(&handle, &sample, event,
7782	8541	throttle_event.header.size);
7783	8542	if (ret)
7784	8543	return;
..	..	@@ -7786,6 +8545,290 @@
7786	8545	perf_output_put(&handle, throttle_event);
7787	8546	perf_event__output_id_sample(event, &handle, &sample);
7788	8547	perf_output_end(&handle);
	8548	+}
	8549	+
	8550	+/*
	8551	+ * ksymbol register/unregister tracking
	8552	+ */
	8553	+
	8554	+struct perf_ksymbol_event {
	8555	+ const char *name;
	8556	+ int name_len;
	8557	+ struct {
	8558	+ struct perf_event_header header;
	8559	+ u64 addr;
	8560	+ u32 len;
	8561	+ u16 ksym_type;
	8562	+ u16 flags;
	8563	+ } event_id;
	8564	+};
	8565	+
	8566	+static int perf_event_ksymbol_match(struct perf_event *event)
	8567	+{
	8568	+ return event->attr.ksymbol;
	8569	+}
	8570	+
	8571	+static void perf_event_ksymbol_output(struct perf_event event, void data)
	8572	+{
	8573	+ struct perf_ksymbol_event *ksymbol_event = data;
	8574	+ struct perf_output_handle handle;
	8575	+ struct perf_sample_data sample;
	8576	+ int ret;
	8577	+
	8578	+ if (!perf_event_ksymbol_match(event))
	8579	+ return;
	8580	+
	8581	+ perf_event_header__init_id(&ksymbol_event->event_id.header,
	8582	+ &sample, event);
	8583	+ ret = perf_output_begin(&handle, &sample, event,
	8584	+ ksymbol_event->event_id.header.size);
	8585	+ if (ret)
	8586	+ return;
	8587	+
	8588	+ perf_output_put(&handle, ksymbol_event->event_id);
	8589	+ __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
	8590	+ perf_event__output_id_sample(event, &handle, &sample);
	8591	+
	8592	+ perf_output_end(&handle);
	8593	+}
	8594	+
	8595	+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
	8596	+ const char *sym)
	8597	+{
	8598	+ struct perf_ksymbol_event ksymbol_event;
	8599	+ char name[KSYM_NAME_LEN];
	8600	+ u16 flags = 0;
	8601	+ int name_len;
	8602	+
	8603	+ if (!atomic_read(&nr_ksymbol_events))
	8604	+ return;
	8605	+
	8606	+ if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX \|\|
	8607	+ ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
	8608	+ goto err;
	8609	+
	8610	+ strlcpy(name, sym, KSYM_NAME_LEN);
	8611	+ name_len = strlen(name) + 1;
	8612	+ while (!IS_ALIGNED(name_len, sizeof(u64)))
	8613	+ name[name_len++] = '\0';
	8614	+ BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
	8615	+
	8616	+ if (unregister)
	8617	+ flags \|= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
	8618	+
	8619	+ ksymbol_event = (struct perf_ksymbol_event){
	8620	+ .name = name,
	8621	+ .name_len = name_len,
	8622	+ .event_id = {
	8623	+ .header = {
	8624	+ .type = PERF_RECORD_KSYMBOL,
	8625	+ .size = sizeof(ksymbol_event.event_id) +
	8626	+ name_len,
	8627	+ },
	8628	+ .addr = addr,
	8629	+ .len = len,
	8630	+ .ksym_type = ksym_type,
	8631	+ .flags = flags,
	8632	+ },
	8633	+ };
	8634	+
	8635	+ perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
	8636	+ return;
	8637	+err:
	8638	+ WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
	8639	+}
	8640	+
	8641	+/*
	8642	+ * bpf program load/unload tracking
	8643	+ */
	8644	+
	8645	+struct perf_bpf_event {
	8646	+ struct bpf_prog *prog;
	8647	+ struct {
	8648	+ struct perf_event_header header;
	8649	+ u16 type;
	8650	+ u16 flags;
	8651	+ u32 id;
	8652	+ u8 tag[BPF_TAG_SIZE];
	8653	+ } event_id;
	8654	+};
	8655	+
	8656	+static int perf_event_bpf_match(struct perf_event *event)
	8657	+{
	8658	+ return event->attr.bpf_event;
	8659	+}
	8660	+
	8661	+static void perf_event_bpf_output(struct perf_event event, void data)
	8662	+{
	8663	+ struct perf_bpf_event *bpf_event = data;
	8664	+ struct perf_output_handle handle;
	8665	+ struct perf_sample_data sample;
	8666	+ int ret;
	8667	+
	8668	+ if (!perf_event_bpf_match(event))
	8669	+ return;
	8670	+
	8671	+ perf_event_header__init_id(&bpf_event->event_id.header,
	8672	+ &sample, event);
	8673	+ ret = perf_output_begin(&handle, &sample, event,
	8674	+ bpf_event->event_id.header.size);
	8675	+ if (ret)
	8676	+ return;
	8677	+
	8678	+ perf_output_put(&handle, bpf_event->event_id);
	8679	+ perf_event__output_id_sample(event, &handle, &sample);
	8680	+
	8681	+ perf_output_end(&handle);
	8682	+}
	8683	+
	8684	+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
	8685	+ enum perf_bpf_event_type type)
	8686	+{
	8687	+ bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
	8688	+ int i;
	8689	+
	8690	+ if (prog->aux->func_cnt == 0) {
	8691	+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
	8692	+ (u64)(unsigned long)prog->bpf_func,
	8693	+ prog->jited_len, unregister,
	8694	+ prog->aux->ksym.name);
	8695	+ } else {
	8696	+ for (i = 0; i < prog->aux->func_cnt; i++) {
	8697	+ struct bpf_prog *subprog = prog->aux->func[i];
	8698	+
	8699	+ perf_event_ksymbol(
	8700	+ PERF_RECORD_KSYMBOL_TYPE_BPF,
	8701	+ (u64)(unsigned long)subprog->bpf_func,
	8702	+ subprog->jited_len, unregister,
	8703	+ subprog->aux->ksym.name);
	8704	+ }
	8705	+ }
	8706	+}
	8707	+
	8708	+void perf_event_bpf_event(struct bpf_prog *prog,
	8709	+ enum perf_bpf_event_type type,
	8710	+ u16 flags)
	8711	+{
	8712	+ struct perf_bpf_event bpf_event;
	8713	+
	8714	+ if (type <= PERF_BPF_EVENT_UNKNOWN \|\|
	8715	+ type >= PERF_BPF_EVENT_MAX)
	8716	+ return;
	8717	+
	8718	+ switch (type) {
	8719	+ case PERF_BPF_EVENT_PROG_LOAD:
	8720	+ case PERF_BPF_EVENT_PROG_UNLOAD:
	8721	+ if (atomic_read(&nr_ksymbol_events))
	8722	+ perf_event_bpf_emit_ksymbols(prog, type);
	8723	+ break;
	8724	+ default:
	8725	+ break;
	8726	+ }
	8727	+
	8728	+ if (!atomic_read(&nr_bpf_events))
	8729	+ return;
	8730	+
	8731	+ bpf_event = (struct perf_bpf_event){
	8732	+ .prog = prog,
	8733	+ .event_id = {
	8734	+ .header = {
	8735	+ .type = PERF_RECORD_BPF_EVENT,
	8736	+ .size = sizeof(bpf_event.event_id),
	8737	+ },
	8738	+ .type = type,
	8739	+ .flags = flags,
	8740	+ .id = prog->aux->id,
	8741	+ },
	8742	+ };
	8743	+
	8744	+ BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
	8745	+
	8746	+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
	8747	+ perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
	8748	+}
	8749	+
	8750	+struct perf_text_poke_event {
	8751	+ const void *old_bytes;
	8752	+ const void *new_bytes;
	8753	+ size_t pad;
	8754	+ u16 old_len;
	8755	+ u16 new_len;
	8756	+
	8757	+ struct {
	8758	+ struct perf_event_header header;
	8759	+
	8760	+ u64 addr;
	8761	+ } event_id;
	8762	+};
	8763	+
	8764	+static int perf_event_text_poke_match(struct perf_event *event)
	8765	+{
	8766	+ return event->attr.text_poke;
	8767	+}
	8768	+
	8769	+static void perf_event_text_poke_output(struct perf_event event, void data)
	8770	+{
	8771	+ struct perf_text_poke_event *text_poke_event = data;
	8772	+ struct perf_output_handle handle;
	8773	+ struct perf_sample_data sample;
	8774	+ u64 padding = 0;
	8775	+ int ret;
	8776	+
	8777	+ if (!perf_event_text_poke_match(event))
	8778	+ return;
	8779	+
	8780	+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
	8781	+
	8782	+ ret = perf_output_begin(&handle, &sample, event,
	8783	+ text_poke_event->event_id.header.size);
	8784	+ if (ret)
	8785	+ return;
	8786	+
	8787	+ perf_output_put(&handle, text_poke_event->event_id);
	8788	+ perf_output_put(&handle, text_poke_event->old_len);
	8789	+ perf_output_put(&handle, text_poke_event->new_len);
	8790	+
	8791	+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
	8792	+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
	8793	+
	8794	+ if (text_poke_event->pad)
	8795	+ __output_copy(&handle, &padding, text_poke_event->pad);
	8796	+
	8797	+ perf_event__output_id_sample(event, &handle, &sample);
	8798	+
	8799	+ perf_output_end(&handle);
	8800	+}
	8801	+
	8802	+void perf_event_text_poke(const void addr, const void old_bytes,
	8803	+ size_t old_len, const void *new_bytes, size_t new_len)
	8804	+{
	8805	+ struct perf_text_poke_event text_poke_event;
	8806	+ size_t tot, pad;
	8807	+
	8808	+ if (!atomic_read(&nr_text_poke_events))
	8809	+ return;
	8810	+
	8811	+ tot = sizeof(text_poke_event.old_len) + old_len;
	8812	+ tot += sizeof(text_poke_event.new_len) + new_len;
	8813	+ pad = ALIGN(tot, sizeof(u64)) - tot;
	8814	+
	8815	+ text_poke_event = (struct perf_text_poke_event){
	8816	+ .old_bytes = old_bytes,
	8817	+ .new_bytes = new_bytes,
	8818	+ .pad = pad,
	8819	+ .old_len = old_len,
	8820	+ .new_len = new_len,
	8821	+ .event_id = {
	8822	+ .header = {
	8823	+ .type = PERF_RECORD_TEXT_POKE,
	8824	+ .misc = PERF_RECORD_MISC_KERNEL,
	8825	+ .size = sizeof(text_poke_event.event_id) + tot + pad,
	8826	+ },
	8827	+ .addr = (unsigned long)addr,
	8828	+ },
	8829	+ };
	8830	+
	8831	+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
7789	8832	}
7790	8833
7791	8834	void perf_event_itrace_started(struct perf_event *event)
..	..	@@ -7818,7 +8861,7 @@
7818	8861	rec.tid = perf_event_tid(event, current);
7819	8862
7820	8863	perf_event_header__init_id(&rec.header, &sample, event);
7821		- ret = perf_output_begin(&handle, event, rec.header.size);
	8864	+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
7822	8865
7823	8866	if (ret)
7824	8867	return;
..	..	@@ -7842,8 +8885,8 @@
7842	8885	hwc->interrupts = 1;
7843	8886	} else {
7844	8887	hwc->interrupts++;
7845		- if (unlikely(throttle
7846		- && hwc->interrupts >= max_samples_per_tick)) {
	8888	+ if (unlikely(throttle &&
	8889	+ hwc->interrupts > max_samples_per_tick)) {
7847	8890	__this_cpu_inc(perf_throttled_count);
7848	8891	tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7849	8892	hwc->interrupts = MAX_INTERRUPTS;
..	..	@@ -8386,9 +9429,9 @@
8386	9429	if (event->hw.state & PERF_HES_STOPPED)
8387	9430	return 0;
8388	9431	/*
8389		- * All tracepoints are from kernel-space.
	9432	+ * If exclude_kernel, only trace user-space tracepoints (uprobes)
8390	9433	*/
8391		- if (event->attr.exclude_kernel)
	9434	+ if (event->attr.exclude_kernel && !user_mode(regs))
8392	9435	return 0;
8393	9436
8394	9437	if (!perf_tp_filter_match(event, data))
..	..	@@ -8514,30 +9557,39 @@
8514	9557	*
8515	9558	* PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
8516	9559	* if not set, create kprobe/uprobe
	9560	+ *
	9561	+ * The following values specify a reference counter (or semaphore in the
	9562	+ * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
	9563	+ * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
	9564	+ *
	9565	+ * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset
	9566	+ * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
8517	9567	*/
8518	9568	enum perf_probe_config {
8519	9569	PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
	9570	+ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
	9571	+ PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
8520	9572	};
8521	9573
8522	9574	PMU_FORMAT_ATTR(retprobe, "config:0");
	9575	+#endif
8523	9576
8524		-static struct attribute *probe_attrs[] = {
	9577	+#ifdef CONFIG_KPROBE_EVENTS
	9578	+static struct attribute *kprobe_attrs[] = {
8525	9579	&format_attr_retprobe.attr,
8526	9580	NULL,
8527	9581	};
8528	9582
8529		-static struct attribute_group probe_format_group = {
	9583	+static struct attribute_group kprobe_format_group = {
8530	9584	.name = "format",
8531		- .attrs = probe_attrs,
	9585	+ .attrs = kprobe_attrs,
8532	9586	};
8533	9587
8534		-static const struct attribute_group *probe_attr_groups[] = {
8535		- &probe_format_group,
	9588	+static const struct attribute_group *kprobe_attr_groups[] = {
	9589	+ &kprobe_format_group,
8536	9590	NULL,
8537	9591	};
8538		-#endif
8539	9592
8540		-#ifdef CONFIG_KPROBE_EVENTS
8541	9593	static int perf_kprobe_event_init(struct perf_event *event);
8542	9594	static struct pmu perf_kprobe = {
8543	9595	.task_ctx_nr = perf_sw_context,
..	..	@@ -8547,7 +9599,7 @@
8547	9599	.start = perf_swevent_start,
8548	9600	.stop = perf_swevent_stop,
8549	9601	.read = perf_swevent_read,
8550		- .attr_groups = probe_attr_groups,
	9602	+ .attr_groups = kprobe_attr_groups,
8551	9603	};
8552	9604
8553	9605	static int perf_kprobe_event_init(struct perf_event *event)
..	..	@@ -8558,7 +9610,7 @@
8558	9610	if (event->attr.type != perf_kprobe.type)
8559	9611	return -ENOENT;
8560	9612
8561		- if (!capable(CAP_SYS_ADMIN))
	9613	+ if (!perfmon_capable())
8562	9614	return -EACCES;
8563	9615
8564	9616	/*
..	..	@@ -8579,6 +9631,24 @@
8579	9631	#endif /* CONFIG_KPROBE_EVENTS */
8580	9632
8581	9633	#ifdef CONFIG_UPROBE_EVENTS
	9634	+PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
	9635	+
	9636	+static struct attribute *uprobe_attrs[] = {
	9637	+ &format_attr_retprobe.attr,
	9638	+ &format_attr_ref_ctr_offset.attr,
	9639	+ NULL,
	9640	+};
	9641	+
	9642	+static struct attribute_group uprobe_format_group = {
	9643	+ .name = "format",
	9644	+ .attrs = uprobe_attrs,
	9645	+};
	9646	+
	9647	+static const struct attribute_group *uprobe_attr_groups[] = {
	9648	+ &uprobe_format_group,
	9649	+ NULL,
	9650	+};
	9651	+
8582	9652	static int perf_uprobe_event_init(struct perf_event *event);
8583	9653	static struct pmu perf_uprobe = {
8584	9654	.task_ctx_nr = perf_sw_context,
..	..	@@ -8588,18 +9658,19 @@
8588	9658	.start = perf_swevent_start,
8589	9659	.stop = perf_swevent_stop,
8590	9660	.read = perf_swevent_read,
8591		- .attr_groups = probe_attr_groups,
	9661	+ .attr_groups = uprobe_attr_groups,
8592	9662	};
8593	9663
8594	9664	static int perf_uprobe_event_init(struct perf_event *event)
8595	9665	{
8596	9666	int err;
	9667	+ unsigned long ref_ctr_offset;
8597	9668	bool is_retprobe;
8598	9669
8599	9670	if (event->attr.type != perf_uprobe.type)
8600	9671	return -ENOENT;
8601	9672
8602		- if (!capable(CAP_SYS_ADMIN))
	9673	+ if (!perfmon_capable())
8603	9674	return -EACCES;
8604	9675
8605	9676	/*
..	..	@@ -8609,7 +9680,8 @@
8609	9680	return -EOPNOTSUPP;
8610	9681
8611	9682	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8612		- err = perf_uprobe_init(event, is_retprobe);
	9683	+ ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
	9684	+ err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
8613	9685	if (err)
8614	9686	return err;
8615	9687
..	..	@@ -8647,7 +9719,6 @@
8647	9719	int ret = 0;
8648	9720
8649	9721	ctx.regs = perf_arch_bpf_user_pt_regs(regs);
8650		- preempt_disable();
8651	9722	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8652	9723	goto out;
8653	9724	rcu_read_lock();
..	..	@@ -8655,7 +9726,6 @@
8655	9726	rcu_read_unlock();
8656	9727	out:
8657	9728	__this_cpu_dec(bpf_prog_active);
8658		- preempt_enable();
8659	9729	if (!ret)
8660	9730	return;
8661	9731
..	..	@@ -8676,6 +9746,24 @@
8676	9746	prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8677	9747	if (IS_ERR(prog))
8678	9748	return PTR_ERR(prog);
	9749	+
	9750	+ if (event->attr.precise_ip &&
	9751	+ prog->call_get_stack &&
	9752	+ (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) \|\|
	9753	+ event->attr.exclude_callchain_kernel \|\|
	9754	+ event->attr.exclude_callchain_user)) {
	9755	+ /*
	9756	+ * On perf_event with precise_ip, calling bpf_get_stack()
	9757	+ * may trigger unwinder warnings and occasional crashes.
	9758	+ * bpf_get_[stack\|stackid] works around this issue by using
	9759	+ * callchain attached to perf_sample_data. If the
	9760	+ * perf_event does not full (kernel and user) callchain
	9761	+ * attached to perf_sample_data, do not allow attaching BPF
	9762	+ * program that calls bpf_get_[stack\|stackid].
	9763	+ */
	9764	+ bpf_prog_put(prog);
	9765	+ return -EPROTO;
	9766	+ }
8679	9767
8680	9768	event->prog = prog;
8681	9769	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
..	..	@@ -8875,7 +9963,7 @@
8875	9963	/*
8876	9964	* Scan through mm's vmas and see if one of them matches the
8877	9965	* @filter; if so, adjust filter's address range.
8878		- * Called with mm::mmap_sem down for reading.
	9966	+ * Called with mm::mmap_lock down for reading.
8879	9967	*/
8880	9968	static void perf_addr_filter_apply(struct perf_addr_filter *filter,
8881	9969	struct mm_struct *mm,
..	..	@@ -8917,7 +10005,7 @@
8917	10005	if (!mm)
8918	10006	goto restart;
8919	10007
8920		- down_read(&mm->mmap_sem);
	10008	+ mmap_read_lock(mm);
8921	10009	}
8922	10010
8923	10011	raw_spin_lock_irqsave(&ifh->lock, flags);
..	..	@@ -8943,7 +10031,7 @@
8943	10031	raw_spin_unlock_irqrestore(&ifh->lock, flags);
8944	10032
8945	10033	if (ifh->nr_file_filters) {
8946		- up_read(&mm->mmap_sem);
	10034	+ mmap_read_unlock(mm);
8947	10035
8948	10036	mmput(mm);
8949	10037	}
..	..	@@ -9050,6 +10138,7 @@
9050	10138	case IF_SRC_KERNELADDR:
9051	10139	case IF_SRC_KERNEL:
9052	10140	kernel = 1;
	10141	+ fallthrough;
9053	10142
9054	10143	case IF_SRC_FILEADDR:
9055	10144	case IF_SRC_FILE:
..	..	@@ -9136,8 +10225,11 @@
9136	10225	}
9137	10226
9138	10227	/* ready to consume more filters */
	10228	+ kfree(filename);
	10229	+ filename = NULL;
9139	10230	state = IF_STATE_ACTION;
9140	10231	filter = NULL;
	10232	+ kernel = 0;
9141	10233	}
9142	10234	}
9143	10235
..	..	@@ -9285,7 +10377,7 @@
9285	10377	period = max_t(u64, 10000, hwc->sample_period);
9286	10378	}
9287	10379	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
9288		- HRTIMER_MODE_REL_PINNED);
	10380	+ HRTIMER_MODE_REL_PINNED_HARD);
9289	10381	}
9290	10382
9291	10383	static void perf_swevent_cancel_hrtimer(struct perf_event *event)
..	..	@@ -9640,8 +10732,7 @@
9640	10732	cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9641	10733	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9642	10734
9643		- cpu_function_call(cpu,
9644		- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
	10735	+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx);
9645	10736	}
9646	10737	cpus_read_unlock();
9647	10738	mutex_unlock(&mux_interval_mutex);
..	..	@@ -9678,13 +10769,15 @@
9678	10769
9679	10770	pmu->dev->groups = pmu->attr_groups;
9680	10771	device_initialize(pmu->dev);
9681		- ret = dev_set_name(pmu->dev, "%s", pmu->name);
9682		- if (ret)
9683		- goto free_dev;
9684	10772
9685	10773	dev_set_drvdata(pmu->dev, pmu);
9686	10774	pmu->dev->bus = &pmu_bus;
9687	10775	pmu->dev->release = pmu_dev_release;
	10776	+
	10777	+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
	10778	+ if (ret)
	10779	+ goto free_dev;
	10780	+
9688	10781	ret = device_add(pmu->dev);
9689	10782	if (ret)
9690	10783	goto free_dev;
..	..	@@ -9692,6 +10785,12 @@
9692	10785	/* For PMUs with address filters, throw in an extra attribute: */
9693	10786	if (pmu->nr_addr_filters)
9694	10787	ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
	10788	+
	10789	+ if (ret)
	10790	+ goto del_dev;
	10791	+
	10792	+ if (pmu->attr_update)
	10793	+ ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
9695	10794
9696	10795	if (ret)
9697	10796	goto del_dev;
..	..	@@ -9712,7 +10811,7 @@
9712	10811
9713	10812	int perf_pmu_register(struct pmu pmu, const char name, int type)
9714	10813	{
9715		- int cpu, ret;
	10814	+ int cpu, ret, max = PERF_TYPE_MAX;
9716	10815
9717	10816	mutex_lock(&pmus_lock);
9718	10817	ret = -ENOMEM;
..	..	@@ -9725,12 +10824,17 @@
9725	10824	goto skip_type;
9726	10825	pmu->name = name;
9727	10826
9728		- if (type < 0) {
9729		- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9730		- if (type < 0) {
9731		- ret = type;
	10827	+ if (type != PERF_TYPE_SOFTWARE) {
	10828	+ if (type >= 0)
	10829	+ max = type;
	10830	+
	10831	+ ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
	10832	+ if (ret < 0)
9732	10833	goto free_pdc;
9733		- }
	10834	+
	10835	+ WARN_ON(type >= 0 && ret != type);
	10836	+
	10837	+ type = ret;
9734	10838	}
9735	10839	pmu->type = type;
9736	10840
..	..	@@ -9776,6 +10880,9 @@
9776	10880	cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9777	10881
9778	10882	__perf_mux_hrtimer_init(cpuctx, cpu);
	10883	+
	10884	+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
	10885	+ cpuctx->heap = cpuctx->heap_default;
9779	10886	}
9780	10887
9781	10888	got_cpu_context:
..	..	@@ -9807,7 +10914,16 @@
9807	10914	if (!pmu->event_idx)
9808	10915	pmu->event_idx = perf_event_idx_default;
9809	10916
9810		- list_add_rcu(&pmu->entry, &pmus);
	10917	+ /*
	10918	+ * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
	10919	+ * since these cannot be in the IDR. This way the linear search
	10920	+ * is fast, provided a valid software event is provided.
	10921	+ */
	10922	+ if (type == PERF_TYPE_SOFTWARE \|\| !name)
	10923	+ list_add_rcu(&pmu->entry, &pmus);
	10924	+ else
	10925	+ list_add_tail_rcu(&pmu->entry, &pmus);
	10926	+
9811	10927	atomic_set(&pmu->exclusive_cnt, 0);
9812	10928	ret = 0;
9813	10929	unlock:
..	..	@@ -9820,7 +10936,7 @@
9820	10936	put_device(pmu->dev);
9821	10937
9822	10938	free_idr:
9823		- if (pmu->type >= PERF_TYPE_MAX)
	10939	+ if (pmu->type != PERF_TYPE_SOFTWARE)
9824	10940	idr_remove(&pmu_idr, pmu->type);
9825	10941
9826	10942	free_pdc:
..	..	@@ -9842,7 +10958,7 @@
9842	10958	synchronize_rcu();
9843	10959
9844	10960	free_percpu(pmu->pmu_disable_count);
9845		- if (pmu->type >= PERF_TYPE_MAX)
	10961	+ if (pmu->type != PERF_TYPE_SOFTWARE)
9846	10962	idr_remove(&pmu_idr, pmu->type);
9847	10963	if (pmu_bus_running) {
9848	10964	if (pmu->nr_addr_filters)
..	..	@@ -9854,6 +10970,12 @@
9854	10970	mutex_unlock(&pmus_lock);
9855	10971	}
9856	10972	EXPORT_SYMBOL_GPL(perf_pmu_unregister);
	10973	+
	10974	+static inline bool has_extended_regs(struct perf_event *event)
	10975	+{
	10976	+ return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) \|\|
	10977	+ (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
	10978	+}
9857	10979
9858	10980	static int perf_try_init_event(struct pmu pmu, struct perf_event event)
9859	10981	{
..	..	@@ -9885,6 +11007,19 @@
9885	11007	if (ctx)
9886	11008	perf_event_ctx_unlock(event->group_leader, ctx);
9887	11009
	11010	+ if (!ret) {
	11011	+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
	11012	+ has_extended_regs(event))
	11013	+ ret = -EOPNOTSUPP;
	11014	+
	11015	+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
	11016	+ event_has_any_exclude_flag(event))
	11017	+ ret = -EINVAL;
	11018	+
	11019	+ if (ret && event->destroy)
	11020	+ event->destroy(event);
	11021	+ }
	11022	+
9888	11023	if (ret)
9889	11024	module_put(pmu->module);
9890	11025
..	..	@@ -9893,9 +11028,8 @@
9893	11028
9894	11029	static struct pmu perf_init_event(struct perf_event event)
9895	11030	{
	11031	+ int idx, type, ret;
9896	11032	struct pmu *pmu;
9897		- int idx;
9898		- int ret;
9899	11033
9900	11034	idx = srcu_read_lock(&pmus_srcu);
9901	11035
..	..	@@ -9907,17 +11041,32 @@
9907	11041	goto unlock;
9908	11042	}
9909	11043
	11044	+ /*
	11045	+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
	11046	+ * are often aliases for PERF_TYPE_RAW.
	11047	+ */
	11048	+ type = event->attr.type;
	11049	+ if (type == PERF_TYPE_HARDWARE \|\| type == PERF_TYPE_HW_CACHE)
	11050	+ type = PERF_TYPE_RAW;
	11051	+
	11052	+again:
9910	11053	rcu_read_lock();
9911		- pmu = idr_find(&pmu_idr, event->attr.type);
	11054	+ pmu = idr_find(&pmu_idr, type);
9912	11055	rcu_read_unlock();
9913	11056	if (pmu) {
9914	11057	ret = perf_try_init_event(pmu, event);
	11058	+ if (ret == -ENOENT && event->attr.type != type) {
	11059	+ type = event->attr.type;
	11060	+ goto again;
	11061	+ }
	11062	+
9915	11063	if (ret)
9916	11064	pmu = ERR_PTR(ret);
	11065	+
9917	11066	goto unlock;
9918	11067	}
9919	11068
9920		- list_for_each_entry_rcu(pmu, &pmus, entry) {
	11069	+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
9921	11070	ret = perf_try_init_event(pmu, event);
9922	11071	if (!ret)
9923	11072	goto unlock;
..	..	@@ -9993,7 +11142,7 @@
9993	11142	if (event->parent)
9994	11143	return;
9995	11144
9996		- if (event->attach_state & PERF_ATTACH_TASK)
	11145	+ if (event->attach_state & (PERF_ATTACH_TASK \| PERF_ATTACH_SCHED_CB))
9997	11146	inc = true;
9998	11147	if (event->attr.mmap \|\| event->attr.mmap_data)
9999	11148	atomic_inc(&nr_mmap_events);
..	..	@@ -10001,6 +11150,8 @@
10001	11150	atomic_inc(&nr_comm_events);
10002	11151	if (event->attr.namespaces)
10003	11152	atomic_inc(&nr_namespaces_events);
	11153	+ if (event->attr.cgroup)
	11154	+ atomic_inc(&nr_cgroup_events);
10004	11155	if (event->attr.task)
10005	11156	atomic_inc(&nr_task_events);
10006	11157	if (event->attr.freq)
..	..	@@ -10013,6 +11164,12 @@
10013	11164	inc = true;
10014	11165	if (is_cgroup_event(event))
10015	11166	inc = true;
	11167	+ if (event->attr.ksymbol)
	11168	+ atomic_inc(&nr_ksymbol_events);
	11169	+ if (event->attr.bpf_event)
	11170	+ atomic_inc(&nr_bpf_events);
	11171	+ if (event->attr.text_poke)
	11172	+ atomic_inc(&nr_text_poke_events);
10016	11173
10017	11174	if (inc) {
10018	11175	/*
..	..	@@ -10031,7 +11188,7 @@
10031	11188	* call the perf scheduling hooks before proceeding to
10032	11189	* install events that need them.
10033	11190	*/
10034		- synchronize_sched();
	11191	+ synchronize_rcu();
10035	11192	}
10036	11193	/*
10037	11194	* Now that we have waited for the sync_sched(), allow further
..	..	@@ -10120,8 +11277,7 @@
10120	11277	* and we cannot use the ctx information because we need the
10121	11278	* pmu before we get a ctx.
10122	11279	*/
10123		- get_task_struct(task);
10124		- event->hw.target = task;
	11280	+ event->hw.target = get_task_struct(task);
10125	11281	}
10126	11282
10127	11283	event->clock = &local_clock;
..	..	@@ -10133,12 +11289,9 @@
10133	11289	context = parent_event->overflow_handler_context;
10134	11290	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
10135	11291	if (overflow_handler == bpf_overflow_handler) {
10136		- struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
	11292	+ struct bpf_prog *prog = parent_event->prog;
10137	11293
10138		- if (IS_ERR(prog)) {
10139		- err = PTR_ERR(prog);
10140		- goto err_ns;
10141		- }
	11294	+ bpf_prog_inc(prog);
10142	11295	event->prog = prog;
10143	11296	event->orig_overflow_handler =
10144	11297	parent_event->orig_overflow_handler;
..	..	@@ -10179,16 +11332,31 @@
10179	11332	if (!has_branch_stack(event))
10180	11333	event->attr.branch_sample_type = 0;
10181	11334
10182		- if (cgroup_fd != -1) {
10183		- err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
10184		- if (err)
10185		- goto err_ns;
10186		- }
10187		-
10188	11335	pmu = perf_init_event(event);
10189	11336	if (IS_ERR(pmu)) {
10190	11337	err = PTR_ERR(pmu);
10191	11338	goto err_ns;
	11339	+ }
	11340	+
	11341	+ /*
	11342	+ * Disallow uncore-cgroup events, they don't make sense as the cgroup will
	11343	+ * be different on other CPUs in the uncore mask.
	11344	+ */
	11345	+ if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
	11346	+ err = -EINVAL;
	11347	+ goto err_pmu;
	11348	+ }
	11349	+
	11350	+ if (event->attr.aux_output &&
	11351	+ !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
	11352	+ err = -EOPNOTSUPP;
	11353	+ goto err_pmu;
	11354	+ }
	11355	+
	11356	+ if (cgroup_fd != -1) {
	11357	+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
	11358	+ if (err)
	11359	+ goto err_pmu;
10192	11360	}
10193	11361
10194	11362	err = exclusive_event_init(event);
..	..	@@ -10251,12 +11419,12 @@
10251	11419	exclusive_event_destroy(event);
10252	11420
10253	11421	err_pmu:
	11422	+ if (is_cgroup_event(event))
	11423	+ perf_detach_cgroup(event);
10254	11424	if (event->destroy)
10255	11425	event->destroy(event);
10256	11426	module_put(pmu->module);
10257	11427	err_ns:
10258		- if (is_cgroup_event(event))
10259		- perf_detach_cgroup(event);
10260	11428	if (event->ns)
10261	11429	put_pid_ns(event->ns);
10262	11430	if (event->hw.target)
..	..	@@ -10272,58 +11440,29 @@
10272	11440	u32 size;
10273	11441	int ret;
10274	11442
10275		- if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
10276		- return -EFAULT;
10277		-
10278		- /*
10279		- * zero the full structure, so that a short copy will be nice.
10280		- */
	11443	+ /* Zero the full structure, so that a short copy will be nice. */
10281	11444	memset(attr, 0, sizeof(*attr));
10282	11445
10283	11446	ret = get_user(size, &uattr->size);
10284	11447	if (ret)
10285	11448	return ret;
10286	11449
10287		- if (size > PAGE_SIZE) /* silly large */
10288		- goto err_size;
10289		-
10290		- if (!size) /* abi compat */
	11450	+ /* ABI compatibility quirk: */
	11451	+ if (!size)
10291	11452	size = PERF_ATTR_SIZE_VER0;
10292		-
10293		- if (size < PERF_ATTR_SIZE_VER0)
	11453	+ if (size < PERF_ATTR_SIZE_VER0 \|\| size > PAGE_SIZE)
10294	11454	goto err_size;
10295	11455
10296		- /*
10297		- * If we're handed a bigger struct than we know of,
10298		- * ensure all the unknown bits are 0 - i.e. new
10299		- * user-space does not rely on any kernel feature
10300		- * extensions we dont know about yet.
10301		- */
10302		- if (size > sizeof(*attr)) {
10303		- unsigned char __user *addr;
10304		- unsigned char __user *end;
10305		- unsigned char val;
10306		-
10307		- addr = (void __user )uattr + sizeof(attr);
10308		- end = (void __user *)uattr + size;
10309		-
10310		- for (; addr < end; addr++) {
10311		- ret = get_user(val, addr);
10312		- if (ret)
10313		- return ret;
10314		- if (val)
10315		- goto err_size;
10316		- }
10317		- size = sizeof(*attr);
	11456	+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
	11457	+ if (ret) {
	11458	+ if (ret == -E2BIG)
	11459	+ goto err_size;
	11460	+ return ret;
10318	11461	}
10319		-
10320		- ret = copy_from_user(attr, uattr, size);
10321		- if (ret)
10322		- return -EFAULT;
10323	11462
10324	11463	attr->size = size;
10325	11464
10326		- if (attr->__reserved_1)
	11465	+ if (attr->__reserved_1 \|\| attr->__reserved_2 \|\| attr->__reserved_3)
10327	11466	return -EINVAL;
10328	11467
10329	11468	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
..	..	@@ -10394,6 +11533,12 @@
10394	11533
10395	11534	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
10396	11535	ret = perf_reg_validate(attr->sample_regs_intr);
	11536	+
	11537	+#ifndef CONFIG_CGROUP_PERF
	11538	+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
	11539	+ return -EINVAL;
	11540	+#endif
	11541	+
10397	11542	out:
10398	11543	return ret;
10399	11544
..	..	@@ -10403,14 +11548,25 @@
10403	11548	goto out;
10404	11549	}
10405	11550
	11551	+static void mutex_lock_double(struct mutex a, struct mutex b)
	11552	+{
	11553	+ if (b < a)
	11554	+ swap(a, b);
	11555	+
	11556	+ mutex_lock(a);
	11557	+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
	11558	+}
	11559	+
10406	11560	static int
10407	11561	perf_event_set_output(struct perf_event event, struct perf_event output_event)
10408	11562	{
10409		- struct ring_buffer *rb = NULL;
	11563	+ struct perf_buffer *rb = NULL;
10410	11564	int ret = -EINVAL;
10411	11565
10412		- if (!output_event)
	11566	+ if (!output_event) {
	11567	+ mutex_lock(&event->mmap_mutex);
10413	11568	goto set;
	11569	+ }
10414	11570
10415	11571	/* don't allow circular references */
10416	11572	if (event == output_event)
..	..	@@ -10425,7 +11581,7 @@
10425	11581	/*
10426	11582	* If its not a per-cpu rb, it must be the same task.
10427	11583	*/
10428		- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
	11584	+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
10429	11585	goto out;
10430	11586
10431	11587	/*
..	..	@@ -10448,8 +11604,15 @@
10448	11604	event->pmu != output_event->pmu)
10449	11605	goto out;
10450	11606
	11607	+ /*
	11608	+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
	11609	+ * output_event is already on rb->event_list, and the list iteration
	11610	+ * restarts after every removal, it is guaranteed this new event is
	11611	+ * observed OR if output_event is already removed, it's guaranteed we
	11612	+ * observe !rb->mmap_count.
	11613	+ */
	11614	+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
10451	11615	set:
10452		- mutex_lock(&event->mmap_mutex);
10453	11616	/* Can't redirect output if we've got an active mmap() */
10454	11617	if (atomic_read(&event->mmap_count))
10455	11618	goto unlock;
..	..	@@ -10459,6 +11622,12 @@
10459	11622	rb = ring_buffer_get(output_event);
10460	11623	if (!rb)
10461	11624	goto unlock;
	11625	+
	11626	+ /* did we race against perf_mmap_close() */
	11627	+ if (!atomic_read(&rb->mmap_count)) {
	11628	+ ring_buffer_put(rb);
	11629	+ goto unlock;
	11630	+ }
10462	11631	}
10463	11632
10464	11633	ring_buffer_attach(event, rb);
..	..	@@ -10466,18 +11635,11 @@
10466	11635	ret = 0;
10467	11636	unlock:
10468	11637	mutex_unlock(&event->mmap_mutex);
	11638	+ if (output_event)
	11639	+ mutex_unlock(&output_event->mmap_mutex);
10469	11640
10470	11641	out:
10471	11642	return ret;
10472		-}
10473		-
10474		-static void mutex_lock_double(struct mutex a, struct mutex b)
10475		-{
10476		- if (b < a)
10477		- swap(a, b);
10478		-
10479		- mutex_lock(a);
10480		- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
10481	11643	}
10482	11644
10483	11645	static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
..	..	@@ -10500,11 +11662,11 @@
10500	11662	break;
10501	11663
10502	11664	case CLOCK_BOOTTIME:
10503		- event->clock = &ktime_get_boot_ns;
	11665	+ event->clock = &ktime_get_boottime_ns;
10504	11666	break;
10505	11667
10506	11668	case CLOCK_TAI:
10507		- event->clock = &ktime_get_tai_ns;
	11669	+ event->clock = &ktime_get_clocktai_ns;
10508	11670	break;
10509	11671
10510	11672	default:
..	..	@@ -10530,7 +11692,7 @@
10530	11692	again:
10531	11693	rcu_read_lock();
10532	11694	gctx = READ_ONCE(group_leader->ctx);
10533		- if (!atomic_inc_not_zero(&gctx->refcount)) {
	11695	+ if (!refcount_inc_not_zero(&gctx->refcount)) {
10534	11696	rcu_read_unlock();
10535	11697	goto again;
10536	11698	}
..	..	@@ -10563,7 +11725,7 @@
10563	11725	struct perf_event group_leader = NULL, output_event = NULL;
10564	11726	struct perf_event event, sibling;
10565	11727	struct perf_event_attr attr;
10566		- struct perf_event_context ctx, uninitialized_var(gctx);
	11728	+ struct perf_event_context ctx, gctx;
10567	11729	struct file *event_file = NULL;
10568	11730	struct fd group = {NULL, 0};
10569	11731	struct task_struct *task = NULL;
..	..	@@ -10578,15 +11740,12 @@
10578	11740	if (flags & ~PERF_FLAG_ALL)
10579	11741	return -EINVAL;
10580	11742
10581		- if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
10582		- return -EACCES;
10583		-
10584		- /* Do we allow access to perf_event_open(2) ? */
10585		- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
	11743	+ err = perf_copy_attr(attr_uptr, &attr);
10586	11744	if (err)
10587	11745	return err;
10588	11746
10589		- err = perf_copy_attr(attr_uptr, &attr);
	11747	+ /* Do we allow access to perf_event_open(2) ? */
	11748	+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
10590	11749	if (err)
10591	11750	return err;
10592	11751
..	..	@@ -10597,7 +11756,7 @@
10597	11756	}
10598	11757
10599	11758	if (attr.namespaces) {
10600		- if (!capable(CAP_SYS_ADMIN))
	11759	+ if (!perfmon_capable())
10601	11760	return -EACCES;
10602	11761	}
10603	11762
..	..	@@ -10612,6 +11771,13 @@
10612	11771	/* Only privileged users can get physical addresses */
10613	11772	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
10614	11773	err = perf_allow_kernel(&attr);
	11774	+ if (err)
	11775	+ return err;
	11776	+ }
	11777	+
	11778	+ /* REGS_INTR can leak data, lockdown must prevent this */
	11779	+ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
	11780	+ err = security_locked_down(LOCKDOWN_PERF);
10615	11781	if (err)
10616	11782	return err;
10617	11783	}
..	..	@@ -10657,24 +11823,6 @@
10657	11823	goto err_task;
10658	11824	}
10659	11825
10660		- if (task) {
10661		- err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10662		- if (err)
10663		- goto err_task;
10664		-
10665		- /*
10666		- * Reuse ptrace permission checks for now.
10667		- *
10668		- * We must hold cred_guard_mutex across this and any potential
10669		- * perf_install_in_context() call for this new event to
10670		- * serialize against exec() altering our credentials (and the
10671		- * perf_event_exit_task() that could imply).
10672		- */
10673		- err = -EACCES;
10674		- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10675		- goto err_cred;
10676		- }
10677		-
10678	11826	if (flags & PERF_FLAG_PID_CGROUP)
10679	11827	cgroup_fd = pid;
10680	11828
..	..	@@ -10682,7 +11830,7 @@
10682	11830	NULL, NULL, cgroup_fd);
10683	11831	if (IS_ERR(event)) {
10684	11832	err = PTR_ERR(event);
10685		- goto err_cred;
	11833	+ goto err_task;
10686	11834	}
10687	11835
10688	11836	if (is_sampling_event(event)) {
..	..	@@ -10776,6 +11924,9 @@
10776	11924	* Do not allow to attach to a group in a different task
10777	11925	* or CPU context. If we're moving SW events, we'll fix
10778	11926	* this up later, so allow that.
	11927	+ *
	11928	+ * Racy, not holding group_leader->ctx->mutex, see comment with
	11929	+ * perf_event_ctx_lock().
10779	11930	*/
10780	11931	if (!move_group && group_leader->ctx != ctx)
10781	11932	goto err_context;
..	..	@@ -10799,6 +11950,24 @@
10799	11950	err = PTR_ERR(event_file);
10800	11951	event_file = NULL;
10801	11952	goto err_context;
	11953	+ }
	11954	+
	11955	+ if (task) {
	11956	+ err = down_read_interruptible(&task->signal->exec_update_lock);
	11957	+ if (err)
	11958	+ goto err_file;
	11959	+
	11960	+ /*
	11961	+ * Preserve ptrace permission check for backwards compatibility.
	11962	+ *
	11963	+ * We must hold exec_update_lock across this and any potential
	11964	+ * perf_install_in_context() call for this new event to
	11965	+ * serialize against exec() altering our credentials (and the
	11966	+ * perf_event_exit_task() that could imply).
	11967	+ */
	11968	+ err = -EACCES;
	11969	+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
	11970	+ goto err_cred;
10802	11971	}
10803	11972
10804	11973	if (move_group) {
..	..	@@ -10825,6 +11994,7 @@
10825	11994	} else {
10826	11995	perf_event_ctx_unlock(group_leader, gctx);
10827	11996	move_group = 0;
	11997	+ goto not_move_group;
10828	11998	}
10829	11999	}
10830	12000
..	..	@@ -10841,7 +12011,17 @@
10841	12011	}
10842	12012	} else {
10843	12013	mutex_lock(&ctx->mutex);
	12014	+
	12015	+ /*
	12016	+ * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
	12017	+ * see the group_leader && !move_group test earlier.
	12018	+ */
	12019	+ if (group_leader && group_leader->ctx != ctx) {
	12020	+ err = -EINVAL;
	12021	+ goto err_locked;
	12022	+ }
10844	12023	}
	12024	+not_move_group:
10845	12025
10846	12026	if (ctx->task == TASK_TOMBSTONE) {
10847	12027	err = -ESRCH;
..	..	@@ -10869,6 +12049,10 @@
10869	12049	}
10870	12050	}
10871	12051
	12052	+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
	12053	+ err = -EINVAL;
	12054	+ goto err_locked;
	12055	+ }
10872	12056
10873	12057	/*
10874	12058	* Must be under the same ctx::mutex as perf_install_in_context(),
..	..	@@ -10950,7 +12134,7 @@
10950	12134	mutex_unlock(&ctx->mutex);
10951	12135
10952	12136	if (task) {
10953		- mutex_unlock(&task->signal->cred_guard_mutex);
	12137	+ up_read(&task->signal->exec_update_lock);
10954	12138	put_task_struct(task);
10955	12139	}
10956	12140
..	..	@@ -10972,7 +12156,10 @@
10972	12156	if (move_group)
10973	12157	perf_event_ctx_unlock(group_leader, gctx);
10974	12158	mutex_unlock(&ctx->mutex);
10975		-/* err_file: */
	12159	+err_cred:
	12160	+ if (task)
	12161	+ up_read(&task->signal->exec_update_lock);
	12162	+err_file:
10976	12163	fput(event_file);
10977	12164	err_context:
10978	12165	perf_unpin_context(ctx);
..	..	@@ -10984,9 +12171,6 @@
10984	12171	*/
10985	12172	if (!event_file)
10986	12173	free_event(event);
10987		-err_cred:
10988		- if (task)
10989		- mutex_unlock(&task->signal->cred_guard_mutex);
10990	12174	err_task:
10991	12175	if (task)
10992	12176	put_task_struct(task);
..	..	@@ -11015,8 +12199,11 @@
11015	12199	int err;
11016	12200
11017	12201	/*
11018		- * Get the target context (task or percpu):
	12202	+ * Grouping is not supported for kernel events, neither is 'AUX',
	12203	+ * make sure the caller's intentions are adjusted.
11019	12204	*/
	12205	+ if (attr->aux_output)
	12206	+ return ERR_PTR(-EINVAL);
11020	12207
11021	12208	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
11022	12209	overflow_handler, context, -1);
..	..	@@ -11028,6 +12215,9 @@
11028	12215	/* Mark owner so we could distinguish it from user events. */
11029	12216	event->owner = TASK_TOMBSTONE;
11030	12217
	12218	+ /*
	12219	+ * Get the target context (task or percpu):
	12220	+ */
11031	12221	ctx = find_get_context(event->pmu, task, event);
11032	12222	if (IS_ERR(ctx)) {
11033	12223	err = PTR_ERR(ctx);
..	..	@@ -11285,8 +12475,8 @@
11285	12475	/*
11286	12476	* When a child task exits, feed back event values to parent events.
11287	12477	*
11288		- * Can be called with cred_guard_mutex held when called from
11289		- * install_exec_creds().
	12478	+ * Can be called with exec_update_lock held when called from
	12479	+ * setup_new_exec().
11290	12480	*/
11291	12481	void perf_event_exit_task(struct task_struct *child)
11292	12482	{
..	..	@@ -11390,7 +12580,7 @@
11390	12580	*
11391	12581	* Wait for all events to drop their context reference.
11392	12582	*/
11393		- wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1);
	12583	+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
11394	12584	put_ctx(ctx); /* must be last */
11395	12585	}
11396	12586	}
..	..	@@ -11405,9 +12595,7 @@
11405	12595
11406	12596	struct file *perf_event_get(unsigned int fd)
11407	12597	{
11408		- struct file *file;
11409		-
11410		- file = fget_raw(fd);
	12598	+ struct file *file = fget(fd);
11411	12599	if (!file)
11412	12600	return ERR_PTR(-EBADF);
11413	12601
..	..	@@ -11477,8 +12665,7 @@
11477	12665	!child_ctx->task_ctx_data) {
11478	12666	struct pmu *pmu = child_event->pmu;
11479	12667
11480		- child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
11481		- GFP_KERNEL);
	12668	+ child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
11482	12669	if (!child_ctx->task_ctx_data) {
11483	12670	free_event(child_event);
11484	12671	return ERR_PTR(-ENOMEM);
..	..	@@ -11583,6 +12770,10 @@
11583	12770	child, leader, child_ctx);
11584	12771	if (IS_ERR(child_ctr))
11585	12772	return PTR_ERR(child_ctr);
	12773	+
	12774	+ if (sub->aux_event == parent_event && child_ctr &&
	12775	+ !perf_get_aux_event(child_ctr, leader))
	12776	+ return -EINVAL;
11586	12777	}
11587	12778	return 0;
11588	12779	}
..	..	@@ -11778,7 +12969,7 @@
11778	12969	}
11779	12970	}
11780	12971
11781		-void perf_swevent_init_cpu(unsigned int cpu)
	12972	+static void perf_swevent_init_cpu(unsigned int cpu)
11782	12973	{
11783	12974	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11784	12975
..	..	@@ -11975,6 +13166,12 @@
11975	13166	kfree(jc);
11976	13167	}
11977	13168
	13169	+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
	13170	+{
	13171	+ perf_event_cgroup(css->cgroup);
	13172	+ return 0;
	13173	+}
	13174	+
11978	13175	static int __perf_cgroup_move(void *info)
11979	13176	{
11980	13177	struct task_struct *task = info;
..	..	@@ -11996,6 +13193,7 @@
11996	13193	struct cgroup_subsys perf_event_cgrp_subsys = {
11997	13194	.css_alloc = perf_cgroup_css_alloc,
11998	13195	.css_free = perf_cgroup_css_free,
	13196	+ .css_online = perf_cgroup_css_online,
11999	13197	.attach = perf_cgroup_attach,
12000	13198	/*
12001	13199	* Implicitly enable on dfl hierarchy so that perf events can