hc
2024-05-11 04dd17822334871b23ea2862f7798fb0e0007777
kernel/arch/powerpc/perf/imc-pmu.c
....@@ -1,14 +1,10 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * In-Memory Collection (IMC) Performance Monitor counter support.
34 *
45 * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
56 * (C) 2017 Anju T Sudhakar, IBM Corporation.
67 * (C) 2017 Hemant K Shaw, IBM Corporation.
7
- *
8
- * This program is free software; you can redistribute it and/or
9
- * modify it under the terms of the GNU General Public License
10
- * as published by the Free Software Foundation; either version
11
- * 2 of the License, or later version.
128 */
139 #include <linux/perf_event.h>
1410 #include <linux/slab.h>
....@@ -17,6 +13,7 @@
1713 #include <asm/cputhreads.h>
1814 #include <asm/smp.h>
1915 #include <linux/string.h>
16
+#include <linux/spinlock.h>
2017
2118 /* Nest IMC data structures and variables */
2219
....@@ -28,13 +25,13 @@
2825 static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc);
2926 static struct imc_pmu **per_nest_pmu_arr;
3027 static cpumask_t nest_imc_cpumask;
31
-struct imc_pmu_ref *nest_imc_refc;
28
+static struct imc_pmu_ref *nest_imc_refc;
3229 static int nest_pmus;
3330
3431 /* Core IMC data structures and variables */
3532
3633 static cpumask_t core_imc_cpumask;
37
-struct imc_pmu_ref *core_imc_refc;
34
+static struct imc_pmu_ref *core_imc_refc;
3835 static struct imc_pmu *core_imc_pmu;
3936
4037 /* Thread IMC data structures and variables */
....@@ -43,12 +40,27 @@
4340 static struct imc_pmu *thread_imc_pmu;
4441 static int thread_imc_mem_size;
4542
46
-struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
43
+/* Trace IMC data structures */
44
+static DEFINE_PER_CPU(u64 *, trace_imc_mem);
45
+static struct imc_pmu_ref *trace_imc_refc;
46
+static int trace_imc_mem_size;
47
+
48
+/*
49
+ * Global data structure used to avoid races between thread,
50
+ * core and trace-imc
51
+ */
52
+static struct imc_pmu_ref imc_global_refc = {
53
+ .lock = __SPIN_LOCK_INITIALIZER(imc_global_refc.lock),
54
+ .id = 0,
55
+ .refc = 0,
56
+};
57
+
58
+static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
4759 {
4860 return container_of(event->pmu, struct imc_pmu, pmu);
4961 }
5062
51
-PMU_FORMAT_ATTR(event, "config:0-40");
63
+PMU_FORMAT_ATTR(event, "config:0-61");
5264 PMU_FORMAT_ATTR(offset, "config:0-31");
5365 PMU_FORMAT_ATTR(rvalue, "config:32");
5466 PMU_FORMAT_ATTR(mode, "config:33-40");
....@@ -63,6 +75,25 @@
6375 static struct attribute_group imc_format_group = {
6476 .name = "format",
6577 .attrs = imc_format_attrs,
78
+};
79
+
80
+/* Format attribute for imc trace-mode */
81
+PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19");
82
+PMU_FORMAT_ATTR(cpmc_event, "config:20-27");
83
+PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29");
84
+PMU_FORMAT_ATTR(cpmc_load, "config:30-61");
85
+static struct attribute *trace_imc_format_attrs[] = {
86
+ &format_attr_event.attr,
87
+ &format_attr_cpmc_reserved.attr,
88
+ &format_attr_cpmc_event.attr,
89
+ &format_attr_cpmc_samplesel.attr,
90
+ &format_attr_cpmc_load.attr,
91
+ NULL,
92
+};
93
+
94
+static struct attribute_group trace_imc_format_group = {
95
+.name = "format",
96
+.attrs = trace_imc_format_attrs,
6697 };
6798
6899 /* Get the cpumask printed to a buffer "buf" */
....@@ -342,7 +373,14 @@
342373 */
343374 nid = cpu_to_node(cpu);
344375 l_cpumask = cpumask_of_node(nid);
345
- target = cpumask_any_but(l_cpumask, cpu);
376
+ target = cpumask_last(l_cpumask);
377
+
378
+ /*
379
+ * If this(target) is the last cpu in the cpumask for this chip,
380
+ * check for any possible online cpu in the chip.
381
+ */
382
+ if (unlikely(target == cpu))
383
+ target = cpumask_any_but(l_cpumask, cpu);
346384
347385 /*
348386 * Update the cpumask with the target cpu and
....@@ -356,7 +394,7 @@
356394 get_hard_smp_processor_id(cpu));
357395 /*
358396 * If this is the last cpu in this chip then, skip the reference
359
- * count mutex lock and make the reference count on this chip zero.
397
+ * count lock and make the reference count on this chip zero.
360398 */
361399 ref = get_nest_pmu_ref(cpu);
362400 if (!ref)
....@@ -418,15 +456,15 @@
418456 /*
419457 * See if we need to disable the nest PMU.
420458 * If no events are currently in use, then we have to take a
421
- * mutex to ensure that we don't race with another task doing
459
+ * lock to ensure that we don't race with another task doing
422460 * enable or disable the nest counters.
423461 */
424462 ref = get_nest_pmu_ref(event->cpu);
425463 if (!ref)
426464 return;
427465
428
- /* Take the mutex lock for this node and then decrement the reference count */
429
- mutex_lock(&ref->lock);
466
+ /* Take the lock for this node and then decrement the reference count */
467
+ spin_lock(&ref->lock);
430468 if (ref->refc == 0) {
431469 /*
432470 * The scenario where this is true is, when perf session is
....@@ -438,7 +476,7 @@
438476 * an OPAL call to disable the engine in that node.
439477 *
440478 */
441
- mutex_unlock(&ref->lock);
479
+ spin_unlock(&ref->lock);
442480 return;
443481 }
444482 ref->refc--;
....@@ -446,7 +484,7 @@
446484 rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
447485 get_hard_smp_processor_id(event->cpu));
448486 if (rc) {
449
- mutex_unlock(&ref->lock);
487
+ spin_unlock(&ref->lock);
450488 pr_err("nest-imc: Unable to stop the counters for core %d\n", node_id);
451489 return;
452490 }
....@@ -454,7 +492,7 @@
454492 WARN(1, "nest-imc: Invalid event reference count\n");
455493 ref->refc = 0;
456494 }
457
- mutex_unlock(&ref->lock);
495
+ spin_unlock(&ref->lock);
458496 }
459497
460498 static int nest_imc_event_init(struct perf_event *event)
....@@ -471,15 +509,6 @@
471509
472510 /* Sampling not supported */
473511 if (event->hw.sample_period)
474
- return -EINVAL;
475
-
476
- /* unsupported modes and filters */
477
- if (event->attr.exclude_user ||
478
- event->attr.exclude_kernel ||
479
- event->attr.exclude_hv ||
480
- event->attr.exclude_idle ||
481
- event->attr.exclude_host ||
482
- event->attr.exclude_guest)
483512 return -EINVAL;
484513
485514 if (event->cpu < 0)
....@@ -522,26 +551,25 @@
522551
523552 /*
524553 * Get the imc_pmu_ref struct for this node.
525
- * Take the mutex lock and then increment the count of nest pmu events
526
- * inited.
554
+ * Take the lock and then increment the count of nest pmu events inited.
527555 */
528556 ref = get_nest_pmu_ref(event->cpu);
529557 if (!ref)
530558 return -EINVAL;
531559
532
- mutex_lock(&ref->lock);
560
+ spin_lock(&ref->lock);
533561 if (ref->refc == 0) {
534562 rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_NEST,
535563 get_hard_smp_processor_id(event->cpu));
536564 if (rc) {
537
- mutex_unlock(&ref->lock);
565
+ spin_unlock(&ref->lock);
538566 pr_err("nest-imc: Unable to start the counters for node %d\n",
539567 node_id);
540568 return rc;
541569 }
542570 }
543571 ++ref->refc;
544
- mutex_unlock(&ref->lock);
572
+ spin_unlock(&ref->lock);
545573
546574 event->destroy = nest_imc_counters_release;
547575 return 0;
....@@ -559,6 +587,7 @@
559587 {
560588 int nid, rc = 0, core_id = (cpu / threads_per_core);
561589 struct imc_mem_info *mem_info;
590
+ struct page *page;
562591
563592 /*
564593 * alloc_pages_node() will allocate memory for core in the
....@@ -569,15 +598,15 @@
569598 mem_info->id = core_id;
570599
571600 /* We need only vbase for core counters */
572
- mem_info->vbase = page_address(alloc_pages_node(nid,
573
- GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
574
- __GFP_NOWARN, get_order(size)));
575
- if (!mem_info->vbase)
601
+ page = alloc_pages_node(nid,
602
+ GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
603
+ __GFP_NOWARN, get_order(size));
604
+ if (!page)
576605 return -ENOMEM;
606
+ mem_info->vbase = page_address(page);
577607
578
- /* Init the mutex */
579608 core_imc_refc[core_id].id = core_id;
580
- mutex_init(&core_imc_refc[core_id].lock);
609
+ spin_lock_init(&core_imc_refc[core_id].lock);
581610
582611 rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_CORE,
583612 __pa((void *)mem_info->vbase),
....@@ -656,16 +685,18 @@
656685 return 0;
657686
658687 /* Find any online cpu in that core except the current "cpu" */
659
- ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
688
+ ncpu = cpumask_last(cpu_sibling_mask(cpu));
689
+
690
+ if (unlikely(ncpu == cpu))
691
+ ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
660692
661693 if (ncpu >= 0 && ncpu < nr_cpu_ids) {
662694 cpumask_set_cpu(ncpu, &core_imc_cpumask);
663695 perf_pmu_migrate_context(&core_imc_pmu->pmu, cpu, ncpu);
664696 } else {
665697 /*
666
- * If this is the last cpu in this core then, skip taking refernce
667
- * count mutex lock for this core and directly zero "refc" for
668
- * this core.
698
+ * If this is the last cpu in this core then skip taking reference
699
+ * count lock for this core and directly zero "refc" for this core.
669700 */
670701 opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
671702 get_hard_smp_processor_id(cpu));
....@@ -675,6 +706,16 @@
675706 return -EINVAL;
676707
677708 ref->refc = 0;
709
+ /*
710
+ * Reduce the global reference count, if this is the
711
+ * last cpu in this core and core-imc event running
712
+ * in this cpu.
713
+ */
714
+ spin_lock(&imc_global_refc.lock);
715
+ if (imc_global_refc.id == IMC_DOMAIN_CORE)
716
+ imc_global_refc.refc--;
717
+
718
+ spin_unlock(&imc_global_refc.lock);
678719 }
679720 return 0;
680721 }
....@@ -687,6 +728,23 @@
687728 ppc_core_imc_cpu_offline);
688729 }
689730
731
+static void reset_global_refc(struct perf_event *event)
732
+{
733
+ spin_lock(&imc_global_refc.lock);
734
+ imc_global_refc.refc--;
735
+
736
+ /*
737
+ * If no other thread is running any
738
+ * event for this domain(thread/core/trace),
739
+ * set the global id to zero.
740
+ */
741
+ if (imc_global_refc.refc <= 0) {
742
+ imc_global_refc.refc = 0;
743
+ imc_global_refc.id = 0;
744
+ }
745
+ spin_unlock(&imc_global_refc.lock);
746
+}
747
+
690748 static void core_imc_counters_release(struct perf_event *event)
691749 {
692750 int rc, core_id;
....@@ -697,17 +755,17 @@
697755 /*
698756 * See if we need to disable the IMC PMU.
699757 * If no events are currently in use, then we have to take a
700
- * mutex to ensure that we don't race with another task doing
758
+ * lock to ensure that we don't race with another task doing
701759 * enable or disable the core counters.
702760 */
703761 core_id = event->cpu / threads_per_core;
704762
705
- /* Take the mutex lock and decrement the refernce count for this core */
763
+ /* Take the lock and decrement the refernce count for this core */
706764 ref = &core_imc_refc[core_id];
707765 if (!ref)
708766 return;
709767
710
- mutex_lock(&ref->lock);
768
+ spin_lock(&ref->lock);
711769 if (ref->refc == 0) {
712770 /*
713771 * The scenario where this is true is, when perf session is
....@@ -719,7 +777,7 @@
719777 * an OPAL call to disable the engine in that core.
720778 *
721779 */
722
- mutex_unlock(&ref->lock);
780
+ spin_unlock(&ref->lock);
723781 return;
724782 }
725783 ref->refc--;
....@@ -727,7 +785,7 @@
727785 rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
728786 get_hard_smp_processor_id(event->cpu));
729787 if (rc) {
730
- mutex_unlock(&ref->lock);
788
+ spin_unlock(&ref->lock);
731789 pr_err("IMC: Unable to stop the counters for core %d\n", core_id);
732790 return;
733791 }
....@@ -735,7 +793,9 @@
735793 WARN(1, "core-imc: Invalid event reference count\n");
736794 ref->refc = 0;
737795 }
738
- mutex_unlock(&ref->lock);
796
+ spin_unlock(&ref->lock);
797
+
798
+ reset_global_refc(event);
739799 }
740800
741801 static int core_imc_event_init(struct perf_event *event)
....@@ -751,15 +811,6 @@
751811
752812 /* Sampling not supported */
753813 if (event->hw.sample_period)
754
- return -EINVAL;
755
-
756
- /* unsupported modes and filters */
757
- if (event->attr.exclude_user ||
758
- event->attr.exclude_kernel ||
759
- event->attr.exclude_hv ||
760
- event->attr.exclude_idle ||
761
- event->attr.exclude_host ||
762
- event->attr.exclude_guest)
763814 return -EINVAL;
764815
765816 if (event->cpu < 0)
....@@ -780,7 +831,6 @@
780831 if ((!pcmi->vbase))
781832 return -ENODEV;
782833
783
- /* Get the core_imc mutex for this core */
784834 ref = &core_imc_refc[core_id];
785835 if (!ref)
786836 return -EINVAL;
....@@ -788,22 +838,45 @@
788838 /*
789839 * Core pmu units are enabled only when it is used.
790840 * See if this is triggered for the first time.
791
- * If yes, take the mutex lock and enable the core counters.
841
+ * If yes, take the lock and enable the core counters.
792842 * If not, just increment the count in core_imc_refc struct.
793843 */
794
- mutex_lock(&ref->lock);
844
+ spin_lock(&ref->lock);
795845 if (ref->refc == 0) {
796846 rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
797847 get_hard_smp_processor_id(event->cpu));
798848 if (rc) {
799
- mutex_unlock(&ref->lock);
849
+ spin_unlock(&ref->lock);
800850 pr_err("core-imc: Unable to start the counters for core %d\n",
801851 core_id);
802852 return rc;
803853 }
804854 }
805855 ++ref->refc;
806
- mutex_unlock(&ref->lock);
856
+ spin_unlock(&ref->lock);
857
+
858
+ /*
859
+ * Since the system can run either in accumulation or trace-mode
860
+ * of IMC at a time, core-imc events are allowed only if no other
861
+ * trace/thread imc events are enabled/monitored.
862
+ *
863
+ * Take the global lock, and check the refc.id
864
+ * to know whether any other trace/thread imc
865
+ * events are running.
866
+ */
867
+ spin_lock(&imc_global_refc.lock);
868
+ if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
869
+ /*
870
+ * No other trace/thread imc events are running in
871
+ * the system, so set the refc.id to core-imc.
872
+ */
873
+ imc_global_refc.id = IMC_DOMAIN_CORE;
874
+ imc_global_refc.refc++;
875
+ } else {
876
+ spin_unlock(&imc_global_refc.lock);
877
+ return -EBUSY;
878
+ }
879
+ spin_unlock(&imc_global_refc.lock);
807880
808881 event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK);
809882 event->destroy = core_imc_counters_release;
....@@ -811,8 +884,11 @@
811884 }
812885
813886 /*
814
- * Allocates a page of memory for each of the online cpus, and write the
815
- * physical base address of that page to the LDBAR for that cpu.
887
+ * Allocates a page of memory for each of the online cpus, and load
888
+ * LDBAR with 0.
889
+ * The physical base address of the page allocated for a cpu will be
890
+ * written to the LDBAR for that cpu, when the thread-imc event
891
+ * is added.
816892 *
817893 * LDBAR Register Layout:
818894 *
....@@ -830,26 +906,26 @@
830906 */
831907 static int thread_imc_mem_alloc(int cpu_id, int size)
832908 {
833
- u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
909
+ u64 *local_mem = per_cpu(thread_imc_mem, cpu_id);
834910 int nid = cpu_to_node(cpu_id);
835911
836912 if (!local_mem) {
913
+ struct page *page;
837914 /*
838915 * This case could happen only once at start, since we dont
839916 * free the memory in cpu offline path.
840917 */
841
- local_mem = page_address(alloc_pages_node(nid,
918
+ page = alloc_pages_node(nid,
842919 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
843
- __GFP_NOWARN, get_order(size)));
844
- if (!local_mem)
920
+ __GFP_NOWARN, get_order(size));
921
+ if (!page)
845922 return -ENOMEM;
923
+ local_mem = page_address(page);
846924
847925 per_cpu(thread_imc_mem, cpu_id) = local_mem;
848926 }
849927
850
- ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE;
851
-
852
- mtspr(SPRN_LDBAR, ldbar_value);
928
+ mtspr(SPRN_LDBAR, 0);
853929 return 0;
854930 }
855931
....@@ -860,7 +936,23 @@
860936
861937 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
862938 {
863
- mtspr(SPRN_LDBAR, 0);
939
+ /*
940
+ * Set the bit 0 of LDBAR to zero.
941
+ *
942
+ * If bit 0 of LDBAR is unset, it will stop posting
943
+ * the counter data to memory.
944
+ * For thread-imc, bit 0 of LDBAR will be set to 1 in the
945
+ * event_add function. So reset this bit here, to stop the updates
946
+ * to memory in the cpu_offline path.
947
+ */
948
+ mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
949
+
950
+ /* Reduce the refc if thread-imc event running on this cpu */
951
+ spin_lock(&imc_global_refc.lock);
952
+ if (imc_global_refc.id == IMC_DOMAIN_THREAD)
953
+ imc_global_refc.refc--;
954
+ spin_unlock(&imc_global_refc.lock);
955
+
864956 return 0;
865957 }
866958
....@@ -881,6 +973,9 @@
881973 if (event->attr.type != event->pmu->type)
882974 return -ENOENT;
883975
976
+ if (!perfmon_capable())
977
+ return -EACCES;
978
+
884979 /* Sampling not supported */
885980 if (event->hw.sample_period)
886981 return -EINVAL;
....@@ -896,7 +991,22 @@
896991 if (!target)
897992 return -EINVAL;
898993
994
+ spin_lock(&imc_global_refc.lock);
995
+ /*
996
+ * Check if any other trace/core imc events are running in the
997
+ * system, if not set the global id to thread-imc.
998
+ */
999
+ if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_THREAD) {
1000
+ imc_global_refc.id = IMC_DOMAIN_THREAD;
1001
+ imc_global_refc.refc++;
1002
+ } else {
1003
+ spin_unlock(&imc_global_refc.lock);
1004
+ return -EBUSY;
1005
+ }
1006
+ spin_unlock(&imc_global_refc.lock);
1007
+
8991008 event->pmu->task_ctx_nr = perf_sw_context;
1009
+ event->destroy = reset_global_refc;
9001010 return 0;
9011011 }
9021012
....@@ -1000,6 +1110,7 @@
10001110 {
10011111 int core_id;
10021112 struct imc_pmu_ref *ref;
1113
+ u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id());
10031114
10041115 if (flags & PERF_EF_START)
10051116 imc_event_start(event, flags);
....@@ -1008,28 +1119,31 @@
10081119 return -EINVAL;
10091120
10101121 core_id = smp_processor_id() / threads_per_core;
1122
+ ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE;
1123
+ mtspr(SPRN_LDBAR, ldbar_value);
1124
+
10111125 /*
10121126 * imc pmus are enabled only when it is used.
10131127 * See if this is triggered for the first time.
1014
- * If yes, take the mutex lock and enable the counters.
1128
+ * If yes, take the lock and enable the counters.
10151129 * If not, just increment the count in ref count struct.
10161130 */
10171131 ref = &core_imc_refc[core_id];
10181132 if (!ref)
10191133 return -EINVAL;
10201134
1021
- mutex_lock(&ref->lock);
1135
+ spin_lock(&ref->lock);
10221136 if (ref->refc == 0) {
10231137 if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
10241138 get_hard_smp_processor_id(smp_processor_id()))) {
1025
- mutex_unlock(&ref->lock);
1139
+ spin_unlock(&ref->lock);
10261140 pr_err("thread-imc: Unable to start the counter\
10271141 for core %d\n", core_id);
10281142 return -EINVAL;
10291143 }
10301144 }
10311145 ++ref->refc;
1032
- mutex_unlock(&ref->lock);
1146
+ spin_unlock(&ref->lock);
10331147 return 0;
10341148 }
10351149
....@@ -1039,21 +1153,19 @@
10391153 int core_id;
10401154 struct imc_pmu_ref *ref;
10411155
1042
- /*
1043
- * Take a snapshot and calculate the delta and update
1044
- * the event counter values.
1045
- */
1046
- imc_event_update(event);
1047
-
10481156 core_id = smp_processor_id() / threads_per_core;
10491157 ref = &core_imc_refc[core_id];
1158
+ if (!ref) {
1159
+ pr_debug("imc: Failed to get event reference count\n");
1160
+ return;
1161
+ }
10501162
1051
- mutex_lock(&ref->lock);
1163
+ spin_lock(&ref->lock);
10521164 ref->refc--;
10531165 if (ref->refc == 0) {
10541166 if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
10551167 get_hard_smp_processor_id(smp_processor_id()))) {
1056
- mutex_unlock(&ref->lock);
1168
+ spin_unlock(&ref->lock);
10571169 pr_err("thread-imc: Unable to stop the counters\
10581170 for core %d\n", core_id);
10591171 return;
....@@ -1061,7 +1173,293 @@
10611173 } else if (ref->refc < 0) {
10621174 ref->refc = 0;
10631175 }
1064
- mutex_unlock(&ref->lock);
1176
+ spin_unlock(&ref->lock);
1177
+
1178
+ /* Set bit 0 of LDBAR to zero, to stop posting updates to memory */
1179
+ mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
1180
+
1181
+ /*
1182
+ * Take a snapshot and calculate the delta and update
1183
+ * the event counter values.
1184
+ */
1185
+ imc_event_update(event);
1186
+}
1187
+
1188
+/*
1189
+ * Allocate a page of memory for each cpu, and load LDBAR with 0.
1190
+ */
1191
+static int trace_imc_mem_alloc(int cpu_id, int size)
1192
+{
1193
+ u64 *local_mem = per_cpu(trace_imc_mem, cpu_id);
1194
+ int phys_id = cpu_to_node(cpu_id), rc = 0;
1195
+ int core_id = (cpu_id / threads_per_core);
1196
+
1197
+ if (!local_mem) {
1198
+ struct page *page;
1199
+
1200
+ page = alloc_pages_node(phys_id,
1201
+ GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE |
1202
+ __GFP_NOWARN, get_order(size));
1203
+ if (!page)
1204
+ return -ENOMEM;
1205
+ local_mem = page_address(page);
1206
+ per_cpu(trace_imc_mem, cpu_id) = local_mem;
1207
+
1208
+ /* Initialise the counters for trace mode */
1209
+ rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem),
1210
+ get_hard_smp_processor_id(cpu_id));
1211
+ if (rc) {
1212
+ pr_info("IMC:opal init failed for trace imc\n");
1213
+ return rc;
1214
+ }
1215
+ }
1216
+
1217
+ trace_imc_refc[core_id].id = core_id;
1218
+ spin_lock_init(&trace_imc_refc[core_id].lock);
1219
+
1220
+ mtspr(SPRN_LDBAR, 0);
1221
+ return 0;
1222
+}
1223
+
1224
+static int ppc_trace_imc_cpu_online(unsigned int cpu)
1225
+{
1226
+ return trace_imc_mem_alloc(cpu, trace_imc_mem_size);
1227
+}
1228
+
1229
+static int ppc_trace_imc_cpu_offline(unsigned int cpu)
1230
+{
1231
+ /*
1232
+ * No need to set bit 0 of LDBAR to zero, as
1233
+ * it is set to zero for imc trace-mode
1234
+ *
1235
+ * Reduce the refc if any trace-imc event running
1236
+ * on this cpu.
1237
+ */
1238
+ spin_lock(&imc_global_refc.lock);
1239
+ if (imc_global_refc.id == IMC_DOMAIN_TRACE)
1240
+ imc_global_refc.refc--;
1241
+ spin_unlock(&imc_global_refc.lock);
1242
+
1243
+ return 0;
1244
+}
1245
+
1246
+static int trace_imc_cpu_init(void)
1247
+{
1248
+ return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
1249
+ "perf/powerpc/imc_trace:online",
1250
+ ppc_trace_imc_cpu_online,
1251
+ ppc_trace_imc_cpu_offline);
1252
+}
1253
+
1254
+static u64 get_trace_imc_event_base_addr(void)
1255
+{
1256
+ return (u64)per_cpu(trace_imc_mem, smp_processor_id());
1257
+}
1258
+
1259
+/*
1260
+ * Function to parse trace-imc data obtained
1261
+ * and to prepare the perf sample.
1262
+ */
1263
+static int trace_imc_prepare_sample(struct trace_imc_data *mem,
1264
+ struct perf_sample_data *data,
1265
+ u64 *prev_tb,
1266
+ struct perf_event_header *header,
1267
+ struct perf_event *event)
1268
+{
1269
+ /* Sanity checks for a valid record */
1270
+ if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb)
1271
+ *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1));
1272
+ else
1273
+ return -EINVAL;
1274
+
1275
+ if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) !=
1276
+ be64_to_cpu(READ_ONCE(mem->tb2)))
1277
+ return -EINVAL;
1278
+
1279
+ /* Prepare perf sample */
1280
+ data->ip = be64_to_cpu(READ_ONCE(mem->ip));
1281
+ data->period = event->hw.last_period;
1282
+
1283
+ header->type = PERF_RECORD_SAMPLE;
1284
+ header->size = sizeof(*header) + event->header_size;
1285
+ header->misc = 0;
1286
+
1287
+ if (cpu_has_feature(CPU_FTR_ARCH_31)) {
1288
+ switch (IMC_TRACE_RECORD_VAL_HVPR(be64_to_cpu(READ_ONCE(mem->val)))) {
1289
+ case 0:/* when MSR HV and PR not set in the trace-record */
1290
+ header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1291
+ break;
1292
+ case 1: /* MSR HV is 0 and PR is 1 */
1293
+ header->misc |= PERF_RECORD_MISC_GUEST_USER;
1294
+ break;
1295
+ case 2: /* MSR HV is 1 and PR is 0 */
1296
+ header->misc |= PERF_RECORD_MISC_KERNEL;
1297
+ break;
1298
+ case 3: /* MSR HV is 1 and PR is 1 */
1299
+ header->misc |= PERF_RECORD_MISC_USER;
1300
+ break;
1301
+ default:
1302
+ pr_info("IMC: Unable to set the flag based on MSR bits\n");
1303
+ break;
1304
+ }
1305
+ } else {
1306
+ if (is_kernel_addr(data->ip))
1307
+ header->misc |= PERF_RECORD_MISC_KERNEL;
1308
+ else
1309
+ header->misc |= PERF_RECORD_MISC_USER;
1310
+ }
1311
+ perf_event_header__init_id(header, data, event);
1312
+
1313
+ return 0;
1314
+}
1315
+
1316
+static void dump_trace_imc_data(struct perf_event *event)
1317
+{
1318
+ struct trace_imc_data *mem;
1319
+ int i, ret;
1320
+ u64 prev_tb = 0;
1321
+
1322
+ mem = (struct trace_imc_data *)get_trace_imc_event_base_addr();
1323
+ for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data));
1324
+ i++, mem++) {
1325
+ struct perf_sample_data data;
1326
+ struct perf_event_header header;
1327
+
1328
+ ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, event);
1329
+ if (ret) /* Exit, if not a valid record */
1330
+ break;
1331
+ else {
1332
+ /* If this is a valid record, create the sample */
1333
+ struct perf_output_handle handle;
1334
+
1335
+ if (perf_output_begin(&handle, &data, event, header.size))
1336
+ return;
1337
+
1338
+ perf_output_sample(&handle, &header, &data, event);
1339
+ perf_output_end(&handle);
1340
+ }
1341
+ }
1342
+}
1343
+
1344
+static int trace_imc_event_add(struct perf_event *event, int flags)
1345
+{
1346
+ int core_id = smp_processor_id() / threads_per_core;
1347
+ struct imc_pmu_ref *ref = NULL;
1348
+ u64 local_mem, ldbar_value;
1349
+
1350
+ /* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */
1351
+ local_mem = get_trace_imc_event_base_addr();
1352
+ ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE;
1353
+
1354
+ /* trace-imc reference count */
1355
+ if (trace_imc_refc)
1356
+ ref = &trace_imc_refc[core_id];
1357
+ if (!ref) {
1358
+ pr_debug("imc: Failed to get the event reference count\n");
1359
+ return -EINVAL;
1360
+ }
1361
+
1362
+ mtspr(SPRN_LDBAR, ldbar_value);
1363
+ spin_lock(&ref->lock);
1364
+ if (ref->refc == 0) {
1365
+ if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
1366
+ get_hard_smp_processor_id(smp_processor_id()))) {
1367
+ spin_unlock(&ref->lock);
1368
+ pr_err("trace-imc: Unable to start the counters for core %d\n", core_id);
1369
+ return -EINVAL;
1370
+ }
1371
+ }
1372
+ ++ref->refc;
1373
+ spin_unlock(&ref->lock);
1374
+ return 0;
1375
+}
1376
+
1377
+static void trace_imc_event_read(struct perf_event *event)
1378
+{
1379
+ return;
1380
+}
1381
+
1382
+static void trace_imc_event_stop(struct perf_event *event, int flags)
1383
+{
1384
+ u64 local_mem = get_trace_imc_event_base_addr();
1385
+ dump_trace_imc_data(event);
1386
+ memset((void *)local_mem, 0, sizeof(u64));
1387
+}
1388
+
1389
+static void trace_imc_event_start(struct perf_event *event, int flags)
1390
+{
1391
+ return;
1392
+}
1393
+
1394
+static void trace_imc_event_del(struct perf_event *event, int flags)
1395
+{
1396
+ int core_id = smp_processor_id() / threads_per_core;
1397
+ struct imc_pmu_ref *ref = NULL;
1398
+
1399
+ if (trace_imc_refc)
1400
+ ref = &trace_imc_refc[core_id];
1401
+ if (!ref) {
1402
+ pr_debug("imc: Failed to get event reference count\n");
1403
+ return;
1404
+ }
1405
+
1406
+ spin_lock(&ref->lock);
1407
+ ref->refc--;
1408
+ if (ref->refc == 0) {
1409
+ if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_TRACE,
1410
+ get_hard_smp_processor_id(smp_processor_id()))) {
1411
+ spin_unlock(&ref->lock);
1412
+ pr_err("trace-imc: Unable to stop the counters for core %d\n", core_id);
1413
+ return;
1414
+ }
1415
+ } else if (ref->refc < 0) {
1416
+ ref->refc = 0;
1417
+ }
1418
+ spin_unlock(&ref->lock);
1419
+
1420
+ trace_imc_event_stop(event, flags);
1421
+}
1422
+
1423
+static int trace_imc_event_init(struct perf_event *event)
1424
+{
1425
+ if (event->attr.type != event->pmu->type)
1426
+ return -ENOENT;
1427
+
1428
+ if (!perfmon_capable())
1429
+ return -EACCES;
1430
+
1431
+ /* Return if this is a couting event */
1432
+ if (event->attr.sample_period == 0)
1433
+ return -ENOENT;
1434
+
1435
+ /*
1436
+ * Take the global lock, and make sure
1437
+ * no other thread is running any core/thread imc
1438
+ * events
1439
+ */
1440
+ spin_lock(&imc_global_refc.lock);
1441
+ if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_TRACE) {
1442
+ /*
1443
+ * No core/thread imc events are running in the
1444
+ * system, so set the refc.id to trace-imc.
1445
+ */
1446
+ imc_global_refc.id = IMC_DOMAIN_TRACE;
1447
+ imc_global_refc.refc++;
1448
+ } else {
1449
+ spin_unlock(&imc_global_refc.lock);
1450
+ return -EBUSY;
1451
+ }
1452
+ spin_unlock(&imc_global_refc.lock);
1453
+
1454
+ event->hw.idx = -1;
1455
+
1456
+ /*
1457
+ * There can only be a single PMU for perf_hw_context events which is assigned to
1458
+ * core PMU. Hence use "perf_sw_context" for trace_imc.
1459
+ */
1460
+ event->pmu->task_ctx_nr = perf_sw_context;
1461
+ event->destroy = reset_global_refc;
1462
+ return 0;
10651463 }
10661464
10671465 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
....@@ -1074,6 +1472,7 @@
10741472 pmu->pmu.stop = imc_event_stop;
10751473 pmu->pmu.read = imc_event_update;
10761474 pmu->pmu.attr_groups = pmu->attr_groups;
1475
+ pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
10771476 pmu->attr_groups[IMC_FORMAT_ATTR] = &imc_format_group;
10781477
10791478 switch (pmu->domain) {
....@@ -1093,6 +1492,14 @@
10931492 pmu->pmu.cancel_txn = thread_imc_pmu_cancel_txn;
10941493 pmu->pmu.commit_txn = thread_imc_pmu_commit_txn;
10951494 break;
1495
+ case IMC_DOMAIN_TRACE:
1496
+ pmu->pmu.event_init = trace_imc_event_init;
1497
+ pmu->pmu.add = trace_imc_event_add;
1498
+ pmu->pmu.del = trace_imc_event_del;
1499
+ pmu->pmu.start = trace_imc_event_start;
1500
+ pmu->pmu.stop = trace_imc_event_stop;
1501
+ pmu->pmu.read = trace_imc_event_read;
1502
+ pmu->attr_groups[IMC_FORMAT_ATTR] = &trace_imc_format_group;
10961503 default:
10971504 break;
10981505 }
....@@ -1114,10 +1521,10 @@
11141521 i = 0;
11151522 for_each_node(nid) {
11161523 /*
1117
- * Mutex lock to avoid races while tracking the number of
1524
+ * Take the lock to avoid races while tracking the number of
11181525 * sessions using the chip's nest pmu units.
11191526 */
1120
- mutex_init(&nest_imc_refc[i].lock);
1527
+ spin_lock_init(&nest_imc_refc[i].lock);
11211528
11221529 /*
11231530 * Loop to init the "id" with the node_id. Variable "i" initialized to
....@@ -1163,10 +1570,10 @@
11631570 static void thread_imc_ldbar_disable(void *dummy)
11641571 {
11651572 /*
1166
- * By Zeroing LDBAR, we disable thread-imc
1167
- * updates.
1573
+ * By setting 0th bit of LDBAR to zero, we disable thread-imc
1574
+ * updates to memory.
11681575 */
1169
- mtspr(SPRN_LDBAR, 0);
1576
+ mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
11701577 }
11711578
11721579 void thread_imc_disable(void)
....@@ -1183,6 +1590,18 @@
11831590 free_pages((u64)per_cpu(thread_imc_mem, i), order);
11841591
11851592 }
1593
+}
1594
+
1595
+static void cleanup_all_trace_imc_memory(void)
1596
+{
1597
+ int i, order = get_order(trace_imc_mem_size);
1598
+
1599
+ for_each_online_cpu(i) {
1600
+ if (per_cpu(trace_imc_mem, i))
1601
+ free_pages((u64)per_cpu(trace_imc_mem, i), order);
1602
+
1603
+ }
1604
+ kfree(trace_imc_refc);
11861605 }
11871606
11881607 /* Function to free the attr_groups which are dynamically allocated */
....@@ -1225,6 +1644,11 @@
12251644 if (pmu_ptr->domain == IMC_DOMAIN_THREAD) {
12261645 cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
12271646 cleanup_all_thread_imc_memory();
1647
+ }
1648
+
1649
+ if (pmu_ptr->domain == IMC_DOMAIN_TRACE) {
1650
+ cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE);
1651
+ cleanup_all_trace_imc_memory();
12281652 }
12291653 }
12301654
....@@ -1308,6 +1732,27 @@
13081732
13091733 thread_imc_pmu = pmu_ptr;
13101734 break;
1735
+ case IMC_DOMAIN_TRACE:
1736
+ /* Update the pmu name */
1737
+ pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
1738
+ if (!pmu_ptr->pmu.name)
1739
+ return -ENOMEM;
1740
+
1741
+ nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core);
1742
+ trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
1743
+ GFP_KERNEL);
1744
+ if (!trace_imc_refc)
1745
+ return -ENOMEM;
1746
+
1747
+ trace_imc_mem_size = pmu_ptr->counter_mem_size;
1748
+ for_each_online_cpu(cpu) {
1749
+ res = trace_imc_mem_alloc(cpu, trace_imc_mem_size);
1750
+ if (res) {
1751
+ cleanup_all_trace_imc_memory();
1752
+ goto err;
1753
+ }
1754
+ }
1755
+ break;
13111756 default:
13121757 return -EINVAL;
13131758 }
....@@ -1381,6 +1826,14 @@
13811826 }
13821827
13831828 break;
1829
+ case IMC_DOMAIN_TRACE:
1830
+ ret = trace_imc_cpu_init();
1831
+ if (ret) {
1832
+ cleanup_all_trace_imc_memory();
1833
+ goto err_free_mem;
1834
+ }
1835
+
1836
+ break;
13841837 default:
13851838 return -EINVAL; /* Unknown domain */
13861839 }
....@@ -1397,7 +1850,7 @@
13971850 if (ret)
13981851 goto err_free_cpuhp_mem;
13991852
1400
- pr_info("%s performance monitor hardware support registered\n",
1853
+ pr_debug("%s performance monitor hardware support registered\n",
14011854 pmu_ptr->pmu.name);
14021855
14031856 return 0;