~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,22 +1,9 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/*
2	3	* pSeries_lpar.c
3	4	* Copyright (C) 2001 Todd Inglett, IBM Corporation
4	5	*
5	6	* pSeries LPAR support.
6		- *
7		- * This program is free software; you can redistribute it and/or modify
8		- * it under the terms of the GNU General Public License as published by
9		- * the Free Software Foundation; either version 2 of the License, or
10		- * (at your option) any later version.
11		- *
12		- * This program is distributed in the hope that it will be useful,
13		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		- * GNU General Public License for more details.
16		- *
17		- * You should have received a copy of the GNU General Public License
18		- * along with this program; if not, write to the Free Software
19		- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20	7	*/
21	8
22	9	/* Enables debugging of low-level hash table routines - careful! */
..	..	@@ -30,10 +17,14 @@
30	17	#include <linux/jump_label.h>
31	18	#include <linux/delay.h>
32	19	#include <linux/stop_machine.h>
	20	+#include <linux/spinlock.h>
	21	+#include <linux/cpuhotplug.h>
	22	+#include <linux/workqueue.h>
	23	+#include <linux/proc_fs.h>
	24	+#include <linux/pgtable.h>
33	25	#include <asm/processor.h>
34	26	#include <asm/mmu.h>
35	27	#include <asm/page.h>
36		-#include <asm/pgtable.h>
37	28	#include <asm/machdep.h>
38	29	#include <asm/mmu_context.h>
39	30	#include <asm/iommu.h>
..	..	@@ -49,6 +40,7 @@
49	40	#include <asm/fadump.h>
50	41	#include <asm/asm-prototypes.h>
51	42	#include <asm/debugfs.h>
	43	+#include <asm/dtl.h>
52	44
53	45	#include "pseries.h"
54	46
..	..	@@ -65,13 +57,615 @@
65	57	EXPORT_SYMBOL(plpar_hcall9);
66	58	EXPORT_SYMBOL(plpar_hcall_norets);
67	59
	60	+/*
	61	+ * H_BLOCK_REMOVE supported block size for this page size in segment who's base
	62	+ * page size is that page size.
	63	+ *
	64	+ * The first index is the segment base page size, the second one is the actual
	65	+ * page size.
	66	+ */
	67	+static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
	68	+
	69	+/*
	70	+ * Due to the involved complexity, and that the current hypervisor is only
	71	+ * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE
	72	+ * buffer size to 8 size block.
	73	+ */
	74	+#define HBLKRM_SUPPORTED_BLOCK_SIZE 8
	75	+
	76	+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
	77	+static u8 dtl_mask = DTL_LOG_PREEMPT;
	78	+#else
	79	+static u8 dtl_mask;
	80	+#endif
	81	+
	82	+void alloc_dtl_buffers(unsigned long *time_limit)
	83	+{
	84	+ int cpu;
	85	+ struct paca_struct *pp;
	86	+ struct dtl_entry *dtl;
	87	+
	88	+ for_each_possible_cpu(cpu) {
	89	+ pp = paca_ptrs[cpu];
	90	+ if (pp->dispatch_log)
	91	+ continue;
	92	+ dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
	93	+ if (!dtl) {
	94	+ pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
	95	+ cpu);
	96	+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
	97	+ pr_warn("Stolen time statistics will be unreliable\n");
	98	+#endif
	99	+ break;
	100	+ }
	101	+
	102	+ pp->dtl_ridx = 0;
	103	+ pp->dispatch_log = dtl;
	104	+ pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
	105	+ pp->dtl_curr = dtl;
	106	+
	107	+ if (time_limit && time_after(jiffies, *time_limit)) {
	108	+ cond_resched();
	109	+ *time_limit = jiffies + HZ;
	110	+ }
	111	+ }
	112	+}
	113	+
	114	+void register_dtl_buffer(int cpu)
	115	+{
	116	+ long ret;
	117	+ struct paca_struct *pp;
	118	+ struct dtl_entry *dtl;
	119	+ int hwcpu = get_hard_smp_processor_id(cpu);
	120	+
	121	+ pp = paca_ptrs[cpu];
	122	+ dtl = pp->dispatch_log;
	123	+ if (dtl && dtl_mask) {
	124	+ pp->dtl_ridx = 0;
	125	+ pp->dtl_curr = dtl;
	126	+ lppaca_of(cpu).dtl_idx = 0;
	127	+
	128	+ /* hypervisor reads buffer length from this field */
	129	+ dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
	130	+ ret = register_dtl(hwcpu, __pa(dtl));
	131	+ if (ret)
	132	+ pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n",
	133	+ cpu, hwcpu, ret);
	134	+
	135	+ lppaca_of(cpu).dtl_enable_mask = dtl_mask;
	136	+ }
	137	+}
	138	+
	139	+#ifdef CONFIG_PPC_SPLPAR
	140	+struct dtl_worker {
	141	+ struct delayed_work work;
	142	+ int cpu;
	143	+};
	144	+
	145	+struct vcpu_dispatch_data {
	146	+ int last_disp_cpu;
	147	+
	148	+ int total_disp;
	149	+
	150	+ int same_cpu_disp;
	151	+ int same_chip_disp;
	152	+ int diff_chip_disp;
	153	+ int far_chip_disp;
	154	+
	155	+ int numa_home_disp;
	156	+ int numa_remote_disp;
	157	+ int numa_far_disp;
	158	+};
	159	+
	160	+/*
	161	+ * This represents the number of cpus in the hypervisor. Since there is no
	162	+ * architected way to discover the number of processors in the host, we
	163	+ * provision for dealing with NR_CPUS. This is currently 2048 by default, and
	164	+ * is sufficient for our purposes. This will need to be tweaked if
	165	+ * CONFIG_NR_CPUS is changed.
	166	+ */
	167	+#define NR_CPUS_H NR_CPUS
	168	+
	169	+DEFINE_RWLOCK(dtl_access_lock);
	170	+static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data);
	171	+static DEFINE_PER_CPU(u64, dtl_entry_ridx);
	172	+static DEFINE_PER_CPU(struct dtl_worker, dtl_workers);
	173	+static enum cpuhp_state dtl_worker_state;
	174	+static DEFINE_MUTEX(dtl_enable_mutex);
	175	+static int vcpudispatch_stats_on __read_mostly;
	176	+static int vcpudispatch_stats_freq = 50;
	177	+static __be32 vcpu_associativity, pcpu_associativity;
	178	+
	179	+
	180	+static void free_dtl_buffers(unsigned long *time_limit)
	181	+{
	182	+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
	183	+ int cpu;
	184	+ struct paca_struct *pp;
	185	+
	186	+ for_each_possible_cpu(cpu) {
	187	+ pp = paca_ptrs[cpu];
	188	+ if (!pp->dispatch_log)
	189	+ continue;
	190	+ kmem_cache_free(dtl_cache, pp->dispatch_log);
	191	+ pp->dtl_ridx = 0;
	192	+ pp->dispatch_log = 0;
	193	+ pp->dispatch_log_end = 0;
	194	+ pp->dtl_curr = 0;
	195	+
	196	+ if (time_limit && time_after(jiffies, *time_limit)) {
	197	+ cond_resched();
	198	+ *time_limit = jiffies + HZ;
	199	+ }
	200	+ }
	201	+#endif
	202	+}
	203	+
	204	+static int init_cpu_associativity(void)
	205	+{
	206	+ vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core,
	207	+ VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
	208	+ pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core,
	209	+ VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
	210	+
	211	+ if (!vcpu_associativity \|\| !pcpu_associativity) {
	212	+ pr_err("error allocating memory for associativity information\n");
	213	+ return -ENOMEM;
	214	+ }
	215	+
	216	+ return 0;
	217	+}
	218	+
	219	+static void destroy_cpu_associativity(void)
	220	+{
	221	+ kfree(vcpu_associativity);
	222	+ kfree(pcpu_associativity);
	223	+ vcpu_associativity = pcpu_associativity = 0;
	224	+}
	225	+
	226	+static __be32 __get_cpu_associativity(int cpu, __be32 cpu_assoc, int flag)
	227	+{
	228	+ __be32 *assoc;
	229	+ int rc = 0;
	230	+
	231	+ assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE];
	232	+ if (!assoc[0]) {
	233	+ rc = hcall_vphn(cpu, flag, &assoc[0]);
	234	+ if (rc)
	235	+ return NULL;
	236	+ }
	237	+
	238	+ return assoc;
	239	+}
	240	+
	241	+static __be32 *get_pcpu_associativity(int cpu)
	242	+{
	243	+ return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU);
	244	+}
	245	+
	246	+static __be32 *get_vcpu_associativity(int cpu)
	247	+{
	248	+ return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU);
	249	+}
	250	+
	251	+static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu)
	252	+{
	253	+ __be32 last_disp_cpu_assoc, cur_disp_cpu_assoc;
	254	+
	255	+ if (last_disp_cpu >= NR_CPUS_H \|\| cur_disp_cpu >= NR_CPUS_H)
	256	+ return -EINVAL;
	257	+
	258	+ last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu);
	259	+ cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu);
	260	+
	261	+ if (!last_disp_cpu_assoc \|\| !cur_disp_cpu_assoc)
	262	+ return -EIO;
	263	+
	264	+ return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
	265	+}
	266	+
	267	+static int cpu_home_node_dispatch_distance(int disp_cpu)
	268	+{
	269	+ __be32 disp_cpu_assoc, vcpu_assoc;
	270	+ int vcpu_id = smp_processor_id();
	271	+
	272	+ if (disp_cpu >= NR_CPUS_H) {
	273	+ pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n",
	274	+ disp_cpu, NR_CPUS_H);
	275	+ return -EINVAL;
	276	+ }
	277	+
	278	+ disp_cpu_assoc = get_pcpu_associativity(disp_cpu);
	279	+ vcpu_assoc = get_vcpu_associativity(vcpu_id);
	280	+
	281	+ if (!disp_cpu_assoc \|\| !vcpu_assoc)
	282	+ return -EIO;
	283	+
	284	+ return cpu_distance(disp_cpu_assoc, vcpu_assoc);
	285	+}
	286	+
	287	+static void update_vcpu_disp_stat(int disp_cpu)
	288	+{
	289	+ struct vcpu_dispatch_data *disp;
	290	+ int distance;
	291	+
	292	+ disp = this_cpu_ptr(&vcpu_disp_data);
	293	+ if (disp->last_disp_cpu == -1) {
	294	+ disp->last_disp_cpu = disp_cpu;
	295	+ return;
	296	+ }
	297	+
	298	+ disp->total_disp++;
	299	+
	300	+ if (disp->last_disp_cpu == disp_cpu \|\|
	301	+ (cpu_first_thread_sibling(disp->last_disp_cpu) ==
	302	+ cpu_first_thread_sibling(disp_cpu)))
	303	+ disp->same_cpu_disp++;
	304	+ else {
	305	+ distance = cpu_relative_dispatch_distance(disp->last_disp_cpu,
	306	+ disp_cpu);
	307	+ if (distance < 0)
	308	+ pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
	309	+ smp_processor_id());
	310	+ else {
	311	+ switch (distance) {
	312	+ case 0:
	313	+ disp->same_chip_disp++;
	314	+ break;
	315	+ case 1:
	316	+ disp->diff_chip_disp++;
	317	+ break;
	318	+ case 2:
	319	+ disp->far_chip_disp++;
	320	+ break;
	321	+ default:
	322	+ pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n",
	323	+ smp_processor_id(),
	324	+ disp->last_disp_cpu,
	325	+ disp_cpu,
	326	+ distance);
	327	+ }
	328	+ }
	329	+ }
	330	+
	331	+ distance = cpu_home_node_dispatch_distance(disp_cpu);
	332	+ if (distance < 0)
	333	+ pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
	334	+ smp_processor_id());
	335	+ else {
	336	+ switch (distance) {
	337	+ case 0:
	338	+ disp->numa_home_disp++;
	339	+ break;
	340	+ case 1:
	341	+ disp->numa_remote_disp++;
	342	+ break;
	343	+ case 2:
	344	+ disp->numa_far_disp++;
	345	+ break;
	346	+ default:
	347	+ pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n",
	348	+ smp_processor_id(),
	349	+ disp_cpu,
	350	+ distance);
	351	+ }
	352	+ }
	353	+
	354	+ disp->last_disp_cpu = disp_cpu;
	355	+}
	356	+
	357	+static void process_dtl_buffer(struct work_struct *work)
	358	+{
	359	+ struct dtl_entry dtle;
	360	+ u64 i = __this_cpu_read(dtl_entry_ridx);
	361	+ struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
	362	+ struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
	363	+ struct lppaca *vpa = local_paca->lppaca_ptr;
	364	+ struct dtl_worker *d = container_of(work, struct dtl_worker, work.work);
	365	+
	366	+ if (!local_paca->dispatch_log)
	367	+ return;
	368	+
	369	+ /* if we have been migrated away, we cancel ourself */
	370	+ if (d->cpu != smp_processor_id()) {
	371	+ pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n",
	372	+ smp_processor_id());
	373	+ return;
	374	+ }
	375	+
	376	+ if (i == be64_to_cpu(vpa->dtl_idx))
	377	+ goto out;
	378	+
	379	+ while (i < be64_to_cpu(vpa->dtl_idx)) {
	380	+ dtle = *dtl;
	381	+ barrier();
	382	+ if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
	383	+ /* buffer has overflowed */
	384	+ pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n",
	385	+ d->cpu,
	386	+ be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i);
	387	+ i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
	388	+ dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
	389	+ continue;
	390	+ }
	391	+ update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id));
	392	+ ++i;
	393	+ ++dtl;
	394	+ if (dtl == dtl_end)
	395	+ dtl = local_paca->dispatch_log;
	396	+ }
	397	+
	398	+ __this_cpu_write(dtl_entry_ridx, i);
	399	+
	400	+out:
	401	+ schedule_delayed_work_on(d->cpu, to_delayed_work(work),
	402	+ HZ / vcpudispatch_stats_freq);
	403	+}
	404	+
	405	+static int dtl_worker_online(unsigned int cpu)
	406	+{
	407	+ struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
	408	+
	409	+ memset(d, 0, sizeof(*d));
	410	+ INIT_DELAYED_WORK(&d->work, process_dtl_buffer);
	411	+ d->cpu = cpu;
	412	+
	413	+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
	414	+ per_cpu(dtl_entry_ridx, cpu) = 0;
	415	+ register_dtl_buffer(cpu);
	416	+#else
	417	+ per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx);
	418	+#endif
	419	+
	420	+ schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq);
	421	+ return 0;
	422	+}
	423	+
	424	+static int dtl_worker_offline(unsigned int cpu)
	425	+{
	426	+ struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
	427	+
	428	+ cancel_delayed_work_sync(&d->work);
	429	+
	430	+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
	431	+ unregister_dtl(get_hard_smp_processor_id(cpu));
	432	+#endif
	433	+
	434	+ return 0;
	435	+}
	436	+
	437	+static void set_global_dtl_mask(u8 mask)
	438	+{
	439	+ int cpu;
	440	+
	441	+ dtl_mask = mask;
	442	+ for_each_present_cpu(cpu)
	443	+ lppaca_of(cpu).dtl_enable_mask = dtl_mask;
	444	+}
	445	+
	446	+static void reset_global_dtl_mask(void)
	447	+{
	448	+ int cpu;
	449	+
	450	+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
	451	+ dtl_mask = DTL_LOG_PREEMPT;
	452	+#else
	453	+ dtl_mask = 0;
	454	+#endif
	455	+ for_each_present_cpu(cpu)
	456	+ lppaca_of(cpu).dtl_enable_mask = dtl_mask;
	457	+}
	458	+
	459	+static int dtl_worker_enable(unsigned long *time_limit)
	460	+{
	461	+ int rc = 0, state;
	462	+
	463	+ if (!write_trylock(&dtl_access_lock)) {
	464	+ rc = -EBUSY;
	465	+ goto out;
	466	+ }
	467	+
	468	+ set_global_dtl_mask(DTL_LOG_ALL);
	469	+
	470	+ /* Setup dtl buffers and register those */
	471	+ alloc_dtl_buffers(time_limit);
	472	+
	473	+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online",
	474	+ dtl_worker_online, dtl_worker_offline);
	475	+ if (state < 0) {
	476	+ pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n");
	477	+ free_dtl_buffers(time_limit);
	478	+ reset_global_dtl_mask();
	479	+ write_unlock(&dtl_access_lock);
	480	+ rc = -EINVAL;
	481	+ goto out;
	482	+ }
	483	+ dtl_worker_state = state;
	484	+
	485	+out:
	486	+ return rc;
	487	+}
	488	+
	489	+static void dtl_worker_disable(unsigned long *time_limit)
	490	+{
	491	+ cpuhp_remove_state(dtl_worker_state);
	492	+ free_dtl_buffers(time_limit);
	493	+ reset_global_dtl_mask();
	494	+ write_unlock(&dtl_access_lock);
	495	+}
	496	+
	497	+static ssize_t vcpudispatch_stats_write(struct file file, const char __user p,
	498	+ size_t count, loff_t *ppos)
	499	+{
	500	+ unsigned long time_limit = jiffies + HZ;
	501	+ struct vcpu_dispatch_data *disp;
	502	+ int rc, cmd, cpu;
	503	+ char buf[16];
	504	+
	505	+ if (count > 15)
	506	+ return -EINVAL;
	507	+
	508	+ if (copy_from_user(buf, p, count))
	509	+ return -EFAULT;
	510	+
	511	+ buf[count] = 0;
	512	+ rc = kstrtoint(buf, 0, &cmd);
	513	+ if (rc \|\| cmd < 0 \|\| cmd > 1) {
	514	+ pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n");
	515	+ return rc ? rc : -EINVAL;
	516	+ }
	517	+
	518	+ mutex_lock(&dtl_enable_mutex);
	519	+
	520	+ if ((cmd == 0 && !vcpudispatch_stats_on) \|\|
	521	+ (cmd == 1 && vcpudispatch_stats_on))
	522	+ goto out;
	523	+
	524	+ if (cmd) {
	525	+ rc = init_cpu_associativity();
	526	+ if (rc)
	527	+ goto out;
	528	+
	529	+ for_each_possible_cpu(cpu) {
	530	+ disp = per_cpu_ptr(&vcpu_disp_data, cpu);
	531	+ memset(disp, 0, sizeof(*disp));
	532	+ disp->last_disp_cpu = -1;
	533	+ }
	534	+
	535	+ rc = dtl_worker_enable(&time_limit);
	536	+ if (rc) {
	537	+ destroy_cpu_associativity();
	538	+ goto out;
	539	+ }
	540	+ } else {
	541	+ dtl_worker_disable(&time_limit);
	542	+ destroy_cpu_associativity();
	543	+ }
	544	+
	545	+ vcpudispatch_stats_on = cmd;
	546	+
	547	+out:
	548	+ mutex_unlock(&dtl_enable_mutex);
	549	+ if (rc)
	550	+ return rc;
	551	+ return count;
	552	+}
	553	+
	554	+static int vcpudispatch_stats_display(struct seq_file p, void v)
	555	+{
	556	+ int cpu;
	557	+ struct vcpu_dispatch_data *disp;
	558	+
	559	+ if (!vcpudispatch_stats_on) {
	560	+ seq_puts(p, "off\n");
	561	+ return 0;
	562	+ }
	563	+
	564	+ for_each_online_cpu(cpu) {
	565	+ disp = per_cpu_ptr(&vcpu_disp_data, cpu);
	566	+ seq_printf(p, "cpu%d", cpu);
	567	+ seq_put_decimal_ull(p, " ", disp->total_disp);
	568	+ seq_put_decimal_ull(p, " ", disp->same_cpu_disp);
	569	+ seq_put_decimal_ull(p, " ", disp->same_chip_disp);
	570	+ seq_put_decimal_ull(p, " ", disp->diff_chip_disp);
	571	+ seq_put_decimal_ull(p, " ", disp->far_chip_disp);
	572	+ seq_put_decimal_ull(p, " ", disp->numa_home_disp);
	573	+ seq_put_decimal_ull(p, " ", disp->numa_remote_disp);
	574	+ seq_put_decimal_ull(p, " ", disp->numa_far_disp);
	575	+ seq_puts(p, "\n");
	576	+ }
	577	+
	578	+ return 0;
	579	+}
	580	+
	581	+static int vcpudispatch_stats_open(struct inode inode, struct file file)
	582	+{
	583	+ return single_open(file, vcpudispatch_stats_display, NULL);
	584	+}
	585	+
	586	+static const struct proc_ops vcpudispatch_stats_proc_ops = {
	587	+ .proc_open = vcpudispatch_stats_open,
	588	+ .proc_read = seq_read,
	589	+ .proc_write = vcpudispatch_stats_write,
	590	+ .proc_lseek = seq_lseek,
	591	+ .proc_release = single_release,
	592	+};
	593	+
	594	+static ssize_t vcpudispatch_stats_freq_write(struct file *file,
	595	+ const char __user p, size_t count, loff_t ppos)
	596	+{
	597	+ int rc, freq;
	598	+ char buf[16];
	599	+
	600	+ if (count > 15)
	601	+ return -EINVAL;
	602	+
	603	+ if (copy_from_user(buf, p, count))
	604	+ return -EFAULT;
	605	+
	606	+ buf[count] = 0;
	607	+ rc = kstrtoint(buf, 0, &freq);
	608	+ if (rc \|\| freq < 1 \|\| freq > HZ) {
	609	+ pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n",
	610	+ HZ);
	611	+ return rc ? rc : -EINVAL;
	612	+ }
	613	+
	614	+ vcpudispatch_stats_freq = freq;
	615	+
	616	+ return count;
	617	+}
	618	+
	619	+static int vcpudispatch_stats_freq_display(struct seq_file p, void v)
	620	+{
	621	+ seq_printf(p, "%d\n", vcpudispatch_stats_freq);
	622	+ return 0;
	623	+}
	624	+
	625	+static int vcpudispatch_stats_freq_open(struct inode inode, struct file file)
	626	+{
	627	+ return single_open(file, vcpudispatch_stats_freq_display, NULL);
	628	+}
	629	+
	630	+static const struct proc_ops vcpudispatch_stats_freq_proc_ops = {
	631	+ .proc_open = vcpudispatch_stats_freq_open,
	632	+ .proc_read = seq_read,
	633	+ .proc_write = vcpudispatch_stats_freq_write,
	634	+ .proc_lseek = seq_lseek,
	635	+ .proc_release = single_release,
	636	+};
	637	+
	638	+static int __init vcpudispatch_stats_procfs_init(void)
	639	+{
	640	+ /*
	641	+ * Avoid smp_processor_id while preemptible. All CPUs should have
	642	+ * the same value for lppaca_shared_proc.
	643	+ */
	644	+ preempt_disable();
	645	+ if (!lppaca_shared_proc(get_lppaca())) {
	646	+ preempt_enable();
	647	+ return 0;
	648	+ }
	649	+ preempt_enable();
	650	+
	651	+ if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
	652	+ &vcpudispatch_stats_proc_ops))
	653	+ pr_err("vcpudispatch_stats: error creating procfs file\n");
	654	+ else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL,
	655	+ &vcpudispatch_stats_freq_proc_ops))
	656	+ pr_err("vcpudispatch_stats_freq: error creating procfs file\n");
	657	+
	658	+ return 0;
	659	+}
	660	+
	661	+machine_device_initcall(pseries, vcpudispatch_stats_procfs_init);
	662	+#endif /* CONFIG_PPC_SPLPAR */
	663	+
68	664	void vpa_init(int cpu)
69	665	{
70	666	int hwcpu = get_hard_smp_processor_id(cpu);
71	667	unsigned long addr;
72	668	long ret;
73		- struct paca_struct *pp;
74		- struct dtl_entry *dtl;
75	669
76	670	/*
77	671	* The spec says it "may be problematic" if CPU x registers the VPA of
..	..	@@ -112,22 +706,7 @@
112	706	/*
113	707	* Register dispatch trace log, if one has been allocated.
114	708	*/
115		- pp = paca_ptrs[cpu];
116		- dtl = pp->dispatch_log;
117		- if (dtl) {
118		- pp->dtl_ridx = 0;
119		- pp->dtl_curr = dtl;
120		- lppaca_of(cpu).dtl_idx = 0;
121		-
122		- /* hypervisor reads buffer length from this field */
123		- dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
124		- ret = register_dtl(hwcpu, __pa(dtl));
125		- if (ret)
126		- pr_err("WARNING: DTL registration of cpu %d (hw %d) "
127		- "failed with %ld\n", smp_processor_id(),
128		- hwcpu, ret);
129		- lppaca_of(cpu).dtl_enable_mask = 2;
130		- }
	709	+ register_dtl_buffer(cpu);
131	710	}
132	711
133	712	#ifdef CONFIG_PPC_BOOK3S_64
..	..	@@ -204,7 +783,7 @@
204	783
205	784	/* don't remove a bolted entry */
206	785	lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
207		- (0x1UL << 4), &dummy1, &dummy2);
	786	+ HPTE_V_BOLTED, &dummy1, &dummy2);
208	787	if (lpar_rc == H_SUCCESS)
209	788	return i;
210	789
..	..	@@ -368,11 +947,19 @@
368	947	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
369	948	want_v = hpte_encode_avpn(vpn, psize, ssize);
370	949
371		- /* Bolted entries are always in the primary group */
	950	+ /*
	951	+ * We try to keep bolted entries always in primary hash
	952	+ * But in some case we can find them in secondary too.
	953	+ */
372	954	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
373	955	slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
374		- if (slot < 0)
375		- return -1;
	956	+ if (slot < 0) {
	957	+ /* Try in secondary */
	958	+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
	959	+ slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
	960	+ if (slot < 0)
	961	+ return -1;
	962	+ }
376	963	return hpte_group + slot;
377	964	}
378	965
..	..	@@ -418,6 +1005,90 @@
418	1005	BUG_ON(lpar_rc != H_SUCCESS);
419	1006	}
420	1007
	1008	+
	1009	+/*
	1010	+ * As defined in the PAPR's section 14.5.4.1.8
	1011	+ * The control mask doesn't include the returned reference and change bit from
	1012	+ * the processed PTE.
	1013	+ */
	1014	+#define HBLKR_AVPN 0x0100000000000000UL
	1015	+#define HBLKR_CTRL_MASK 0xf800000000000000UL
	1016	+#define HBLKR_CTRL_SUCCESS 0x8000000000000000UL
	1017	+#define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL
	1018	+#define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL
	1019	+
	1020	+/*
	1021	+ * Returned true if we are supporting this block size for the specified segment
	1022	+ * base page size and actual page size.
	1023	+ *
	1024	+ * Currently, we only support 8 size block.
	1025	+ */
	1026	+static inline bool is_supported_hlbkrm(int bpsize, int psize)
	1027	+{
	1028	+ return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE);
	1029	+}
	1030	+
	1031	+/**
	1032	+ * H_BLOCK_REMOVE caller.
	1033	+ * @idx should point to the latest @param entry set with a PTEX.
	1034	+ * If PTE cannot be processed because another CPUs has already locked that
	1035	+ * group, those entries are put back in @param starting at index 1.
	1036	+ * If entries has to be retried and @retry_busy is set to true, these entries
	1037	+ * are retried until success. If @retry_busy is set to false, the returned
	1038	+ * is the number of entries yet to process.
	1039	+ */
	1040	+static unsigned long call_block_remove(unsigned long idx, unsigned long *param,
	1041	+ bool retry_busy)
	1042	+{
	1043	+ unsigned long i, rc, new_idx;
	1044	+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
	1045	+
	1046	+ if (idx < 2) {
	1047	+ pr_warn("Unexpected empty call to H_BLOCK_REMOVE");
	1048	+ return 0;
	1049	+ }
	1050	+again:
	1051	+ new_idx = 0;
	1052	+ if (idx > PLPAR_HCALL9_BUFSIZE) {
	1053	+ pr_err("Too many PTEs (%lu) for H_BLOCK_REMOVE", idx);
	1054	+ idx = PLPAR_HCALL9_BUFSIZE;
	1055	+ } else if (idx < PLPAR_HCALL9_BUFSIZE)
	1056	+ param[idx] = HBR_END;
	1057	+
	1058	+ rc = plpar_hcall9(H_BLOCK_REMOVE, retbuf,
	1059	+ param[0], /* AVA */
	1060	+ param[1], param[2], param[3], param[4], /* TS0-7 */
	1061	+ param[5], param[6], param[7], param[8]);
	1062	+ if (rc == H_SUCCESS)
	1063	+ return 0;
	1064	+
	1065	+ BUG_ON(rc != H_PARTIAL);
	1066	+
	1067	+ /* Check that the unprocessed entries were 'not found' or 'busy' */
	1068	+ for (i = 0; i < idx-1; i++) {
	1069	+ unsigned long ctrl = retbuf[i] & HBLKR_CTRL_MASK;
	1070	+
	1071	+ if (ctrl == HBLKR_CTRL_ERRBUSY) {
	1072	+ param[++new_idx] = param[i+1];
	1073	+ continue;
	1074	+ }
	1075	+
	1076	+ BUG_ON(ctrl != HBLKR_CTRL_SUCCESS
	1077	+ && ctrl != HBLKR_CTRL_ERRNOTFOUND);
	1078	+ }
	1079	+
	1080	+ /*
	1081	+ * If there were entries found busy, retry these entries if requested,
	1082	+ * of if all the entries have to be retried.
	1083	+ */
	1084	+ if (new_idx && (retry_busy \|\| new_idx == (PLPAR_HCALL9_BUFSIZE-1))) {
	1085	+ idx = new_idx + 1;
	1086	+ goto again;
	1087	+ }
	1088	+
	1089	+ return new_idx;
	1090	+}
	1091	+
421	1092	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
422	1093	/*
423	1094	* Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
..	..	@@ -425,17 +1096,57 @@
425	1096	*/
426	1097	#define PPC64_HUGE_HPTE_BATCH 12
427	1098
428		-static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
429		- unsigned long *vpn, int count,
430		- int psize, int ssize)
	1099	+static void hugepage_block_invalidate(unsigned long slot, unsigned long vpn,
	1100	+ int count, int psize, int ssize)
	1101	+{
	1102	+ unsigned long param[PLPAR_HCALL9_BUFSIZE];
	1103	+ unsigned long shift, current_vpgb, vpgb;
	1104	+ int i, pix = 0;
	1105	+
	1106	+ shift = mmu_psize_defs[psize].shift;
	1107	+
	1108	+ for (i = 0; i < count; i++) {
	1109	+ /*
	1110	+ * Shifting 3 bits more on the right to get a
	1111	+ * 8 pages aligned virtual addresse.
	1112	+ */
	1113	+ vpgb = (vpn[i] >> (shift - VPN_SHIFT + 3));
	1114	+ if (!pix \|\| vpgb != current_vpgb) {
	1115	+ /*
	1116	+ * Need to start a new 8 pages block, flush
	1117	+ * the current one if needed.
	1118	+ */
	1119	+ if (pix)
	1120	+ (void)call_block_remove(pix, param, true);
	1121	+ current_vpgb = vpgb;
	1122	+ param[0] = hpte_encode_avpn(vpn[i], psize, ssize);
	1123	+ pix = 1;
	1124	+ }
	1125	+
	1126	+ param[pix++] = HBR_REQUEST \| HBLKR_AVPN \| slot[i];
	1127	+ if (pix == PLPAR_HCALL9_BUFSIZE) {
	1128	+ pix = call_block_remove(pix, param, false);
	1129	+ /*
	1130	+ * pix = 0 means that all the entries were
	1131	+ * removed, we can start a new block.
	1132	+ * Otherwise, this means that there are entries
	1133	+ * to retry, and pix points to latest one, so
	1134	+ * we should increment it and try to continue
	1135	+ * the same block.
	1136	+ */
	1137	+ if (pix)
	1138	+ pix++;
	1139	+ }
	1140	+ }
	1141	+ if (pix)
	1142	+ (void)call_block_remove(pix, param, true);
	1143	+}
	1144	+
	1145	+static void hugepage_bulk_invalidate(unsigned long slot, unsigned long vpn,
	1146	+ int count, int psize, int ssize)
431	1147	{
432	1148	unsigned long param[PLPAR_HCALL9_BUFSIZE];
433	1149	int i = 0, pix = 0, rc;
434		- unsigned long flags = 0;
435		- int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
436		-
437		- if (lock_tlbie)
438		- spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
439	1150
440	1151	for (i = 0; i < count; i++) {
441	1152
..	..	@@ -463,6 +1174,24 @@
463	1174	param[6], param[7]);
464	1175	BUG_ON(rc != H_SUCCESS);
465	1176	}
	1177	+}
	1178	+
	1179	+static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
	1180	+ unsigned long *vpn,
	1181	+ int count, int psize,
	1182	+ int ssize)
	1183	+{
	1184	+ unsigned long flags = 0;
	1185	+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
	1186	+
	1187	+ if (lock_tlbie)
	1188	+ spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
	1189	+
	1190	+ /* Assuming THP size is 16M */
	1191	+ if (is_supported_hlbkrm(psize, MMU_PAGE_16M))
	1192	+ hugepage_block_invalidate(slot, vpn, count, psize, ssize);
	1193	+ else
	1194	+ hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
466	1195
467	1196	if (lock_tlbie)
468	1197	spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
..	..	@@ -547,6 +1276,220 @@
547	1276	return 0;
548	1277	}
549	1278
	1279	+
	1280	+static inline unsigned long compute_slot(real_pte_t pte,
	1281	+ unsigned long vpn,
	1282	+ unsigned long index,
	1283	+ unsigned long shift,
	1284	+ int ssize)
	1285	+{
	1286	+ unsigned long slot, hash, hidx;
	1287	+
	1288	+ hash = hpt_hash(vpn, shift, ssize);
	1289	+ hidx = __rpte_to_hidx(pte, index);
	1290	+ if (hidx & _PTEIDX_SECONDARY)
	1291	+ hash = ~hash;
	1292	+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
	1293	+ slot += hidx & _PTEIDX_GROUP_IX;
	1294	+ return slot;
	1295	+}
	1296	+
	1297	+/**
	1298	+ * The hcall H_BLOCK_REMOVE implies that the virtual pages to processed are
	1299	+ * "all within the same naturally aligned 8 page virtual address block".
	1300	+ */
	1301	+static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch,
	1302	+ unsigned long *param)
	1303	+{
	1304	+ unsigned long vpn;
	1305	+ unsigned long i, pix = 0;
	1306	+ unsigned long index, shift, slot, current_vpgb, vpgb;
	1307	+ real_pte_t pte;
	1308	+ int psize, ssize;
	1309	+
	1310	+ psize = batch->psize;
	1311	+ ssize = batch->ssize;
	1312	+
	1313	+ for (i = 0; i < number; i++) {
	1314	+ vpn = batch->vpn[i];
	1315	+ pte = batch->pte[i];
	1316	+ pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
	1317	+ /*
	1318	+ * Shifting 3 bits more on the right to get a
	1319	+ * 8 pages aligned virtual addresse.
	1320	+ */
	1321	+ vpgb = (vpn >> (shift - VPN_SHIFT + 3));
	1322	+ if (!pix \|\| vpgb != current_vpgb) {
	1323	+ /*
	1324	+ * Need to start a new 8 pages block, flush
	1325	+ * the current one if needed.
	1326	+ */
	1327	+ if (pix)
	1328	+ (void)call_block_remove(pix, param,
	1329	+ true);
	1330	+ current_vpgb = vpgb;
	1331	+ param[0] = hpte_encode_avpn(vpn, psize,
	1332	+ ssize);
	1333	+ pix = 1;
	1334	+ }
	1335	+
	1336	+ slot = compute_slot(pte, vpn, index, shift, ssize);
	1337	+ param[pix++] = HBR_REQUEST \| HBLKR_AVPN \| slot;
	1338	+
	1339	+ if (pix == PLPAR_HCALL9_BUFSIZE) {
	1340	+ pix = call_block_remove(pix, param, false);
	1341	+ /*
	1342	+ * pix = 0 means that all the entries were
	1343	+ * removed, we can start a new block.
	1344	+ * Otherwise, this means that there are entries
	1345	+ * to retry, and pix points to latest one, so
	1346	+ * we should increment it and try to continue
	1347	+ * the same block.
	1348	+ */
	1349	+ if (pix)
	1350	+ pix++;
	1351	+ }
	1352	+ } pte_iterate_hashed_end();
	1353	+ }
	1354	+
	1355	+ if (pix)
	1356	+ (void)call_block_remove(pix, param, true);
	1357	+}
	1358	+
	1359	+/*
	1360	+ * TLB Block Invalidate Characteristics
	1361	+ *
	1362	+ * These characteristics define the size of the block the hcall H_BLOCK_REMOVE
	1363	+ * is able to process for each couple segment base page size, actual page size.
	1364	+ *
	1365	+ * The ibm,get-system-parameter properties is returning a buffer with the
	1366	+ * following layout:
	1367	+ *
	1368	+ * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ]
	1369	+ * -----------------
	1370	+ * TLB Block Invalidate Specifiers:
	1371	+ * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ]
	1372	+ * [ 1 byte Number of page sizes (N) that are supported for the specified
	1373	+ * TLB invalidate block size ]
	1374	+ * [ 1 byte Encoded segment base page size and actual page size
	1375	+ * MSB=0 means 4k segment base page size and actual page size
	1376	+ * MSB=1 the penc value in mmu_psize_def ]
	1377	+ * ...
	1378	+ * -----------------
	1379	+ * Next TLB Block Invalidate Specifiers...
	1380	+ * -----------------
	1381	+ * [ 0 ]
	1382	+ */
	1383	+static inline void set_hblkrm_bloc_size(int bpsize, int psize,
	1384	+ unsigned int block_size)
	1385	+{
	1386	+ if (block_size > hblkrm_size[bpsize][psize])
	1387	+ hblkrm_size[bpsize][psize] = block_size;
	1388	+}
	1389	+
	1390	+/*
	1391	+ * Decode the Encoded segment base page size and actual page size.
	1392	+ * PAPR specifies:
	1393	+ * - bit 7 is the L bit
	1394	+ * - bits 0-5 are the penc value
	1395	+ * If the L bit is 0, this means 4K segment base page size and actual page size
	1396	+ * otherwise the penc value should be read.
	1397	+ */
	1398	+#define HBLKRM_L_MASK 0x80
	1399	+#define HBLKRM_PENC_MASK 0x3f
	1400	+static inline void __init check_lp_set_hblkrm(unsigned int lp,
	1401	+ unsigned int block_size)
	1402	+{
	1403	+ unsigned int bpsize, psize;
	1404	+
	1405	+ /* First, check the L bit, if not set, this means 4K */
	1406	+ if ((lp & HBLKRM_L_MASK) == 0) {
	1407	+ set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size);
	1408	+ return;
	1409	+ }
	1410	+
	1411	+ lp &= HBLKRM_PENC_MASK;
	1412	+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) {
	1413	+ struct mmu_psize_def *def = &mmu_psize_defs[bpsize];
	1414	+
	1415	+ for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
	1416	+ if (def->penc[psize] == lp) {
	1417	+ set_hblkrm_bloc_size(bpsize, psize, block_size);
	1418	+ return;
	1419	+ }
	1420	+ }
	1421	+ }
	1422	+}
	1423	+
	1424	+#define SPLPAR_TLB_BIC_TOKEN 50
	1425	+
	1426	+/*
	1427	+ * The size of the TLB Block Invalidate Characteristics is variable. But at the
	1428	+ * maximum it will be the number of possible page sizes *2 + 10 bytes.
	1429	+ * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size
	1430	+ * (128 bytes) for the buffer to get plenty of space.
	1431	+ */
	1432	+#define SPLPAR_TLB_BIC_MAXLENGTH 128
	1433	+
	1434	+void __init pseries_lpar_read_hblkrm_characteristics(void)
	1435	+{
	1436	+ unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
	1437	+ int call_status, len, idx, bpsize;
	1438	+
	1439	+ if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
	1440	+ return;
	1441	+
	1442	+ spin_lock(&rtas_data_buf_lock);
	1443	+ memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
	1444	+ call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
	1445	+ NULL,
	1446	+ SPLPAR_TLB_BIC_TOKEN,
	1447	+ __pa(rtas_data_buf),
	1448	+ RTAS_DATA_BUF_SIZE);
	1449	+ memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
	1450	+ local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
	1451	+ spin_unlock(&rtas_data_buf_lock);
	1452	+
	1453	+ if (call_status != 0) {
	1454	+ pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
	1455	+ __FILE__, __func__, call_status);
	1456	+ return;
	1457	+ }
	1458	+
	1459	+ /*
	1460	+ * The first two (2) bytes of the data in the buffer are the length of
	1461	+ * the returned data, not counting these first two (2) bytes.
	1462	+ */
	1463	+ len = be16_to_cpu(((u16 )local_buffer)) + 2;
	1464	+ if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
	1465	+ pr_warn("%s too large returned buffer %d", __func__, len);
	1466	+ return;
	1467	+ }
	1468	+
	1469	+ idx = 2;
	1470	+ while (idx < len) {
	1471	+ u8 block_shift = local_buffer[idx++];
	1472	+ u32 block_size;
	1473	+ unsigned int npsize;
	1474	+
	1475	+ if (!block_shift)
	1476	+ break;
	1477	+
	1478	+ block_size = 1 << block_shift;
	1479	+
	1480	+ for (npsize = local_buffer[idx++];
	1481	+ npsize > 0 && idx < len; npsize--)
	1482	+ check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
	1483	+ block_size);
	1484	+ }
	1485	+
	1486	+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
	1487	+ for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
	1488	+ if (hblkrm_size[bpsize][idx])
	1489	+ pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d",
	1490	+ bpsize, idx, hblkrm_size[bpsize][idx]);
	1491	+}
	1492	+
550	1493	/*
551	1494	* Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
552	1495	* lock.
..	..	@@ -559,12 +1502,17 @@
559	1502	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
560	1503	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
561	1504	unsigned long param[PLPAR_HCALL9_BUFSIZE];
562		- unsigned long hash, index, shift, hidx, slot;
	1505	+ unsigned long index, shift, slot;
563	1506	real_pte_t pte;
564	1507	int psize, ssize;
565	1508
566	1509	if (lock_tlbie)
567	1510	spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
	1511	+
	1512	+ if (is_supported_hlbkrm(batch->psize, batch->psize)) {
	1513	+ do_block_remove(number, batch, param);
	1514	+ goto out;
	1515	+ }
568	1516
569	1517	psize = batch->psize;
570	1518	ssize = batch->ssize;
..	..	@@ -573,12 +1521,7 @@
573	1521	vpn = batch->vpn[i];
574	1522	pte = batch->pte[i];
575	1523	pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
576		- hash = hpt_hash(vpn, shift, ssize);
577		- hidx = __rpte_to_hidx(pte, index);
578		- if (hidx & _PTEIDX_SECONDARY)
579		- hash = ~hash;
580		- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
581		- slot += hidx & _PTEIDX_GROUP_IX;
	1524	+ slot = compute_slot(pte, vpn, index, shift, ssize);
582	1525	if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
583	1526	/*
584	1527	* lpar doesn't use the passed actual page size
..	..	@@ -609,6 +1552,7 @@
609	1552	BUG_ON(rc != H_SUCCESS);
610	1553	}
611	1554
	1555	+out:
612	1556	if (lock_tlbie)
613	1557	spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
614	1558	}
..	..	@@ -693,8 +1637,10 @@
693	1637	break;
694	1638
695	1639	case H_PARAMETER:
	1640	+ pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n");
696	1641	return -EINVAL;
697	1642	case H_RESOURCE:
	1643	+ pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n");
698	1644	return -EPERM;
699	1645	default:
700	1646	pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
..	..	@@ -711,7 +1657,6 @@
711	1657	if (rc != 0) {
712	1658	switch (state.commit_rc) {
713	1659	case H_PTEG_FULL:
714		- pr_warn("Hash collision while resizing HPT\n");
715	1660	return -ENOSPC;
716	1661
717	1662	default:
..	..	@@ -736,9 +1681,11 @@
736	1681
737	1682	if (table_size)
738	1683	flags \|= PROC_TABLE_NEW;
739		- if (radix_enabled())
740		- flags \|= PROC_TABLE_RADIX \| PROC_TABLE_GTSE;
741		- else
	1684	+ if (radix_enabled()) {
	1685	+ flags \|= PROC_TABLE_RADIX;
	1686	+ if (mmu_has_feature(MMU_FTR_GTSE))
	1687	+ flags \|= PROC_TABLE_GTSE;
	1688	+ } else
742	1689	flags \|= PROC_TABLE_HPT_SLB;
743	1690	for (;;) {
744	1691	rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
..	..	@@ -765,17 +1712,27 @@
765	1712	mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range;
766	1713	mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all;
767	1714	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
768		- register_process_table = pseries_lpar_register_process_table;
769	1715
770	1716	if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
771	1717	mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
	1718	+
	1719	+ /*
	1720	+ * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
	1721	+ * to inform the hypervisor that we wish to use the HPT.
	1722	+ */
	1723	+ if (cpu_has_feature(CPU_FTR_ARCH_300))
	1724	+ pseries_lpar_register_process_table(0, 0, 0);
772	1725	}
773	1726
	1727	+#ifdef CONFIG_PPC_RADIX_MMU
774	1728	void radix_init_pseries(void)
775	1729	{
776	1730	pr_info("Using radix MMU under hypervisor\n");
777		- register_process_table = pseries_lpar_register_process_table;
	1731	+
	1732	+ pseries_lpar_register_process_table(__pa(process_tb),
	1733	+ 0, PRTB_SIZE_SHIFT - 12);
778	1734	}
	1735	+#endif
779	1736
780	1737	#ifdef CONFIG_PPC_SMLPAR
781	1738	#define CMO_FREE_HINT_DEFAULT 1
..	..	@@ -870,8 +1827,7 @@
870	1827
871	1828	/*
872	1829	* Since the tracing code might execute hcalls we need to guard against
873		- * recursion. One example of this are spinlocks calling H_YIELD on
874		- * shared processor partitions.
	1830	+ * recursion.
875	1831	*/
876	1832	static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
877	1833
..	..	@@ -1062,24 +2018,11 @@
1062	2018	return 0;
1063	2019
1064	2020	vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root);
1065		- if (!vpa_dir) {
1066		- pr_warn("%s: can't create vpa root dir\n", __func__);
1067		- return -ENOMEM;
1068		- }
1069	2021
1070	2022	/* set up the per-cpu vpa file*/
1071	2023	for_each_possible_cpu(i) {
1072		- struct dentry *d;
1073		-
1074	2024	sprintf(name, "cpu-%ld", i);
1075		-
1076		- d = debugfs_create_file(name, 0400, vpa_dir, (void *)i,
1077		- &vpa_fops);
1078		- if (!d) {
1079		- pr_warn("%s: can't create per-cpu vpa file\n",
1080		- __func__);
1081		- return -ENOMEM;
1082		- }
	2025	+ debugfs_create_file(name, 0400, vpa_dir, (void *)i, &vpa_fops);
1083	2026	}
1084	2027
1085	2028	return 0;