~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Kernel-based Virtual Machine driver for Linux
3	4	*
..	..	@@ -10,10 +11,6 @@
10	11	* Authors:
11	12	* Avi Kivity <avi@qumranet.com>
12	13	* Yaniv Kamay <yaniv@qumranet.com>
13		- *
14		- * This work is licensed under the terms of the GNU GPL, version 2. See
15		- * the COPYING file in the top-level directory.
16		- *
17	14	*/
18	15
19	16	#include <kvm/iodev.h>
..	..	@@ -51,13 +48,13 @@
51	48	#include <linux/slab.h>
52	49	#include <linux/sort.h>
53	50	#include <linux/bsearch.h>
54		-#include <linux/kthread.h>
55	51	#include <linux/io.h>
	52	+#include <linux/lockdep.h>
	53	+#include <linux/kthread.h>
56	54
57	55	#include <asm/processor.h>
58	56	#include <asm/ioctl.h>
59	57	#include <linux/uaccess.h>
60		-#include <asm/pgtable.h>
61	58
62	59	#include "coalesced_mmio.h"
63	60	#include "async_pf.h"
..	..	@@ -82,6 +79,11 @@
82	79	module_param(halt_poll_ns_grow, uint, 0644);
83	80	EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
84	81
	82	+/* The start value to grow halt_poll_ns from */
	83	+unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
	84	+module_param(halt_poll_ns_grow_start, uint, 0644);
	85	+EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
	86	+
85	87	/* Default resets per-vcpu halt_poll_ns . */
86	88	unsigned int halt_poll_ns_shrink;
87	89	module_param(halt_poll_ns_shrink, uint, 0644);
..	..	@@ -101,16 +103,18 @@
101	103	static int kvm_usage_count;
102	104	static atomic_t hardware_enable_failed;
103	105
104		-struct kmem_cache *kvm_vcpu_cache;
105		-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
	106	+static struct kmem_cache *kvm_vcpu_cache;
106	107
107	108	static __read_mostly struct preempt_ops kvm_preempt_ops;
	109	+static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
108	110
109	111	struct dentry *kvm_debugfs_dir;
110	112	EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
111	113
112	114	static int kvm_debugfs_num_entries;
113		-static const struct file_operations *stat_fops_per_vm[];
	115	+static const struct file_operations stat_fops_per_vm;
	116	+
	117	+static struct file_operations kvm_chardev_ops;
114	118
115	119	static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116	120	unsigned long arg);
..	..	@@ -119,21 +123,30 @@
119	123	unsigned long arg);
120	124	#define KVM_COMPAT(c) .compat_ioctl = (c)
121	125	#else
	126	+/*
	127	+ * For architectures that don't implement a compat infrastructure,
	128	+ * adopt a double line of defense:
	129	+ * - Prevent a compat task from opening /dev/kvm
	130	+ * - If the open has been done by a 64bit task, and the KVM fd
	131	+ * passed to a compat task, let the ioctls fail.
	132	+ */
122	133	static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
123	134	unsigned long arg) { return -EINVAL; }
124		-#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl
	135	+
	136	+static int kvm_no_compat_open(struct inode inode, struct file file)
	137	+{
	138	+ return is_compat_task() ? -ENODEV : 0;
	139	+}
	140	+#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
	141	+ .open = kvm_no_compat_open
125	142	#endif
126	143	static int hardware_enable_all(void);
127	144	static void hardware_disable_all(void);
128	145
129	146	static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
130	147
131		-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
132		-
133	148	__visible bool kvm_rebooting;
134	149	EXPORT_SYMBOL_GPL(kvm_rebooting);
135		-
136		-static bool largepages_enabled = true;
137	150
138	151	#define KVM_EVENT_CREATE_VM 0
139	152	#define KVM_EVENT_DESTROY_VM 1
..	..	@@ -141,8 +154,14 @@
141	154	static unsigned long long kvm_createvm_count;
142	155	static unsigned long long kvm_active_vms;
143	156
	157	+static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
	158	+
144	159	__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
145	160	unsigned long start, unsigned long end)
	161	+{
	162	+}
	163	+
	164	+__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
146	165	{
147	166	}
148	167
..	..	@@ -175,12 +194,24 @@
175	194	return true;
176	195	}
177	196
	197	+bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
	198	+{
	199	+ struct page *page = pfn_to_page(pfn);
	200	+
	201	+ if (!PageTransCompoundMap(page))
	202	+ return false;
	203	+
	204	+ return is_transparent_hugepage(compound_head(page));
	205	+}
	206	+
178	207	/*
179	208	* Switches to specified vcpu, until a matching vcpu_put()
180	209	*/
181	210	void vcpu_load(struct kvm_vcpu *vcpu)
182	211	{
183	212	int cpu = get_cpu();
	213	+
	214	+ __this_cpu_write(kvm_running_vcpu, vcpu);
184	215	preempt_notifier_register(&vcpu->preempt_notifier);
185	216	kvm_arch_vcpu_load(vcpu, cpu);
186	217	put_cpu();
..	..	@@ -192,6 +223,7 @@
192	223	preempt_disable();
193	224	kvm_arch_vcpu_put(vcpu);
194	225	preempt_notifier_unregister(&vcpu->preempt_notifier);
	226	+ __this_cpu_write(kvm_running_vcpu, NULL);
195	227	preempt_enable();
196	228	}
197	229	EXPORT_SYMBOL_GPL(vcpu_put);
..	..	@@ -218,9 +250,13 @@
218	250	{
219	251	}
220	252
221		-static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
	253	+static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait)
222	254	{
223		- if (unlikely(!cpus))
	255	+ const struct cpumask *cpus;
	256	+
	257	+ if (likely(cpumask_available(tmp)))
	258	+ cpus = tmp;
	259	+ else
224	260	cpus = cpu_online_mask;
225	261
226	262	if (cpumask_empty(cpus))
..	..	@@ -230,28 +266,57 @@
230	266	return true;
231	267	}
232	268
	269	+static void kvm_make_vcpu_request(struct kvm kvm, struct kvm_vcpu vcpu,
	270	+ unsigned int req, cpumask_var_t tmp,
	271	+ int current_cpu)
	272	+{
	273	+ int cpu;
	274	+
	275	+ kvm_make_request(req, vcpu);
	276	+
	277	+ if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
	278	+ return;
	279	+
	280	+ /*
	281	+ * tmp can be "unavailable" if cpumasks are allocated off stack as
	282	+ * allocation of the mask is deliberately not fatal and is handled by
	283	+ * falling back to kicking all online CPUs.
	284	+ */
	285	+ if (!cpumask_available(tmp))
	286	+ return;
	287	+
	288	+ /*
	289	+ * Note, the vCPU could get migrated to a different pCPU at any point
	290	+ * after kvm_request_needs_ipi(), which could result in sending an IPI
	291	+ * to the previous pCPU. But, that's OK because the purpose of the IPI
	292	+ * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
	293	+ * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
	294	+ * after this point is also OK, as the requirement is only that KVM wait
	295	+ * for vCPUs that were reading SPTEs _before_ any changes were
	296	+ * finalized. See kvm_vcpu_kick() for more details on handling requests.
	297	+ */
	298	+ if (kvm_request_needs_ipi(vcpu, req)) {
	299	+ cpu = READ_ONCE(vcpu->cpu);
	300	+ if (cpu != -1 && cpu != current_cpu)
	301	+ __cpumask_set_cpu(cpu, tmp);
	302	+ }
	303	+}
	304	+
233	305	bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
	306	+ struct kvm_vcpu *except,
234	307	unsigned long *vcpu_bitmap, cpumask_var_t tmp)
235	308	{
236		- int i, cpu, me;
237	309	struct kvm_vcpu *vcpu;
	310	+ int i, me;
238	311	bool called;
239	312
240	313	me = get_cpu();
241	314
242		- kvm_for_each_vcpu(i, vcpu, kvm) {
243		- if (!test_bit(i, vcpu_bitmap))
	315	+ for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
	316	+ vcpu = kvm_get_vcpu(kvm, i);
	317	+ if (!vcpu \|\| vcpu == except)
244	318	continue;
245		-
246		- kvm_make_request(req, vcpu);
247		- cpu = vcpu->cpu;
248		-
249		- if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
250		- continue;
251		-
252		- if (tmp != NULL && cpu != -1 && cpu != me &&
253		- kvm_request_needs_ipi(vcpu, req))
254		- __cpumask_set_cpu(cpu, tmp);
	319	+ kvm_make_vcpu_request(kvm, vcpu, req, tmp, me);
255	320	}
256	321
257	322	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
..	..	@@ -260,19 +325,34 @@
260	325	return called;
261	326	}
262	327
	328	+bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
	329	+ struct kvm_vcpu *except)
	330	+{
	331	+ struct kvm_vcpu *vcpu;
	332	+ struct cpumask *cpus;
	333	+ bool called;
	334	+ int i, me;
	335	+
	336	+ me = get_cpu();
	337	+
	338	+ cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
	339	+ cpumask_clear(cpus);
	340	+
	341	+ kvm_for_each_vcpu(i, vcpu, kvm) {
	342	+ if (vcpu == except)
	343	+ continue;
	344	+ kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
	345	+ }
	346	+
	347	+ called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
	348	+ put_cpu();
	349	+
	350	+ return called;
	351	+}
	352	+
263	353	bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
264	354	{
265		- cpumask_var_t cpus;
266		- bool called;
267		- static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]
268		- = {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX};
269		-
270		- zalloc_cpumask_var(&cpus, GFP_ATOMIC);
271		-
272		- called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus);
273		-
274		- free_cpumask_var(cpus);
275		- return called;
	355	+ return kvm_make_all_cpus_request_except(kvm, req, NULL);
276	356	}
277	357
278	358	#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
..	..	@@ -308,57 +388,102 @@
308	388	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
309	389	}
310	390
311		-int kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
	391	+static void kvm_flush_shadow_all(struct kvm *kvm)
312	392	{
313		- struct page *page;
314		- int r;
	393	+ kvm_arch_flush_shadow_all(kvm);
	394	+ kvm_arch_guest_memory_reclaimed(kvm);
	395	+}
315	396
	397	+#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
	398	+static inline void mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache mc,
	399	+ gfp_t gfp_flags)
	400	+{
	401	+ gfp_flags \|= mc->gfp_zero;
	402	+
	403	+ if (mc->kmem_cache)
	404	+ return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
	405	+ else
	406	+ return (void *)__get_free_page(gfp_flags);
	407	+}
	408	+
	409	+int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
	410	+{
	411	+ void *obj;
	412	+
	413	+ if (mc->nobjs >= min)
	414	+ return 0;
	415	+ while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
	416	+ obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
	417	+ if (!obj)
	418	+ return mc->nobjs >= min ? 0 : -ENOMEM;
	419	+ mc->objects[mc->nobjs++] = obj;
	420	+ }
	421	+ return 0;
	422	+}
	423	+
	424	+int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
	425	+{
	426	+ return mc->nobjs;
	427	+}
	428	+
	429	+void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
	430	+{
	431	+ while (mc->nobjs) {
	432	+ if (mc->kmem_cache)
	433	+ kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
	434	+ else
	435	+ free_page((unsigned long)mc->objects[--mc->nobjs]);
	436	+ }
	437	+}
	438	+
	439	+void kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache mc)
	440	+{
	441	+ void *p;
	442	+
	443	+ if (WARN_ON(!mc->nobjs))
	444	+ p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC \| __GFP_ACCOUNT);
	445	+ else
	446	+ p = mc->objects[--mc->nobjs];
	447	+ BUG_ON(!p);
	448	+ return p;
	449	+}
	450	+#endif
	451	+
	452	+static void kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
	453	+{
316	454	mutex_init(&vcpu->mutex);
317	455	vcpu->cpu = -1;
318	456	vcpu->kvm = kvm;
319	457	vcpu->vcpu_id = id;
320	458	vcpu->pid = NULL;
321		- init_swait_queue_head(&vcpu->wq);
	459	+ rcuwait_init(&vcpu->wait);
322	460	kvm_async_pf_vcpu_init(vcpu);
323	461
324	462	vcpu->pre_pcpu = -1;
325	463	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
326	464
327		- page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
328		- if (!page) {
329		- r = -ENOMEM;
330		- goto fail;
331		- }
332		- vcpu->run = page_address(page);
333		-
334	465	kvm_vcpu_set_in_spin_loop(vcpu, false);
335	466	kvm_vcpu_set_dy_eligible(vcpu, false);
336	467	vcpu->preempted = false;
337		-
338		- r = kvm_arch_vcpu_init(vcpu);
339		- if (r < 0)
340		- goto fail_free_run;
341		- return 0;
342		-
343		-fail_free_run:
344		- free_page((unsigned long)vcpu->run);
345		-fail:
346		- return r;
	468	+ vcpu->ready = false;
	469	+ preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
347	470	}
348		-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
349	471
350		-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
	472	+void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
351	473	{
	474	+ kvm_arch_vcpu_destroy(vcpu);
	475	+
352	476	/*
353		- * no need for rcu_read_lock as VCPU_RUN is the only place that
354		- * will change the vcpu->pid pointer and on uninit all file
355		- * descriptors are already gone.
	477	+ * No need for rcu_read_lock as VCPU_RUN is the only place that changes
	478	+ * the vcpu->pid pointer, and at destruction time all file descriptors
	479	+ * are already gone.
356	480	*/
357	481	put_pid(rcu_dereference_protected(vcpu->pid, 1));
358		- kvm_arch_vcpu_uninit(vcpu);
	482	+
359	483	free_page((unsigned long)vcpu->run);
	484	+ kmem_cache_free(kvm_vcpu_cache, vcpu);
360	485	}
361		-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
	486	+EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
362	487
363	488	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
364	489	static inline struct kvm mmu_notifier_to_kvm(struct mmu_notifier mn)
..	..	@@ -389,16 +514,16 @@
389	514	idx = srcu_read_lock(&kvm->srcu);
390	515	spin_lock(&kvm->mmu_lock);
391	516	kvm->mmu_notifier_seq++;
392		- kvm_set_spte_hva(kvm, address, pte);
	517	+
	518	+ if (kvm_set_spte_hva(kvm, address, pte))
	519	+ kvm_flush_remote_tlbs(kvm);
	520	+
393	521	spin_unlock(&kvm->mmu_lock);
394	522	srcu_read_unlock(&kvm->srcu, idx);
395	523	}
396	524
397	525	static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
398		- struct mm_struct *mm,
399		- unsigned long start,
400		- unsigned long end,
401		- bool blockable)
	526	+ const struct mmu_notifier_range *range)
402	527	{
403	528	struct kvm *kvm = mmu_notifier_to_kvm(mn);
404	529	int need_tlb_flush = 0, idx;
..	..	@@ -411,21 +536,21 @@
411	536	* count is also read inside the mmu_lock critical section.
412	537	*/
413	538	kvm->mmu_notifier_count++;
414		- need_tlb_flush = kvm_unmap_hva_range(kvm, start, end, blockable);
	539	+ need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
	540	+ range->flags);
415	541	/* we've to flush the tlb before the pages can be freed */
416	542	if (need_tlb_flush \|\| kvm->tlbs_dirty)
417	543	kvm_flush_remote_tlbs(kvm);
418	544
419	545	spin_unlock(&kvm->mmu_lock);
	546	+ kvm_arch_guest_memory_reclaimed(kvm);
420	547	srcu_read_unlock(&kvm->srcu, idx);
421	548
422	549	return 0;
423	550	}
424	551
425	552	static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
426		- struct mm_struct *mm,
427		- unsigned long start,
428		- unsigned long end)
	553	+ const struct mmu_notifier_range *range)
429	554	{
430	555	struct kvm *kvm = mmu_notifier_to_kvm(mn);
431	556
..	..	@@ -522,12 +647,11 @@
522	647	int idx;
523	648
524	649	idx = srcu_read_lock(&kvm->srcu);
525		- kvm_arch_flush_shadow_all(kvm);
	650	+ kvm_flush_shadow_all(kvm);
526	651	srcu_read_unlock(&kvm->srcu, idx);
527	652	}
528	653
529	654	static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
530		- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
531	655	.invalidate_range = kvm_mmu_notifier_invalidate_range,
532	656	.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
533	657	.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
..	..	@@ -558,12 +682,12 @@
558	682	int i;
559	683	struct kvm_memslots *slots;
560	684
561		- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
	685	+ slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
562	686	if (!slots)
563	687	return NULL;
564	688
565	689	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
566		- slots->id_to_index[i] = slots->memslots[i].id = i;
	690	+ slots->id_to_index[i] = -1;
567	691
568	692	return slots;
569	693	}
..	..	@@ -577,18 +701,14 @@
577	701	memslot->dirty_bitmap = NULL;
578	702	}
579	703
580		-/*
581		- * Free any memory in @free but not in @dont.
582		- */
583		-static void kvm_free_memslot(struct kvm kvm, struct kvm_memory_slot free,
584		- struct kvm_memory_slot *dont)
	704	+static void kvm_free_memslot(struct kvm kvm, struct kvm_memory_slot slot)
585	705	{
586		- if (!dont \|\| free->dirty_bitmap != dont->dirty_bitmap)
587		- kvm_destroy_dirty_bitmap(free);
	706	+ kvm_destroy_dirty_bitmap(slot);
588	707
589		- kvm_arch_free_memslot(kvm, free, dont);
	708	+ kvm_arch_free_memslot(kvm, slot);
590	709
591		- free->npages = 0;
	710	+ slot->flags = 0;
	711	+ slot->npages = 0;
592	712	}
593	713
594	714	static void kvm_free_memslots(struct kvm kvm, struct kvm_memslots slots)
..	..	@@ -599,7 +719,7 @@
599	719	return;
600	720
601	721	kvm_for_each_memslot(memslot, slots)
602		- kvm_free_memslot(kvm, memslot, NULL);
	722	+ kvm_free_memslot(kvm, memslot);
603	723
604	724	kvfree(slots);
605	725	}
..	..	@@ -622,6 +742,8 @@
622	742
623	743	static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
624	744	{
	745	+ static DEFINE_MUTEX(kvm_debugfs_lock);
	746	+ struct dentry *dent;
625	747	char dir_name[ITOA_MAX_LEN * 2];
626	748	struct kvm_stat_data *stat_data;
627	749	struct kvm_stats_debugfs_item *p;
..	..	@@ -630,25 +752,37 @@
630	752	return 0;
631	753
632	754	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
633		- kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
	755	+ mutex_lock(&kvm_debugfs_lock);
	756	+ dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
	757	+ if (dent) {
	758	+ pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
	759	+ dput(dent);
	760	+ mutex_unlock(&kvm_debugfs_lock);
	761	+ return 0;
	762	+ }
	763	+ dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
	764	+ mutex_unlock(&kvm_debugfs_lock);
	765	+ if (IS_ERR(dent))
	766	+ return 0;
634	767
	768	+ kvm->debugfs_dentry = dent;
635	769	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
636	770	sizeof(*kvm->debugfs_stat_data),
637		- GFP_KERNEL);
	771	+ GFP_KERNEL_ACCOUNT);
638	772	if (!kvm->debugfs_stat_data)
639	773	return -ENOMEM;
640	774
641	775	for (p = debugfs_entries; p->name; p++) {
642		- stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
	776	+ stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
643	777	if (!stat_data)
644	778	return -ENOMEM;
645	779
646	780	stat_data->kvm = kvm;
647		- stat_data->offset = p->offset;
648		- stat_data->mode = p->mode ? p->mode : 0644;
	781	+ stat_data->dbgfs_item = p;
649	782	kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
650		- debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
651		- stat_data, stat_fops_per_vm[p->kind]);
	783	+ debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
	784	+ kvm->debugfs_dentry, stat_data,
	785	+ &stat_fops_per_vm);
652	786	}
653	787	return 0;
654	788	}
..	..	@@ -672,8 +806,9 @@
672	806
673	807	static struct kvm *kvm_create_vm(unsigned long type)
674	808	{
675		- int r, i;
676	809	struct kvm *kvm = kvm_arch_alloc_vm();
	810	+ int r = -ENOMEM;
	811	+ int i;
677	812
678	813	if (!kvm)
679	814	return ERR_PTR(-ENOMEM);
..	..	@@ -685,12 +820,38 @@
685	820	mutex_init(&kvm->lock);
686	821	mutex_init(&kvm->irq_lock);
687	822	mutex_init(&kvm->slots_lock);
688		- refcount_set(&kvm->users_count, 1);
689	823	INIT_LIST_HEAD(&kvm->devices);
	824	+
	825	+ BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
	826	+
	827	+ if (init_srcu_struct(&kvm->srcu))
	828	+ goto out_err_no_srcu;
	829	+ if (init_srcu_struct(&kvm->irq_srcu))
	830	+ goto out_err_no_irq_srcu;
	831	+
	832	+ refcount_set(&kvm->users_count, 1);
	833	+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
	834	+ struct kvm_memslots *slots = kvm_alloc_memslots();
	835	+
	836	+ if (!slots)
	837	+ goto out_err_no_arch_destroy_vm;
	838	+ /* Generations must be different for each address space. */
	839	+ slots->generation = i;
	840	+ rcu_assign_pointer(kvm->memslots[i], slots);
	841	+ }
	842	+
	843	+ for (i = 0; i < KVM_NR_BUSES; i++) {
	844	+ rcu_assign_pointer(kvm->buses[i],
	845	+ kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
	846	+ if (!kvm->buses[i])
	847	+ goto out_err_no_arch_destroy_vm;
	848	+ }
	849	+
	850	+ kvm->max_halt_poll_ns = halt_poll_ns;
690	851
691	852	r = kvm_arch_init_vm(kvm, type);
692	853	if (r)
693		- goto out_err_no_disable;
	854	+ goto out_err_no_arch_destroy_vm;
694	855
695	856	r = hardware_enable_all();
696	857	if (r)
..	..	@@ -699,33 +860,6 @@
699	860	#ifdef CONFIG_HAVE_KVM_IRQFD
700	861	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
701	862	#endif
702		-
703		- BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
704		-
705		- r = -ENOMEM;
706		- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
707		- struct kvm_memslots *slots = kvm_alloc_memslots();
708		- if (!slots)
709		- goto out_err_no_srcu;
710		- /*
711		- * Generations must be different for each address space.
712		- * Init kvm generation close to the maximum to easily test the
713		- * code of handling generation number wrap-around.
714		- */
715		- slots->generation = i * 2 - 150;
716		- rcu_assign_pointer(kvm->memslots[i], slots);
717		- }
718		-
719		- if (init_srcu_struct(&kvm->srcu))
720		- goto out_err_no_srcu;
721		- if (init_srcu_struct(&kvm->irq_srcu))
722		- goto out_err_no_irq_srcu;
723		- for (i = 0; i < KVM_NR_BUSES; i++) {
724		- rcu_assign_pointer(kvm->buses[i],
725		- kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
726		- if (!kvm->buses[i])
727		- goto out_err_no_mmu_notifier;
728		- }
729	863
730	864	r = kvm_init_mmu_notifier(kvm);
731	865	if (r)
..	..	@@ -741,6 +875,16 @@
741	875
742	876	preempt_notifier_inc();
743	877
	878	+ /*
	879	+ * When the fd passed to this ioctl() is opened it pins the module,
	880	+ * but try_module_get() also prevents getting a reference if the module
	881	+ * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait").
	882	+ */
	883	+ if (!try_module_get(kvm_chardev_ops.owner)) {
	884	+ r = -ENODEV;
	885	+ goto out_err;
	886	+ }
	887	+
744	888	return kvm;
745	889
746	890	out_err:
..	..	@@ -749,17 +893,19 @@
749	893	mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
750	894	#endif
751	895	out_err_no_mmu_notifier:
752		- cleanup_srcu_struct(&kvm->irq_srcu);
753		-out_err_no_irq_srcu:
754		- cleanup_srcu_struct(&kvm->srcu);
755		-out_err_no_srcu:
756	896	hardware_disable_all();
757	897	out_err_no_disable:
758		- refcount_set(&kvm->users_count, 0);
	898	+ kvm_arch_destroy_vm(kvm);
	899	+out_err_no_arch_destroy_vm:
	900	+ WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
759	901	for (i = 0; i < KVM_NR_BUSES; i++)
760	902	kfree(kvm_get_bus(kvm, i));
761	903	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
762	904	kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
	905	+ cleanup_srcu_struct(&kvm->irq_srcu);
	906	+out_err_no_irq_srcu:
	907	+ cleanup_srcu_struct(&kvm->srcu);
	908	+out_err_no_srcu:
763	909	kvm_arch_free_vm(kvm);
764	910	mmdrop(current->mm);
765	911	return ERR_PTR(r);
..	..	@@ -805,7 +951,7 @@
805	951	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
806	952	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
807	953	#else
808		- kvm_arch_flush_shadow_all(kvm);
	954	+ kvm_flush_shadow_all(kvm);
809	955	#endif
810	956	kvm_arch_destroy_vm(kvm);
811	957	kvm_destroy_devices(kvm);
..	..	@@ -817,6 +963,7 @@
817	963	preempt_notifier_dec();
818	964	hardware_disable_all();
819	965	mmdrop(mm);
	966	+ module_put(kvm_chardev_ops.owner);
820	967	}
821	968
822	969	void kvm_get_kvm(struct kvm *kvm)
..	..	@@ -832,6 +979,18 @@
832	979	}
833	980	EXPORT_SYMBOL_GPL(kvm_put_kvm);
834	981
	982	+/*
	983	+ * Used to put a reference that was taken on behalf of an object associated
	984	+ * with a user-visible file descriptor, e.g. a vcpu or device, if installation
	985	+ * of the new file descriptor fails and the reference cannot be transferred to
	986	+ * its final owner. In such cases, the caller is still actively using @kvm and
	987	+ * will fail miserably if the refcount unexpectedly hits zero.
	988	+ */
	989	+void kvm_put_kvm_no_destroy(struct kvm *kvm)
	990	+{
	991	+ WARN_ON(refcount_dec_and_test(&kvm->users_count));
	992	+}
	993	+EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
835	994
836	995	static int kvm_vm_release(struct inode inode, struct file filp)
837	996	{
..	..	@@ -845,13 +1004,13 @@
845	1004
846	1005	/*
847	1006	* Allocation size is twice as large as the actual dirty bitmap size.
848		- * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
	1007	+ * See kvm_vm_ioctl_get_dirty_log() why this is needed.
849	1008	*/
850		-static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
	1009	+static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
851	1010	{
852	1011	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
853	1012
854		- memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
	1013	+ memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
855	1014	if (!memslot->dirty_bitmap)
856	1015	return -ENOMEM;
857	1016
..	..	@@ -859,58 +1018,165 @@
859	1018	}
860	1019
861	1020	/*
862		- * Insert memslot and re-sort memslots based on their GFN,
863		- * so binary search could be used to lookup GFN.
864		- * Sorting algorithm takes advantage of having initially
865		- * sorted array and known changed memslot position.
	1021	+ * Delete a memslot by decrementing the number of used slots and shifting all
	1022	+ * other entries in the array forward one spot.
866	1023	*/
867		-static void update_memslots(struct kvm_memslots *slots,
868		- struct kvm_memory_slot *new)
	1024	+static inline void kvm_memslot_delete(struct kvm_memslots *slots,
	1025	+ struct kvm_memory_slot *memslot)
869	1026	{
870		- int id = new->id;
871		- int i = slots->id_to_index[id];
872	1027	struct kvm_memory_slot *mslots = slots->memslots;
	1028	+ int i;
873	1029
874		- WARN_ON(mslots[i].id != id);
875		- if (!new->npages) {
876		- WARN_ON(!mslots[i].npages);
877		- if (mslots[i].npages)
878		- slots->used_slots--;
879		- } else {
880		- if (!mslots[i].npages)
881		- slots->used_slots++;
882		- }
	1030	+ if (WARN_ON(slots->id_to_index[memslot->id] == -1))
	1031	+ return;
883	1032
884		- while (i < KVM_MEM_SLOTS_NUM - 1 &&
885		- new->base_gfn <= mslots[i + 1].base_gfn) {
886		- if (!mslots[i + 1].npages)
887		- break;
	1033	+ slots->used_slots--;
	1034	+
	1035	+ if (atomic_read(&slots->lru_slot) >= slots->used_slots)
	1036	+ atomic_set(&slots->lru_slot, 0);
	1037	+
	1038	+ for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
888	1039	mslots[i] = mslots[i + 1];
889	1040	slots->id_to_index[mslots[i].id] = i;
890		- i++;
891	1041	}
	1042	+ mslots[i] = *memslot;
	1043	+ slots->id_to_index[memslot->id] = -1;
	1044	+}
	1045	+
	1046	+/*
	1047	+ * "Insert" a new memslot by incrementing the number of used slots. Returns
	1048	+ * the new slot's initial index into the memslots array.
	1049	+ */
	1050	+static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
	1051	+{
	1052	+ return slots->used_slots++;
	1053	+}
	1054	+
	1055	+/*
	1056	+ * Move a changed memslot backwards in the array by shifting existing slots
	1057	+ * with a higher GFN toward the front of the array. Note, the changed memslot
	1058	+ * itself is not preserved in the array, i.e. not swapped at this time, only
	1059	+ * its new index into the array is tracked. Returns the changed memslot's
	1060	+ * current index into the memslots array.
	1061	+ */
	1062	+static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
	1063	+ struct kvm_memory_slot *memslot)
	1064	+{
	1065	+ struct kvm_memory_slot *mslots = slots->memslots;
	1066	+ int i;
	1067	+
	1068	+ if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) \|\|
	1069	+ WARN_ON_ONCE(!slots->used_slots))
	1070	+ return -1;
892	1071
893	1072	/*
894		- * The ">=" is needed when creating a slot with base_gfn == 0,
895		- * so that it moves before all those with base_gfn == npages == 0.
896		- *
897		- * On the other hand, if new->npages is zero, the above loop has
898		- * already left i pointing to the beginning of the empty part of
899		- * mslots, and the ">=" would move the hole backwards in this
900		- * case---which is wrong. So skip the loop when deleting a slot.
	1073	+ * Move the target memslot backward in the array by shifting existing
	1074	+ * memslots with a higher GFN (than the target memslot) towards the
	1075	+ * front of the array.
901	1076	*/
902		- if (new->npages) {
903		- while (i > 0 &&
904		- new->base_gfn >= mslots[i - 1].base_gfn) {
905		- mslots[i] = mslots[i - 1];
906		- slots->id_to_index[mslots[i].id] = i;
907		- i--;
908		- }
909		- } else
910		- WARN_ON_ONCE(i != slots->used_slots);
	1077	+ for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
	1078	+ if (memslot->base_gfn > mslots[i + 1].base_gfn)
	1079	+ break;
911	1080
912		- mslots[i] = *new;
913		- slots->id_to_index[mslots[i].id] = i;
	1081	+ WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
	1082	+
	1083	+ /* Shift the next memslot forward one and update its index. */
	1084	+ mslots[i] = mslots[i + 1];
	1085	+ slots->id_to_index[mslots[i].id] = i;
	1086	+ }
	1087	+ return i;
	1088	+}
	1089	+
	1090	+/*
	1091	+ * Move a changed memslot forwards in the array by shifting existing slots with
	1092	+ * a lower GFN toward the back of the array. Note, the changed memslot itself
	1093	+ * is not preserved in the array, i.e. not swapped at this time, only its new
	1094	+ * index into the array is tracked. Returns the changed memslot's final index
	1095	+ * into the memslots array.
	1096	+ */
	1097	+static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
	1098	+ struct kvm_memory_slot *memslot,
	1099	+ int start)
	1100	+{
	1101	+ struct kvm_memory_slot *mslots = slots->memslots;
	1102	+ int i;
	1103	+
	1104	+ for (i = start; i > 0; i--) {
	1105	+ if (memslot->base_gfn < mslots[i - 1].base_gfn)
	1106	+ break;
	1107	+
	1108	+ WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
	1109	+
	1110	+ /* Shift the next memslot back one and update its index. */
	1111	+ mslots[i] = mslots[i - 1];
	1112	+ slots->id_to_index[mslots[i].id] = i;
	1113	+ }
	1114	+ return i;
	1115	+}
	1116	+
	1117	+/*
	1118	+ * Re-sort memslots based on their GFN to account for an added, deleted, or
	1119	+ * moved memslot. Sorting memslots by GFN allows using a binary search during
	1120	+ * memslot lookup.
	1121	+ *
	1122	+ * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry
	1123	+ * at memslots[0] has the highest GFN.
	1124	+ *
	1125	+ * The sorting algorithm takes advantage of having initially sorted memslots
	1126	+ * and knowing the position of the changed memslot. Sorting is also optimized
	1127	+ * by not swapping the updated memslot and instead only shifting other memslots
	1128	+ * and tracking the new index for the update memslot. Only once its final
	1129	+ * index is known is the updated memslot copied into its position in the array.
	1130	+ *
	1131	+ * - When deleting a memslot, the deleted memslot simply needs to be moved to
	1132	+ * the end of the array.
	1133	+ *
	1134	+ * - When creating a memslot, the algorithm "inserts" the new memslot at the
	1135	+ * end of the array and then it forward to its correct location.
	1136	+ *
	1137	+ * - When moving a memslot, the algorithm first moves the updated memslot
	1138	+ * backward to handle the scenario where the memslot's GFN was changed to a
	1139	+ * lower value. update_memslots() then falls through and runs the same flow
	1140	+ * as creating a memslot to move the memslot forward to handle the scenario
	1141	+ * where its GFN was changed to a higher value.
	1142	+ *
	1143	+ * Note, slots are sorted from highest->lowest instead of lowest->highest for
	1144	+ * historical reasons. Originally, invalid memslots where denoted by having
	1145	+ * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
	1146	+ * to the end of the array. The current algorithm uses dedicated logic to
	1147	+ * delete a memslot and thus does not rely on invalid memslots having GFN=0.
	1148	+ *
	1149	+ * The other historical motiviation for highest->lowest was to improve the
	1150	+ * performance of memslot lookup. KVM originally used a linear search starting
	1151	+ * at memslots[0]. On x86, the largest memslot usually has one of the highest,
	1152	+ * if not the highest, GFN, as the bulk of the guest's RAM is located in a
	1153	+ * single memslot above the 4gb boundary. As the largest memslot is also the
	1154	+ * most likely to be referenced, sorting it to the front of the array was
	1155	+ * advantageous. The current binary search starts from the middle of the array
	1156	+ * and uses an LRU pointer to improve performance for all memslots and GFNs.
	1157	+ */
	1158	+static void update_memslots(struct kvm_memslots *slots,
	1159	+ struct kvm_memory_slot *memslot,
	1160	+ enum kvm_mr_change change)
	1161	+{
	1162	+ int i;
	1163	+
	1164	+ if (change == KVM_MR_DELETE) {
	1165	+ kvm_memslot_delete(slots, memslot);
	1166	+ } else {
	1167	+ if (change == KVM_MR_CREATE)
	1168	+ i = kvm_memslot_insert_back(slots);
	1169	+ else
	1170	+ i = kvm_memslot_move_backward(slots, memslot);
	1171	+ i = kvm_memslot_move_forward(slots, memslot, i);
	1172	+
	1173	+ /*
	1174	+ * Copy the memslot to its new position in memslots and update
	1175	+ * its index accordingly.
	1176	+ */
	1177	+ slots->memslots[i] = *memslot;
	1178	+ slots->id_to_index[memslot->id] = i;
	1179	+ }
914	1180	}
915	1181
916	1182	static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
..	..	@@ -931,36 +1197,148 @@
931	1197	int as_id, struct kvm_memslots *slots)
932	1198	{
933	1199	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
934		- u64 gen;
	1200	+ u64 gen = old_memslots->generation;
935	1201
936		- /*
937		- * Set the low bit in the generation, which disables SPTE caching
938		- * until the end of synchronize_srcu_expedited.
939		- */
940		- WARN_ON(old_memslots->generation & 1);
941		- slots->generation = old_memslots->generation + 1;
	1202	+ WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
	1203	+ slots->generation = gen \| KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
942	1204
943	1205	rcu_assign_pointer(kvm->memslots[as_id], slots);
944	1206	synchronize_srcu_expedited(&kvm->srcu);
945	1207
946	1208	/*
947		- * Increment the new memslot generation a second time. This prevents
948		- * vm exits that race with memslot updates from caching a memslot
949		- * generation that will (potentially) be valid forever.
950		- *
	1209	+ * Increment the new memslot generation a second time, dropping the
	1210	+ * update in-progress flag and incrementing the generation based on
	1211	+ * the number of address spaces. This provides a unique and easily
	1212	+ * identifiable generation number while the memslots are in flux.
	1213	+ */
	1214	+ gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
	1215	+
	1216	+ /*
951	1217	* Generations must be unique even across address spaces. We do not need
952	1218	* a global counter for that, instead the generation space is evenly split
953	1219	* across address spaces. For example, with two address spaces, address
954		- * space 0 will use generations 0, 4, 8, ... while * address space 1 will
955		- * use generations 2, 6, 10, 14, ...
	1220	+ * space 0 will use generations 0, 2, 4, ... while address space 1 will
	1221	+ * use generations 1, 3, 5, ...
956	1222	*/
957		- gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
	1223	+ gen += KVM_ADDRESS_SPACE_NUM;
958	1224
959	1225	kvm_arch_memslots_updated(kvm, gen);
960	1226
961	1227	slots->generation = gen;
962	1228
963	1229	return old_memslots;
	1230	+}
	1231	+
	1232	+/*
	1233	+ * Note, at a minimum, the current number of used slots must be allocated, even
	1234	+ * when deleting a memslot, as we need a complete duplicate of the memslots for
	1235	+ * use when invalidating a memslot prior to deleting/moving the memslot.
	1236	+ */
	1237	+static struct kvm_memslots kvm_dup_memslots(struct kvm_memslots old,
	1238	+ enum kvm_mr_change change)
	1239	+{
	1240	+ struct kvm_memslots *slots;
	1241	+ size_t old_size, new_size;
	1242	+
	1243	+ old_size = sizeof(struct kvm_memslots) +
	1244	+ (sizeof(struct kvm_memory_slot) * old->used_slots);
	1245	+
	1246	+ if (change == KVM_MR_CREATE)
	1247	+ new_size = old_size + sizeof(struct kvm_memory_slot);
	1248	+ else
	1249	+ new_size = old_size;
	1250	+
	1251	+ slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
	1252	+ if (likely(slots))
	1253	+ memcpy(slots, old, old_size);
	1254	+
	1255	+ return slots;
	1256	+}
	1257	+
	1258	+static int kvm_set_memslot(struct kvm *kvm,
	1259	+ const struct kvm_userspace_memory_region *mem,
	1260	+ struct kvm_memory_slot *old,
	1261	+ struct kvm_memory_slot *new, int as_id,
	1262	+ enum kvm_mr_change change)
	1263	+{
	1264	+ struct kvm_memory_slot *slot;
	1265	+ struct kvm_memslots *slots;
	1266	+ int r;
	1267	+
	1268	+ slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
	1269	+ if (!slots)
	1270	+ return -ENOMEM;
	1271	+
	1272	+ if (change == KVM_MR_DELETE \|\| change == KVM_MR_MOVE) {
	1273	+ /*
	1274	+ * Note, the INVALID flag needs to be in the appropriate entry
	1275	+ * in the freshly allocated memslots, not in @old or @new.
	1276	+ */
	1277	+ slot = id_to_memslot(slots, old->id);
	1278	+ slot->flags \|= KVM_MEMSLOT_INVALID;
	1279	+
	1280	+ /*
	1281	+ * We can re-use the old memslots, the only difference from the
	1282	+ * newly installed memslots is the invalid flag, which will get
	1283	+ * dropped by update_memslots anyway. We'll also revert to the
	1284	+ * old memslots if preparing the new memory region fails.
	1285	+ */
	1286	+ slots = install_new_memslots(kvm, as_id, slots);
	1287	+
	1288	+ /* From this point no new shadow pages pointing to a deleted,
	1289	+ * or moved, memslot will be created.
	1290	+ *
	1291	+ * validation of sp->gfn happens in:
	1292	+ * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
	1293	+ * - kvm_is_visible_gfn (mmu_check_root)
	1294	+ */
	1295	+ kvm_arch_flush_shadow_memslot(kvm, slot);
	1296	+ kvm_arch_guest_memory_reclaimed(kvm);
	1297	+ }
	1298	+
	1299	+ r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
	1300	+ if (r)
	1301	+ goto out_slots;
	1302	+
	1303	+ update_memslots(slots, new, change);
	1304	+ slots = install_new_memslots(kvm, as_id, slots);
	1305	+
	1306	+ kvm_arch_commit_memory_region(kvm, mem, old, new, change);
	1307	+
	1308	+ kvfree(slots);
	1309	+ return 0;
	1310	+
	1311	+out_slots:
	1312	+ if (change == KVM_MR_DELETE \|\| change == KVM_MR_MOVE)
	1313	+ slots = install_new_memslots(kvm, as_id, slots);
	1314	+ kvfree(slots);
	1315	+ return r;
	1316	+}
	1317	+
	1318	+static int kvm_delete_memslot(struct kvm *kvm,
	1319	+ const struct kvm_userspace_memory_region *mem,
	1320	+ struct kvm_memory_slot *old, int as_id)
	1321	+{
	1322	+ struct kvm_memory_slot new;
	1323	+ int r;
	1324	+
	1325	+ if (!old->npages)
	1326	+ return -EINVAL;
	1327	+
	1328	+ memset(&new, 0, sizeof(new));
	1329	+ new.id = old->id;
	1330	+ /*
	1331	+ * This is only for debugging purpose; it should never be referenced
	1332	+ * for a removed memslot.
	1333	+ */
	1334	+ new.as_id = as_id;
	1335	+
	1336	+ r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
	1337	+ if (r)
	1338	+ return r;
	1339	+
	1340	+ kvm_free_memslot(kvm, old);
	1341	+ return 0;
964	1342	}
965	1343
966	1344	/*
..	..	@@ -974,163 +1352,120 @@
974	1352	int __kvm_set_memory_region(struct kvm *kvm,
975	1353	const struct kvm_userspace_memory_region *mem)
976	1354	{
977		- int r;
978		- gfn_t base_gfn;
979		- unsigned long npages;
980		- struct kvm_memory_slot *slot;
981	1355	struct kvm_memory_slot old, new;
982		- struct kvm_memslots slots = NULL, old_memslots;
983		- int as_id, id;
	1356	+ struct kvm_memory_slot *tmp;
984	1357	enum kvm_mr_change change;
	1358	+ int as_id, id;
	1359	+ int r;
985	1360
986	1361	r = check_memory_region_flags(mem);
987	1362	if (r)
988		- goto out;
	1363	+ return r;
989	1364
990		- r = -EINVAL;
991	1365	as_id = mem->slot >> 16;
992	1366	id = (u16)mem->slot;
993	1367
994	1368	/* General sanity checks */
995		- if (mem->memory_size & (PAGE_SIZE - 1))
996		- goto out;
	1369	+ if ((mem->memory_size & (PAGE_SIZE - 1)) \|\|
	1370	+ (mem->memory_size != (unsigned long)mem->memory_size))
	1371	+ return -EINVAL;
997	1372	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
998		- goto out;
	1373	+ return -EINVAL;
999	1374	/* We can read the guest memory with __xxx_user() later on. */
1000		- if ((id < KVM_USER_MEM_SLOTS) &&
1001		- ((mem->userspace_addr & (PAGE_SIZE - 1)) \|\|
1002		- !access_ok(VERIFY_WRITE,
1003		- (void __user *)(unsigned long)mem->userspace_addr,
1004		- mem->memory_size)))
1005		- goto out;
	1375	+ if ((mem->userspace_addr & (PAGE_SIZE - 1)) \|\|
	1376	+ (mem->userspace_addr != untagged_addr(mem->userspace_addr)) \|\|
	1377	+ !access_ok((void __user *)(unsigned long)mem->userspace_addr,
	1378	+ mem->memory_size))
	1379	+ return -EINVAL;
1006	1380	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_MEM_SLOTS_NUM)
1007		- goto out;
	1381	+ return -EINVAL;
1008	1382	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1009		- goto out;
	1383	+ return -EINVAL;
1010	1384
1011		- slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1012		- base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1013		- npages = mem->memory_size >> PAGE_SHIFT;
1014		-
1015		- if (npages > KVM_MEM_MAX_NR_PAGES)
1016		- goto out;
1017		-
1018		- new = old = *slot;
1019		-
1020		- new.id = id;
1021		- new.base_gfn = base_gfn;
1022		- new.npages = npages;
1023		- new.flags = mem->flags;
1024		-
1025		- if (npages) {
1026		- if (!old.npages)
1027		- change = KVM_MR_CREATE;
1028		- else { /* Modify an existing slot. */
1029		- if ((mem->userspace_addr != old.userspace_addr) \|\|
1030		- (npages != old.npages) \|\|
1031		- ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1032		- goto out;
1033		-
1034		- if (base_gfn != old.base_gfn)
1035		- change = KVM_MR_MOVE;
1036		- else if (new.flags != old.flags)
1037		- change = KVM_MR_FLAGS_ONLY;
1038		- else { /* Nothing to change. */
1039		- r = 0;
1040		- goto out;
1041		- }
1042		- }
	1385	+ /*
	1386	+ * Make a full copy of the old memslot, the pointer will become stale
	1387	+ * when the memslots are re-sorted by update_memslots(), and the old
	1388	+ * memslot needs to be referenced after calling update_memslots(), e.g.
	1389	+ * to free its resources and for arch specific behavior.
	1390	+ */
	1391	+ tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
	1392	+ if (tmp) {
	1393	+ old = *tmp;
	1394	+ tmp = NULL;
1043	1395	} else {
1044		- if (!old.npages)
1045		- goto out;
	1396	+ memset(&old, 0, sizeof(old));
	1397	+ old.id = id;
	1398	+ }
1046	1399
1047		- change = KVM_MR_DELETE;
1048		- new.base_gfn = 0;
1049		- new.flags = 0;
	1400	+ if (!mem->memory_size)
	1401	+ return kvm_delete_memslot(kvm, mem, &old, as_id);
	1402	+
	1403	+ new.as_id = as_id;
	1404	+ new.id = id;
	1405	+ new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
	1406	+ new.npages = mem->memory_size >> PAGE_SHIFT;
	1407	+ new.flags = mem->flags;
	1408	+ new.userspace_addr = mem->userspace_addr;
	1409	+
	1410	+ if (new.npages > KVM_MEM_MAX_NR_PAGES)
	1411	+ return -EINVAL;
	1412	+
	1413	+ if (!old.npages) {
	1414	+ change = KVM_MR_CREATE;
	1415	+ new.dirty_bitmap = NULL;
	1416	+ memset(&new.arch, 0, sizeof(new.arch));
	1417	+ } else { /* Modify an existing slot. */
	1418	+ if ((new.userspace_addr != old.userspace_addr) \|\|
	1419	+ (new.npages != old.npages) \|\|
	1420	+ ((new.flags ^ old.flags) & KVM_MEM_READONLY))
	1421	+ return -EINVAL;
	1422	+
	1423	+ if (new.base_gfn != old.base_gfn)
	1424	+ change = KVM_MR_MOVE;
	1425	+ else if (new.flags != old.flags)
	1426	+ change = KVM_MR_FLAGS_ONLY;
	1427	+ else /* Nothing to change. */
	1428	+ return 0;
	1429	+
	1430	+ /* Copy dirty_bitmap and arch from the current memslot. */
	1431	+ new.dirty_bitmap = old.dirty_bitmap;
	1432	+ memcpy(&new.arch, &old.arch, sizeof(new.arch));
1050	1433	}
1051	1434
1052	1435	if ((change == KVM_MR_CREATE) \|\| (change == KVM_MR_MOVE)) {
1053	1436	/* Check for overlaps */
1054		- r = -EEXIST;
1055		- kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
1056		- if (slot->id == id)
	1437	+ kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
	1438	+ if (tmp->id == id)
1057	1439	continue;
1058		- if (!((base_gfn + npages <= slot->base_gfn) \|\|
1059		- (base_gfn >= slot->base_gfn + slot->npages)))
1060		- goto out;
	1440	+ if (!((new.base_gfn + new.npages <= tmp->base_gfn) \|\|
	1441	+ (new.base_gfn >= tmp->base_gfn + tmp->npages)))
	1442	+ return -EEXIST;
1061	1443	}
1062	1444	}
1063	1445
1064		- /* Free page dirty bitmap if unneeded */
	1446	+ /* Allocate/free page dirty bitmap as needed */
1065	1447	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1066	1448	new.dirty_bitmap = NULL;
	1449	+ else if (!new.dirty_bitmap) {
	1450	+ r = kvm_alloc_dirty_bitmap(&new);
	1451	+ if (r)
	1452	+ return r;
1067	1453
1068		- r = -ENOMEM;
1069		- if (change == KVM_MR_CREATE) {
1070		- new.userspace_addr = mem->userspace_addr;
1071		-
1072		- if (kvm_arch_create_memslot(kvm, &new, npages))
1073		- goto out_free;
	1454	+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
	1455	+ bitmap_set(new.dirty_bitmap, 0, new.npages);
1074	1456	}
1075	1457
1076		- /* Allocate page dirty bitmap if needed */
1077		- if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1078		- if (kvm_create_dirty_bitmap(&new) < 0)
1079		- goto out_free;
1080		- }
1081		-
1082		- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
1083		- if (!slots)
1084		- goto out_free;
1085		- memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
1086		-
1087		- if ((change == KVM_MR_DELETE) \|\| (change == KVM_MR_MOVE)) {
1088		- slot = id_to_memslot(slots, id);
1089		- slot->flags \|= KVM_MEMSLOT_INVALID;
1090		-
1091		- old_memslots = install_new_memslots(kvm, as_id, slots);
1092		-
1093		- /* From this point no new shadow pages pointing to a deleted,
1094		- * or moved, memslot will be created.
1095		- *
1096		- * validation of sp->gfn happens in:
1097		- * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1098		- * - kvm_is_visible_gfn (mmu_check_roots)
1099		- */
1100		- kvm_arch_flush_shadow_memslot(kvm, slot);
1101		-
1102		- /*
1103		- * We can re-use the old_memslots from above, the only difference
1104		- * from the currently installed memslots is the invalid flag. This
1105		- * will get overwritten by update_memslots anyway.
1106		- */
1107		- slots = old_memslots;
1108		- }
1109		-
1110		- r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
	1458	+ r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
1111	1459	if (r)
1112		- goto out_slots;
	1460	+ goto out_bitmap;
1113	1461
1114		- /* actual memory is freed via old in kvm_free_memslot below */
1115		- if (change == KVM_MR_DELETE) {
1116		- new.dirty_bitmap = NULL;
1117		- memset(&new.arch, 0, sizeof(new.arch));
1118		- }
1119		-
1120		- update_memslots(slots, &new);
1121		- old_memslots = install_new_memslots(kvm, as_id, slots);
1122		-
1123		- kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
1124		-
1125		- kvm_free_memslot(kvm, &old, &new);
1126		- kvfree(old_memslots);
	1462	+ if (old.dirty_bitmap && !new.dirty_bitmap)
	1463	+ kvm_destroy_dirty_bitmap(&old);
1127	1464	return 0;
1128	1465
1129		-out_slots:
1130		- kvfree(slots);
1131		-out_free:
1132		- kvm_free_memslot(kvm, &new, &old);
1133		-out:
	1466	+out_bitmap:
	1467	+ if (new.dirty_bitmap && !old.dirty_bitmap)
	1468	+ kvm_destroy_dirty_bitmap(&new);
1134	1469	return r;
1135	1470	}
1136	1471	EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
..	..	@@ -1156,14 +1491,24 @@
1156	1491	return kvm_set_memory_region(kvm, mem);
1157	1492	}
1158	1493
1159		-int kvm_get_dirty_log(struct kvm *kvm,
1160		- struct kvm_dirty_log log, int is_dirty)
	1494	+#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	1495	+/**
	1496	+ * kvm_get_dirty_log - get a snapshot of dirty pages
	1497	+ * @kvm: pointer to kvm instance
	1498	+ * @log: slot id and address to which we copy the log
	1499	+ * @is_dirty: set to '1' if any dirty pages were found
	1500	+ * @memslot: set to the associated memslot, always valid on success
	1501	+ */
	1502	+int kvm_get_dirty_log(struct kvm kvm, struct kvm_dirty_log log,
	1503	+ int is_dirty, struct kvm_memory_slot *memslot)
1161	1504	{
1162	1505	struct kvm_memslots *slots;
1163		- struct kvm_memory_slot *memslot;
1164	1506	int i, as_id, id;
1165	1507	unsigned long n;
1166	1508	unsigned long any = 0;
	1509	+
	1510	+ *memslot = NULL;
	1511	+ *is_dirty = 0;
1167	1512
1168	1513	as_id = log->slot >> 16;
1169	1514	id = (u16)log->slot;
..	..	@@ -1171,16 +1516,18 @@
1171	1516	return -EINVAL;
1172	1517
1173	1518	slots = __kvm_memslots(kvm, as_id);
1174		- memslot = id_to_memslot(slots, id);
1175		- if (!memslot->dirty_bitmap)
	1519	+ *memslot = id_to_memslot(slots, id);
	1520	+ if (!(memslot) \|\| !(memslot)->dirty_bitmap)
1176	1521	return -ENOENT;
1177	1522
1178		- n = kvm_dirty_bitmap_bytes(memslot);
	1523	+ kvm_arch_sync_dirty_log(kvm, *memslot);
	1524	+
	1525	+ n = kvm_dirty_bitmap_bytes(*memslot);
1179	1526
1180	1527	for (i = 0; !any && i < n/sizeof(long); ++i)
1181		- any = memslot->dirty_bitmap[i];
	1528	+ any = (*memslot)->dirty_bitmap[i];
1182	1529
1183		- if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
	1530	+ if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1184	1531	return -EFAULT;
1185	1532
1186	1533	if (any)
..	..	@@ -1189,13 +1536,12 @@
1189	1536	}
1190	1537	EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1191	1538
1192		-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	1539	+#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1193	1540	/**
1194		- * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
1195		- * are dirty write protect them for next write.
	1541	+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages
	1542	+ * and reenable dirty page tracking for the corresponding pages.
1196	1543	* @kvm: pointer to kvm instance
1197	1544	* @log: slot id and address to which we copy the log
1198		- * @is_dirty: flag set if any page is dirty
1199	1545	*
1200	1546	* We need to keep it in mind that VCPU threads can write to the bitmap
1201	1547	* concurrently. So, to avoid losing track of dirty pages we keep the
..	..	@@ -1212,8 +1558,7 @@
1212	1558	* exiting to userspace will be logged for the next call.
1213	1559	*
1214	1560	*/
1215		-int kvm_get_dirty_log_protect(struct kvm *kvm,
1216		- struct kvm_dirty_log log, bool is_dirty)
	1561	+static int kvm_get_dirty_log_protect(struct kvm kvm, struct kvm_dirty_log log)
1217	1562	{
1218	1563	struct kvm_memslots *slots;
1219	1564	struct kvm_memory_slot *memslot;
..	..	@@ -1221,6 +1566,7 @@
1221	1566	unsigned long n;
1222	1567	unsigned long *dirty_bitmap;
1223	1568	unsigned long *dirty_bitmap_buffer;
	1569	+ bool flush;
1224	1570
1225	1571	as_id = log->slot >> 16;
1226	1572	id = (u16)log->slot;
..	..	@@ -1229,55 +1575,180 @@
1229	1575
1230	1576	slots = __kvm_memslots(kvm, as_id);
1231	1577	memslot = id_to_memslot(slots, id);
1232		-
1233		- dirty_bitmap = memslot->dirty_bitmap;
1234		- if (!dirty_bitmap)
	1578	+ if (!memslot \|\| !memslot->dirty_bitmap)
1235	1579	return -ENOENT;
1236	1580
	1581	+ dirty_bitmap = memslot->dirty_bitmap;
	1582	+
	1583	+ kvm_arch_sync_dirty_log(kvm, memslot);
	1584	+
1237	1585	n = kvm_dirty_bitmap_bytes(memslot);
	1586	+ flush = false;
	1587	+ if (kvm->manual_dirty_log_protect) {
	1588	+ /*
	1589	+ * Unlike kvm_get_dirty_log, we always return false in *flush,
	1590	+ * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
	1591	+ * is some code duplication between this function and
	1592	+ * kvm_get_dirty_log, but hopefully all architecture
	1593	+ * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
	1594	+ * can be eliminated.
	1595	+ */
	1596	+ dirty_bitmap_buffer = dirty_bitmap;
	1597	+ } else {
	1598	+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
	1599	+ memset(dirty_bitmap_buffer, 0, n);
1238	1600
1239		- dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1240		- memset(dirty_bitmap_buffer, 0, n);
	1601	+ spin_lock(&kvm->mmu_lock);
	1602	+ for (i = 0; i < n / sizeof(long); i++) {
	1603	+ unsigned long mask;
	1604	+ gfn_t offset;
1241	1605
1242		- spin_lock(&kvm->mmu_lock);
1243		- *is_dirty = false;
1244		- for (i = 0; i < n / sizeof(long); i++) {
1245		- unsigned long mask;
1246		- gfn_t offset;
	1606	+ if (!dirty_bitmap[i])
	1607	+ continue;
1247	1608
1248		- if (!dirty_bitmap[i])
1249		- continue;
	1609	+ flush = true;
	1610	+ mask = xchg(&dirty_bitmap[i], 0);
	1611	+ dirty_bitmap_buffer[i] = mask;
1250	1612
1251		- *is_dirty = true;
1252		-
1253		- mask = xchg(&dirty_bitmap[i], 0);
1254		- dirty_bitmap_buffer[i] = mask;
1255		-
1256		- if (mask) {
1257	1613	offset = i * BITS_PER_LONG;
1258	1614	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1259	1615	offset, mask);
1260	1616	}
	1617	+ spin_unlock(&kvm->mmu_lock);
1261	1618	}
1262	1619
1263		- spin_unlock(&kvm->mmu_lock);
	1620	+ if (flush)
	1621	+ kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
	1622	+
1264	1623	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1265	1624	return -EFAULT;
1266	1625	return 0;
1267	1626	}
1268		-EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1269		-#endif
1270	1627
1271		-bool kvm_largepages_enabled(void)
	1628	+
	1629	+/**
	1630	+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
	1631	+ * @kvm: kvm instance
	1632	+ * @log: slot id and address to which we copy the log
	1633	+ *
	1634	+ * Steps 1-4 below provide general overview of dirty page logging. See
	1635	+ * kvm_get_dirty_log_protect() function description for additional details.
	1636	+ *
	1637	+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
	1638	+ * always flush the TLB (step 4) even if previous step failed and the dirty
	1639	+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
	1640	+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
	1641	+ * writes will be marked dirty for next log read.
	1642	+ *
	1643	+ * 1. Take a snapshot of the bit and clear it if needed.
	1644	+ * 2. Write protect the corresponding page.
	1645	+ * 3. Copy the snapshot to the userspace.
	1646	+ * 4. Flush TLB's if needed.
	1647	+ */
	1648	+static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
	1649	+ struct kvm_dirty_log *log)
1272	1650	{
1273		- return largepages_enabled;
	1651	+ int r;
	1652	+
	1653	+ mutex_lock(&kvm->slots_lock);
	1654	+
	1655	+ r = kvm_get_dirty_log_protect(kvm, log);
	1656	+
	1657	+ mutex_unlock(&kvm->slots_lock);
	1658	+ return r;
1274	1659	}
1275	1660
1276		-void kvm_disable_largepages(void)
	1661	+/**
	1662	+ * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
	1663	+ * and reenable dirty page tracking for the corresponding pages.
	1664	+ * @kvm: pointer to kvm instance
	1665	+ * @log: slot id and address from which to fetch the bitmap of dirty pages
	1666	+ */
	1667	+static int kvm_clear_dirty_log_protect(struct kvm *kvm,
	1668	+ struct kvm_clear_dirty_log *log)
1277	1669	{
1278		- largepages_enabled = false;
	1670	+ struct kvm_memslots *slots;
	1671	+ struct kvm_memory_slot *memslot;
	1672	+ int as_id, id;
	1673	+ gfn_t offset;
	1674	+ unsigned long i, n;
	1675	+ unsigned long *dirty_bitmap;
	1676	+ unsigned long *dirty_bitmap_buffer;
	1677	+ bool flush;
	1678	+
	1679	+ as_id = log->slot >> 16;
	1680	+ id = (u16)log->slot;
	1681	+ if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
	1682	+ return -EINVAL;
	1683	+
	1684	+ if (log->first_page & 63)
	1685	+ return -EINVAL;
	1686	+
	1687	+ slots = __kvm_memslots(kvm, as_id);
	1688	+ memslot = id_to_memslot(slots, id);
	1689	+ if (!memslot \|\| !memslot->dirty_bitmap)
	1690	+ return -ENOENT;
	1691	+
	1692	+ dirty_bitmap = memslot->dirty_bitmap;
	1693	+
	1694	+ n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
	1695	+
	1696	+ if (log->first_page > memslot->npages \|\|
	1697	+ log->num_pages > memslot->npages - log->first_page \|\|
	1698	+ (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
	1699	+ return -EINVAL;
	1700	+
	1701	+ kvm_arch_sync_dirty_log(kvm, memslot);
	1702	+
	1703	+ flush = false;
	1704	+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
	1705	+ if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
	1706	+ return -EFAULT;
	1707	+
	1708	+ spin_lock(&kvm->mmu_lock);
	1709	+ for (offset = log->first_page, i = offset / BITS_PER_LONG,
	1710	+ n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
	1711	+ i++, offset += BITS_PER_LONG) {
	1712	+ unsigned long mask = *dirty_bitmap_buffer++;
	1713	+ atomic_long_t p = (atomic_long_t ) &dirty_bitmap[i];
	1714	+ if (!mask)
	1715	+ continue;
	1716	+
	1717	+ mask &= atomic_long_fetch_andnot(mask, p);
	1718	+
	1719	+ /*
	1720	+ * mask contains the bits that really have been cleared. This
	1721	+ * never includes any bits beyond the length of the memslot (if
	1722	+ * the length is not aligned to 64 pages), therefore it is not
	1723	+ * a problem if userspace sets them in log->dirty_bitmap.
	1724	+ */
	1725	+ if (mask) {
	1726	+ flush = true;
	1727	+ kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
	1728	+ offset, mask);
	1729	+ }
	1730	+ }
	1731	+ spin_unlock(&kvm->mmu_lock);
	1732	+
	1733	+ if (flush)
	1734	+ kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
	1735	+
	1736	+ return 0;
1279	1737	}
1280		-EXPORT_SYMBOL_GPL(kvm_disable_largepages);
	1738	+
	1739	+static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
	1740	+ struct kvm_clear_dirty_log *log)
	1741	+{
	1742	+ int r;
	1743	+
	1744	+ mutex_lock(&kvm->slots_lock);
	1745	+
	1746	+ r = kvm_clear_dirty_log_protect(kvm, log);
	1747	+
	1748	+ mutex_unlock(&kvm->slots_lock);
	1749	+ return r;
	1750	+}
	1751	+#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1281	1752
1282	1753	struct kvm_memory_slot gfn_to_memslot(struct kvm kvm, gfn_t gfn)
1283	1754	{
..	..	@@ -1294,13 +1765,17 @@
1294	1765	{
1295	1766	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
1296	1767
1297		- if (!memslot \|\| memslot->id >= KVM_USER_MEM_SLOTS \|\|
1298		- memslot->flags & KVM_MEMSLOT_INVALID)
1299		- return false;
1300		-
1301		- return true;
	1768	+ return kvm_is_visible_memslot(memslot);
1302	1769	}
1303	1770	EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
	1771	+
	1772	+bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
	1773	+{
	1774	+ struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
	1775	+
	1776	+ return kvm_is_visible_memslot(memslot);
	1777	+}
	1778	+EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
1304	1779
1305	1780	unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
1306	1781	{
..	..	@@ -1313,7 +1788,7 @@
1313	1788	if (kvm_is_error_hva(addr))
1314	1789	return PAGE_SIZE;
1315	1790
1316		- down_read(&current->mm->mmap_sem);
	1791	+ mmap_read_lock(current->mm);
1317	1792	vma = find_vma(current->mm, addr);
1318	1793	if (!vma)
1319	1794	goto out;
..	..	@@ -1321,7 +1796,7 @@
1321	1796	size = vma_kernel_pagesize(vma);
1322	1797
1323	1798	out:
1324		- up_read(&current->mm->mmap_sem);
	1799	+ mmap_read_unlock(current->mm);
1325	1800
1326	1801	return size;
1327	1802	}
..	..	@@ -1372,8 +1847,12 @@
1372	1847	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
1373	1848
1374	1849	/*
1375		- * If writable is set to false, the hva returned by this function is only
1376		- * allowed to be read.
	1850	+ * Return the hva of a @gfn and the R/W attribute if possible.
	1851	+ *
	1852	+ * @slot: the kvm_memory_slot which contains @gfn
	1853	+ * @gfn: the gfn to be translated
	1854	+ * @writable: used to return the read/write attribute of the @slot if the hva
	1855	+ * is valid and @writable is not NULL
1377	1856	*/
1378	1857	unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
1379	1858	gfn_t gfn, bool *writable)
..	..	@@ -1411,13 +1890,12 @@
1411	1890	/*
1412	1891	* The fast path to get the writable pfn which will be stored in @pfn,
1413	1892	* true indicates success, otherwise false is returned. It's also the
1414		- * only part that runs if we can are in atomic context.
	1893	+ * only part that runs if we can in atomic context.
1415	1894	*/
1416	1895	static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
1417	1896	bool writable, kvm_pfn_t pfn)
1418	1897	{
1419	1898	struct page *page[1];
1420		- int npages;
1421	1899
1422	1900	/*
1423	1901	* Fast pin a writable pfn only if it is a write fault request
..	..	@@ -1427,8 +1905,7 @@
1427	1905	if (!(write_fault \|\| writable))
1428	1906	return false;
1429	1907
1430		- npages = __get_user_pages_fast(addr, 1, 1, page);
1431		- if (npages == 1) {
	1908	+ if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
1432	1909	*pfn = page_to_pfn(page[0]);
1433	1910
1434	1911	if (writable)
..	..	@@ -1468,7 +1945,7 @@
1468	1945	if (unlikely(!write_fault) && writable) {
1469	1946	struct page *wpage;
1470	1947
1471		- if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
	1948	+ if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
1472	1949	*writable = true;
1473	1950	put_page(page);
1474	1951	page = wpage;
..	..	@@ -1506,14 +1983,14 @@
1506	1983	spinlock_t *ptl;
1507	1984	int r;
1508	1985
1509		- r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
	1986	+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
1510	1987	if (r) {
1511	1988	/*
1512	1989	* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
1513	1990	* not call the fault handler, so do it here.
1514	1991	*/
1515	1992	bool unlocked = false;
1516		- r = fixup_user_fault(current, current->mm, addr,
	1993	+ r = fixup_user_fault(current->mm, addr,
1517	1994	(write_fault ? FAULT_FLAG_WRITE : 0),
1518	1995	&unlocked);
1519	1996	if (unlocked)
..	..	@@ -1521,7 +1998,7 @@
1521	1998	if (r)
1522	1999	return r;
1523	2000
1524		- r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
	2001	+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
1525	2002	if (r)
1526	2003	return r;
1527	2004	}
..	..	@@ -1596,7 +2073,7 @@
1596	2073	if (npages == 1)
1597	2074	return pfn;
1598	2075
1599		- down_read(&current->mm->mmap_sem);
	2076	+ mmap_read_lock(current->mm);
1600	2077	if (npages == -EHWPOISON \|\|
1601	2078	(!async && check_user_page_hwpoison(addr))) {
1602	2079	pfn = KVM_PFN_ERR_HWPOISON;
..	..	@@ -1620,7 +2097,7 @@
1620	2097	pfn = KVM_PFN_ERR_FAULT;
1621	2098	}
1622	2099	exit:
1623		- up_read(&current->mm->mmap_sem);
	2100	+ mmap_read_unlock(current->mm);
1624	2101	return pfn;
1625	2102	}
1626	2103
..	..	@@ -1673,12 +2150,6 @@
1673	2150	}
1674	2151	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1675	2152
1676		-kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1677		-{
1678		- return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
1679		-}
1680		-EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1681		-
1682	2153	kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
1683	2154	{
1684	2155	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
..	..	@@ -1710,7 +2181,7 @@
1710	2181	if (entry < nr_pages)
1711	2182	return 0;
1712	2183
1713		- return __get_user_pages_fast(addr, nr_pages, 1, pages);
	2184	+ return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
1714	2185	}
1715	2186	EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1716	2187
..	..	@@ -1924,20 +2395,28 @@
1924	2395	}
1925	2396	EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1926	2397
	2398	+static bool kvm_is_ad_tracked_pfn(kvm_pfn_t pfn)
	2399	+{
	2400	+ if (!pfn_valid(pfn))
	2401	+ return false;
	2402	+
	2403	+ /*
	2404	+ * Per page-flags.h, pages tagged PG_reserved "should in general not be
	2405	+ * touched (e.g. set dirty) except by its owner".
	2406	+ */
	2407	+ return !PageReserved(pfn_to_page(pfn));
	2408	+}
	2409	+
1927	2410	void kvm_set_pfn_dirty(kvm_pfn_t pfn)
1928	2411	{
1929		- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
1930		- struct page *page = pfn_to_page(pfn);
1931		-
1932		- if (!PageReserved(page))
1933		- SetPageDirty(page);
1934		- }
	2412	+ if (kvm_is_ad_tracked_pfn(pfn))
	2413	+ SetPageDirty(pfn_to_page(pfn));
1935	2414	}
1936	2415	EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1937	2416
1938	2417	void kvm_set_pfn_accessed(kvm_pfn_t pfn)
1939	2418	{
1940		- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
	2419	+ if (kvm_is_ad_tracked_pfn(pfn))
1941	2420	mark_page_accessed(pfn_to_page(pfn));
1942	2421	}
1943	2422	EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
..	..	@@ -2047,17 +2526,6 @@
2047	2526	return 0;
2048	2527	}
2049	2528
2050		-int kvm_read_guest_atomic(struct kvm kvm, gpa_t gpa, void data,
2051		- unsigned long len)
2052		-{
2053		- gfn_t gfn = gpa >> PAGE_SHIFT;
2054		- struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2055		- int offset = offset_in_page(gpa);
2056		-
2057		- return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2058		-}
2059		-EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
2060		-
2061	2529	int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2062	2530	void *data, unsigned long len)
2063	2531	{
..	..	@@ -2155,30 +2623,34 @@
2155	2623	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2156	2624	gfn_t nr_pages_avail;
2157	2625
2158		- ghc->gpa = gpa;
	2626	+ /* Update ghc->generation before performing any error checks. */
2159	2627	ghc->generation = slots->generation;
2160		- ghc->len = len;
2161		- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2162		- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
2163		- if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
2164		- ghc->hva += offset;
2165		- } else {
2166		- /*
2167		- * If the requested region crosses two memslots, we still
2168		- * verify that the entire region is valid here.
2169		- */
2170		- while (start_gfn <= end_gfn) {
2171		- nr_pages_avail = 0;
2172		- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2173		- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2174		- &nr_pages_avail);
2175		- if (kvm_is_error_hva(ghc->hva))
2176		- return -EFAULT;
2177		- start_gfn += nr_pages_avail;
2178		- }
2179		- /* Use the slow path for cross page reads and writes. */
2180		- ghc->memslot = NULL;
	2628	+
	2629	+ if (start_gfn > end_gfn) {
	2630	+ ghc->hva = KVM_HVA_ERR_BAD;
	2631	+ return -EINVAL;
2181	2632	}
	2633	+
	2634	+ /*
	2635	+ * If the requested region crosses two memslots, we still
	2636	+ * verify that the entire region is valid here.
	2637	+ */
	2638	+ for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
	2639	+ ghc->memslot = __gfn_to_memslot(slots, start_gfn);
	2640	+ ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
	2641	+ &nr_pages_avail);
	2642	+ if (kvm_is_error_hva(ghc->hva))
	2643	+ return -EFAULT;
	2644	+ }
	2645	+
	2646	+ /* Use the slow path for cross page reads and writes. */
	2647	+ if (nr_pages_needed == 1)
	2648	+ ghc->hva += offset;
	2649	+ else
	2650	+ ghc->memslot = NULL;
	2651	+
	2652	+ ghc->gpa = gpa;
	2653	+ ghc->len = len;
2182	2654	return 0;
2183	2655	}
2184	2656
..	..	@@ -2198,10 +2670,13 @@
2198	2670	int r;
2199	2671	gpa_t gpa = ghc->gpa + offset;
2200	2672
2201		- BUG_ON(len + offset > ghc->len);
	2673	+ if (WARN_ON_ONCE(len + offset > ghc->len))
	2674	+ return -EINVAL;
2202	2675
2203		- if (slots->generation != ghc->generation)
2204		- __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
	2676	+ if (slots->generation != ghc->generation) {
	2677	+ if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
	2678	+ return -EFAULT;
	2679	+ }
2205	2680
2206	2681	if (kvm_is_error_hva(ghc->hva))
2207	2682	return -EFAULT;
..	..	@@ -2225,28 +2700,40 @@
2225	2700	}
2226	2701	EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2227	2702
2228		-int kvm_read_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
2229		- void *data, unsigned long len)
	2703	+int kvm_read_guest_offset_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
	2704	+ void *data, unsigned int offset,
	2705	+ unsigned long len)
2230	2706	{
2231	2707	struct kvm_memslots *slots = kvm_memslots(kvm);
2232	2708	int r;
	2709	+ gpa_t gpa = ghc->gpa + offset;
2233	2710
2234		- BUG_ON(len > ghc->len);
	2711	+ if (WARN_ON_ONCE(len + offset > ghc->len))
	2712	+ return -EINVAL;
2235	2713
2236		- if (slots->generation != ghc->generation)
2237		- __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
	2714	+ if (slots->generation != ghc->generation) {
	2715	+ if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
	2716	+ return -EFAULT;
	2717	+ }
2238	2718
2239	2719	if (kvm_is_error_hva(ghc->hva))
2240	2720	return -EFAULT;
2241	2721
2242	2722	if (unlikely(!ghc->memslot))
2243		- return kvm_read_guest(kvm, ghc->gpa, data, len);
	2723	+ return kvm_read_guest(kvm, gpa, data, len);
2244	2724
2245		- r = __copy_from_user(data, (void __user *)ghc->hva, len);
	2725	+ r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
2246	2726	if (r)
2247	2727	return -EFAULT;
2248	2728
2249	2729	return 0;
	2730	+}
	2731	+EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
	2732	+
	2733	+int kvm_read_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
	2734	+ void *data, unsigned long len)
	2735	+{
	2736	+ return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
2250	2737	}
2251	2738	EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
2252	2739
..	..	@@ -2277,8 +2764,7 @@
2277	2764	}
2278	2765	EXPORT_SYMBOL_GPL(kvm_clear_guest);
2279	2766
2280		-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
2281		- gfn_t gfn)
	2767	+void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn)
2282	2768	{
2283	2769	if (memslot && memslot->dirty_bitmap) {
2284	2770	unsigned long rel_gfn = gfn - memslot->base_gfn;
..	..	@@ -2286,6 +2772,7 @@
2286	2772	set_bit_le(rel_gfn, memslot->dirty_bitmap);
2287	2773	}
2288	2774	}
	2775	+EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
2289	2776
2290	2777	void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
2291	2778	{
..	..	@@ -2330,33 +2817,40 @@
2330	2817
2331	2818	static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
2332	2819	{
2333		- unsigned int old, val, grow;
	2820	+ unsigned int old, val, grow, grow_start;
2334	2821
2335	2822	old = val = vcpu->halt_poll_ns;
	2823	+ grow_start = READ_ONCE(halt_poll_ns_grow_start);
2336	2824	grow = READ_ONCE(halt_poll_ns_grow);
2337		- /* 10us base */
2338		- if (val == 0 && grow)
2339		- val = 10000;
2340		- else
2341		- val *= grow;
	2825	+ if (!grow)
	2826	+ goto out;
2342	2827
2343		- if (val > halt_poll_ns)
2344		- val = halt_poll_ns;
	2828	+ val *= grow;
	2829	+ if (val < grow_start)
	2830	+ val = grow_start;
	2831	+
	2832	+ if (val > vcpu->kvm->max_halt_poll_ns)
	2833	+ val = vcpu->kvm->max_halt_poll_ns;
2345	2834
2346	2835	vcpu->halt_poll_ns = val;
	2836	+out:
2347	2837	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
2348	2838	}
2349	2839
2350	2840	static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
2351	2841	{
2352		- unsigned int old, val, shrink;
	2842	+ unsigned int old, val, shrink, grow_start;
2353	2843
2354	2844	old = val = vcpu->halt_poll_ns;
2355	2845	shrink = READ_ONCE(halt_poll_ns_shrink);
	2846	+ grow_start = READ_ONCE(halt_poll_ns_grow_start);
2356	2847	if (shrink == 0)
2357	2848	val = 0;
2358	2849	else
2359	2850	val /= shrink;
	2851	+
	2852	+ if (val < grow_start)
	2853	+ val = 0;
2360	2854
2361	2855	vcpu->halt_poll_ns = val;
2362	2856	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
..	..	@@ -2382,18 +2876,28 @@
2382	2876	return ret;
2383	2877	}
2384	2878
	2879	+static inline void
	2880	+update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
	2881	+{
	2882	+ if (waited)
	2883	+ vcpu->stat.halt_poll_fail_ns += poll_ns;
	2884	+ else
	2885	+ vcpu->stat.halt_poll_success_ns += poll_ns;
	2886	+}
	2887	+
2385	2888	/*
2386	2889	* The vCPU has executed a HLT instruction with in-kernel mode enabled.
2387	2890	*/
2388	2891	void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2389	2892	{
2390		- ktime_t start, cur;
2391		- DECLARE_SWAITQUEUE(wait);
	2893	+ ktime_t start, cur, poll_end;
2392	2894	bool waited = false;
2393	2895	u64 block_ns;
2394	2896
2395		- start = cur = ktime_get();
2396		- if (vcpu->halt_poll_ns) {
	2897	+ kvm_arch_vcpu_blocking(vcpu);
	2898	+
	2899	+ start = cur = poll_end = ktime_get();
	2900	+ if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
2397	2901	ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
2398	2902
2399	2903	++vcpu->stat.halt_attempted_poll;
..	..	@@ -2408,14 +2912,14 @@
2408	2912	++vcpu->stat.halt_poll_invalid;
2409	2913	goto out;
2410	2914	}
2411		- cur = ktime_get();
2412		- } while (single_task_running() && ktime_before(cur, stop));
	2915	+ poll_end = cur = ktime_get();
	2916	+ } while (single_task_running() && !need_resched() &&
	2917	+ ktime_before(cur, stop));
2413	2918	}
2414	2919
2415		- kvm_arch_vcpu_blocking(vcpu);
2416		-
	2920	+ prepare_to_rcuwait(&vcpu->wait);
2417	2921	for (;;) {
2418		- prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
	2922	+ set_current_state(TASK_INTERRUPTIBLE);
2419	2923
2420	2924	if (kvm_vcpu_check_block(vcpu) < 0)
2421	2925	break;
..	..	@@ -2423,28 +2927,33 @@
2423	2927	waited = true;
2424	2928	schedule();
2425	2929	}
2426		-
2427		- finish_swait(&vcpu->wq, &wait);
	2930	+ finish_rcuwait(&vcpu->wait);
2428	2931	cur = ktime_get();
2429		-
2430		- kvm_arch_vcpu_unblocking(vcpu);
2431	2932	out:
	2933	+ kvm_arch_vcpu_unblocking(vcpu);
2432	2934	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
2433	2935
2434		- if (!vcpu_valid_wakeup(vcpu))
2435		- shrink_halt_poll_ns(vcpu);
2436		- else if (halt_poll_ns) {
2437		- if (block_ns <= vcpu->halt_poll_ns)
2438		- ;
2439		- /* we had a long block, shrink polling */
2440		- else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
	2936	+ update_halt_poll_stats(
	2937	+ vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
	2938	+
	2939	+ if (!kvm_arch_no_poll(vcpu)) {
	2940	+ if (!vcpu_valid_wakeup(vcpu)) {
2441	2941	shrink_halt_poll_ns(vcpu);
2442		- /* we had a short halt and our poll time is too small */
2443		- else if (vcpu->halt_poll_ns < halt_poll_ns &&
2444		- block_ns < halt_poll_ns)
2445		- grow_halt_poll_ns(vcpu);
2446		- } else
2447		- vcpu->halt_poll_ns = 0;
	2942	+ } else if (vcpu->kvm->max_halt_poll_ns) {
	2943	+ if (block_ns <= vcpu->halt_poll_ns)
	2944	+ ;
	2945	+ /* we had a long block, shrink polling */
	2946	+ else if (vcpu->halt_poll_ns &&
	2947	+ block_ns > vcpu->kvm->max_halt_poll_ns)
	2948	+ shrink_halt_poll_ns(vcpu);
	2949	+ /* we had a short halt and our poll time is too small */
	2950	+ else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
	2951	+ block_ns < vcpu->kvm->max_halt_poll_ns)
	2952	+ grow_halt_poll_ns(vcpu);
	2953	+ } else {
	2954	+ vcpu->halt_poll_ns = 0;
	2955	+ }
	2956	+ }
2448	2957
2449	2958	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
2450	2959	kvm_arch_vcpu_block_finish(vcpu);
..	..	@@ -2453,11 +2962,11 @@
2453	2962
2454	2963	bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
2455	2964	{
2456		- struct swait_queue_head *wqp;
	2965	+ struct rcuwait *waitp;
2457	2966
2458		- wqp = kvm_arch_vcpu_wq(vcpu);
2459		- if (swq_has_sleeper(wqp)) {
2460		- swake_up_one(wqp);
	2967	+ waitp = kvm_arch_vcpu_get_wait(vcpu);
	2968	+ if (rcuwait_wake_up(waitp)) {
	2969	+ WRITE_ONCE(vcpu->ready, true);
2461	2970	++vcpu->stat.halt_wakeup;
2462	2971	return true;
2463	2972	}
..	..	@@ -2472,16 +2981,24 @@
2472	2981	*/
2473	2982	void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
2474	2983	{
2475		- int me;
2476		- int cpu = vcpu->cpu;
	2984	+ int me, cpu;
2477	2985
2478	2986	if (kvm_vcpu_wake_up(vcpu))
2479	2987	return;
2480	2988
	2989	+ /*
	2990	+ * Note, the vCPU could get migrated to a different pCPU at any point
	2991	+ * after kvm_arch_vcpu_should_kick(), which could result in sending an
	2992	+ * IPI to the previous pCPU. But, that's ok because the purpose of the
	2993	+ * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
	2994	+ * vCPU also requires it to leave IN_GUEST_MODE.
	2995	+ */
2481	2996	me = get_cpu();
2482		- if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
2483		- if (kvm_arch_vcpu_should_kick(vcpu))
	2997	+ if (kvm_arch_vcpu_should_kick(vcpu)) {
	2998	+ cpu = READ_ONCE(vcpu->cpu);
	2999	+ if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
2484	3000	smp_send_reschedule(cpu);
	3001	+ }
2485	3002	put_cpu();
2486	3003	}
2487	3004	EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
..	..	@@ -2513,7 +3030,7 @@
2513	3030	*
2514	3031	* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
2515	3032	* (preempted lock holder), indicated by @in_spin_loop.
2516		- * Set at the beiginning and cleared at the end of interception/PLE handler.
	3033	+ * Set at the beginning and cleared at the end of interception/PLE handler.
2517	3034	*
2518	3035	* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
2519	3036	* chance last time (mostly it has become eligible now since we have probably
..	..	@@ -2594,13 +3111,15 @@
2594	3111	continue;
2595	3112	} else if (pass && i > last_boosted_vcpu)
2596	3113	break;
2597		- if (!READ_ONCE(vcpu->preempted))
	3114	+ if (!READ_ONCE(vcpu->ready))
2598	3115	continue;
2599	3116	if (vcpu == me)
2600	3117	continue;
2601		- if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
	3118	+ if (rcuwait_active(&vcpu->wait) &&
	3119	+ !vcpu_dy_runnable(vcpu))
2602	3120	continue;
2603		- if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
	3121	+ if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
	3122	+ !kvm_arch_vcpu_in_kernel(vcpu))
2604	3123	continue;
2605	3124	if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
2606	3125	continue;
..	..	@@ -2659,7 +3178,6 @@
2659	3178	{
2660	3179	struct kvm_vcpu *vcpu = filp->private_data;
2661	3180
2662		- debugfs_remove_recursive(vcpu->debugfs_dentry);
2663	3181	kvm_put_kvm(vcpu->kvm);
2664	3182	return 0;
2665	3183	}
..	..	@@ -2683,30 +3201,21 @@
2683	3201	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR \| O_CLOEXEC);
2684	3202	}
2685	3203
2686		-static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
	3204	+static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
2687	3205	{
	3206	+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
	3207	+ struct dentry *debugfs_dentry;
2688	3208	char dir_name[ITOA_MAX_LEN * 2];
2689		- int ret;
2690		-
2691		- if (!kvm_arch_has_vcpu_debugfs())
2692		- return 0;
2693	3209
2694	3210	if (!debugfs_initialized())
2695		- return 0;
	3211	+ return;
2696	3212
2697	3213	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
2698		- vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
2699		- vcpu->kvm->debugfs_dentry);
2700		- if (!vcpu->debugfs_dentry)
2701		- return -ENOMEM;
	3214	+ debugfs_dentry = debugfs_create_dir(dir_name,
	3215	+ vcpu->kvm->debugfs_dentry);
2702	3216
2703		- ret = kvm_arch_create_vcpu_debugfs(vcpu);
2704		- if (ret < 0) {
2705		- debugfs_remove_recursive(vcpu->debugfs_dentry);
2706		- return ret;
2707		- }
2708		-
2709		- return 0;
	3217	+ kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
	3218	+#endif
2710	3219	}
2711	3220
2712	3221	/*
..	..	@@ -2716,6 +3225,7 @@
2716	3225	{
2717	3226	int r;
2718	3227	struct kvm_vcpu *vcpu;
	3228	+ struct page *page;
2719	3229
2720	3230	if (id >= KVM_MAX_VCPU_ID)
2721	3231	return -EINVAL;
..	..	@@ -2729,21 +3239,29 @@
2729	3239	kvm->created_vcpus++;
2730	3240	mutex_unlock(&kvm->lock);
2731	3241
2732		- vcpu = kvm_arch_vcpu_create(kvm, id);
2733		- if (IS_ERR(vcpu)) {
2734		- r = PTR_ERR(vcpu);
	3242	+ r = kvm_arch_vcpu_precreate(kvm, id);
	3243	+ if (r)
	3244	+ goto vcpu_decrement;
	3245	+
	3246	+ vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
	3247	+ if (!vcpu) {
	3248	+ r = -ENOMEM;
2735	3249	goto vcpu_decrement;
2736	3250	}
2737	3251
2738		- preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
	3252	+ BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
	3253	+ page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
	3254	+ if (!page) {
	3255	+ r = -ENOMEM;
	3256	+ goto vcpu_free;
	3257	+ }
	3258	+ vcpu->run = page_address(page);
2739	3259
2740		- r = kvm_arch_vcpu_setup(vcpu);
2741		- if (r)
2742		- goto vcpu_destroy;
	3260	+ kvm_vcpu_init(vcpu, kvm, id);
2743	3261
2744		- r = kvm_create_vcpu_debugfs(vcpu);
	3262	+ r = kvm_arch_vcpu_create(vcpu);
2745	3263	if (r)
2746		- goto vcpu_destroy;
	3264	+ goto vcpu_free_run_page;
2747	3265
2748	3266	mutex_lock(&kvm->lock);
2749	3267	if (kvm_get_vcpu_by_id(kvm, id)) {
..	..	@@ -2758,7 +3276,7 @@
2758	3276	kvm_get_kvm(kvm);
2759	3277	r = create_vcpu_fd(vcpu);
2760	3278	if (r < 0) {
2761		- kvm_put_kvm(kvm);
	3279	+ kvm_put_kvm_no_destroy(kvm);
2762	3280	goto unlock_vcpu_destroy;
2763	3281	}
2764	3282
..	..	@@ -2773,13 +3291,16 @@
2773	3291
2774	3292	mutex_unlock(&kvm->lock);
2775	3293	kvm_arch_vcpu_postcreate(vcpu);
	3294	+ kvm_create_vcpu_debugfs(vcpu);
2776	3295	return r;
2777	3296
2778	3297	unlock_vcpu_destroy:
2779	3298	mutex_unlock(&kvm->lock);
2780		- debugfs_remove_recursive(vcpu->debugfs_dentry);
2781		-vcpu_destroy:
2782	3299	kvm_arch_vcpu_destroy(vcpu);
	3300	+vcpu_free_run_page:
	3301	+ free_page((unsigned long)vcpu->run);
	3302	+vcpu_free:
	3303	+ kmem_cache_free(kvm_vcpu_cache, vcpu);
2783	3304	vcpu_decrement:
2784	3305	mutex_lock(&kvm->lock);
2785	3306	kvm->created_vcpus--;
..	..	@@ -2807,7 +3328,7 @@
2807	3328	struct kvm_fpu *fpu = NULL;
2808	3329	struct kvm_sregs *kvm_sregs = NULL;
2809	3330
2810		- if (vcpu->kvm->mm != current->mm)
	3331	+ if (vcpu->kvm->mm != current->mm \|\| vcpu->kvm->vm_bugged)
2811	3332	return -EIO;
2812	3333
2813	3334	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
..	..	@@ -2844,7 +3365,7 @@
2844	3365	synchronize_rcu();
2845	3366	put_pid(oldpid);
2846	3367	}
2847		- r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
	3368	+ r = kvm_arch_vcpu_ioctl_run(vcpu);
2848	3369	trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
2849	3370	break;
2850	3371	}
..	..	@@ -2852,7 +3373,7 @@
2852	3373	struct kvm_regs *kvm_regs;
2853	3374
2854	3375	r = -ENOMEM;
2855		- kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
	3376	+ kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
2856	3377	if (!kvm_regs)
2857	3378	goto out;
2858	3379	r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
..	..	@@ -2869,7 +3390,6 @@
2869	3390	case KVM_SET_REGS: {
2870	3391	struct kvm_regs *kvm_regs;
2871	3392
2872		- r = -ENOMEM;
2873	3393	kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
2874	3394	if (IS_ERR(kvm_regs)) {
2875	3395	r = PTR_ERR(kvm_regs);
..	..	@@ -2880,7 +3400,8 @@
2880	3400	break;
2881	3401	}
2882	3402	case KVM_GET_SREGS: {
2883		- kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
	3403	+ kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
	3404	+ GFP_KERNEL_ACCOUNT);
2884	3405	r = -ENOMEM;
2885	3406	if (!kvm_sregs)
2886	3407	goto out;
..	..	@@ -2972,7 +3493,7 @@
2972	3493	break;
2973	3494	}
2974	3495	case KVM_GET_FPU: {
2975		- fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
	3496	+ fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
2976	3497	r = -ENOMEM;
2977	3498	if (!fpu)
2978	3499	goto out;
..	..	@@ -3013,7 +3534,7 @@
3013	3534	void __user *argp = compat_ptr(arg);
3014	3535	int r;
3015	3536
3016		- if (vcpu->kvm->mm != current->mm)
	3537	+ if (vcpu->kvm->mm != current->mm \|\| vcpu->kvm->vm_bugged)
3017	3538	return -EIO;
3018	3539
3019	3540	switch (ioctl) {
..	..	@@ -3031,7 +3552,8 @@
3031	3552	if (kvm_sigmask.len != sizeof(compat_sigset_t))
3032	3553	goto out;
3033	3554	r = -EFAULT;
3034		- if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
	3555	+ if (get_compat_sigset(&sigset,
	3556	+ (compat_sigset_t __user *)sigmask_arg->sigset))
3035	3557	goto out;
3036	3558	r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3037	3559	} else
..	..	@@ -3046,6 +3568,16 @@
3046	3568	return r;
3047	3569	}
3048	3570	#endif
	3571	+
	3572	+static int kvm_device_mmap(struct file filp, struct vm_area_struct vma)
	3573	+{
	3574	+ struct kvm_device *dev = filp->private_data;
	3575	+
	3576	+ if (dev->ops->mmap)
	3577	+ return dev->ops->mmap(dev, vma);
	3578	+
	3579	+ return -ENODEV;
	3580	+}
3049	3581
3050	3582	static int kvm_device_ioctl_attr(struct kvm_device *dev,
3051	3583	int (accessor)(struct kvm_device dev,
..	..	@@ -3068,7 +3600,7 @@
3068	3600	{
3069	3601	struct kvm_device *dev = filp->private_data;
3070	3602
3071		- if (dev->kvm->mm != current->mm)
	3603	+ if (dev->kvm->mm != current->mm \|\| dev->kvm->vm_bugged)
3072	3604	return -EIO;
3073	3605
3074	3606	switch (ioctl) {
..	..	@@ -3091,6 +3623,13 @@
3091	3623	struct kvm_device *dev = filp->private_data;
3092	3624	struct kvm *kvm = dev->kvm;
3093	3625
	3626	+ if (dev->ops->release) {
	3627	+ mutex_lock(&kvm->lock);
	3628	+ list_del(&dev->vm_node);
	3629	+ dev->ops->release(dev);
	3630	+ mutex_unlock(&kvm->lock);
	3631	+ }
	3632	+
3094	3633	kvm_put_kvm(kvm);
3095	3634	return 0;
3096	3635	}
..	..	@@ -3099,6 +3638,7 @@
3099	3638	.unlocked_ioctl = kvm_device_ioctl,
3100	3639	.release = kvm_device_release,
3101	3640	KVM_COMPAT(kvm_device_ioctl),
	3641	+ .mmap = kvm_device_mmap,
3102	3642	};
3103	3643
3104	3644	struct kvm_device kvm_device_from_filp(struct file filp)
..	..	@@ -3109,14 +3649,14 @@
3109	3649	return filp->private_data;
3110	3650	}
3111	3651
3112		-static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
	3652	+static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3113	3653	#ifdef CONFIG_KVM_MPIC
3114	3654	[KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
3115	3655	[KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
3116	3656	#endif
3117	3657	};
3118	3658
3119		-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
	3659	+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
3120	3660	{
3121	3661	if (type >= ARRAY_SIZE(kvm_device_ops_table))
3122	3662	return -ENOSPC;
..	..	@@ -3137,7 +3677,7 @@
3137	3677	static int kvm_ioctl_create_device(struct kvm *kvm,
3138	3678	struct kvm_create_device *cd)
3139	3679	{
3140		- struct kvm_device_ops *ops = NULL;
	3680	+ const struct kvm_device_ops *ops = NULL;
3141	3681	struct kvm_device *dev;
3142	3682	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
3143	3683	int type;
..	..	@@ -3154,7 +3694,7 @@
3154	3694	if (test)
3155	3695	return 0;
3156	3696
3157		- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
	3697	+ dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
3158	3698	if (!dev)
3159	3699	return -ENOMEM;
3160	3700
..	..	@@ -3177,11 +3717,14 @@
3177	3717	kvm_get_kvm(kvm);
3178	3718	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR \| O_CLOEXEC);
3179	3719	if (ret < 0) {
3180		- kvm_put_kvm(kvm);
	3720	+ kvm_put_kvm_no_destroy(kvm);
3181	3721	mutex_lock(&kvm->lock);
3182	3722	list_del(&dev->vm_node);
	3723	+ if (ops->release)
	3724	+ ops->release(dev);
3183	3725	mutex_unlock(&kvm->lock);
3184		- ops->destroy(dev);
	3726	+ if (ops->destroy)
	3727	+ ops->destroy(dev);
3185	3728	return ret;
3186	3729	}
3187	3730
..	..	@@ -3205,10 +3748,18 @@
3205	3748	#endif
3206	3749	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
3207	3750	case KVM_CAP_CHECK_EXTENSION_VM:
	3751	+ case KVM_CAP_ENABLE_CAP_VM:
	3752	+ case KVM_CAP_HALT_POLL:
3208	3753	return 1;
3209	3754	#ifdef CONFIG_KVM_MMIO
3210	3755	case KVM_CAP_COALESCED_MMIO:
3211	3756	return KVM_COALESCED_MMIO_PAGE_OFFSET;
	3757	+ case KVM_CAP_COALESCED_PIO:
	3758	+ return 1;
	3759	+#endif
	3760	+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	3761	+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
	3762	+ return KVM_DIRTY_LOG_MANUAL_CAPS;
3212	3763	#endif
3213	3764	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3214	3765	case KVM_CAP_IRQ_ROUTING:
..	..	@@ -3218,10 +3769,47 @@
3218	3769	case KVM_CAP_MULTI_ADDRESS_SPACE:
3219	3770	return KVM_ADDRESS_SPACE_NUM;
3220	3771	#endif
	3772	+ case KVM_CAP_NR_MEMSLOTS:
	3773	+ return KVM_USER_MEM_SLOTS;
3221	3774	default:
3222	3775	break;
3223	3776	}
3224	3777	return kvm_vm_ioctl_check_extension(kvm, arg);
	3778	+}
	3779	+
	3780	+int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
	3781	+ struct kvm_enable_cap *cap)
	3782	+{
	3783	+ return -EINVAL;
	3784	+}
	3785	+
	3786	+static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
	3787	+ struct kvm_enable_cap *cap)
	3788	+{
	3789	+ switch (cap->cap) {
	3790	+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	3791	+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
	3792	+ u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
	3793	+
	3794	+ if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
	3795	+ allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
	3796	+
	3797	+ if (cap->flags \|\| (cap->args[0] & ~allowed_options))
	3798	+ return -EINVAL;
	3799	+ kvm->manual_dirty_log_protect = cap->args[0];
	3800	+ return 0;
	3801	+ }
	3802	+#endif
	3803	+ case KVM_CAP_HALT_POLL: {
	3804	+ if (cap->flags \|\| cap->args[0] != (unsigned int)cap->args[0])
	3805	+ return -EINVAL;
	3806	+
	3807	+ kvm->max_halt_poll_ns = cap->args[0];
	3808	+ return 0;
	3809	+ }
	3810	+ default:
	3811	+ return kvm_vm_ioctl_enable_cap(kvm, cap);
	3812	+ }
3225	3813	}
3226	3814
3227	3815	static long kvm_vm_ioctl(struct file *filp,
..	..	@@ -3231,12 +3819,21 @@
3231	3819	void __user argp = (void __user )arg;
3232	3820	int r;
3233	3821
3234		- if (kvm->mm != current->mm)
	3822	+ if (kvm->mm != current->mm \|\| kvm->vm_bugged)
3235	3823	return -EIO;
3236	3824	switch (ioctl) {
3237	3825	case KVM_CREATE_VCPU:
3238	3826	r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3239	3827	break;
	3828	+ case KVM_ENABLE_CAP: {
	3829	+ struct kvm_enable_cap cap;
	3830	+
	3831	+ r = -EFAULT;
	3832	+ if (copy_from_user(&cap, argp, sizeof(cap)))
	3833	+ goto out;
	3834	+ r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
	3835	+ break;
	3836	+ }
3240	3837	case KVM_SET_USER_MEMORY_REGION: {
3241	3838	struct kvm_userspace_memory_region kvm_userspace_mem;
3242	3839
..	..	@@ -3257,6 +3854,17 @@
3257	3854	r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3258	3855	break;
3259	3856	}
	3857	+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	3858	+ case KVM_CLEAR_DIRTY_LOG: {
	3859	+ struct kvm_clear_dirty_log log;
	3860	+
	3861	+ r = -EFAULT;
	3862	+ if (copy_from_user(&log, argp, sizeof(log)))
	3863	+ goto out;
	3864	+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
	3865	+ break;
	3866	+ }
	3867	+#endif
3260	3868	#ifdef CONFIG_KVM_MMIO
3261	3869	case KVM_REGISTER_COALESCED_MMIO: {
3262	3870	struct kvm_coalesced_mmio_zone zone;
..	..	@@ -3347,21 +3955,18 @@
3347	3955	if (routing.flags)
3348	3956	goto out;
3349	3957	if (routing.nr) {
3350		- r = -ENOMEM;
3351		- entries = vmalloc(array_size(sizeof(*entries),
3352		- routing.nr));
3353		- if (!entries)
3354		- goto out;
3355		- r = -EFAULT;
3356	3958	urouting = argp;
3357		- if (copy_from_user(entries, urouting->entries,
3358		- routing.nr * sizeof(*entries)))
3359		- goto out_free_irq_routing;
	3959	+ entries = vmemdup_user(urouting->entries,
	3960	+ array_size(sizeof(*entries),
	3961	+ routing.nr));
	3962	+ if (IS_ERR(entries)) {
	3963	+ r = PTR_ERR(entries);
	3964	+ goto out;
	3965	+ }
3360	3966	}
3361	3967	r = kvm_set_irq_routing(kvm, entries, routing.nr,
3362	3968	routing.flags);
3363		-out_free_irq_routing:
3364		- vfree(entries);
	3969	+ kvfree(entries);
3365	3970	break;
3366	3971	}
3367	3972	#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
..	..	@@ -3403,15 +4008,54 @@
3403	4008	};
3404	4009	};
3405	4010
	4011	+struct compat_kvm_clear_dirty_log {
	4012	+ __u32 slot;
	4013	+ __u32 num_pages;
	4014	+ __u64 first_page;
	4015	+ union {
	4016	+ compat_uptr_t dirty_bitmap; /* one bit per page */
	4017	+ __u64 padding2;
	4018	+ };
	4019	+};
	4020	+
	4021	+long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
	4022	+ unsigned long arg)
	4023	+{
	4024	+ return -ENOTTY;
	4025	+}
	4026	+
3406	4027	static long kvm_vm_compat_ioctl(struct file *filp,
3407	4028	unsigned int ioctl, unsigned long arg)
3408	4029	{
3409	4030	struct kvm *kvm = filp->private_data;
3410	4031	int r;
3411	4032
3412		- if (kvm->mm != current->mm)
	4033	+ if (kvm->mm != current->mm \|\| kvm->vm_bugged)
3413	4034	return -EIO;
	4035	+
	4036	+ r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
	4037	+ if (r != -ENOTTY)
	4038	+ return r;
	4039	+
3414	4040	switch (ioctl) {
	4041	+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	4042	+ case KVM_CLEAR_DIRTY_LOG: {
	4043	+ struct compat_kvm_clear_dirty_log compat_log;
	4044	+ struct kvm_clear_dirty_log log;
	4045	+
	4046	+ if (copy_from_user(&compat_log, (void __user *)arg,
	4047	+ sizeof(compat_log)))
	4048	+ return -EFAULT;
	4049	+ log.slot = compat_log.slot;
	4050	+ log.num_pages = compat_log.num_pages;
	4051	+ log.first_page = compat_log.first_page;
	4052	+ log.padding2 = compat_log.padding2;
	4053	+ log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
	4054	+
	4055	+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
	4056	+ break;
	4057	+ }
	4058	+#endif
3415	4059	case KVM_GET_DIRTY_LOG: {
3416	4060	struct compat_kvm_dirty_log compat_log;
3417	4061	struct kvm_dirty_log log;
..	..	@@ -3749,6 +4393,7 @@
3749	4393	r = __kvm_io_bus_write(vcpu, bus, &range, val);
3750	4394	return r < 0 ? r : 0;
3751	4395	}
	4396	+EXPORT_SYMBOL_GPL(kvm_io_bus_write);
3752	4397
3753	4398	/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
3754	4399	int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
..	..	@@ -3799,7 +4444,6 @@
3799	4444
3800	4445	return -EOPNOTSUPP;
3801	4446	}
3802		-EXPORT_SYMBOL_GPL(kvm_io_bus_write);
3803	4447
3804	4448	/* kvm_io_bus_read - called under kvm->slots_lock */
3805	4449	int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
..	..	@@ -3821,7 +4465,6 @@
3821	4465	return r < 0 ? r : 0;
3822	4466	}
3823	4467
3824		-
3825	4468	/* Caller must hold slots_lock. */
3826	4469	int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
3827	4470	int len, struct kvm_io_device *dev)
..	..	@@ -3838,8 +4481,8 @@
3838	4481	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
3839	4482	return -ENOSPC;
3840	4483
3841		- new_bus = kmalloc(sizeof(bus) + ((bus->dev_count + 1)
3842		- sizeof(struct kvm_io_range)), GFP_KERNEL);
	4484	+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
	4485	+ GFP_KERNEL_ACCOUNT);
3843	4486	if (!new_bus)
3844	4487	return -ENOMEM;
3845	4488
..	..	@@ -3866,15 +4509,15 @@
3866	4509	}
3867	4510
3868	4511	/* Caller must hold slots_lock. */
3869		-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
3870		- struct kvm_io_device *dev)
	4512	+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
	4513	+ struct kvm_io_device *dev)
3871	4514	{
3872	4515	int i, j;
3873	4516	struct kvm_io_bus new_bus, bus;
3874	4517
3875	4518	bus = kvm_get_bus(kvm, bus_idx);
3876	4519	if (!bus)
3877		- return;
	4520	+ return 0;
3878	4521
3879	4522	for (i = 0; i < bus->dev_count; i++)
3880	4523	if (bus->range[i].dev == dev) {
..	..	@@ -3882,16 +4525,22 @@
3882	4525	}
3883	4526
3884	4527	if (i == bus->dev_count)
3885		- return;
	4528	+ return 0;
3886	4529
3887		- new_bus = kmalloc(sizeof(bus) + ((bus->dev_count - 1)
3888		- sizeof(struct kvm_io_range)), GFP_KERNEL);
	4530	+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
	4531	+ GFP_KERNEL_ACCOUNT);
3889	4532	if (new_bus) {
3890		- memcpy(new_bus, bus, sizeof(bus) + i sizeof(struct kvm_io_range));
	4533	+ memcpy(new_bus, bus, struct_size(bus, range, i));
3891	4534	new_bus->dev_count--;
3892	4535	memcpy(new_bus->range + i, bus->range + i + 1,
3893		- (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
3894		- } else {
	4536	+ flex_array_size(new_bus, range, new_bus->dev_count - i));
	4537	+ }
	4538	+
	4539	+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
	4540	+ synchronize_srcu_expedited(&kvm->srcu);
	4541	+
	4542	+ /* Destroy the old bus _after_ installing the (null) bus. */
	4543	+ if (!new_bus) {
3895	4544	pr_err("kvm: failed to shrink bus, removing it completely\n");
3896	4545	for (j = 0; j < bus->dev_count; j++) {
3897	4546	if (j == i)
..	..	@@ -3900,10 +4549,8 @@
3900	4549	}
3901	4550	}
3902	4551
3903		- rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
3904		- synchronize_srcu_expedited(&kvm->srcu);
3905	4552	kfree(bus);
3906		- return;
	4553	+ return new_bus ? 0 : -ENOMEM;
3907	4554	}
3908	4555
3909	4556	struct kvm_io_device kvm_io_bus_get_dev(struct kvm kvm, enum kvm_bus bus_idx,
..	..	@@ -3948,8 +4595,9 @@
3948	4595	return -ENOENT;
3949	4596
3950	4597	if (simple_attr_open(inode, file, get,
3951		- stat_data->mode & S_IWUGO ? set : NULL,
3952		- fmt)) {
	4598	+ KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
	4599	+ ? set : NULL,
	4600	+ fmt)) {
3953	4601	kvm_put_kvm(stat_data->kvm);
3954	4602	return -ENOMEM;
3955	4603	}
..	..	@@ -3968,105 +4616,111 @@
3968	4616	return 0;
3969	4617	}
3970	4618
3971		-static int vm_stat_get_per_vm(void data, u64 val)
	4619	+static int kvm_get_stat_per_vm(struct kvm kvm, size_t offset, u64 val)
3972	4620	{
3973		- struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
3974		-
3975		- val = (ulong )((void )stat_data->kvm + stat_data->offset);
	4621	+ val = (ulong )((void )kvm + offset);
3976	4622
3977	4623	return 0;
3978	4624	}
3979	4625
3980		-static int vm_stat_clear_per_vm(void *data, u64 val)
	4626	+static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
3981	4627	{
3982		- struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
3983		-
3984		- if (val)
3985		- return -EINVAL;
3986		-
3987		- (ulong )((void *)stat_data->kvm + stat_data->offset) = 0;
	4628	+ (ulong )((void *)kvm + offset) = 0;
3988	4629
3989	4630	return 0;
3990	4631	}
3991	4632
3992		-static int vm_stat_get_per_vm_open(struct inode inode, struct file file)
3993		-{
3994		- __simple_attr_check_format("%llu\n", 0ull);
3995		- return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
3996		- vm_stat_clear_per_vm, "%llu\n");
3997		-}
3998		-
3999		-static const struct file_operations vm_stat_get_per_vm_fops = {
4000		- .owner = THIS_MODULE,
4001		- .open = vm_stat_get_per_vm_open,
4002		- .release = kvm_debugfs_release,
4003		- .read = simple_attr_read,
4004		- .write = simple_attr_write,
4005		- .llseek = no_llseek,
4006		-};
4007		-
4008		-static int vcpu_stat_get_per_vm(void data, u64 val)
	4633	+static int kvm_get_stat_per_vcpu(struct kvm kvm, size_t offset, u64 val)
4009	4634	{
4010	4635	int i;
4011		- struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
4012	4636	struct kvm_vcpu *vcpu;
4013	4637
4014	4638	*val = 0;
4015	4639
4016		- kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4017		- val += (u64 )((void )vcpu + stat_data->offset);
	4640	+ kvm_for_each_vcpu(i, vcpu, kvm)
	4641	+ val += (u64 )((void )vcpu + offset);
4018	4642
4019	4643	return 0;
4020	4644	}
4021	4645
4022		-static int vcpu_stat_clear_per_vm(void *data, u64 val)
	4646	+static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
4023	4647	{
4024	4648	int i;
4025		- struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
4026	4649	struct kvm_vcpu *vcpu;
	4650	+
	4651	+ kvm_for_each_vcpu(i, vcpu, kvm)
	4652	+ (u64 )((void *)vcpu + offset) = 0;
	4653	+
	4654	+ return 0;
	4655	+}
	4656	+
	4657	+static int kvm_stat_data_get(void data, u64 val)
	4658	+{
	4659	+ int r = -EFAULT;
	4660	+ struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
	4661	+
	4662	+ switch (stat_data->dbgfs_item->kind) {
	4663	+ case KVM_STAT_VM:
	4664	+ r = kvm_get_stat_per_vm(stat_data->kvm,
	4665	+ stat_data->dbgfs_item->offset, val);
	4666	+ break;
	4667	+ case KVM_STAT_VCPU:
	4668	+ r = kvm_get_stat_per_vcpu(stat_data->kvm,
	4669	+ stat_data->dbgfs_item->offset, val);
	4670	+ break;
	4671	+ }
	4672	+
	4673	+ return r;
	4674	+}
	4675	+
	4676	+static int kvm_stat_data_clear(void *data, u64 val)
	4677	+{
	4678	+ int r = -EFAULT;
	4679	+ struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
4027	4680
4028	4681	if (val)
4029	4682	return -EINVAL;
4030	4683
4031		- kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4032		- (u64 )((void *)vcpu + stat_data->offset) = 0;
	4684	+ switch (stat_data->dbgfs_item->kind) {
	4685	+ case KVM_STAT_VM:
	4686	+ r = kvm_clear_stat_per_vm(stat_data->kvm,
	4687	+ stat_data->dbgfs_item->offset);
	4688	+ break;
	4689	+ case KVM_STAT_VCPU:
	4690	+ r = kvm_clear_stat_per_vcpu(stat_data->kvm,
	4691	+ stat_data->dbgfs_item->offset);
	4692	+ break;
	4693	+ }
4033	4694
4034		- return 0;
	4695	+ return r;
4035	4696	}
4036	4697
4037		-static int vcpu_stat_get_per_vm_open(struct inode inode, struct file file)
	4698	+static int kvm_stat_data_open(struct inode inode, struct file file)
4038	4699	{
4039	4700	__simple_attr_check_format("%llu\n", 0ull);
4040		- return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
4041		- vcpu_stat_clear_per_vm, "%llu\n");
	4701	+ return kvm_debugfs_open(inode, file, kvm_stat_data_get,
	4702	+ kvm_stat_data_clear, "%llu\n");
4042	4703	}
4043	4704
4044		-static const struct file_operations vcpu_stat_get_per_vm_fops = {
4045		- .owner = THIS_MODULE,
4046		- .open = vcpu_stat_get_per_vm_open,
	4705	+static const struct file_operations stat_fops_per_vm = {
	4706	+ .owner = THIS_MODULE,
	4707	+ .open = kvm_stat_data_open,
4047	4708	.release = kvm_debugfs_release,
4048		- .read = simple_attr_read,
4049		- .write = simple_attr_write,
4050		- .llseek = no_llseek,
4051		-};
4052		-
4053		-static const struct file_operations *stat_fops_per_vm[] = {
4054		- [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
4055		- [KVM_STAT_VM] = &vm_stat_get_per_vm_fops,
	4709	+ .read = simple_attr_read,
	4710	+ .write = simple_attr_write,
	4711	+ .llseek = no_llseek,
4056	4712	};
4057	4713
4058	4714	static int vm_stat_get(void _offset, u64 val)
4059	4715	{
4060	4716	unsigned offset = (long)_offset;
4061	4717	struct kvm *kvm;
4062		- struct kvm_stat_data stat_tmp = {.offset = offset};
4063	4718	u64 tmp_val;
4064	4719
4065	4720	*val = 0;
4066	4721	mutex_lock(&kvm_lock);
4067	4722	list_for_each_entry(kvm, &vm_list, vm_list) {
4068		- stat_tmp.kvm = kvm;
4069		- vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
	4723	+ kvm_get_stat_per_vm(kvm, offset, &tmp_val);
4070	4724	*val += tmp_val;
4071	4725	}
4072	4726	mutex_unlock(&kvm_lock);
..	..	@@ -4077,15 +4731,13 @@
4077	4731	{
4078	4732	unsigned offset = (long)_offset;
4079	4733	struct kvm *kvm;
4080		- struct kvm_stat_data stat_tmp = {.offset = offset};
4081	4734
4082	4735	if (val)
4083	4736	return -EINVAL;
4084	4737
4085	4738	mutex_lock(&kvm_lock);
4086	4739	list_for_each_entry(kvm, &vm_list, vm_list) {
4087		- stat_tmp.kvm = kvm;
4088		- vm_stat_clear_per_vm((void *)&stat_tmp, 0);
	4740	+ kvm_clear_stat_per_vm(kvm, offset);
4089	4741	}
4090	4742	mutex_unlock(&kvm_lock);
4091	4743
..	..	@@ -4098,14 +4750,12 @@
4098	4750	{
4099	4751	unsigned offset = (long)_offset;
4100	4752	struct kvm *kvm;
4101		- struct kvm_stat_data stat_tmp = {.offset = offset};
4102	4753	u64 tmp_val;
4103	4754
4104	4755	*val = 0;
4105	4756	mutex_lock(&kvm_lock);
4106	4757	list_for_each_entry(kvm, &vm_list, vm_list) {
4107		- stat_tmp.kvm = kvm;
4108		- vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
	4758	+ kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
4109	4759	*val += tmp_val;
4110	4760	}
4111	4761	mutex_unlock(&kvm_lock);
..	..	@@ -4116,15 +4766,13 @@
4116	4766	{
4117	4767	unsigned offset = (long)_offset;
4118	4768	struct kvm *kvm;
4119		- struct kvm_stat_data stat_tmp = {.offset = offset};
4120	4769
4121	4770	if (val)
4122	4771	return -EINVAL;
4123	4772
4124	4773	mutex_lock(&kvm_lock);
4125	4774	list_for_each_entry(kvm, &vm_list, vm_list) {
4126		- stat_tmp.kvm = kvm;
4127		- vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
	4775	+ kvm_clear_stat_per_vcpu(kvm, offset);
4128	4776	}
4129	4777	mutex_unlock(&kvm_lock);
4130	4778
..	..	@@ -4158,7 +4806,7 @@
4158	4806	active = kvm_active_vms;
4159	4807	mutex_unlock(&kvm_lock);
4160	4808
4161		- env = kzalloc(sizeof(*env), GFP_KERNEL);
	4809	+ env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
4162	4810	if (!env)
4163	4811	return;
4164	4812
..	..	@@ -4173,8 +4821,8 @@
4173	4821	}
4174	4822	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
4175	4823
4176		- if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
4177		- char tmp, p = kmalloc(PATH_MAX, GFP_KERNEL);
	4824	+ if (kvm->debugfs_dentry) {
	4825	+ char tmp, p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
4178	4826
4179	4827	if (p) {
4180	4828	tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
..	..	@@ -4197,9 +4845,8 @@
4197	4845
4198	4846	kvm_debugfs_num_entries = 0;
4199	4847	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
4200		- int mode = p->mode ? p->mode : 0644;
4201		- debugfs_create_file(p->name, mode, kvm_debugfs_dir,
4202		- (void *)(long)p->offset,
	4848	+ debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
	4849	+ kvm_debugfs_dir, (void *)(long)p->offset,
4203	4850	stat_fops[p->kind]);
4204	4851	}
4205	4852	}
..	..	@@ -4214,7 +4861,9 @@
4214	4861	static void kvm_resume(void)
4215	4862	{
4216	4863	if (kvm_usage_count) {
4217		- WARN_ON(raw_spin_is_locked(&kvm_count_lock));
	4864	+#ifdef CONFIG_LOCKDEP
	4865	+ WARN_ON(lockdep_is_held(&kvm_count_lock));
	4866	+#endif
4218	4867	hardware_enable_nolock(NULL);
4219	4868	}
4220	4869	}
..	..	@@ -4234,11 +4883,11 @@
4234	4883	{
4235	4884	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4236	4885
4237		- if (vcpu->preempted)
4238		- vcpu->preempted = false;
	4886	+ WRITE_ONCE(vcpu->preempted, false);
	4887	+ WRITE_ONCE(vcpu->ready, false);
4239	4888
	4889	+ __this_cpu_write(kvm_running_vcpu, vcpu);
4240	4890	kvm_arch_sched_in(vcpu, cpu);
4241		-
4242	4891	kvm_arch_vcpu_load(vcpu, cpu);
4243	4892	}
4244	4893
..	..	@@ -4247,14 +4896,59 @@
4247	4896	{
4248	4897	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4249	4898
4250		- if (current->state == TASK_RUNNING)
4251		- vcpu->preempted = true;
	4899	+ if (current->state == TASK_RUNNING) {
	4900	+ WRITE_ONCE(vcpu->preempted, true);
	4901	+ WRITE_ONCE(vcpu->ready, true);
	4902	+ }
4252	4903	kvm_arch_vcpu_put(vcpu);
	4904	+ __this_cpu_write(kvm_running_vcpu, NULL);
	4905	+}
	4906	+
	4907	+/**
	4908	+ * kvm_get_running_vcpu - get the vcpu running on the current CPU.
	4909	+ *
	4910	+ * We can disable preemption locally around accessing the per-CPU variable,
	4911	+ * and use the resolved vcpu pointer after enabling preemption again,
	4912	+ * because even if the current thread is migrated to another CPU, reading
	4913	+ * the per-CPU value later will give us the same value as we update the
	4914	+ * per-CPU variable in the preempt notifier handlers.
	4915	+ */
	4916	+struct kvm_vcpu *kvm_get_running_vcpu(void)
	4917	+{
	4918	+ struct kvm_vcpu *vcpu;
	4919	+
	4920	+ preempt_disable();
	4921	+ vcpu = __this_cpu_read(kvm_running_vcpu);
	4922	+ preempt_enable();
	4923	+
	4924	+ return vcpu;
	4925	+}
	4926	+EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
	4927	+
	4928	+/**
	4929	+ * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
	4930	+ */
	4931	+struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
	4932	+{
	4933	+ return &kvm_running_vcpu;
	4934	+}
	4935	+
	4936	+struct kvm_cpu_compat_check {
	4937	+ void *opaque;
	4938	+ int *ret;
	4939	+};
	4940	+
	4941	+static void check_processor_compat(void *data)
	4942	+{
	4943	+ struct kvm_cpu_compat_check *c = data;
	4944	+
	4945	+ *c->ret = kvm_arch_check_processor_compat(c->opaque);
4253	4946	}
4254	4947
4255	4948	int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
4256	4949	struct module *module)
4257	4950	{
	4951	+ struct kvm_cpu_compat_check c;
4258	4952	int r;
4259	4953	int cpu;
4260	4954
..	..	@@ -4278,16 +4972,16 @@
4278	4972	goto out_free_0;
4279	4973	}
4280	4974
4281		- r = kvm_arch_hardware_setup();
	4975	+ r = kvm_arch_hardware_setup(opaque);
4282	4976	if (r < 0)
4283		- goto out_free_0a;
	4977	+ goto out_free_1;
4284	4978
	4979	+ c.ret = &r;
	4980	+ c.opaque = opaque;
4285	4981	for_each_online_cpu(cpu) {
4286		- smp_call_function_single(cpu,
4287		- kvm_arch_check_processor_compat,
4288		- &r, 1);
	4982	+ smp_call_function_single(cpu, check_processor_compat, &c, 1);
4289	4983	if (r < 0)
4290		- goto out_free_1;
	4984	+ goto out_free_2;
4291	4985	}
4292	4986
4293	4987	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
..	..	@@ -4310,19 +5004,21 @@
4310	5004	goto out_free_3;
4311	5005	}
4312	5006
	5007	+ for_each_possible_cpu(cpu) {
	5008	+ if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
	5009	+ GFP_KERNEL, cpu_to_node(cpu))) {
	5010	+ r = -ENOMEM;
	5011	+ goto out_free_4;
	5012	+ }
	5013	+ }
	5014	+
4313	5015	r = kvm_async_pf_init();
4314	5016	if (r)
4315		- goto out_free;
	5017	+ goto out_free_4;
4316	5018
4317	5019	kvm_chardev_ops.owner = module;
4318	5020	kvm_vm_fops.owner = module;
4319	5021	kvm_vcpu_fops.owner = module;
4320		-
4321		- r = misc_register(&kvm_dev);
4322		- if (r) {
4323		- pr_err("kvm: misc device register failed\n");
4324		- goto out_unreg;
4325		- }
4326	5022
4327	5023	register_syscore_ops(&kvm_syscore_ops);
4328	5024
..	..	@@ -4332,21 +5028,35 @@
4332	5028	kvm_init_debug();
4333	5029
4334	5030	r = kvm_vfio_ops_init();
4335		- WARN_ON(r);
	5031	+ if (WARN_ON_ONCE(r))
	5032	+ goto err_vfio;
	5033	+
	5034	+ /*
	5035	+ * Registration _must_ be the very last thing done, as this exposes
	5036	+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
	5037	+ */
	5038	+ r = misc_register(&kvm_dev);
	5039	+ if (r) {
	5040	+ pr_err("kvm: misc device register failed\n");
	5041	+ goto err_register;
	5042	+ }
4336	5043
4337	5044	return 0;
4338	5045
4339		-out_unreg:
	5046	+err_register:
	5047	+ kvm_vfio_ops_exit();
	5048	+err_vfio:
4340	5049	kvm_async_pf_deinit();
4341		-out_free:
	5050	+out_free_4:
	5051	+ for_each_possible_cpu(cpu)
	5052	+ free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
4342	5053	kmem_cache_destroy(kvm_vcpu_cache);
4343	5054	out_free_3:
4344	5055	unregister_reboot_notifier(&kvm_reboot_notifier);
4345	5056	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
4346	5057	out_free_2:
4347		-out_free_1:
4348	5058	kvm_arch_hardware_unsetup();
4349		-out_free_0a:
	5059	+out_free_1:
4350	5060	free_cpumask_var(cpus_hardware_enabled);
4351	5061	out_free_0:
4352	5062	kvm_irqfd_exit();
..	..	@@ -4359,8 +5069,18 @@
4359	5069
4360	5070	void kvm_exit(void)
4361	5071	{
4362		- debugfs_remove_recursive(kvm_debugfs_dir);
	5072	+ int cpu;
	5073	+
	5074	+ /*
	5075	+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
	5076	+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
	5077	+ * to KVM while the module is being stopped.
	5078	+ */
4363	5079	misc_deregister(&kvm_dev);
	5080	+
	5081	+ debugfs_remove_recursive(kvm_debugfs_dir);
	5082	+ for_each_possible_cpu(cpu)
	5083	+ free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
4364	5084	kmem_cache_destroy(kvm_vcpu_cache);
4365	5085	kvm_async_pf_deinit();
4366	5086	unregister_syscore_ops(&kvm_syscore_ops);