~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Kernel-based Virtual Machine driver for Linux
3	4	*
..	..	@@ -10,10 +11,6 @@
10	11	* Authors:
11	12	* Avi Kivity <avi@qumranet.com>
12	13	* Yaniv Kamay <yaniv@qumranet.com>
13		- *
14		- * This work is licensed under the terms of the GNU GPL, version 2. See
15		- * the COPYING file in the top-level directory.
16		- *
17	14	*/
18	15
19	16	#include <kvm/iodev.h>
..	..	@@ -51,13 +48,13 @@
51	48	#include <linux/slab.h>
52	49	#include <linux/sort.h>
53	50	#include <linux/bsearch.h>
54		-#include <linux/kthread.h>
55	51	#include <linux/io.h>
	52	+#include <linux/lockdep.h>
	53	+#include <linux/kthread.h>
56	54
57	55	#include <asm/processor.h>
58	56	#include <asm/ioctl.h>
59	57	#include <linux/uaccess.h>
60		-#include <asm/pgtable.h>
61	58
62	59	#include "coalesced_mmio.h"
63	60	#include "async_pf.h"
..	..	@@ -82,6 +79,11 @@
82	79	module_param(halt_poll_ns_grow, uint, 0644);
83	80	EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
84	81
	82	+/* The start value to grow halt_poll_ns from */
	83	+unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
	84	+module_param(halt_poll_ns_grow_start, uint, 0644);
	85	+EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
	86	+
85	87	/* Default resets per-vcpu halt_poll_ns . */
86	88	unsigned int halt_poll_ns_shrink;
87	89	module_param(halt_poll_ns_shrink, uint, 0644);
..	..	@@ -101,16 +103,18 @@
101	103	static int kvm_usage_count;
102	104	static atomic_t hardware_enable_failed;
103	105
104		-struct kmem_cache *kvm_vcpu_cache;
105		-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
	106	+static struct kmem_cache *kvm_vcpu_cache;
106	107
107	108	static __read_mostly struct preempt_ops kvm_preempt_ops;
	109	+static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
108	110
109	111	struct dentry *kvm_debugfs_dir;
110	112	EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
111	113
112	114	static int kvm_debugfs_num_entries;
113		-static const struct file_operations *stat_fops_per_vm[];
	115	+static const struct file_operations stat_fops_per_vm;
	116	+
	117	+static struct file_operations kvm_chardev_ops;
114	118
115	119	static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116	120	unsigned long arg);
..	..	@@ -119,21 +123,30 @@
119	123	unsigned long arg);
120	124	#define KVM_COMPAT(c) .compat_ioctl = (c)
121	125	#else
	126	+/*
	127	+ * For architectures that don't implement a compat infrastructure,
	128	+ * adopt a double line of defense:
	129	+ * - Prevent a compat task from opening /dev/kvm
	130	+ * - If the open has been done by a 64bit task, and the KVM fd
	131	+ * passed to a compat task, let the ioctls fail.
	132	+ */
122	133	static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
123	134	unsigned long arg) { return -EINVAL; }
124		-#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl
	135	+
	136	+static int kvm_no_compat_open(struct inode inode, struct file file)
	137	+{
	138	+ return is_compat_task() ? -ENODEV : 0;
	139	+}
	140	+#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
	141	+ .open = kvm_no_compat_open
125	142	#endif
126	143	static int hardware_enable_all(void);
127	144	static void hardware_disable_all(void);
128	145
129	146	static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
130	147
131		-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
132		-
133	148	__visible bool kvm_rebooting;
134	149	EXPORT_SYMBOL_GPL(kvm_rebooting);
135		-
136		-static bool largepages_enabled = true;
137	150
138	151	#define KVM_EVENT_CREATE_VM 0
139	152	#define KVM_EVENT_DESTROY_VM 1
..	..	@@ -143,6 +156,10 @@
143	156
144	157	__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
145	158	unsigned long start, unsigned long end)
	159	+{
	160	+}
	161	+
	162	+__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
146	163	{
147	164	}
148	165
..	..	@@ -175,12 +192,24 @@
175	192	return true;
176	193	}
177	194
	195	+bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
	196	+{
	197	+ struct page *page = pfn_to_page(pfn);
	198	+
	199	+ if (!PageTransCompoundMap(page))
	200	+ return false;
	201	+
	202	+ return is_transparent_hugepage(compound_head(page));
	203	+}
	204	+
178	205	/*
179	206	* Switches to specified vcpu, until a matching vcpu_put()
180	207	*/
181	208	void vcpu_load(struct kvm_vcpu *vcpu)
182	209	{
183	210	int cpu = get_cpu();
	211	+
	212	+ __this_cpu_write(kvm_running_vcpu, vcpu);
184	213	preempt_notifier_register(&vcpu->preempt_notifier);
185	214	kvm_arch_vcpu_load(vcpu, cpu);
186	215	put_cpu();
..	..	@@ -192,6 +221,7 @@
192	221	preempt_disable();
193	222	kvm_arch_vcpu_put(vcpu);
194	223	preempt_notifier_unregister(&vcpu->preempt_notifier);
	224	+ __this_cpu_write(kvm_running_vcpu, NULL);
195	225	preempt_enable();
196	226	}
197	227	EXPORT_SYMBOL_GPL(vcpu_put);
..	..	@@ -231,6 +261,7 @@
231	261	}
232	262
233	263	bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
	264	+ struct kvm_vcpu *except,
234	265	unsigned long *vcpu_bitmap, cpumask_var_t tmp)
235	266	{
236	267	int i, cpu, me;
..	..	@@ -240,7 +271,8 @@
240	271	me = get_cpu();
241	272
242	273	kvm_for_each_vcpu(i, vcpu, kvm) {
243		- if (!test_bit(i, vcpu_bitmap))
	274	+ if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) \|\|
	275	+ vcpu == except)
244	276	continue;
245	277
246	278	kvm_make_request(req, vcpu);
..	..	@@ -260,19 +292,23 @@
260	292	return called;
261	293	}
262	294
263		-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
	295	+bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
	296	+ struct kvm_vcpu *except)
264	297	{
265	298	cpumask_var_t cpus;
266	299	bool called;
267		- static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]
268		- = {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX};
269	300
270	301	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
271	302
272		- called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus);
	303	+ called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
273	304
274	305	free_cpumask_var(cpus);
275	306	return called;
	307	+}
	308	+
	309	+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
	310	+{
	311	+ return kvm_make_all_cpus_request_except(kvm, req, NULL);
276	312	}
277	313
278	314	#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
..	..	@@ -308,57 +344,102 @@
308	344	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
309	345	}
310	346
311		-int kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
	347	+static void kvm_flush_shadow_all(struct kvm *kvm)
312	348	{
313		- struct page *page;
314		- int r;
	349	+ kvm_arch_flush_shadow_all(kvm);
	350	+ kvm_arch_guest_memory_reclaimed(kvm);
	351	+}
315	352
	353	+#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
	354	+static inline void mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache mc,
	355	+ gfp_t gfp_flags)
	356	+{
	357	+ gfp_flags \|= mc->gfp_zero;
	358	+
	359	+ if (mc->kmem_cache)
	360	+ return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
	361	+ else
	362	+ return (void *)__get_free_page(gfp_flags);
	363	+}
	364	+
	365	+int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
	366	+{
	367	+ void *obj;
	368	+
	369	+ if (mc->nobjs >= min)
	370	+ return 0;
	371	+ while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
	372	+ obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
	373	+ if (!obj)
	374	+ return mc->nobjs >= min ? 0 : -ENOMEM;
	375	+ mc->objects[mc->nobjs++] = obj;
	376	+ }
	377	+ return 0;
	378	+}
	379	+
	380	+int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
	381	+{
	382	+ return mc->nobjs;
	383	+}
	384	+
	385	+void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
	386	+{
	387	+ while (mc->nobjs) {
	388	+ if (mc->kmem_cache)
	389	+ kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
	390	+ else
	391	+ free_page((unsigned long)mc->objects[--mc->nobjs]);
	392	+ }
	393	+}
	394	+
	395	+void kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache mc)
	396	+{
	397	+ void *p;
	398	+
	399	+ if (WARN_ON(!mc->nobjs))
	400	+ p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC \| __GFP_ACCOUNT);
	401	+ else
	402	+ p = mc->objects[--mc->nobjs];
	403	+ BUG_ON(!p);
	404	+ return p;
	405	+}
	406	+#endif
	407	+
	408	+static void kvm_vcpu_init(struct kvm_vcpu vcpu, struct kvm kvm, unsigned id)
	409	+{
316	410	mutex_init(&vcpu->mutex);
317	411	vcpu->cpu = -1;
318	412	vcpu->kvm = kvm;
319	413	vcpu->vcpu_id = id;
320	414	vcpu->pid = NULL;
321		- init_swait_queue_head(&vcpu->wq);
	415	+ rcuwait_init(&vcpu->wait);
322	416	kvm_async_pf_vcpu_init(vcpu);
323	417
324	418	vcpu->pre_pcpu = -1;
325	419	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
326	420
327		- page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
328		- if (!page) {
329		- r = -ENOMEM;
330		- goto fail;
331		- }
332		- vcpu->run = page_address(page);
333		-
334	421	kvm_vcpu_set_in_spin_loop(vcpu, false);
335	422	kvm_vcpu_set_dy_eligible(vcpu, false);
336	423	vcpu->preempted = false;
337		-
338		- r = kvm_arch_vcpu_init(vcpu);
339		- if (r < 0)
340		- goto fail_free_run;
341		- return 0;
342		-
343		-fail_free_run:
344		- free_page((unsigned long)vcpu->run);
345		-fail:
346		- return r;
	424	+ vcpu->ready = false;
	425	+ preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
347	426	}
348		-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
349	427
350		-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
	428	+void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
351	429	{
	430	+ kvm_arch_vcpu_destroy(vcpu);
	431	+
352	432	/*
353		- * no need for rcu_read_lock as VCPU_RUN is the only place that
354		- * will change the vcpu->pid pointer and on uninit all file
355		- * descriptors are already gone.
	433	+ * No need for rcu_read_lock as VCPU_RUN is the only place that changes
	434	+ * the vcpu->pid pointer, and at destruction time all file descriptors
	435	+ * are already gone.
356	436	*/
357	437	put_pid(rcu_dereference_protected(vcpu->pid, 1));
358		- kvm_arch_vcpu_uninit(vcpu);
	438	+
359	439	free_page((unsigned long)vcpu->run);
	440	+ kmem_cache_free(kvm_vcpu_cache, vcpu);
360	441	}
361		-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
	442	+EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
362	443
363	444	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
364	445	static inline struct kvm mmu_notifier_to_kvm(struct mmu_notifier mn)
..	..	@@ -389,16 +470,16 @@
389	470	idx = srcu_read_lock(&kvm->srcu);
390	471	spin_lock(&kvm->mmu_lock);
391	472	kvm->mmu_notifier_seq++;
392		- kvm_set_spte_hva(kvm, address, pte);
	473	+
	474	+ if (kvm_set_spte_hva(kvm, address, pte))
	475	+ kvm_flush_remote_tlbs(kvm);
	476	+
393	477	spin_unlock(&kvm->mmu_lock);
394	478	srcu_read_unlock(&kvm->srcu, idx);
395	479	}
396	480
397	481	static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
398		- struct mm_struct *mm,
399		- unsigned long start,
400		- unsigned long end,
401		- bool blockable)
	482	+ const struct mmu_notifier_range *range)
402	483	{
403	484	struct kvm *kvm = mmu_notifier_to_kvm(mn);
404	485	int need_tlb_flush = 0, idx;
..	..	@@ -411,21 +492,21 @@
411	492	* count is also read inside the mmu_lock critical section.
412	493	*/
413	494	kvm->mmu_notifier_count++;
414		- need_tlb_flush = kvm_unmap_hva_range(kvm, start, end, blockable);
	495	+ need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
	496	+ range->flags);
415	497	/* we've to flush the tlb before the pages can be freed */
416	498	if (need_tlb_flush \|\| kvm->tlbs_dirty)
417	499	kvm_flush_remote_tlbs(kvm);
418	500
419	501	spin_unlock(&kvm->mmu_lock);
	502	+ kvm_arch_guest_memory_reclaimed(kvm);
420	503	srcu_read_unlock(&kvm->srcu, idx);
421	504
422	505	return 0;
423	506	}
424	507
425	508	static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
426		- struct mm_struct *mm,
427		- unsigned long start,
428		- unsigned long end)
	509	+ const struct mmu_notifier_range *range)
429	510	{
430	511	struct kvm *kvm = mmu_notifier_to_kvm(mn);
431	512
..	..	@@ -522,12 +603,11 @@
522	603	int idx;
523	604
524	605	idx = srcu_read_lock(&kvm->srcu);
525		- kvm_arch_flush_shadow_all(kvm);
	606	+ kvm_flush_shadow_all(kvm);
526	607	srcu_read_unlock(&kvm->srcu, idx);
527	608	}
528	609
529	610	static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
530		- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
531	611	.invalidate_range = kvm_mmu_notifier_invalidate_range,
532	612	.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
533	613	.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
..	..	@@ -558,12 +638,12 @@
558	638	int i;
559	639	struct kvm_memslots *slots;
560	640
561		- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
	641	+ slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
562	642	if (!slots)
563	643	return NULL;
564	644
565	645	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
566		- slots->id_to_index[i] = slots->memslots[i].id = i;
	646	+ slots->id_to_index[i] = -1;
567	647
568	648	return slots;
569	649	}
..	..	@@ -577,18 +657,14 @@
577	657	memslot->dirty_bitmap = NULL;
578	658	}
579	659
580		-/*
581		- * Free any memory in @free but not in @dont.
582		- */
583		-static void kvm_free_memslot(struct kvm kvm, struct kvm_memory_slot free,
584		- struct kvm_memory_slot *dont)
	660	+static void kvm_free_memslot(struct kvm kvm, struct kvm_memory_slot slot)
585	661	{
586		- if (!dont \|\| free->dirty_bitmap != dont->dirty_bitmap)
587		- kvm_destroy_dirty_bitmap(free);
	662	+ kvm_destroy_dirty_bitmap(slot);
588	663
589		- kvm_arch_free_memslot(kvm, free, dont);
	664	+ kvm_arch_free_memslot(kvm, slot);
590	665
591		- free->npages = 0;
	666	+ slot->flags = 0;
	667	+ slot->npages = 0;
592	668	}
593	669
594	670	static void kvm_free_memslots(struct kvm kvm, struct kvm_memslots slots)
..	..	@@ -599,7 +675,7 @@
599	675	return;
600	676
601	677	kvm_for_each_memslot(memslot, slots)
602		- kvm_free_memslot(kvm, memslot, NULL);
	678	+ kvm_free_memslot(kvm, memslot);
603	679
604	680	kvfree(slots);
605	681	}
..	..	@@ -622,6 +698,8 @@
622	698
623	699	static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
624	700	{
	701	+ static DEFINE_MUTEX(kvm_debugfs_lock);
	702	+ struct dentry *dent;
625	703	char dir_name[ITOA_MAX_LEN * 2];
626	704	struct kvm_stat_data *stat_data;
627	705	struct kvm_stats_debugfs_item *p;
..	..	@@ -630,25 +708,37 @@
630	708	return 0;
631	709
632	710	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
633		- kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
	711	+ mutex_lock(&kvm_debugfs_lock);
	712	+ dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
	713	+ if (dent) {
	714	+ pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
	715	+ dput(dent);
	716	+ mutex_unlock(&kvm_debugfs_lock);
	717	+ return 0;
	718	+ }
	719	+ dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
	720	+ mutex_unlock(&kvm_debugfs_lock);
	721	+ if (IS_ERR(dent))
	722	+ return 0;
634	723
	724	+ kvm->debugfs_dentry = dent;
635	725	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
636	726	sizeof(*kvm->debugfs_stat_data),
637		- GFP_KERNEL);
	727	+ GFP_KERNEL_ACCOUNT);
638	728	if (!kvm->debugfs_stat_data)
639	729	return -ENOMEM;
640	730
641	731	for (p = debugfs_entries; p->name; p++) {
642		- stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
	732	+ stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
643	733	if (!stat_data)
644	734	return -ENOMEM;
645	735
646	736	stat_data->kvm = kvm;
647		- stat_data->offset = p->offset;
648		- stat_data->mode = p->mode ? p->mode : 0644;
	737	+ stat_data->dbgfs_item = p;
649	738	kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
650		- debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
651		- stat_data, stat_fops_per_vm[p->kind]);
	739	+ debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
	740	+ kvm->debugfs_dentry, stat_data,
	741	+ &stat_fops_per_vm);
652	742	}
653	743	return 0;
654	744	}
..	..	@@ -672,8 +762,9 @@
672	762
673	763	static struct kvm *kvm_create_vm(unsigned long type)
674	764	{
675		- int r, i;
676	765	struct kvm *kvm = kvm_arch_alloc_vm();
	766	+ int r = -ENOMEM;
	767	+ int i;
677	768
678	769	if (!kvm)
679	770	return ERR_PTR(-ENOMEM);
..	..	@@ -685,12 +776,38 @@
685	776	mutex_init(&kvm->lock);
686	777	mutex_init(&kvm->irq_lock);
687	778	mutex_init(&kvm->slots_lock);
688		- refcount_set(&kvm->users_count, 1);
689	779	INIT_LIST_HEAD(&kvm->devices);
	780	+
	781	+ BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
	782	+
	783	+ if (init_srcu_struct(&kvm->srcu))
	784	+ goto out_err_no_srcu;
	785	+ if (init_srcu_struct(&kvm->irq_srcu))
	786	+ goto out_err_no_irq_srcu;
	787	+
	788	+ refcount_set(&kvm->users_count, 1);
	789	+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
	790	+ struct kvm_memslots *slots = kvm_alloc_memslots();
	791	+
	792	+ if (!slots)
	793	+ goto out_err_no_arch_destroy_vm;
	794	+ /* Generations must be different for each address space. */
	795	+ slots->generation = i;
	796	+ rcu_assign_pointer(kvm->memslots[i], slots);
	797	+ }
	798	+
	799	+ for (i = 0; i < KVM_NR_BUSES; i++) {
	800	+ rcu_assign_pointer(kvm->buses[i],
	801	+ kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
	802	+ if (!kvm->buses[i])
	803	+ goto out_err_no_arch_destroy_vm;
	804	+ }
	805	+
	806	+ kvm->max_halt_poll_ns = halt_poll_ns;
690	807
691	808	r = kvm_arch_init_vm(kvm, type);
692	809	if (r)
693		- goto out_err_no_disable;
	810	+ goto out_err_no_arch_destroy_vm;
694	811
695	812	r = hardware_enable_all();
696	813	if (r)
..	..	@@ -699,33 +816,6 @@
699	816	#ifdef CONFIG_HAVE_KVM_IRQFD
700	817	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
701	818	#endif
702		-
703		- BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
704		-
705		- r = -ENOMEM;
706		- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
707		- struct kvm_memslots *slots = kvm_alloc_memslots();
708		- if (!slots)
709		- goto out_err_no_srcu;
710		- /*
711		- * Generations must be different for each address space.
712		- * Init kvm generation close to the maximum to easily test the
713		- * code of handling generation number wrap-around.
714		- */
715		- slots->generation = i * 2 - 150;
716		- rcu_assign_pointer(kvm->memslots[i], slots);
717		- }
718		-
719		- if (init_srcu_struct(&kvm->srcu))
720		- goto out_err_no_srcu;
721		- if (init_srcu_struct(&kvm->irq_srcu))
722		- goto out_err_no_irq_srcu;
723		- for (i = 0; i < KVM_NR_BUSES; i++) {
724		- rcu_assign_pointer(kvm->buses[i],
725		- kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
726		- if (!kvm->buses[i])
727		- goto out_err_no_mmu_notifier;
728		- }
729	819
730	820	r = kvm_init_mmu_notifier(kvm);
731	821	if (r)
..	..	@@ -741,6 +831,16 @@
741	831
742	832	preempt_notifier_inc();
743	833
	834	+ /*
	835	+ * When the fd passed to this ioctl() is opened it pins the module,
	836	+ * but try_module_get() also prevents getting a reference if the module
	837	+ * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait").
	838	+ */
	839	+ if (!try_module_get(kvm_chardev_ops.owner)) {
	840	+ r = -ENODEV;
	841	+ goto out_err;
	842	+ }
	843	+
744	844	return kvm;
745	845
746	846	out_err:
..	..	@@ -749,17 +849,19 @@
749	849	mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
750	850	#endif
751	851	out_err_no_mmu_notifier:
752		- cleanup_srcu_struct(&kvm->irq_srcu);
753		-out_err_no_irq_srcu:
754		- cleanup_srcu_struct(&kvm->srcu);
755		-out_err_no_srcu:
756	852	hardware_disable_all();
757	853	out_err_no_disable:
758		- refcount_set(&kvm->users_count, 0);
	854	+ kvm_arch_destroy_vm(kvm);
	855	+out_err_no_arch_destroy_vm:
	856	+ WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
759	857	for (i = 0; i < KVM_NR_BUSES; i++)
760	858	kfree(kvm_get_bus(kvm, i));
761	859	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
762	860	kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
	861	+ cleanup_srcu_struct(&kvm->irq_srcu);
	862	+out_err_no_irq_srcu:
	863	+ cleanup_srcu_struct(&kvm->srcu);
	864	+out_err_no_srcu:
763	865	kvm_arch_free_vm(kvm);
764	866	mmdrop(current->mm);
765	867	return ERR_PTR(r);
..	..	@@ -805,7 +907,7 @@
805	907	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
806	908	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
807	909	#else
808		- kvm_arch_flush_shadow_all(kvm);
	910	+ kvm_flush_shadow_all(kvm);
809	911	#endif
810	912	kvm_arch_destroy_vm(kvm);
811	913	kvm_destroy_devices(kvm);
..	..	@@ -817,6 +919,7 @@
817	919	preempt_notifier_dec();
818	920	hardware_disable_all();
819	921	mmdrop(mm);
	922	+ module_put(kvm_chardev_ops.owner);
820	923	}
821	924
822	925	void kvm_get_kvm(struct kvm *kvm)
..	..	@@ -832,6 +935,18 @@
832	935	}
833	936	EXPORT_SYMBOL_GPL(kvm_put_kvm);
834	937
	938	+/*
	939	+ * Used to put a reference that was taken on behalf of an object associated
	940	+ * with a user-visible file descriptor, e.g. a vcpu or device, if installation
	941	+ * of the new file descriptor fails and the reference cannot be transferred to
	942	+ * its final owner. In such cases, the caller is still actively using @kvm and
	943	+ * will fail miserably if the refcount unexpectedly hits zero.
	944	+ */
	945	+void kvm_put_kvm_no_destroy(struct kvm *kvm)
	946	+{
	947	+ WARN_ON(refcount_dec_and_test(&kvm->users_count));
	948	+}
	949	+EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
835	950
836	951	static int kvm_vm_release(struct inode inode, struct file filp)
837	952	{
..	..	@@ -845,13 +960,13 @@
845	960
846	961	/*
847	962	* Allocation size is twice as large as the actual dirty bitmap size.
848		- * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
	963	+ * See kvm_vm_ioctl_get_dirty_log() why this is needed.
849	964	*/
850		-static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
	965	+static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
851	966	{
852	967	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
853	968
854		- memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
	969	+ memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
855	970	if (!memslot->dirty_bitmap)
856	971	return -ENOMEM;
857	972
..	..	@@ -859,58 +974,165 @@
859	974	}
860	975
861	976	/*
862		- * Insert memslot and re-sort memslots based on their GFN,
863		- * so binary search could be used to lookup GFN.
864		- * Sorting algorithm takes advantage of having initially
865		- * sorted array and known changed memslot position.
	977	+ * Delete a memslot by decrementing the number of used slots and shifting all
	978	+ * other entries in the array forward one spot.
866	979	*/
867		-static void update_memslots(struct kvm_memslots *slots,
868		- struct kvm_memory_slot *new)
	980	+static inline void kvm_memslot_delete(struct kvm_memslots *slots,
	981	+ struct kvm_memory_slot *memslot)
869	982	{
870		- int id = new->id;
871		- int i = slots->id_to_index[id];
872	983	struct kvm_memory_slot *mslots = slots->memslots;
	984	+ int i;
873	985
874		- WARN_ON(mslots[i].id != id);
875		- if (!new->npages) {
876		- WARN_ON(!mslots[i].npages);
877		- if (mslots[i].npages)
878		- slots->used_slots--;
879		- } else {
880		- if (!mslots[i].npages)
881		- slots->used_slots++;
882		- }
	986	+ if (WARN_ON(slots->id_to_index[memslot->id] == -1))
	987	+ return;
883	988
884		- while (i < KVM_MEM_SLOTS_NUM - 1 &&
885		- new->base_gfn <= mslots[i + 1].base_gfn) {
886		- if (!mslots[i + 1].npages)
887		- break;
	989	+ slots->used_slots--;
	990	+
	991	+ if (atomic_read(&slots->lru_slot) >= slots->used_slots)
	992	+ atomic_set(&slots->lru_slot, 0);
	993	+
	994	+ for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
888	995	mslots[i] = mslots[i + 1];
889	996	slots->id_to_index[mslots[i].id] = i;
890		- i++;
891	997	}
	998	+ mslots[i] = *memslot;
	999	+ slots->id_to_index[memslot->id] = -1;
	1000	+}
	1001	+
	1002	+/*
	1003	+ * "Insert" a new memslot by incrementing the number of used slots. Returns
	1004	+ * the new slot's initial index into the memslots array.
	1005	+ */
	1006	+static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
	1007	+{
	1008	+ return slots->used_slots++;
	1009	+}
	1010	+
	1011	+/*
	1012	+ * Move a changed memslot backwards in the array by shifting existing slots
	1013	+ * with a higher GFN toward the front of the array. Note, the changed memslot
	1014	+ * itself is not preserved in the array, i.e. not swapped at this time, only
	1015	+ * its new index into the array is tracked. Returns the changed memslot's
	1016	+ * current index into the memslots array.
	1017	+ */
	1018	+static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
	1019	+ struct kvm_memory_slot *memslot)
	1020	+{
	1021	+ struct kvm_memory_slot *mslots = slots->memslots;
	1022	+ int i;
	1023	+
	1024	+ if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) \|\|
	1025	+ WARN_ON_ONCE(!slots->used_slots))
	1026	+ return -1;
892	1027
893	1028	/*
894		- * The ">=" is needed when creating a slot with base_gfn == 0,
895		- * so that it moves before all those with base_gfn == npages == 0.
896		- *
897		- * On the other hand, if new->npages is zero, the above loop has
898		- * already left i pointing to the beginning of the empty part of
899		- * mslots, and the ">=" would move the hole backwards in this
900		- * case---which is wrong. So skip the loop when deleting a slot.
	1029	+ * Move the target memslot backward in the array by shifting existing
	1030	+ * memslots with a higher GFN (than the target memslot) towards the
	1031	+ * front of the array.
901	1032	*/
902		- if (new->npages) {
903		- while (i > 0 &&
904		- new->base_gfn >= mslots[i - 1].base_gfn) {
905		- mslots[i] = mslots[i - 1];
906		- slots->id_to_index[mslots[i].id] = i;
907		- i--;
908		- }
909		- } else
910		- WARN_ON_ONCE(i != slots->used_slots);
	1033	+ for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
	1034	+ if (memslot->base_gfn > mslots[i + 1].base_gfn)
	1035	+ break;
911	1036
912		- mslots[i] = *new;
913		- slots->id_to_index[mslots[i].id] = i;
	1037	+ WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
	1038	+
	1039	+ /* Shift the next memslot forward one and update its index. */
	1040	+ mslots[i] = mslots[i + 1];
	1041	+ slots->id_to_index[mslots[i].id] = i;
	1042	+ }
	1043	+ return i;
	1044	+}
	1045	+
	1046	+/*
	1047	+ * Move a changed memslot forwards in the array by shifting existing slots with
	1048	+ * a lower GFN toward the back of the array. Note, the changed memslot itself
	1049	+ * is not preserved in the array, i.e. not swapped at this time, only its new
	1050	+ * index into the array is tracked. Returns the changed memslot's final index
	1051	+ * into the memslots array.
	1052	+ */
	1053	+static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
	1054	+ struct kvm_memory_slot *memslot,
	1055	+ int start)
	1056	+{
	1057	+ struct kvm_memory_slot *mslots = slots->memslots;
	1058	+ int i;
	1059	+
	1060	+ for (i = start; i > 0; i--) {
	1061	+ if (memslot->base_gfn < mslots[i - 1].base_gfn)
	1062	+ break;
	1063	+
	1064	+ WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
	1065	+
	1066	+ /* Shift the next memslot back one and update its index. */
	1067	+ mslots[i] = mslots[i - 1];
	1068	+ slots->id_to_index[mslots[i].id] = i;
	1069	+ }
	1070	+ return i;
	1071	+}
	1072	+
	1073	+/*
	1074	+ * Re-sort memslots based on their GFN to account for an added, deleted, or
	1075	+ * moved memslot. Sorting memslots by GFN allows using a binary search during
	1076	+ * memslot lookup.
	1077	+ *
	1078	+ * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry
	1079	+ * at memslots[0] has the highest GFN.
	1080	+ *
	1081	+ * The sorting algorithm takes advantage of having initially sorted memslots
	1082	+ * and knowing the position of the changed memslot. Sorting is also optimized
	1083	+ * by not swapping the updated memslot and instead only shifting other memslots
	1084	+ * and tracking the new index for the update memslot. Only once its final
	1085	+ * index is known is the updated memslot copied into its position in the array.
	1086	+ *
	1087	+ * - When deleting a memslot, the deleted memslot simply needs to be moved to
	1088	+ * the end of the array.
	1089	+ *
	1090	+ * - When creating a memslot, the algorithm "inserts" the new memslot at the
	1091	+ * end of the array and then it forward to its correct location.
	1092	+ *
	1093	+ * - When moving a memslot, the algorithm first moves the updated memslot
	1094	+ * backward to handle the scenario where the memslot's GFN was changed to a
	1095	+ * lower value. update_memslots() then falls through and runs the same flow
	1096	+ * as creating a memslot to move the memslot forward to handle the scenario
	1097	+ * where its GFN was changed to a higher value.
	1098	+ *
	1099	+ * Note, slots are sorted from highest->lowest instead of lowest->highest for
	1100	+ * historical reasons. Originally, invalid memslots where denoted by having
	1101	+ * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
	1102	+ * to the end of the array. The current algorithm uses dedicated logic to
	1103	+ * delete a memslot and thus does not rely on invalid memslots having GFN=0.
	1104	+ *
	1105	+ * The other historical motiviation for highest->lowest was to improve the
	1106	+ * performance of memslot lookup. KVM originally used a linear search starting
	1107	+ * at memslots[0]. On x86, the largest memslot usually has one of the highest,
	1108	+ * if not the highest, GFN, as the bulk of the guest's RAM is located in a
	1109	+ * single memslot above the 4gb boundary. As the largest memslot is also the
	1110	+ * most likely to be referenced, sorting it to the front of the array was
	1111	+ * advantageous. The current binary search starts from the middle of the array
	1112	+ * and uses an LRU pointer to improve performance for all memslots and GFNs.
	1113	+ */
	1114	+static void update_memslots(struct kvm_memslots *slots,
	1115	+ struct kvm_memory_slot *memslot,
	1116	+ enum kvm_mr_change change)
	1117	+{
	1118	+ int i;
	1119	+
	1120	+ if (change == KVM_MR_DELETE) {
	1121	+ kvm_memslot_delete(slots, memslot);
	1122	+ } else {
	1123	+ if (change == KVM_MR_CREATE)
	1124	+ i = kvm_memslot_insert_back(slots);
	1125	+ else
	1126	+ i = kvm_memslot_move_backward(slots, memslot);
	1127	+ i = kvm_memslot_move_forward(slots, memslot, i);
	1128	+
	1129	+ /*
	1130	+ * Copy the memslot to its new position in memslots and update
	1131	+ * its index accordingly.
	1132	+ */
	1133	+ slots->memslots[i] = *memslot;
	1134	+ slots->id_to_index[memslot->id] = i;
	1135	+ }
914	1136	}
915	1137
916	1138	static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
..	..	@@ -931,36 +1153,148 @@
931	1153	int as_id, struct kvm_memslots *slots)
932	1154	{
933	1155	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
934		- u64 gen;
	1156	+ u64 gen = old_memslots->generation;
935	1157
936		- /*
937		- * Set the low bit in the generation, which disables SPTE caching
938		- * until the end of synchronize_srcu_expedited.
939		- */
940		- WARN_ON(old_memslots->generation & 1);
941		- slots->generation = old_memslots->generation + 1;
	1158	+ WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
	1159	+ slots->generation = gen \| KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
942	1160
943	1161	rcu_assign_pointer(kvm->memslots[as_id], slots);
944	1162	synchronize_srcu_expedited(&kvm->srcu);
945	1163
946	1164	/*
947		- * Increment the new memslot generation a second time. This prevents
948		- * vm exits that race with memslot updates from caching a memslot
949		- * generation that will (potentially) be valid forever.
950		- *
	1165	+ * Increment the new memslot generation a second time, dropping the
	1166	+ * update in-progress flag and incrementing the generation based on
	1167	+ * the number of address spaces. This provides a unique and easily
	1168	+ * identifiable generation number while the memslots are in flux.
	1169	+ */
	1170	+ gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
	1171	+
	1172	+ /*
951	1173	* Generations must be unique even across address spaces. We do not need
952	1174	* a global counter for that, instead the generation space is evenly split
953	1175	* across address spaces. For example, with two address spaces, address
954		- * space 0 will use generations 0, 4, 8, ... while * address space 1 will
955		- * use generations 2, 6, 10, 14, ...
	1176	+ * space 0 will use generations 0, 2, 4, ... while address space 1 will
	1177	+ * use generations 1, 3, 5, ...
956	1178	*/
957		- gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
	1179	+ gen += KVM_ADDRESS_SPACE_NUM;
958	1180
959	1181	kvm_arch_memslots_updated(kvm, gen);
960	1182
961	1183	slots->generation = gen;
962	1184
963	1185	return old_memslots;
	1186	+}
	1187	+
	1188	+/*
	1189	+ * Note, at a minimum, the current number of used slots must be allocated, even
	1190	+ * when deleting a memslot, as we need a complete duplicate of the memslots for
	1191	+ * use when invalidating a memslot prior to deleting/moving the memslot.
	1192	+ */
	1193	+static struct kvm_memslots kvm_dup_memslots(struct kvm_memslots old,
	1194	+ enum kvm_mr_change change)
	1195	+{
	1196	+ struct kvm_memslots *slots;
	1197	+ size_t old_size, new_size;
	1198	+
	1199	+ old_size = sizeof(struct kvm_memslots) +
	1200	+ (sizeof(struct kvm_memory_slot) * old->used_slots);
	1201	+
	1202	+ if (change == KVM_MR_CREATE)
	1203	+ new_size = old_size + sizeof(struct kvm_memory_slot);
	1204	+ else
	1205	+ new_size = old_size;
	1206	+
	1207	+ slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
	1208	+ if (likely(slots))
	1209	+ memcpy(slots, old, old_size);
	1210	+
	1211	+ return slots;
	1212	+}
	1213	+
	1214	+static int kvm_set_memslot(struct kvm *kvm,
	1215	+ const struct kvm_userspace_memory_region *mem,
	1216	+ struct kvm_memory_slot *old,
	1217	+ struct kvm_memory_slot *new, int as_id,
	1218	+ enum kvm_mr_change change)
	1219	+{
	1220	+ struct kvm_memory_slot *slot;
	1221	+ struct kvm_memslots *slots;
	1222	+ int r;
	1223	+
	1224	+ slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
	1225	+ if (!slots)
	1226	+ return -ENOMEM;
	1227	+
	1228	+ if (change == KVM_MR_DELETE \|\| change == KVM_MR_MOVE) {
	1229	+ /*
	1230	+ * Note, the INVALID flag needs to be in the appropriate entry
	1231	+ * in the freshly allocated memslots, not in @old or @new.
	1232	+ */
	1233	+ slot = id_to_memslot(slots, old->id);
	1234	+ slot->flags \|= KVM_MEMSLOT_INVALID;
	1235	+
	1236	+ /*
	1237	+ * We can re-use the old memslots, the only difference from the
	1238	+ * newly installed memslots is the invalid flag, which will get
	1239	+ * dropped by update_memslots anyway. We'll also revert to the
	1240	+ * old memslots if preparing the new memory region fails.
	1241	+ */
	1242	+ slots = install_new_memslots(kvm, as_id, slots);
	1243	+
	1244	+ /* From this point no new shadow pages pointing to a deleted,
	1245	+ * or moved, memslot will be created.
	1246	+ *
	1247	+ * validation of sp->gfn happens in:
	1248	+ * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
	1249	+ * - kvm_is_visible_gfn (mmu_check_root)
	1250	+ */
	1251	+ kvm_arch_flush_shadow_memslot(kvm, slot);
	1252	+ kvm_arch_guest_memory_reclaimed(kvm);
	1253	+ }
	1254	+
	1255	+ r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
	1256	+ if (r)
	1257	+ goto out_slots;
	1258	+
	1259	+ update_memslots(slots, new, change);
	1260	+ slots = install_new_memslots(kvm, as_id, slots);
	1261	+
	1262	+ kvm_arch_commit_memory_region(kvm, mem, old, new, change);
	1263	+
	1264	+ kvfree(slots);
	1265	+ return 0;
	1266	+
	1267	+out_slots:
	1268	+ if (change == KVM_MR_DELETE \|\| change == KVM_MR_MOVE)
	1269	+ slots = install_new_memslots(kvm, as_id, slots);
	1270	+ kvfree(slots);
	1271	+ return r;
	1272	+}
	1273	+
	1274	+static int kvm_delete_memslot(struct kvm *kvm,
	1275	+ const struct kvm_userspace_memory_region *mem,
	1276	+ struct kvm_memory_slot *old, int as_id)
	1277	+{
	1278	+ struct kvm_memory_slot new;
	1279	+ int r;
	1280	+
	1281	+ if (!old->npages)
	1282	+ return -EINVAL;
	1283	+
	1284	+ memset(&new, 0, sizeof(new));
	1285	+ new.id = old->id;
	1286	+ /*
	1287	+ * This is only for debugging purpose; it should never be referenced
	1288	+ * for a removed memslot.
	1289	+ */
	1290	+ new.as_id = as_id;
	1291	+
	1292	+ r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
	1293	+ if (r)
	1294	+ return r;
	1295	+
	1296	+ kvm_free_memslot(kvm, old);
	1297	+ return 0;
964	1298	}
965	1299
966	1300	/*
..	..	@@ -974,163 +1308,120 @@
974	1308	int __kvm_set_memory_region(struct kvm *kvm,
975	1309	const struct kvm_userspace_memory_region *mem)
976	1310	{
977		- int r;
978		- gfn_t base_gfn;
979		- unsigned long npages;
980		- struct kvm_memory_slot *slot;
981	1311	struct kvm_memory_slot old, new;
982		- struct kvm_memslots slots = NULL, old_memslots;
983		- int as_id, id;
	1312	+ struct kvm_memory_slot *tmp;
984	1313	enum kvm_mr_change change;
	1314	+ int as_id, id;
	1315	+ int r;
985	1316
986	1317	r = check_memory_region_flags(mem);
987	1318	if (r)
988		- goto out;
	1319	+ return r;
989	1320
990		- r = -EINVAL;
991	1321	as_id = mem->slot >> 16;
992	1322	id = (u16)mem->slot;
993	1323
994	1324	/* General sanity checks */
995		- if (mem->memory_size & (PAGE_SIZE - 1))
996		- goto out;
	1325	+ if ((mem->memory_size & (PAGE_SIZE - 1)) \|\|
	1326	+ (mem->memory_size != (unsigned long)mem->memory_size))
	1327	+ return -EINVAL;
997	1328	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
998		- goto out;
	1329	+ return -EINVAL;
999	1330	/* We can read the guest memory with __xxx_user() later on. */
1000		- if ((id < KVM_USER_MEM_SLOTS) &&
1001		- ((mem->userspace_addr & (PAGE_SIZE - 1)) \|\|
1002		- !access_ok(VERIFY_WRITE,
1003		- (void __user *)(unsigned long)mem->userspace_addr,
1004		- mem->memory_size)))
1005		- goto out;
	1331	+ if ((mem->userspace_addr & (PAGE_SIZE - 1)) \|\|
	1332	+ (mem->userspace_addr != untagged_addr(mem->userspace_addr)) \|\|
	1333	+ !access_ok((void __user *)(unsigned long)mem->userspace_addr,
	1334	+ mem->memory_size))
	1335	+ return -EINVAL;
1006	1336	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_MEM_SLOTS_NUM)
1007		- goto out;
	1337	+ return -EINVAL;
1008	1338	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1009		- goto out;
	1339	+ return -EINVAL;
1010	1340
1011		- slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1012		- base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1013		- npages = mem->memory_size >> PAGE_SHIFT;
1014		-
1015		- if (npages > KVM_MEM_MAX_NR_PAGES)
1016		- goto out;
1017		-
1018		- new = old = *slot;
1019		-
1020		- new.id = id;
1021		- new.base_gfn = base_gfn;
1022		- new.npages = npages;
1023		- new.flags = mem->flags;
1024		-
1025		- if (npages) {
1026		- if (!old.npages)
1027		- change = KVM_MR_CREATE;
1028		- else { /* Modify an existing slot. */
1029		- if ((mem->userspace_addr != old.userspace_addr) \|\|
1030		- (npages != old.npages) \|\|
1031		- ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1032		- goto out;
1033		-
1034		- if (base_gfn != old.base_gfn)
1035		- change = KVM_MR_MOVE;
1036		- else if (new.flags != old.flags)
1037		- change = KVM_MR_FLAGS_ONLY;
1038		- else { /* Nothing to change. */
1039		- r = 0;
1040		- goto out;
1041		- }
1042		- }
	1341	+ /*
	1342	+ * Make a full copy of the old memslot, the pointer will become stale
	1343	+ * when the memslots are re-sorted by update_memslots(), and the old
	1344	+ * memslot needs to be referenced after calling update_memslots(), e.g.
	1345	+ * to free its resources and for arch specific behavior.
	1346	+ */
	1347	+ tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
	1348	+ if (tmp) {
	1349	+ old = *tmp;
	1350	+ tmp = NULL;
1043	1351	} else {
1044		- if (!old.npages)
1045		- goto out;
	1352	+ memset(&old, 0, sizeof(old));
	1353	+ old.id = id;
	1354	+ }
1046	1355
1047		- change = KVM_MR_DELETE;
1048		- new.base_gfn = 0;
1049		- new.flags = 0;
	1356	+ if (!mem->memory_size)
	1357	+ return kvm_delete_memslot(kvm, mem, &old, as_id);
	1358	+
	1359	+ new.as_id = as_id;
	1360	+ new.id = id;
	1361	+ new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
	1362	+ new.npages = mem->memory_size >> PAGE_SHIFT;
	1363	+ new.flags = mem->flags;
	1364	+ new.userspace_addr = mem->userspace_addr;
	1365	+
	1366	+ if (new.npages > KVM_MEM_MAX_NR_PAGES)
	1367	+ return -EINVAL;
	1368	+
	1369	+ if (!old.npages) {
	1370	+ change = KVM_MR_CREATE;
	1371	+ new.dirty_bitmap = NULL;
	1372	+ memset(&new.arch, 0, sizeof(new.arch));
	1373	+ } else { /* Modify an existing slot. */
	1374	+ if ((new.userspace_addr != old.userspace_addr) \|\|
	1375	+ (new.npages != old.npages) \|\|
	1376	+ ((new.flags ^ old.flags) & KVM_MEM_READONLY))
	1377	+ return -EINVAL;
	1378	+
	1379	+ if (new.base_gfn != old.base_gfn)
	1380	+ change = KVM_MR_MOVE;
	1381	+ else if (new.flags != old.flags)
	1382	+ change = KVM_MR_FLAGS_ONLY;
	1383	+ else /* Nothing to change. */
	1384	+ return 0;
	1385	+
	1386	+ /* Copy dirty_bitmap and arch from the current memslot. */
	1387	+ new.dirty_bitmap = old.dirty_bitmap;
	1388	+ memcpy(&new.arch, &old.arch, sizeof(new.arch));
1050	1389	}
1051	1390
1052	1391	if ((change == KVM_MR_CREATE) \|\| (change == KVM_MR_MOVE)) {
1053	1392	/* Check for overlaps */
1054		- r = -EEXIST;
1055		- kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
1056		- if (slot->id == id)
	1393	+ kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
	1394	+ if (tmp->id == id)
1057	1395	continue;
1058		- if (!((base_gfn + npages <= slot->base_gfn) \|\|
1059		- (base_gfn >= slot->base_gfn + slot->npages)))
1060		- goto out;
	1396	+ if (!((new.base_gfn + new.npages <= tmp->base_gfn) \|\|
	1397	+ (new.base_gfn >= tmp->base_gfn + tmp->npages)))
	1398	+ return -EEXIST;
1061	1399	}
1062	1400	}
1063	1401
1064		- /* Free page dirty bitmap if unneeded */
	1402	+ /* Allocate/free page dirty bitmap as needed */
1065	1403	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1066	1404	new.dirty_bitmap = NULL;
	1405	+ else if (!new.dirty_bitmap) {
	1406	+ r = kvm_alloc_dirty_bitmap(&new);
	1407	+ if (r)
	1408	+ return r;
1067	1409
1068		- r = -ENOMEM;
1069		- if (change == KVM_MR_CREATE) {
1070		- new.userspace_addr = mem->userspace_addr;
1071		-
1072		- if (kvm_arch_create_memslot(kvm, &new, npages))
1073		- goto out_free;
	1410	+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
	1411	+ bitmap_set(new.dirty_bitmap, 0, new.npages);
1074	1412	}
1075	1413
1076		- /* Allocate page dirty bitmap if needed */
1077		- if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1078		- if (kvm_create_dirty_bitmap(&new) < 0)
1079		- goto out_free;
1080		- }
1081		-
1082		- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
1083		- if (!slots)
1084		- goto out_free;
1085		- memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
1086		-
1087		- if ((change == KVM_MR_DELETE) \|\| (change == KVM_MR_MOVE)) {
1088		- slot = id_to_memslot(slots, id);
1089		- slot->flags \|= KVM_MEMSLOT_INVALID;
1090		-
1091		- old_memslots = install_new_memslots(kvm, as_id, slots);
1092		-
1093		- /* From this point no new shadow pages pointing to a deleted,
1094		- * or moved, memslot will be created.
1095		- *
1096		- * validation of sp->gfn happens in:
1097		- * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1098		- * - kvm_is_visible_gfn (mmu_check_roots)
1099		- */
1100		- kvm_arch_flush_shadow_memslot(kvm, slot);
1101		-
1102		- /*
1103		- * We can re-use the old_memslots from above, the only difference
1104		- * from the currently installed memslots is the invalid flag. This
1105		- * will get overwritten by update_memslots anyway.
1106		- */
1107		- slots = old_memslots;
1108		- }
1109		-
1110		- r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
	1414	+ r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
1111	1415	if (r)
1112		- goto out_slots;
	1416	+ goto out_bitmap;
1113	1417
1114		- /* actual memory is freed via old in kvm_free_memslot below */
1115		- if (change == KVM_MR_DELETE) {
1116		- new.dirty_bitmap = NULL;
1117		- memset(&new.arch, 0, sizeof(new.arch));
1118		- }
1119		-
1120		- update_memslots(slots, &new);
1121		- old_memslots = install_new_memslots(kvm, as_id, slots);
1122		-
1123		- kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
1124		-
1125		- kvm_free_memslot(kvm, &old, &new);
1126		- kvfree(old_memslots);
	1418	+ if (old.dirty_bitmap && !new.dirty_bitmap)
	1419	+ kvm_destroy_dirty_bitmap(&old);
1127	1420	return 0;
1128	1421
1129		-out_slots:
1130		- kvfree(slots);
1131		-out_free:
1132		- kvm_free_memslot(kvm, &new, &old);
1133		-out:
	1422	+out_bitmap:
	1423	+ if (new.dirty_bitmap && !old.dirty_bitmap)
	1424	+ kvm_destroy_dirty_bitmap(&new);
1134	1425	return r;
1135	1426	}
1136	1427	EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
..	..	@@ -1156,14 +1447,24 @@
1156	1447	return kvm_set_memory_region(kvm, mem);
1157	1448	}
1158	1449
1159		-int kvm_get_dirty_log(struct kvm *kvm,
1160		- struct kvm_dirty_log log, int is_dirty)
	1450	+#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	1451	+/**
	1452	+ * kvm_get_dirty_log - get a snapshot of dirty pages
	1453	+ * @kvm: pointer to kvm instance
	1454	+ * @log: slot id and address to which we copy the log
	1455	+ * @is_dirty: set to '1' if any dirty pages were found
	1456	+ * @memslot: set to the associated memslot, always valid on success
	1457	+ */
	1458	+int kvm_get_dirty_log(struct kvm kvm, struct kvm_dirty_log log,
	1459	+ int is_dirty, struct kvm_memory_slot *memslot)
1161	1460	{
1162	1461	struct kvm_memslots *slots;
1163		- struct kvm_memory_slot *memslot;
1164	1462	int i, as_id, id;
1165	1463	unsigned long n;
1166	1464	unsigned long any = 0;
	1465	+
	1466	+ *memslot = NULL;
	1467	+ *is_dirty = 0;
1167	1468
1168	1469	as_id = log->slot >> 16;
1169	1470	id = (u16)log->slot;
..	..	@@ -1171,16 +1472,18 @@
1171	1472	return -EINVAL;
1172	1473
1173	1474	slots = __kvm_memslots(kvm, as_id);
1174		- memslot = id_to_memslot(slots, id);
1175		- if (!memslot->dirty_bitmap)
	1475	+ *memslot = id_to_memslot(slots, id);
	1476	+ if (!(memslot) \|\| !(memslot)->dirty_bitmap)
1176	1477	return -ENOENT;
1177	1478
1178		- n = kvm_dirty_bitmap_bytes(memslot);
	1479	+ kvm_arch_sync_dirty_log(kvm, *memslot);
	1480	+
	1481	+ n = kvm_dirty_bitmap_bytes(*memslot);
1179	1482
1180	1483	for (i = 0; !any && i < n/sizeof(long); ++i)
1181		- any = memslot->dirty_bitmap[i];
	1484	+ any = (*memslot)->dirty_bitmap[i];
1182	1485
1183		- if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
	1486	+ if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1184	1487	return -EFAULT;
1185	1488
1186	1489	if (any)
..	..	@@ -1189,13 +1492,12 @@
1189	1492	}
1190	1493	EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1191	1494
1192		-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	1495	+#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1193	1496	/**
1194		- * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
1195		- * are dirty write protect them for next write.
	1497	+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages
	1498	+ * and reenable dirty page tracking for the corresponding pages.
1196	1499	* @kvm: pointer to kvm instance
1197	1500	* @log: slot id and address to which we copy the log
1198		- * @is_dirty: flag set if any page is dirty
1199	1501	*
1200	1502	* We need to keep it in mind that VCPU threads can write to the bitmap
1201	1503	* concurrently. So, to avoid losing track of dirty pages we keep the
..	..	@@ -1212,8 +1514,7 @@
1212	1514	* exiting to userspace will be logged for the next call.
1213	1515	*
1214	1516	*/
1215		-int kvm_get_dirty_log_protect(struct kvm *kvm,
1216		- struct kvm_dirty_log log, bool is_dirty)
	1517	+static int kvm_get_dirty_log_protect(struct kvm kvm, struct kvm_dirty_log log)
1217	1518	{
1218	1519	struct kvm_memslots *slots;
1219	1520	struct kvm_memory_slot *memslot;
..	..	@@ -1221,6 +1522,7 @@
1221	1522	unsigned long n;
1222	1523	unsigned long *dirty_bitmap;
1223	1524	unsigned long *dirty_bitmap_buffer;
	1525	+ bool flush;
1224	1526
1225	1527	as_id = log->slot >> 16;
1226	1528	id = (u16)log->slot;
..	..	@@ -1229,55 +1531,180 @@
1229	1531
1230	1532	slots = __kvm_memslots(kvm, as_id);
1231	1533	memslot = id_to_memslot(slots, id);
1232		-
1233		- dirty_bitmap = memslot->dirty_bitmap;
1234		- if (!dirty_bitmap)
	1534	+ if (!memslot \|\| !memslot->dirty_bitmap)
1235	1535	return -ENOENT;
1236	1536
	1537	+ dirty_bitmap = memslot->dirty_bitmap;
	1538	+
	1539	+ kvm_arch_sync_dirty_log(kvm, memslot);
	1540	+
1237	1541	n = kvm_dirty_bitmap_bytes(memslot);
	1542	+ flush = false;
	1543	+ if (kvm->manual_dirty_log_protect) {
	1544	+ /*
	1545	+ * Unlike kvm_get_dirty_log, we always return false in *flush,
	1546	+ * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
	1547	+ * is some code duplication between this function and
	1548	+ * kvm_get_dirty_log, but hopefully all architecture
	1549	+ * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
	1550	+ * can be eliminated.
	1551	+ */
	1552	+ dirty_bitmap_buffer = dirty_bitmap;
	1553	+ } else {
	1554	+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
	1555	+ memset(dirty_bitmap_buffer, 0, n);
1238	1556
1239		- dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1240		- memset(dirty_bitmap_buffer, 0, n);
	1557	+ spin_lock(&kvm->mmu_lock);
	1558	+ for (i = 0; i < n / sizeof(long); i++) {
	1559	+ unsigned long mask;
	1560	+ gfn_t offset;
1241	1561
1242		- spin_lock(&kvm->mmu_lock);
1243		- *is_dirty = false;
1244		- for (i = 0; i < n / sizeof(long); i++) {
1245		- unsigned long mask;
1246		- gfn_t offset;
	1562	+ if (!dirty_bitmap[i])
	1563	+ continue;
1247	1564
1248		- if (!dirty_bitmap[i])
1249		- continue;
	1565	+ flush = true;
	1566	+ mask = xchg(&dirty_bitmap[i], 0);
	1567	+ dirty_bitmap_buffer[i] = mask;
1250	1568
1251		- *is_dirty = true;
1252		-
1253		- mask = xchg(&dirty_bitmap[i], 0);
1254		- dirty_bitmap_buffer[i] = mask;
1255		-
1256		- if (mask) {
1257	1569	offset = i * BITS_PER_LONG;
1258	1570	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1259	1571	offset, mask);
1260	1572	}
	1573	+ spin_unlock(&kvm->mmu_lock);
1261	1574	}
1262	1575
1263		- spin_unlock(&kvm->mmu_lock);
	1576	+ if (flush)
	1577	+ kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
	1578	+
1264	1579	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1265	1580	return -EFAULT;
1266	1581	return 0;
1267	1582	}
1268		-EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1269		-#endif
1270	1583
1271		-bool kvm_largepages_enabled(void)
	1584	+
	1585	+/**
	1586	+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
	1587	+ * @kvm: kvm instance
	1588	+ * @log: slot id and address to which we copy the log
	1589	+ *
	1590	+ * Steps 1-4 below provide general overview of dirty page logging. See
	1591	+ * kvm_get_dirty_log_protect() function description for additional details.
	1592	+ *
	1593	+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
	1594	+ * always flush the TLB (step 4) even if previous step failed and the dirty
	1595	+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
	1596	+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
	1597	+ * writes will be marked dirty for next log read.
	1598	+ *
	1599	+ * 1. Take a snapshot of the bit and clear it if needed.
	1600	+ * 2. Write protect the corresponding page.
	1601	+ * 3. Copy the snapshot to the userspace.
	1602	+ * 4. Flush TLB's if needed.
	1603	+ */
	1604	+static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
	1605	+ struct kvm_dirty_log *log)
1272	1606	{
1273		- return largepages_enabled;
	1607	+ int r;
	1608	+
	1609	+ mutex_lock(&kvm->slots_lock);
	1610	+
	1611	+ r = kvm_get_dirty_log_protect(kvm, log);
	1612	+
	1613	+ mutex_unlock(&kvm->slots_lock);
	1614	+ return r;
1274	1615	}
1275	1616
1276		-void kvm_disable_largepages(void)
	1617	+/**
	1618	+ * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
	1619	+ * and reenable dirty page tracking for the corresponding pages.
	1620	+ * @kvm: pointer to kvm instance
	1621	+ * @log: slot id and address from which to fetch the bitmap of dirty pages
	1622	+ */
	1623	+static int kvm_clear_dirty_log_protect(struct kvm *kvm,
	1624	+ struct kvm_clear_dirty_log *log)
1277	1625	{
1278		- largepages_enabled = false;
	1626	+ struct kvm_memslots *slots;
	1627	+ struct kvm_memory_slot *memslot;
	1628	+ int as_id, id;
	1629	+ gfn_t offset;
	1630	+ unsigned long i, n;
	1631	+ unsigned long *dirty_bitmap;
	1632	+ unsigned long *dirty_bitmap_buffer;
	1633	+ bool flush;
	1634	+
	1635	+ as_id = log->slot >> 16;
	1636	+ id = (u16)log->slot;
	1637	+ if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
	1638	+ return -EINVAL;
	1639	+
	1640	+ if (log->first_page & 63)
	1641	+ return -EINVAL;
	1642	+
	1643	+ slots = __kvm_memslots(kvm, as_id);
	1644	+ memslot = id_to_memslot(slots, id);
	1645	+ if (!memslot \|\| !memslot->dirty_bitmap)
	1646	+ return -ENOENT;
	1647	+
	1648	+ dirty_bitmap = memslot->dirty_bitmap;
	1649	+
	1650	+ n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
	1651	+
	1652	+ if (log->first_page > memslot->npages \|\|
	1653	+ log->num_pages > memslot->npages - log->first_page \|\|
	1654	+ (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
	1655	+ return -EINVAL;
	1656	+
	1657	+ kvm_arch_sync_dirty_log(kvm, memslot);
	1658	+
	1659	+ flush = false;
	1660	+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
	1661	+ if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
	1662	+ return -EFAULT;
	1663	+
	1664	+ spin_lock(&kvm->mmu_lock);
	1665	+ for (offset = log->first_page, i = offset / BITS_PER_LONG,
	1666	+ n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
	1667	+ i++, offset += BITS_PER_LONG) {
	1668	+ unsigned long mask = *dirty_bitmap_buffer++;
	1669	+ atomic_long_t p = (atomic_long_t ) &dirty_bitmap[i];
	1670	+ if (!mask)
	1671	+ continue;
	1672	+
	1673	+ mask &= atomic_long_fetch_andnot(mask, p);
	1674	+
	1675	+ /*
	1676	+ * mask contains the bits that really have been cleared. This
	1677	+ * never includes any bits beyond the length of the memslot (if
	1678	+ * the length is not aligned to 64 pages), therefore it is not
	1679	+ * a problem if userspace sets them in log->dirty_bitmap.
	1680	+ */
	1681	+ if (mask) {
	1682	+ flush = true;
	1683	+ kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
	1684	+ offset, mask);
	1685	+ }
	1686	+ }
	1687	+ spin_unlock(&kvm->mmu_lock);
	1688	+
	1689	+ if (flush)
	1690	+ kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
	1691	+
	1692	+ return 0;
1279	1693	}
1280		-EXPORT_SYMBOL_GPL(kvm_disable_largepages);
	1694	+
	1695	+static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
	1696	+ struct kvm_clear_dirty_log *log)
	1697	+{
	1698	+ int r;
	1699	+
	1700	+ mutex_lock(&kvm->slots_lock);
	1701	+
	1702	+ r = kvm_clear_dirty_log_protect(kvm, log);
	1703	+
	1704	+ mutex_unlock(&kvm->slots_lock);
	1705	+ return r;
	1706	+}
	1707	+#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1281	1708
1282	1709	struct kvm_memory_slot gfn_to_memslot(struct kvm kvm, gfn_t gfn)
1283	1710	{
..	..	@@ -1294,13 +1721,17 @@
1294	1721	{
1295	1722	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
1296	1723
1297		- if (!memslot \|\| memslot->id >= KVM_USER_MEM_SLOTS \|\|
1298		- memslot->flags & KVM_MEMSLOT_INVALID)
1299		- return false;
1300		-
1301		- return true;
	1724	+ return kvm_is_visible_memslot(memslot);
1302	1725	}
1303	1726	EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
	1727	+
	1728	+bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
	1729	+{
	1730	+ struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
	1731	+
	1732	+ return kvm_is_visible_memslot(memslot);
	1733	+}
	1734	+EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
1304	1735
1305	1736	unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
1306	1737	{
..	..	@@ -1313,7 +1744,7 @@
1313	1744	if (kvm_is_error_hva(addr))
1314	1745	return PAGE_SIZE;
1315	1746
1316		- down_read(&current->mm->mmap_sem);
	1747	+ mmap_read_lock(current->mm);
1317	1748	vma = find_vma(current->mm, addr);
1318	1749	if (!vma)
1319	1750	goto out;
..	..	@@ -1321,7 +1752,7 @@
1321	1752	size = vma_kernel_pagesize(vma);
1322	1753
1323	1754	out:
1324		- up_read(&current->mm->mmap_sem);
	1755	+ mmap_read_unlock(current->mm);
1325	1756
1326	1757	return size;
1327	1758	}
..	..	@@ -1372,8 +1803,12 @@
1372	1803	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
1373	1804
1374	1805	/*
1375		- * If writable is set to false, the hva returned by this function is only
1376		- * allowed to be read.
	1806	+ * Return the hva of a @gfn and the R/W attribute if possible.
	1807	+ *
	1808	+ * @slot: the kvm_memory_slot which contains @gfn
	1809	+ * @gfn: the gfn to be translated
	1810	+ * @writable: used to return the read/write attribute of the @slot if the hva
	1811	+ * is valid and @writable is not NULL
1377	1812	*/
1378	1813	unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
1379	1814	gfn_t gfn, bool *writable)
..	..	@@ -1411,13 +1846,12 @@
1411	1846	/*
1412	1847	* The fast path to get the writable pfn which will be stored in @pfn,
1413	1848	* true indicates success, otherwise false is returned. It's also the
1414		- * only part that runs if we can are in atomic context.
	1849	+ * only part that runs if we can in atomic context.
1415	1850	*/
1416	1851	static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
1417	1852	bool writable, kvm_pfn_t pfn)
1418	1853	{
1419	1854	struct page *page[1];
1420		- int npages;
1421	1855
1422	1856	/*
1423	1857	* Fast pin a writable pfn only if it is a write fault request
..	..	@@ -1427,8 +1861,7 @@
1427	1861	if (!(write_fault \|\| writable))
1428	1862	return false;
1429	1863
1430		- npages = __get_user_pages_fast(addr, 1, 1, page);
1431		- if (npages == 1) {
	1864	+ if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
1432	1865	*pfn = page_to_pfn(page[0]);
1433	1866
1434	1867	if (writable)
..	..	@@ -1468,7 +1901,7 @@
1468	1901	if (unlikely(!write_fault) && writable) {
1469	1902	struct page *wpage;
1470	1903
1471		- if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
	1904	+ if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
1472	1905	*writable = true;
1473	1906	put_page(page);
1474	1907	page = wpage;
..	..	@@ -1506,14 +1939,14 @@
1506	1939	spinlock_t *ptl;
1507	1940	int r;
1508	1941
1509		- r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
	1942	+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
1510	1943	if (r) {
1511	1944	/*
1512	1945	* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
1513	1946	* not call the fault handler, so do it here.
1514	1947	*/
1515	1948	bool unlocked = false;
1516		- r = fixup_user_fault(current, current->mm, addr,
	1949	+ r = fixup_user_fault(current->mm, addr,
1517	1950	(write_fault ? FAULT_FLAG_WRITE : 0),
1518	1951	&unlocked);
1519	1952	if (unlocked)
..	..	@@ -1521,7 +1954,7 @@
1521	1954	if (r)
1522	1955	return r;
1523	1956
1524		- r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
	1957	+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
1525	1958	if (r)
1526	1959	return r;
1527	1960	}
..	..	@@ -1596,7 +2029,7 @@
1596	2029	if (npages == 1)
1597	2030	return pfn;
1598	2031
1599		- down_read(&current->mm->mmap_sem);
	2032	+ mmap_read_lock(current->mm);
1600	2033	if (npages == -EHWPOISON \|\|
1601	2034	(!async && check_user_page_hwpoison(addr))) {
1602	2035	pfn = KVM_PFN_ERR_HWPOISON;
..	..	@@ -1620,7 +2053,7 @@
1620	2053	pfn = KVM_PFN_ERR_FAULT;
1621	2054	}
1622	2055	exit:
1623		- up_read(&current->mm->mmap_sem);
	2056	+ mmap_read_unlock(current->mm);
1624	2057	return pfn;
1625	2058	}
1626	2059
..	..	@@ -1673,12 +2106,6 @@
1673	2106	}
1674	2107	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1675	2108
1676		-kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1677		-{
1678		- return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
1679		-}
1680		-EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1681		-
1682	2109	kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
1683	2110	{
1684	2111	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
..	..	@@ -1710,7 +2137,7 @@
1710	2137	if (entry < nr_pages)
1711	2138	return 0;
1712	2139
1713		- return __get_user_pages_fast(addr, nr_pages, 1, pages);
	2140	+ return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
1714	2141	}
1715	2142	EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1716	2143
..	..	@@ -1924,20 +2351,28 @@
1924	2351	}
1925	2352	EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1926	2353
	2354	+static bool kvm_is_ad_tracked_pfn(kvm_pfn_t pfn)
	2355	+{
	2356	+ if (!pfn_valid(pfn))
	2357	+ return false;
	2358	+
	2359	+ /*
	2360	+ * Per page-flags.h, pages tagged PG_reserved "should in general not be
	2361	+ * touched (e.g. set dirty) except by its owner".
	2362	+ */
	2363	+ return !PageReserved(pfn_to_page(pfn));
	2364	+}
	2365	+
1927	2366	void kvm_set_pfn_dirty(kvm_pfn_t pfn)
1928	2367	{
1929		- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
1930		- struct page *page = pfn_to_page(pfn);
1931		-
1932		- if (!PageReserved(page))
1933		- SetPageDirty(page);
1934		- }
	2368	+ if (kvm_is_ad_tracked_pfn(pfn))
	2369	+ SetPageDirty(pfn_to_page(pfn));
1935	2370	}
1936	2371	EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1937	2372
1938	2373	void kvm_set_pfn_accessed(kvm_pfn_t pfn)
1939	2374	{
1940		- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
	2375	+ if (kvm_is_ad_tracked_pfn(pfn))
1941	2376	mark_page_accessed(pfn_to_page(pfn));
1942	2377	}
1943	2378	EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
..	..	@@ -2047,17 +2482,6 @@
2047	2482	return 0;
2048	2483	}
2049	2484
2050		-int kvm_read_guest_atomic(struct kvm kvm, gpa_t gpa, void data,
2051		- unsigned long len)
2052		-{
2053		- gfn_t gfn = gpa >> PAGE_SHIFT;
2054		- struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2055		- int offset = offset_in_page(gpa);
2056		-
2057		- return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2058		-}
2059		-EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
2060		-
2061	2485	int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2062	2486	void *data, unsigned long len)
2063	2487	{
..	..	@@ -2155,30 +2579,34 @@
2155	2579	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2156	2580	gfn_t nr_pages_avail;
2157	2581
2158		- ghc->gpa = gpa;
	2582	+ /* Update ghc->generation before performing any error checks. */
2159	2583	ghc->generation = slots->generation;
2160		- ghc->len = len;
2161		- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2162		- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
2163		- if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
2164		- ghc->hva += offset;
2165		- } else {
2166		- /*
2167		- * If the requested region crosses two memslots, we still
2168		- * verify that the entire region is valid here.
2169		- */
2170		- while (start_gfn <= end_gfn) {
2171		- nr_pages_avail = 0;
2172		- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2173		- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2174		- &nr_pages_avail);
2175		- if (kvm_is_error_hva(ghc->hva))
2176		- return -EFAULT;
2177		- start_gfn += nr_pages_avail;
2178		- }
2179		- /* Use the slow path for cross page reads and writes. */
2180		- ghc->memslot = NULL;
	2584	+
	2585	+ if (start_gfn > end_gfn) {
	2586	+ ghc->hva = KVM_HVA_ERR_BAD;
	2587	+ return -EINVAL;
2181	2588	}
	2589	+
	2590	+ /*
	2591	+ * If the requested region crosses two memslots, we still
	2592	+ * verify that the entire region is valid here.
	2593	+ */
	2594	+ for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
	2595	+ ghc->memslot = __gfn_to_memslot(slots, start_gfn);
	2596	+ ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
	2597	+ &nr_pages_avail);
	2598	+ if (kvm_is_error_hva(ghc->hva))
	2599	+ return -EFAULT;
	2600	+ }
	2601	+
	2602	+ /* Use the slow path for cross page reads and writes. */
	2603	+ if (nr_pages_needed == 1)
	2604	+ ghc->hva += offset;
	2605	+ else
	2606	+ ghc->memslot = NULL;
	2607	+
	2608	+ ghc->gpa = gpa;
	2609	+ ghc->len = len;
2182	2610	return 0;
2183	2611	}
2184	2612
..	..	@@ -2198,10 +2626,13 @@
2198	2626	int r;
2199	2627	gpa_t gpa = ghc->gpa + offset;
2200	2628
2201		- BUG_ON(len + offset > ghc->len);
	2629	+ if (WARN_ON_ONCE(len + offset > ghc->len))
	2630	+ return -EINVAL;
2202	2631
2203		- if (slots->generation != ghc->generation)
2204		- __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
	2632	+ if (slots->generation != ghc->generation) {
	2633	+ if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
	2634	+ return -EFAULT;
	2635	+ }
2205	2636
2206	2637	if (kvm_is_error_hva(ghc->hva))
2207	2638	return -EFAULT;
..	..	@@ -2225,28 +2656,40 @@
2225	2656	}
2226	2657	EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2227	2658
2228		-int kvm_read_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
2229		- void *data, unsigned long len)
	2659	+int kvm_read_guest_offset_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
	2660	+ void *data, unsigned int offset,
	2661	+ unsigned long len)
2230	2662	{
2231	2663	struct kvm_memslots *slots = kvm_memslots(kvm);
2232	2664	int r;
	2665	+ gpa_t gpa = ghc->gpa + offset;
2233	2666
2234		- BUG_ON(len > ghc->len);
	2667	+ if (WARN_ON_ONCE(len + offset > ghc->len))
	2668	+ return -EINVAL;
2235	2669
2236		- if (slots->generation != ghc->generation)
2237		- __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
	2670	+ if (slots->generation != ghc->generation) {
	2671	+ if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
	2672	+ return -EFAULT;
	2673	+ }
2238	2674
2239	2675	if (kvm_is_error_hva(ghc->hva))
2240	2676	return -EFAULT;
2241	2677
2242	2678	if (unlikely(!ghc->memslot))
2243		- return kvm_read_guest(kvm, ghc->gpa, data, len);
	2679	+ return kvm_read_guest(kvm, gpa, data, len);
2244	2680
2245		- r = __copy_from_user(data, (void __user *)ghc->hva, len);
	2681	+ r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
2246	2682	if (r)
2247	2683	return -EFAULT;
2248	2684
2249	2685	return 0;
	2686	+}
	2687	+EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
	2688	+
	2689	+int kvm_read_guest_cached(struct kvm kvm, struct gfn_to_hva_cache ghc,
	2690	+ void *data, unsigned long len)
	2691	+{
	2692	+ return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
2250	2693	}
2251	2694	EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
2252	2695
..	..	@@ -2277,8 +2720,7 @@
2277	2720	}
2278	2721	EXPORT_SYMBOL_GPL(kvm_clear_guest);
2279	2722
2280		-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
2281		- gfn_t gfn)
	2723	+void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn)
2282	2724	{
2283	2725	if (memslot && memslot->dirty_bitmap) {
2284	2726	unsigned long rel_gfn = gfn - memslot->base_gfn;
..	..	@@ -2286,6 +2728,7 @@
2286	2728	set_bit_le(rel_gfn, memslot->dirty_bitmap);
2287	2729	}
2288	2730	}
	2731	+EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
2289	2732
2290	2733	void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
2291	2734	{
..	..	@@ -2330,33 +2773,40 @@
2330	2773
2331	2774	static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
2332	2775	{
2333		- unsigned int old, val, grow;
	2776	+ unsigned int old, val, grow, grow_start;
2334	2777
2335	2778	old = val = vcpu->halt_poll_ns;
	2779	+ grow_start = READ_ONCE(halt_poll_ns_grow_start);
2336	2780	grow = READ_ONCE(halt_poll_ns_grow);
2337		- /* 10us base */
2338		- if (val == 0 && grow)
2339		- val = 10000;
2340		- else
2341		- val *= grow;
	2781	+ if (!grow)
	2782	+ goto out;
2342	2783
2343		- if (val > halt_poll_ns)
2344		- val = halt_poll_ns;
	2784	+ val *= grow;
	2785	+ if (val < grow_start)
	2786	+ val = grow_start;
	2787	+
	2788	+ if (val > vcpu->kvm->max_halt_poll_ns)
	2789	+ val = vcpu->kvm->max_halt_poll_ns;
2345	2790
2346	2791	vcpu->halt_poll_ns = val;
	2792	+out:
2347	2793	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
2348	2794	}
2349	2795
2350	2796	static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
2351	2797	{
2352		- unsigned int old, val, shrink;
	2798	+ unsigned int old, val, shrink, grow_start;
2353	2799
2354	2800	old = val = vcpu->halt_poll_ns;
2355	2801	shrink = READ_ONCE(halt_poll_ns_shrink);
	2802	+ grow_start = READ_ONCE(halt_poll_ns_grow_start);
2356	2803	if (shrink == 0)
2357	2804	val = 0;
2358	2805	else
2359	2806	val /= shrink;
	2807	+
	2808	+ if (val < grow_start)
	2809	+ val = 0;
2360	2810
2361	2811	vcpu->halt_poll_ns = val;
2362	2812	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
..	..	@@ -2382,18 +2832,28 @@
2382	2832	return ret;
2383	2833	}
2384	2834
	2835	+static inline void
	2836	+update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
	2837	+{
	2838	+ if (waited)
	2839	+ vcpu->stat.halt_poll_fail_ns += poll_ns;
	2840	+ else
	2841	+ vcpu->stat.halt_poll_success_ns += poll_ns;
	2842	+}
	2843	+
2385	2844	/*
2386	2845	* The vCPU has executed a HLT instruction with in-kernel mode enabled.
2387	2846	*/
2388	2847	void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2389	2848	{
2390		- ktime_t start, cur;
2391		- DECLARE_SWAITQUEUE(wait);
	2849	+ ktime_t start, cur, poll_end;
2392	2850	bool waited = false;
2393	2851	u64 block_ns;
2394	2852
2395		- start = cur = ktime_get();
2396		- if (vcpu->halt_poll_ns) {
	2853	+ kvm_arch_vcpu_blocking(vcpu);
	2854	+
	2855	+ start = cur = poll_end = ktime_get();
	2856	+ if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
2397	2857	ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
2398	2858
2399	2859	++vcpu->stat.halt_attempted_poll;
..	..	@@ -2408,14 +2868,14 @@
2408	2868	++vcpu->stat.halt_poll_invalid;
2409	2869	goto out;
2410	2870	}
2411		- cur = ktime_get();
2412		- } while (single_task_running() && ktime_before(cur, stop));
	2871	+ poll_end = cur = ktime_get();
	2872	+ } while (single_task_running() && !need_resched() &&
	2873	+ ktime_before(cur, stop));
2413	2874	}
2414	2875
2415		- kvm_arch_vcpu_blocking(vcpu);
2416		-
	2876	+ prepare_to_rcuwait(&vcpu->wait);
2417	2877	for (;;) {
2418		- prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
	2878	+ set_current_state(TASK_INTERRUPTIBLE);
2419	2879
2420	2880	if (kvm_vcpu_check_block(vcpu) < 0)
2421	2881	break;
..	..	@@ -2423,28 +2883,33 @@
2423	2883	waited = true;
2424	2884	schedule();
2425	2885	}
2426		-
2427		- finish_swait(&vcpu->wq, &wait);
	2886	+ finish_rcuwait(&vcpu->wait);
2428	2887	cur = ktime_get();
2429		-
2430		- kvm_arch_vcpu_unblocking(vcpu);
2431	2888	out:
	2889	+ kvm_arch_vcpu_unblocking(vcpu);
2432	2890	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
2433	2891
2434		- if (!vcpu_valid_wakeup(vcpu))
2435		- shrink_halt_poll_ns(vcpu);
2436		- else if (halt_poll_ns) {
2437		- if (block_ns <= vcpu->halt_poll_ns)
2438		- ;
2439		- /* we had a long block, shrink polling */
2440		- else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
	2892	+ update_halt_poll_stats(
	2893	+ vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
	2894	+
	2895	+ if (!kvm_arch_no_poll(vcpu)) {
	2896	+ if (!vcpu_valid_wakeup(vcpu)) {
2441	2897	shrink_halt_poll_ns(vcpu);
2442		- /* we had a short halt and our poll time is too small */
2443		- else if (vcpu->halt_poll_ns < halt_poll_ns &&
2444		- block_ns < halt_poll_ns)
2445		- grow_halt_poll_ns(vcpu);
2446		- } else
2447		- vcpu->halt_poll_ns = 0;
	2898	+ } else if (vcpu->kvm->max_halt_poll_ns) {
	2899	+ if (block_ns <= vcpu->halt_poll_ns)
	2900	+ ;
	2901	+ /* we had a long block, shrink polling */
	2902	+ else if (vcpu->halt_poll_ns &&
	2903	+ block_ns > vcpu->kvm->max_halt_poll_ns)
	2904	+ shrink_halt_poll_ns(vcpu);
	2905	+ /* we had a short halt and our poll time is too small */
	2906	+ else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
	2907	+ block_ns < vcpu->kvm->max_halt_poll_ns)
	2908	+ grow_halt_poll_ns(vcpu);
	2909	+ } else {
	2910	+ vcpu->halt_poll_ns = 0;
	2911	+ }
	2912	+ }
2448	2913
2449	2914	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
2450	2915	kvm_arch_vcpu_block_finish(vcpu);
..	..	@@ -2453,11 +2918,11 @@
2453	2918
2454	2919	bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
2455	2920	{
2456		- struct swait_queue_head *wqp;
	2921	+ struct rcuwait *waitp;
2457	2922
2458		- wqp = kvm_arch_vcpu_wq(vcpu);
2459		- if (swq_has_sleeper(wqp)) {
2460		- swake_up_one(wqp);
	2923	+ waitp = kvm_arch_vcpu_get_wait(vcpu);
	2924	+ if (rcuwait_wake_up(waitp)) {
	2925	+ WRITE_ONCE(vcpu->ready, true);
2461	2926	++vcpu->stat.halt_wakeup;
2462	2927	return true;
2463	2928	}
..	..	@@ -2513,7 +2978,7 @@
2513	2978	*
2514	2979	* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
2515	2980	* (preempted lock holder), indicated by @in_spin_loop.
2516		- * Set at the beiginning and cleared at the end of interception/PLE handler.
	2981	+ * Set at the beginning and cleared at the end of interception/PLE handler.
2517	2982	*
2518	2983	* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
2519	2984	* chance last time (mostly it has become eligible now since we have probably
..	..	@@ -2594,13 +3059,15 @@
2594	3059	continue;
2595	3060	} else if (pass && i > last_boosted_vcpu)
2596	3061	break;
2597		- if (!READ_ONCE(vcpu->preempted))
	3062	+ if (!READ_ONCE(vcpu->ready))
2598	3063	continue;
2599	3064	if (vcpu == me)
2600	3065	continue;
2601		- if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
	3066	+ if (rcuwait_active(&vcpu->wait) &&
	3067	+ !vcpu_dy_runnable(vcpu))
2602	3068	continue;
2603		- if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
	3069	+ if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
	3070	+ !kvm_arch_vcpu_in_kernel(vcpu))
2604	3071	continue;
2605	3072	if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
2606	3073	continue;
..	..	@@ -2659,7 +3126,6 @@
2659	3126	{
2660	3127	struct kvm_vcpu *vcpu = filp->private_data;
2661	3128
2662		- debugfs_remove_recursive(vcpu->debugfs_dentry);
2663	3129	kvm_put_kvm(vcpu->kvm);
2664	3130	return 0;
2665	3131	}
..	..	@@ -2683,30 +3149,21 @@
2683	3149	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR \| O_CLOEXEC);
2684	3150	}
2685	3151
2686		-static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
	3152	+static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
2687	3153	{
	3154	+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
	3155	+ struct dentry *debugfs_dentry;
2688	3156	char dir_name[ITOA_MAX_LEN * 2];
2689		- int ret;
2690		-
2691		- if (!kvm_arch_has_vcpu_debugfs())
2692		- return 0;
2693	3157
2694	3158	if (!debugfs_initialized())
2695		- return 0;
	3159	+ return;
2696	3160
2697	3161	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
2698		- vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
2699		- vcpu->kvm->debugfs_dentry);
2700		- if (!vcpu->debugfs_dentry)
2701		- return -ENOMEM;
	3162	+ debugfs_dentry = debugfs_create_dir(dir_name,
	3163	+ vcpu->kvm->debugfs_dentry);
2702	3164
2703		- ret = kvm_arch_create_vcpu_debugfs(vcpu);
2704		- if (ret < 0) {
2705		- debugfs_remove_recursive(vcpu->debugfs_dentry);
2706		- return ret;
2707		- }
2708		-
2709		- return 0;
	3165	+ kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
	3166	+#endif
2710	3167	}
2711	3168
2712	3169	/*
..	..	@@ -2716,6 +3173,7 @@
2716	3173	{
2717	3174	int r;
2718	3175	struct kvm_vcpu *vcpu;
	3176	+ struct page *page;
2719	3177
2720	3178	if (id >= KVM_MAX_VCPU_ID)
2721	3179	return -EINVAL;
..	..	@@ -2729,21 +3187,29 @@
2729	3187	kvm->created_vcpus++;
2730	3188	mutex_unlock(&kvm->lock);
2731	3189
2732		- vcpu = kvm_arch_vcpu_create(kvm, id);
2733		- if (IS_ERR(vcpu)) {
2734		- r = PTR_ERR(vcpu);
	3190	+ r = kvm_arch_vcpu_precreate(kvm, id);
	3191	+ if (r)
	3192	+ goto vcpu_decrement;
	3193	+
	3194	+ vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
	3195	+ if (!vcpu) {
	3196	+ r = -ENOMEM;
2735	3197	goto vcpu_decrement;
2736	3198	}
2737	3199
2738		- preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
	3200	+ BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
	3201	+ page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
	3202	+ if (!page) {
	3203	+ r = -ENOMEM;
	3204	+ goto vcpu_free;
	3205	+ }
	3206	+ vcpu->run = page_address(page);
2739	3207
2740		- r = kvm_arch_vcpu_setup(vcpu);
2741		- if (r)
2742		- goto vcpu_destroy;
	3208	+ kvm_vcpu_init(vcpu, kvm, id);
2743	3209
2744		- r = kvm_create_vcpu_debugfs(vcpu);
	3210	+ r = kvm_arch_vcpu_create(vcpu);
2745	3211	if (r)
2746		- goto vcpu_destroy;
	3212	+ goto vcpu_free_run_page;
2747	3213
2748	3214	mutex_lock(&kvm->lock);
2749	3215	if (kvm_get_vcpu_by_id(kvm, id)) {
..	..	@@ -2758,7 +3224,7 @@
2758	3224	kvm_get_kvm(kvm);
2759	3225	r = create_vcpu_fd(vcpu);
2760	3226	if (r < 0) {
2761		- kvm_put_kvm(kvm);
	3227	+ kvm_put_kvm_no_destroy(kvm);
2762	3228	goto unlock_vcpu_destroy;
2763	3229	}
2764	3230
..	..	@@ -2773,13 +3239,16 @@
2773	3239
2774	3240	mutex_unlock(&kvm->lock);
2775	3241	kvm_arch_vcpu_postcreate(vcpu);
	3242	+ kvm_create_vcpu_debugfs(vcpu);
2776	3243	return r;
2777	3244
2778	3245	unlock_vcpu_destroy:
2779	3246	mutex_unlock(&kvm->lock);
2780		- debugfs_remove_recursive(vcpu->debugfs_dentry);
2781		-vcpu_destroy:
2782	3247	kvm_arch_vcpu_destroy(vcpu);
	3248	+vcpu_free_run_page:
	3249	+ free_page((unsigned long)vcpu->run);
	3250	+vcpu_free:
	3251	+ kmem_cache_free(kvm_vcpu_cache, vcpu);
2783	3252	vcpu_decrement:
2784	3253	mutex_lock(&kvm->lock);
2785	3254	kvm->created_vcpus--;
..	..	@@ -2807,7 +3276,7 @@
2807	3276	struct kvm_fpu *fpu = NULL;
2808	3277	struct kvm_sregs *kvm_sregs = NULL;
2809	3278
2810		- if (vcpu->kvm->mm != current->mm)
	3279	+ if (vcpu->kvm->mm != current->mm \|\| vcpu->kvm->vm_bugged)
2811	3280	return -EIO;
2812	3281
2813	3282	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
..	..	@@ -2844,7 +3313,7 @@
2844	3313	synchronize_rcu();
2845	3314	put_pid(oldpid);
2846	3315	}
2847		- r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
	3316	+ r = kvm_arch_vcpu_ioctl_run(vcpu);
2848	3317	trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
2849	3318	break;
2850	3319	}
..	..	@@ -2852,7 +3321,7 @@
2852	3321	struct kvm_regs *kvm_regs;
2853	3322
2854	3323	r = -ENOMEM;
2855		- kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
	3324	+ kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
2856	3325	if (!kvm_regs)
2857	3326	goto out;
2858	3327	r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
..	..	@@ -2869,7 +3338,6 @@
2869	3338	case KVM_SET_REGS: {
2870	3339	struct kvm_regs *kvm_regs;
2871	3340
2872		- r = -ENOMEM;
2873	3341	kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
2874	3342	if (IS_ERR(kvm_regs)) {
2875	3343	r = PTR_ERR(kvm_regs);
..	..	@@ -2880,7 +3348,8 @@
2880	3348	break;
2881	3349	}
2882	3350	case KVM_GET_SREGS: {
2883		- kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
	3351	+ kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
	3352	+ GFP_KERNEL_ACCOUNT);
2884	3353	r = -ENOMEM;
2885	3354	if (!kvm_sregs)
2886	3355	goto out;
..	..	@@ -2972,7 +3441,7 @@
2972	3441	break;
2973	3442	}
2974	3443	case KVM_GET_FPU: {
2975		- fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
	3444	+ fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
2976	3445	r = -ENOMEM;
2977	3446	if (!fpu)
2978	3447	goto out;
..	..	@@ -3013,7 +3482,7 @@
3013	3482	void __user *argp = compat_ptr(arg);
3014	3483	int r;
3015	3484
3016		- if (vcpu->kvm->mm != current->mm)
	3485	+ if (vcpu->kvm->mm != current->mm \|\| vcpu->kvm->vm_bugged)
3017	3486	return -EIO;
3018	3487
3019	3488	switch (ioctl) {
..	..	@@ -3031,7 +3500,8 @@
3031	3500	if (kvm_sigmask.len != sizeof(compat_sigset_t))
3032	3501	goto out;
3033	3502	r = -EFAULT;
3034		- if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
	3503	+ if (get_compat_sigset(&sigset,
	3504	+ (compat_sigset_t __user *)sigmask_arg->sigset))
3035	3505	goto out;
3036	3506	r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3037	3507	} else
..	..	@@ -3046,6 +3516,16 @@
3046	3516	return r;
3047	3517	}
3048	3518	#endif
	3519	+
	3520	+static int kvm_device_mmap(struct file filp, struct vm_area_struct vma)
	3521	+{
	3522	+ struct kvm_device *dev = filp->private_data;
	3523	+
	3524	+ if (dev->ops->mmap)
	3525	+ return dev->ops->mmap(dev, vma);
	3526	+
	3527	+ return -ENODEV;
	3528	+}
3049	3529
3050	3530	static int kvm_device_ioctl_attr(struct kvm_device *dev,
3051	3531	int (accessor)(struct kvm_device dev,
..	..	@@ -3068,7 +3548,7 @@
3068	3548	{
3069	3549	struct kvm_device *dev = filp->private_data;
3070	3550
3071		- if (dev->kvm->mm != current->mm)
	3551	+ if (dev->kvm->mm != current->mm \|\| dev->kvm->vm_bugged)
3072	3552	return -EIO;
3073	3553
3074	3554	switch (ioctl) {
..	..	@@ -3091,6 +3571,13 @@
3091	3571	struct kvm_device *dev = filp->private_data;
3092	3572	struct kvm *kvm = dev->kvm;
3093	3573
	3574	+ if (dev->ops->release) {
	3575	+ mutex_lock(&kvm->lock);
	3576	+ list_del(&dev->vm_node);
	3577	+ dev->ops->release(dev);
	3578	+ mutex_unlock(&kvm->lock);
	3579	+ }
	3580	+
3094	3581	kvm_put_kvm(kvm);
3095	3582	return 0;
3096	3583	}
..	..	@@ -3099,6 +3586,7 @@
3099	3586	.unlocked_ioctl = kvm_device_ioctl,
3100	3587	.release = kvm_device_release,
3101	3588	KVM_COMPAT(kvm_device_ioctl),
	3589	+ .mmap = kvm_device_mmap,
3102	3590	};
3103	3591
3104	3592	struct kvm_device kvm_device_from_filp(struct file filp)
..	..	@@ -3109,14 +3597,14 @@
3109	3597	return filp->private_data;
3110	3598	}
3111	3599
3112		-static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
	3600	+static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3113	3601	#ifdef CONFIG_KVM_MPIC
3114	3602	[KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
3115	3603	[KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
3116	3604	#endif
3117	3605	};
3118	3606
3119		-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
	3607	+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
3120	3608	{
3121	3609	if (type >= ARRAY_SIZE(kvm_device_ops_table))
3122	3610	return -ENOSPC;
..	..	@@ -3137,7 +3625,7 @@
3137	3625	static int kvm_ioctl_create_device(struct kvm *kvm,
3138	3626	struct kvm_create_device *cd)
3139	3627	{
3140		- struct kvm_device_ops *ops = NULL;
	3628	+ const struct kvm_device_ops *ops = NULL;
3141	3629	struct kvm_device *dev;
3142	3630	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
3143	3631	int type;
..	..	@@ -3154,7 +3642,7 @@
3154	3642	if (test)
3155	3643	return 0;
3156	3644
3157		- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
	3645	+ dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
3158	3646	if (!dev)
3159	3647	return -ENOMEM;
3160	3648
..	..	@@ -3177,11 +3665,14 @@
3177	3665	kvm_get_kvm(kvm);
3178	3666	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR \| O_CLOEXEC);
3179	3667	if (ret < 0) {
3180		- kvm_put_kvm(kvm);
	3668	+ kvm_put_kvm_no_destroy(kvm);
3181	3669	mutex_lock(&kvm->lock);
3182	3670	list_del(&dev->vm_node);
	3671	+ if (ops->release)
	3672	+ ops->release(dev);
3183	3673	mutex_unlock(&kvm->lock);
3184		- ops->destroy(dev);
	3674	+ if (ops->destroy)
	3675	+ ops->destroy(dev);
3185	3676	return ret;
3186	3677	}
3187	3678
..	..	@@ -3205,10 +3696,18 @@
3205	3696	#endif
3206	3697	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
3207	3698	case KVM_CAP_CHECK_EXTENSION_VM:
	3699	+ case KVM_CAP_ENABLE_CAP_VM:
	3700	+ case KVM_CAP_HALT_POLL:
3208	3701	return 1;
3209	3702	#ifdef CONFIG_KVM_MMIO
3210	3703	case KVM_CAP_COALESCED_MMIO:
3211	3704	return KVM_COALESCED_MMIO_PAGE_OFFSET;
	3705	+ case KVM_CAP_COALESCED_PIO:
	3706	+ return 1;
	3707	+#endif
	3708	+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	3709	+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
	3710	+ return KVM_DIRTY_LOG_MANUAL_CAPS;
3212	3711	#endif
3213	3712	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3214	3713	case KVM_CAP_IRQ_ROUTING:
..	..	@@ -3218,10 +3717,47 @@
3218	3717	case KVM_CAP_MULTI_ADDRESS_SPACE:
3219	3718	return KVM_ADDRESS_SPACE_NUM;
3220	3719	#endif
	3720	+ case KVM_CAP_NR_MEMSLOTS:
	3721	+ return KVM_USER_MEM_SLOTS;
3221	3722	default:
3222	3723	break;
3223	3724	}
3224	3725	return kvm_vm_ioctl_check_extension(kvm, arg);
	3726	+}
	3727	+
	3728	+int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
	3729	+ struct kvm_enable_cap *cap)
	3730	+{
	3731	+ return -EINVAL;
	3732	+}
	3733	+
	3734	+static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
	3735	+ struct kvm_enable_cap *cap)
	3736	+{
	3737	+ switch (cap->cap) {
	3738	+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	3739	+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
	3740	+ u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
	3741	+
	3742	+ if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
	3743	+ allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
	3744	+
	3745	+ if (cap->flags \|\| (cap->args[0] & ~allowed_options))
	3746	+ return -EINVAL;
	3747	+ kvm->manual_dirty_log_protect = cap->args[0];
	3748	+ return 0;
	3749	+ }
	3750	+#endif
	3751	+ case KVM_CAP_HALT_POLL: {
	3752	+ if (cap->flags \|\| cap->args[0] != (unsigned int)cap->args[0])
	3753	+ return -EINVAL;
	3754	+
	3755	+ kvm->max_halt_poll_ns = cap->args[0];
	3756	+ return 0;
	3757	+ }
	3758	+ default:
	3759	+ return kvm_vm_ioctl_enable_cap(kvm, cap);
	3760	+ }
3225	3761	}
3226	3762
3227	3763	static long kvm_vm_ioctl(struct file *filp,
..	..	@@ -3231,12 +3767,21 @@
3231	3767	void __user argp = (void __user )arg;
3232	3768	int r;
3233	3769
3234		- if (kvm->mm != current->mm)
	3770	+ if (kvm->mm != current->mm \|\| kvm->vm_bugged)
3235	3771	return -EIO;
3236	3772	switch (ioctl) {
3237	3773	case KVM_CREATE_VCPU:
3238	3774	r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3239	3775	break;
	3776	+ case KVM_ENABLE_CAP: {
	3777	+ struct kvm_enable_cap cap;
	3778	+
	3779	+ r = -EFAULT;
	3780	+ if (copy_from_user(&cap, argp, sizeof(cap)))
	3781	+ goto out;
	3782	+ r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
	3783	+ break;
	3784	+ }
3240	3785	case KVM_SET_USER_MEMORY_REGION: {
3241	3786	struct kvm_userspace_memory_region kvm_userspace_mem;
3242	3787
..	..	@@ -3257,6 +3802,17 @@
3257	3802	r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3258	3803	break;
3259	3804	}
	3805	+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	3806	+ case KVM_CLEAR_DIRTY_LOG: {
	3807	+ struct kvm_clear_dirty_log log;
	3808	+
	3809	+ r = -EFAULT;
	3810	+ if (copy_from_user(&log, argp, sizeof(log)))
	3811	+ goto out;
	3812	+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
	3813	+ break;
	3814	+ }
	3815	+#endif
3260	3816	#ifdef CONFIG_KVM_MMIO
3261	3817	case KVM_REGISTER_COALESCED_MMIO: {
3262	3818	struct kvm_coalesced_mmio_zone zone;
..	..	@@ -3347,21 +3903,18 @@
3347	3903	if (routing.flags)
3348	3904	goto out;
3349	3905	if (routing.nr) {
3350		- r = -ENOMEM;
3351		- entries = vmalloc(array_size(sizeof(*entries),
3352		- routing.nr));
3353		- if (!entries)
3354		- goto out;
3355		- r = -EFAULT;
3356	3906	urouting = argp;
3357		- if (copy_from_user(entries, urouting->entries,
3358		- routing.nr * sizeof(*entries)))
3359		- goto out_free_irq_routing;
	3907	+ entries = vmemdup_user(urouting->entries,
	3908	+ array_size(sizeof(*entries),
	3909	+ routing.nr));
	3910	+ if (IS_ERR(entries)) {
	3911	+ r = PTR_ERR(entries);
	3912	+ goto out;
	3913	+ }
3360	3914	}
3361	3915	r = kvm_set_irq_routing(kvm, entries, routing.nr,
3362	3916	routing.flags);
3363		-out_free_irq_routing:
3364		- vfree(entries);
	3917	+ kvfree(entries);
3365	3918	break;
3366	3919	}
3367	3920	#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
..	..	@@ -3403,15 +3956,54 @@
3403	3956	};
3404	3957	};
3405	3958
	3959	+struct compat_kvm_clear_dirty_log {
	3960	+ __u32 slot;
	3961	+ __u32 num_pages;
	3962	+ __u64 first_page;
	3963	+ union {
	3964	+ compat_uptr_t dirty_bitmap; /* one bit per page */
	3965	+ __u64 padding2;
	3966	+ };
	3967	+};
	3968	+
	3969	+long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
	3970	+ unsigned long arg)
	3971	+{
	3972	+ return -ENOTTY;
	3973	+}
	3974	+
3406	3975	static long kvm_vm_compat_ioctl(struct file *filp,
3407	3976	unsigned int ioctl, unsigned long arg)
3408	3977	{
3409	3978	struct kvm *kvm = filp->private_data;
3410	3979	int r;
3411	3980
3412		- if (kvm->mm != current->mm)
	3981	+ if (kvm->mm != current->mm \|\| kvm->vm_bugged)
3413	3982	return -EIO;
	3983	+
	3984	+ r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
	3985	+ if (r != -ENOTTY)
	3986	+ return r;
	3987	+
3414	3988	switch (ioctl) {
	3989	+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
	3990	+ case KVM_CLEAR_DIRTY_LOG: {
	3991	+ struct compat_kvm_clear_dirty_log compat_log;
	3992	+ struct kvm_clear_dirty_log log;
	3993	+
	3994	+ if (copy_from_user(&compat_log, (void __user *)arg,
	3995	+ sizeof(compat_log)))
	3996	+ return -EFAULT;
	3997	+ log.slot = compat_log.slot;
	3998	+ log.num_pages = compat_log.num_pages;
	3999	+ log.first_page = compat_log.first_page;
	4000	+ log.padding2 = compat_log.padding2;
	4001	+ log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
	4002	+
	4003	+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
	4004	+ break;
	4005	+ }
	4006	+#endif
3415	4007	case KVM_GET_DIRTY_LOG: {
3416	4008	struct compat_kvm_dirty_log compat_log;
3417	4009	struct kvm_dirty_log log;
..	..	@@ -3749,6 +4341,7 @@
3749	4341	r = __kvm_io_bus_write(vcpu, bus, &range, val);
3750	4342	return r < 0 ? r : 0;
3751	4343	}
	4344	+EXPORT_SYMBOL_GPL(kvm_io_bus_write);
3752	4345
3753	4346	/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
3754	4347	int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
..	..	@@ -3799,7 +4392,6 @@
3799	4392
3800	4393	return -EOPNOTSUPP;
3801	4394	}
3802		-EXPORT_SYMBOL_GPL(kvm_io_bus_write);
3803	4395
3804	4396	/* kvm_io_bus_read - called under kvm->slots_lock */
3805	4397	int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
..	..	@@ -3821,7 +4413,6 @@
3821	4413	return r < 0 ? r : 0;
3822	4414	}
3823	4415
3824		-
3825	4416	/* Caller must hold slots_lock. */
3826	4417	int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
3827	4418	int len, struct kvm_io_device *dev)
..	..	@@ -3838,8 +4429,8 @@
3838	4429	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
3839	4430	return -ENOSPC;
3840	4431
3841		- new_bus = kmalloc(sizeof(bus) + ((bus->dev_count + 1)
3842		- sizeof(struct kvm_io_range)), GFP_KERNEL);
	4432	+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
	4433	+ GFP_KERNEL_ACCOUNT);
3843	4434	if (!new_bus)
3844	4435	return -ENOMEM;
3845	4436
..	..	@@ -3866,15 +4457,15 @@
3866	4457	}
3867	4458
3868	4459	/* Caller must hold slots_lock. */
3869		-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
3870		- struct kvm_io_device *dev)
	4460	+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
	4461	+ struct kvm_io_device *dev)
3871	4462	{
3872	4463	int i, j;
3873	4464	struct kvm_io_bus new_bus, bus;
3874	4465
3875	4466	bus = kvm_get_bus(kvm, bus_idx);
3876	4467	if (!bus)
3877		- return;
	4468	+ return 0;
3878	4469
3879	4470	for (i = 0; i < bus->dev_count; i++)
3880	4471	if (bus->range[i].dev == dev) {
..	..	@@ -3882,16 +4473,22 @@
3882	4473	}
3883	4474
3884	4475	if (i == bus->dev_count)
3885		- return;
	4476	+ return 0;
3886	4477
3887		- new_bus = kmalloc(sizeof(bus) + ((bus->dev_count - 1)
3888		- sizeof(struct kvm_io_range)), GFP_KERNEL);
	4478	+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
	4479	+ GFP_KERNEL_ACCOUNT);
3889	4480	if (new_bus) {
3890		- memcpy(new_bus, bus, sizeof(bus) + i sizeof(struct kvm_io_range));
	4481	+ memcpy(new_bus, bus, struct_size(bus, range, i));
3891	4482	new_bus->dev_count--;
3892	4483	memcpy(new_bus->range + i, bus->range + i + 1,
3893		- (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
3894		- } else {
	4484	+ flex_array_size(new_bus, range, new_bus->dev_count - i));
	4485	+ }
	4486	+
	4487	+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
	4488	+ synchronize_srcu_expedited(&kvm->srcu);
	4489	+
	4490	+ /* Destroy the old bus _after_ installing the (null) bus. */
	4491	+ if (!new_bus) {
3895	4492	pr_err("kvm: failed to shrink bus, removing it completely\n");
3896	4493	for (j = 0; j < bus->dev_count; j++) {
3897	4494	if (j == i)
..	..	@@ -3900,10 +4497,8 @@
3900	4497	}
3901	4498	}
3902	4499
3903		- rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
3904		- synchronize_srcu_expedited(&kvm->srcu);
3905	4500	kfree(bus);
3906		- return;
	4501	+ return new_bus ? 0 : -ENOMEM;
3907	4502	}
3908	4503
3909	4504	struct kvm_io_device kvm_io_bus_get_dev(struct kvm kvm, enum kvm_bus bus_idx,
..	..	@@ -3948,8 +4543,9 @@
3948	4543	return -ENOENT;
3949	4544
3950	4545	if (simple_attr_open(inode, file, get,
3951		- stat_data->mode & S_IWUGO ? set : NULL,
3952		- fmt)) {
	4546	+ KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
	4547	+ ? set : NULL,
	4548	+ fmt)) {
3953	4549	kvm_put_kvm(stat_data->kvm);
3954	4550	return -ENOMEM;
3955	4551	}
..	..	@@ -3968,105 +4564,111 @@
3968	4564	return 0;
3969	4565	}
3970	4566
3971		-static int vm_stat_get_per_vm(void data, u64 val)
	4567	+static int kvm_get_stat_per_vm(struct kvm kvm, size_t offset, u64 val)
3972	4568	{
3973		- struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
3974		-
3975		- val = (ulong )((void )stat_data->kvm + stat_data->offset);
	4569	+ val = (ulong )((void )kvm + offset);
3976	4570
3977	4571	return 0;
3978	4572	}
3979	4573
3980		-static int vm_stat_clear_per_vm(void *data, u64 val)
	4574	+static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
3981	4575	{
3982		- struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
3983		-
3984		- if (val)
3985		- return -EINVAL;
3986		-
3987		- (ulong )((void *)stat_data->kvm + stat_data->offset) = 0;
	4576	+ (ulong )((void *)kvm + offset) = 0;
3988	4577
3989	4578	return 0;
3990	4579	}
3991	4580
3992		-static int vm_stat_get_per_vm_open(struct inode inode, struct file file)
3993		-{
3994		- __simple_attr_check_format("%llu\n", 0ull);
3995		- return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
3996		- vm_stat_clear_per_vm, "%llu\n");
3997		-}
3998		-
3999		-static const struct file_operations vm_stat_get_per_vm_fops = {
4000		- .owner = THIS_MODULE,
4001		- .open = vm_stat_get_per_vm_open,
4002		- .release = kvm_debugfs_release,
4003		- .read = simple_attr_read,
4004		- .write = simple_attr_write,
4005		- .llseek = no_llseek,
4006		-};
4007		-
4008		-static int vcpu_stat_get_per_vm(void data, u64 val)
	4581	+static int kvm_get_stat_per_vcpu(struct kvm kvm, size_t offset, u64 val)
4009	4582	{
4010	4583	int i;
4011		- struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
4012	4584	struct kvm_vcpu *vcpu;
4013	4585
4014	4586	*val = 0;
4015	4587
4016		- kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4017		- val += (u64 )((void )vcpu + stat_data->offset);
	4588	+ kvm_for_each_vcpu(i, vcpu, kvm)
	4589	+ val += (u64 )((void )vcpu + offset);
4018	4590
4019	4591	return 0;
4020	4592	}
4021	4593
4022		-static int vcpu_stat_clear_per_vm(void *data, u64 val)
	4594	+static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
4023	4595	{
4024	4596	int i;
4025		- struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
4026	4597	struct kvm_vcpu *vcpu;
	4598	+
	4599	+ kvm_for_each_vcpu(i, vcpu, kvm)
	4600	+ (u64 )((void *)vcpu + offset) = 0;
	4601	+
	4602	+ return 0;
	4603	+}
	4604	+
	4605	+static int kvm_stat_data_get(void data, u64 val)
	4606	+{
	4607	+ int r = -EFAULT;
	4608	+ struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
	4609	+
	4610	+ switch (stat_data->dbgfs_item->kind) {
	4611	+ case KVM_STAT_VM:
	4612	+ r = kvm_get_stat_per_vm(stat_data->kvm,
	4613	+ stat_data->dbgfs_item->offset, val);
	4614	+ break;
	4615	+ case KVM_STAT_VCPU:
	4616	+ r = kvm_get_stat_per_vcpu(stat_data->kvm,
	4617	+ stat_data->dbgfs_item->offset, val);
	4618	+ break;
	4619	+ }
	4620	+
	4621	+ return r;
	4622	+}
	4623	+
	4624	+static int kvm_stat_data_clear(void *data, u64 val)
	4625	+{
	4626	+ int r = -EFAULT;
	4627	+ struct kvm_stat_data stat_data = (struct kvm_stat_data )data;
4027	4628
4028	4629	if (val)
4029	4630	return -EINVAL;
4030	4631
4031		- kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4032		- (u64 )((void *)vcpu + stat_data->offset) = 0;
	4632	+ switch (stat_data->dbgfs_item->kind) {
	4633	+ case KVM_STAT_VM:
	4634	+ r = kvm_clear_stat_per_vm(stat_data->kvm,
	4635	+ stat_data->dbgfs_item->offset);
	4636	+ break;
	4637	+ case KVM_STAT_VCPU:
	4638	+ r = kvm_clear_stat_per_vcpu(stat_data->kvm,
	4639	+ stat_data->dbgfs_item->offset);
	4640	+ break;
	4641	+ }
4033	4642
4034		- return 0;
	4643	+ return r;
4035	4644	}
4036	4645
4037		-static int vcpu_stat_get_per_vm_open(struct inode inode, struct file file)
	4646	+static int kvm_stat_data_open(struct inode inode, struct file file)
4038	4647	{
4039	4648	__simple_attr_check_format("%llu\n", 0ull);
4040		- return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
4041		- vcpu_stat_clear_per_vm, "%llu\n");
	4649	+ return kvm_debugfs_open(inode, file, kvm_stat_data_get,
	4650	+ kvm_stat_data_clear, "%llu\n");
4042	4651	}
4043	4652
4044		-static const struct file_operations vcpu_stat_get_per_vm_fops = {
4045		- .owner = THIS_MODULE,
4046		- .open = vcpu_stat_get_per_vm_open,
	4653	+static const struct file_operations stat_fops_per_vm = {
	4654	+ .owner = THIS_MODULE,
	4655	+ .open = kvm_stat_data_open,
4047	4656	.release = kvm_debugfs_release,
4048		- .read = simple_attr_read,
4049		- .write = simple_attr_write,
4050		- .llseek = no_llseek,
4051		-};
4052		-
4053		-static const struct file_operations *stat_fops_per_vm[] = {
4054		- [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
4055		- [KVM_STAT_VM] = &vm_stat_get_per_vm_fops,
	4657	+ .read = simple_attr_read,
	4658	+ .write = simple_attr_write,
	4659	+ .llseek = no_llseek,
4056	4660	};
4057	4661
4058	4662	static int vm_stat_get(void _offset, u64 val)
4059	4663	{
4060	4664	unsigned offset = (long)_offset;
4061	4665	struct kvm *kvm;
4062		- struct kvm_stat_data stat_tmp = {.offset = offset};
4063	4666	u64 tmp_val;
4064	4667
4065	4668	*val = 0;
4066	4669	mutex_lock(&kvm_lock);
4067	4670	list_for_each_entry(kvm, &vm_list, vm_list) {
4068		- stat_tmp.kvm = kvm;
4069		- vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
	4671	+ kvm_get_stat_per_vm(kvm, offset, &tmp_val);
4070	4672	*val += tmp_val;
4071	4673	}
4072	4674	mutex_unlock(&kvm_lock);
..	..	@@ -4077,15 +4679,13 @@
4077	4679	{
4078	4680	unsigned offset = (long)_offset;
4079	4681	struct kvm *kvm;
4080		- struct kvm_stat_data stat_tmp = {.offset = offset};
4081	4682
4082	4683	if (val)
4083	4684	return -EINVAL;
4084	4685
4085	4686	mutex_lock(&kvm_lock);
4086	4687	list_for_each_entry(kvm, &vm_list, vm_list) {
4087		- stat_tmp.kvm = kvm;
4088		- vm_stat_clear_per_vm((void *)&stat_tmp, 0);
	4688	+ kvm_clear_stat_per_vm(kvm, offset);
4089	4689	}
4090	4690	mutex_unlock(&kvm_lock);
4091	4691
..	..	@@ -4098,14 +4698,12 @@
4098	4698	{
4099	4699	unsigned offset = (long)_offset;
4100	4700	struct kvm *kvm;
4101		- struct kvm_stat_data stat_tmp = {.offset = offset};
4102	4701	u64 tmp_val;
4103	4702
4104	4703	*val = 0;
4105	4704	mutex_lock(&kvm_lock);
4106	4705	list_for_each_entry(kvm, &vm_list, vm_list) {
4107		- stat_tmp.kvm = kvm;
4108		- vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
	4706	+ kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
4109	4707	*val += tmp_val;
4110	4708	}
4111	4709	mutex_unlock(&kvm_lock);
..	..	@@ -4116,15 +4714,13 @@
4116	4714	{
4117	4715	unsigned offset = (long)_offset;
4118	4716	struct kvm *kvm;
4119		- struct kvm_stat_data stat_tmp = {.offset = offset};
4120	4717
4121	4718	if (val)
4122	4719	return -EINVAL;
4123	4720
4124	4721	mutex_lock(&kvm_lock);
4125	4722	list_for_each_entry(kvm, &vm_list, vm_list) {
4126		- stat_tmp.kvm = kvm;
4127		- vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
	4723	+ kvm_clear_stat_per_vcpu(kvm, offset);
4128	4724	}
4129	4725	mutex_unlock(&kvm_lock);
4130	4726
..	..	@@ -4158,7 +4754,7 @@
4158	4754	active = kvm_active_vms;
4159	4755	mutex_unlock(&kvm_lock);
4160	4756
4161		- env = kzalloc(sizeof(*env), GFP_KERNEL);
	4757	+ env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
4162	4758	if (!env)
4163	4759	return;
4164	4760
..	..	@@ -4173,8 +4769,8 @@
4173	4769	}
4174	4770	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
4175	4771
4176		- if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
4177		- char tmp, p = kmalloc(PATH_MAX, GFP_KERNEL);
	4772	+ if (kvm->debugfs_dentry) {
	4773	+ char tmp, p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
4178	4774
4179	4775	if (p) {
4180	4776	tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
..	..	@@ -4197,9 +4793,8 @@
4197	4793
4198	4794	kvm_debugfs_num_entries = 0;
4199	4795	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
4200		- int mode = p->mode ? p->mode : 0644;
4201		- debugfs_create_file(p->name, mode, kvm_debugfs_dir,
4202		- (void *)(long)p->offset,
	4796	+ debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
	4797	+ kvm_debugfs_dir, (void *)(long)p->offset,
4203	4798	stat_fops[p->kind]);
4204	4799	}
4205	4800	}
..	..	@@ -4214,7 +4809,9 @@
4214	4809	static void kvm_resume(void)
4215	4810	{
4216	4811	if (kvm_usage_count) {
4217		- WARN_ON(raw_spin_is_locked(&kvm_count_lock));
	4812	+#ifdef CONFIG_LOCKDEP
	4813	+ WARN_ON(lockdep_is_held(&kvm_count_lock));
	4814	+#endif
4218	4815	hardware_enable_nolock(NULL);
4219	4816	}
4220	4817	}
..	..	@@ -4234,11 +4831,11 @@
4234	4831	{
4235	4832	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4236	4833
4237		- if (vcpu->preempted)
4238		- vcpu->preempted = false;
	4834	+ WRITE_ONCE(vcpu->preempted, false);
	4835	+ WRITE_ONCE(vcpu->ready, false);
4239	4836
	4837	+ __this_cpu_write(kvm_running_vcpu, vcpu);
4240	4838	kvm_arch_sched_in(vcpu, cpu);
4241		-
4242	4839	kvm_arch_vcpu_load(vcpu, cpu);
4243	4840	}
4244	4841
..	..	@@ -4247,14 +4844,59 @@
4247	4844	{
4248	4845	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4249	4846
4250		- if (current->state == TASK_RUNNING)
4251		- vcpu->preempted = true;
	4847	+ if (current->state == TASK_RUNNING) {
	4848	+ WRITE_ONCE(vcpu->preempted, true);
	4849	+ WRITE_ONCE(vcpu->ready, true);
	4850	+ }
4252	4851	kvm_arch_vcpu_put(vcpu);
	4852	+ __this_cpu_write(kvm_running_vcpu, NULL);
	4853	+}
	4854	+
	4855	+/**
	4856	+ * kvm_get_running_vcpu - get the vcpu running on the current CPU.
	4857	+ *
	4858	+ * We can disable preemption locally around accessing the per-CPU variable,
	4859	+ * and use the resolved vcpu pointer after enabling preemption again,
	4860	+ * because even if the current thread is migrated to another CPU, reading
	4861	+ * the per-CPU value later will give us the same value as we update the
	4862	+ * per-CPU variable in the preempt notifier handlers.
	4863	+ */
	4864	+struct kvm_vcpu *kvm_get_running_vcpu(void)
	4865	+{
	4866	+ struct kvm_vcpu *vcpu;
	4867	+
	4868	+ preempt_disable();
	4869	+ vcpu = __this_cpu_read(kvm_running_vcpu);
	4870	+ preempt_enable();
	4871	+
	4872	+ return vcpu;
	4873	+}
	4874	+EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
	4875	+
	4876	+/**
	4877	+ * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
	4878	+ */
	4879	+struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
	4880	+{
	4881	+ return &kvm_running_vcpu;
	4882	+}
	4883	+
	4884	+struct kvm_cpu_compat_check {
	4885	+ void *opaque;
	4886	+ int *ret;
	4887	+};
	4888	+
	4889	+static void check_processor_compat(void *data)
	4890	+{
	4891	+ struct kvm_cpu_compat_check *c = data;
	4892	+
	4893	+ *c->ret = kvm_arch_check_processor_compat(c->opaque);
4253	4894	}
4254	4895
4255	4896	int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
4256	4897	struct module *module)
4257	4898	{
	4899	+ struct kvm_cpu_compat_check c;
4258	4900	int r;
4259	4901	int cpu;
4260	4902
..	..	@@ -4278,16 +4920,16 @@
4278	4920	goto out_free_0;
4279	4921	}
4280	4922
4281		- r = kvm_arch_hardware_setup();
	4923	+ r = kvm_arch_hardware_setup(opaque);
4282	4924	if (r < 0)
4283		- goto out_free_0a;
	4925	+ goto out_free_1;
4284	4926
	4927	+ c.ret = &r;
	4928	+ c.opaque = opaque;
4285	4929	for_each_online_cpu(cpu) {
4286		- smp_call_function_single(cpu,
4287		- kvm_arch_check_processor_compat,
4288		- &r, 1);
	4930	+ smp_call_function_single(cpu, check_processor_compat, &c, 1);
4289	4931	if (r < 0)
4290		- goto out_free_1;
	4932	+ goto out_free_2;
4291	4933	}
4292	4934
4293	4935	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
..	..	@@ -4344,9 +4986,8 @@
4344	4986	unregister_reboot_notifier(&kvm_reboot_notifier);
4345	4987	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
4346	4988	out_free_2:
4347		-out_free_1:
4348	4989	kvm_arch_hardware_unsetup();
4349		-out_free_0a:
	4990	+out_free_1:
4350	4991	free_cpumask_var(cpus_hardware_enabled);
4351	4992	out_free_0:
4352	4993	kvm_irqfd_exit();