forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/virt/kvm/kvm_main.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Kernel-based Virtual Machine driver for Linux
34 *
....@@ -10,10 +11,6 @@
1011 * Authors:
1112 * Avi Kivity <avi@qumranet.com>
1213 * Yaniv Kamay <yaniv@qumranet.com>
13
- *
14
- * This work is licensed under the terms of the GNU GPL, version 2. See
15
- * the COPYING file in the top-level directory.
16
- *
1714 */
1815
1916 #include <kvm/iodev.h>
....@@ -51,13 +48,13 @@
5148 #include <linux/slab.h>
5249 #include <linux/sort.h>
5350 #include <linux/bsearch.h>
54
-#include <linux/kthread.h>
5551 #include <linux/io.h>
52
+#include <linux/lockdep.h>
53
+#include <linux/kthread.h>
5654
5755 #include <asm/processor.h>
5856 #include <asm/ioctl.h>
5957 #include <linux/uaccess.h>
60
-#include <asm/pgtable.h>
6158
6259 #include "coalesced_mmio.h"
6360 #include "async_pf.h"
....@@ -82,6 +79,11 @@
8279 module_param(halt_poll_ns_grow, uint, 0644);
8380 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
8481
82
+/* The start value to grow halt_poll_ns from */
83
+unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
84
+module_param(halt_poll_ns_grow_start, uint, 0644);
85
+EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
86
+
8587 /* Default resets per-vcpu halt_poll_ns . */
8688 unsigned int halt_poll_ns_shrink;
8789 module_param(halt_poll_ns_shrink, uint, 0644);
....@@ -101,16 +103,18 @@
101103 static int kvm_usage_count;
102104 static atomic_t hardware_enable_failed;
103105
104
-struct kmem_cache *kvm_vcpu_cache;
105
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
106
+static struct kmem_cache *kvm_vcpu_cache;
106107
107108 static __read_mostly struct preempt_ops kvm_preempt_ops;
109
+static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
108110
109111 struct dentry *kvm_debugfs_dir;
110112 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
111113
112114 static int kvm_debugfs_num_entries;
113
-static const struct file_operations *stat_fops_per_vm[];
115
+static const struct file_operations stat_fops_per_vm;
116
+
117
+static struct file_operations kvm_chardev_ops;
114118
115119 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116120 unsigned long arg);
....@@ -119,21 +123,30 @@
119123 unsigned long arg);
120124 #define KVM_COMPAT(c) .compat_ioctl = (c)
121125 #else
126
+/*
127
+ * For architectures that don't implement a compat infrastructure,
128
+ * adopt a double line of defense:
129
+ * - Prevent a compat task from opening /dev/kvm
130
+ * - If the open has been done by a 64bit task, and the KVM fd
131
+ * passed to a compat task, let the ioctls fail.
132
+ */
122133 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
123134 unsigned long arg) { return -EINVAL; }
124
-#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl
135
+
136
+static int kvm_no_compat_open(struct inode *inode, struct file *file)
137
+{
138
+ return is_compat_task() ? -ENODEV : 0;
139
+}
140
+#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
141
+ .open = kvm_no_compat_open
125142 #endif
126143 static int hardware_enable_all(void);
127144 static void hardware_disable_all(void);
128145
129146 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
130147
131
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
132
-
133148 __visible bool kvm_rebooting;
134149 EXPORT_SYMBOL_GPL(kvm_rebooting);
135
-
136
-static bool largepages_enabled = true;
137150
138151 #define KVM_EVENT_CREATE_VM 0
139152 #define KVM_EVENT_DESTROY_VM 1
....@@ -143,6 +156,10 @@
143156
144157 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
145158 unsigned long start, unsigned long end)
159
+{
160
+}
161
+
162
+__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
146163 {
147164 }
148165
....@@ -175,12 +192,24 @@
175192 return true;
176193 }
177194
195
+bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
196
+{
197
+ struct page *page = pfn_to_page(pfn);
198
+
199
+ if (!PageTransCompoundMap(page))
200
+ return false;
201
+
202
+ return is_transparent_hugepage(compound_head(page));
203
+}
204
+
178205 /*
179206 * Switches to specified vcpu, until a matching vcpu_put()
180207 */
181208 void vcpu_load(struct kvm_vcpu *vcpu)
182209 {
183210 int cpu = get_cpu();
211
+
212
+ __this_cpu_write(kvm_running_vcpu, vcpu);
184213 preempt_notifier_register(&vcpu->preempt_notifier);
185214 kvm_arch_vcpu_load(vcpu, cpu);
186215 put_cpu();
....@@ -192,6 +221,7 @@
192221 preempt_disable();
193222 kvm_arch_vcpu_put(vcpu);
194223 preempt_notifier_unregister(&vcpu->preempt_notifier);
224
+ __this_cpu_write(kvm_running_vcpu, NULL);
195225 preempt_enable();
196226 }
197227 EXPORT_SYMBOL_GPL(vcpu_put);
....@@ -231,6 +261,7 @@
231261 }
232262
233263 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
264
+ struct kvm_vcpu *except,
234265 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
235266 {
236267 int i, cpu, me;
....@@ -240,7 +271,8 @@
240271 me = get_cpu();
241272
242273 kvm_for_each_vcpu(i, vcpu, kvm) {
243
- if (!test_bit(i, vcpu_bitmap))
274
+ if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
275
+ vcpu == except)
244276 continue;
245277
246278 kvm_make_request(req, vcpu);
....@@ -260,19 +292,23 @@
260292 return called;
261293 }
262294
263
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
295
+bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
296
+ struct kvm_vcpu *except)
264297 {
265298 cpumask_var_t cpus;
266299 bool called;
267
- static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]
268
- = {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX};
269300
270301 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
271302
272
- called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus);
303
+ called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
273304
274305 free_cpumask_var(cpus);
275306 return called;
307
+}
308
+
309
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
310
+{
311
+ return kvm_make_all_cpus_request_except(kvm, req, NULL);
276312 }
277313
278314 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
....@@ -308,57 +344,102 @@
308344 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
309345 }
310346
311
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
347
+static void kvm_flush_shadow_all(struct kvm *kvm)
312348 {
313
- struct page *page;
314
- int r;
349
+ kvm_arch_flush_shadow_all(kvm);
350
+ kvm_arch_guest_memory_reclaimed(kvm);
351
+}
315352
353
+#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
354
+static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
355
+ gfp_t gfp_flags)
356
+{
357
+ gfp_flags |= mc->gfp_zero;
358
+
359
+ if (mc->kmem_cache)
360
+ return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
361
+ else
362
+ return (void *)__get_free_page(gfp_flags);
363
+}
364
+
365
+int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
366
+{
367
+ void *obj;
368
+
369
+ if (mc->nobjs >= min)
370
+ return 0;
371
+ while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
372
+ obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
373
+ if (!obj)
374
+ return mc->nobjs >= min ? 0 : -ENOMEM;
375
+ mc->objects[mc->nobjs++] = obj;
376
+ }
377
+ return 0;
378
+}
379
+
380
+int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
381
+{
382
+ return mc->nobjs;
383
+}
384
+
385
+void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
386
+{
387
+ while (mc->nobjs) {
388
+ if (mc->kmem_cache)
389
+ kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
390
+ else
391
+ free_page((unsigned long)mc->objects[--mc->nobjs]);
392
+ }
393
+}
394
+
395
+void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
396
+{
397
+ void *p;
398
+
399
+ if (WARN_ON(!mc->nobjs))
400
+ p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
401
+ else
402
+ p = mc->objects[--mc->nobjs];
403
+ BUG_ON(!p);
404
+ return p;
405
+}
406
+#endif
407
+
408
+static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
409
+{
316410 mutex_init(&vcpu->mutex);
317411 vcpu->cpu = -1;
318412 vcpu->kvm = kvm;
319413 vcpu->vcpu_id = id;
320414 vcpu->pid = NULL;
321
- init_swait_queue_head(&vcpu->wq);
415
+ rcuwait_init(&vcpu->wait);
322416 kvm_async_pf_vcpu_init(vcpu);
323417
324418 vcpu->pre_pcpu = -1;
325419 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
326420
327
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
328
- if (!page) {
329
- r = -ENOMEM;
330
- goto fail;
331
- }
332
- vcpu->run = page_address(page);
333
-
334421 kvm_vcpu_set_in_spin_loop(vcpu, false);
335422 kvm_vcpu_set_dy_eligible(vcpu, false);
336423 vcpu->preempted = false;
337
-
338
- r = kvm_arch_vcpu_init(vcpu);
339
- if (r < 0)
340
- goto fail_free_run;
341
- return 0;
342
-
343
-fail_free_run:
344
- free_page((unsigned long)vcpu->run);
345
-fail:
346
- return r;
424
+ vcpu->ready = false;
425
+ preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
347426 }
348
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
349427
350
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
428
+void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
351429 {
430
+ kvm_arch_vcpu_destroy(vcpu);
431
+
352432 /*
353
- * no need for rcu_read_lock as VCPU_RUN is the only place that
354
- * will change the vcpu->pid pointer and on uninit all file
355
- * descriptors are already gone.
433
+ * No need for rcu_read_lock as VCPU_RUN is the only place that changes
434
+ * the vcpu->pid pointer, and at destruction time all file descriptors
435
+ * are already gone.
356436 */
357437 put_pid(rcu_dereference_protected(vcpu->pid, 1));
358
- kvm_arch_vcpu_uninit(vcpu);
438
+
359439 free_page((unsigned long)vcpu->run);
440
+ kmem_cache_free(kvm_vcpu_cache, vcpu);
360441 }
361
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
442
+EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
362443
363444 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
364445 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
....@@ -389,16 +470,16 @@
389470 idx = srcu_read_lock(&kvm->srcu);
390471 spin_lock(&kvm->mmu_lock);
391472 kvm->mmu_notifier_seq++;
392
- kvm_set_spte_hva(kvm, address, pte);
473
+
474
+ if (kvm_set_spte_hva(kvm, address, pte))
475
+ kvm_flush_remote_tlbs(kvm);
476
+
393477 spin_unlock(&kvm->mmu_lock);
394478 srcu_read_unlock(&kvm->srcu, idx);
395479 }
396480
397481 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
398
- struct mm_struct *mm,
399
- unsigned long start,
400
- unsigned long end,
401
- bool blockable)
482
+ const struct mmu_notifier_range *range)
402483 {
403484 struct kvm *kvm = mmu_notifier_to_kvm(mn);
404485 int need_tlb_flush = 0, idx;
....@@ -411,21 +492,21 @@
411492 * count is also read inside the mmu_lock critical section.
412493 */
413494 kvm->mmu_notifier_count++;
414
- need_tlb_flush = kvm_unmap_hva_range(kvm, start, end, blockable);
495
+ need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
496
+ range->flags);
415497 /* we've to flush the tlb before the pages can be freed */
416498 if (need_tlb_flush || kvm->tlbs_dirty)
417499 kvm_flush_remote_tlbs(kvm);
418500
419501 spin_unlock(&kvm->mmu_lock);
502
+ kvm_arch_guest_memory_reclaimed(kvm);
420503 srcu_read_unlock(&kvm->srcu, idx);
421504
422505 return 0;
423506 }
424507
425508 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
426
- struct mm_struct *mm,
427
- unsigned long start,
428
- unsigned long end)
509
+ const struct mmu_notifier_range *range)
429510 {
430511 struct kvm *kvm = mmu_notifier_to_kvm(mn);
431512
....@@ -522,12 +603,11 @@
522603 int idx;
523604
524605 idx = srcu_read_lock(&kvm->srcu);
525
- kvm_arch_flush_shadow_all(kvm);
606
+ kvm_flush_shadow_all(kvm);
526607 srcu_read_unlock(&kvm->srcu, idx);
527608 }
528609
529610 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
530
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
531611 .invalidate_range = kvm_mmu_notifier_invalidate_range,
532612 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
533613 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
....@@ -558,12 +638,12 @@
558638 int i;
559639 struct kvm_memslots *slots;
560640
561
- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
641
+ slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
562642 if (!slots)
563643 return NULL;
564644
565645 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
566
- slots->id_to_index[i] = slots->memslots[i].id = i;
646
+ slots->id_to_index[i] = -1;
567647
568648 return slots;
569649 }
....@@ -577,18 +657,14 @@
577657 memslot->dirty_bitmap = NULL;
578658 }
579659
580
-/*
581
- * Free any memory in @free but not in @dont.
582
- */
583
-static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
584
- struct kvm_memory_slot *dont)
660
+static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
585661 {
586
- if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
587
- kvm_destroy_dirty_bitmap(free);
662
+ kvm_destroy_dirty_bitmap(slot);
588663
589
- kvm_arch_free_memslot(kvm, free, dont);
664
+ kvm_arch_free_memslot(kvm, slot);
590665
591
- free->npages = 0;
666
+ slot->flags = 0;
667
+ slot->npages = 0;
592668 }
593669
594670 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
....@@ -599,7 +675,7 @@
599675 return;
600676
601677 kvm_for_each_memslot(memslot, slots)
602
- kvm_free_memslot(kvm, memslot, NULL);
678
+ kvm_free_memslot(kvm, memslot);
603679
604680 kvfree(slots);
605681 }
....@@ -622,6 +698,8 @@
622698
623699 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
624700 {
701
+ static DEFINE_MUTEX(kvm_debugfs_lock);
702
+ struct dentry *dent;
625703 char dir_name[ITOA_MAX_LEN * 2];
626704 struct kvm_stat_data *stat_data;
627705 struct kvm_stats_debugfs_item *p;
....@@ -630,25 +708,37 @@
630708 return 0;
631709
632710 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
633
- kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
711
+ mutex_lock(&kvm_debugfs_lock);
712
+ dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
713
+ if (dent) {
714
+ pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
715
+ dput(dent);
716
+ mutex_unlock(&kvm_debugfs_lock);
717
+ return 0;
718
+ }
719
+ dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
720
+ mutex_unlock(&kvm_debugfs_lock);
721
+ if (IS_ERR(dent))
722
+ return 0;
634723
724
+ kvm->debugfs_dentry = dent;
635725 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
636726 sizeof(*kvm->debugfs_stat_data),
637
- GFP_KERNEL);
727
+ GFP_KERNEL_ACCOUNT);
638728 if (!kvm->debugfs_stat_data)
639729 return -ENOMEM;
640730
641731 for (p = debugfs_entries; p->name; p++) {
642
- stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
732
+ stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
643733 if (!stat_data)
644734 return -ENOMEM;
645735
646736 stat_data->kvm = kvm;
647
- stat_data->offset = p->offset;
648
- stat_data->mode = p->mode ? p->mode : 0644;
737
+ stat_data->dbgfs_item = p;
649738 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
650
- debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
651
- stat_data, stat_fops_per_vm[p->kind]);
739
+ debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
740
+ kvm->debugfs_dentry, stat_data,
741
+ &stat_fops_per_vm);
652742 }
653743 return 0;
654744 }
....@@ -672,8 +762,9 @@
672762
673763 static struct kvm *kvm_create_vm(unsigned long type)
674764 {
675
- int r, i;
676765 struct kvm *kvm = kvm_arch_alloc_vm();
766
+ int r = -ENOMEM;
767
+ int i;
677768
678769 if (!kvm)
679770 return ERR_PTR(-ENOMEM);
....@@ -685,12 +776,38 @@
685776 mutex_init(&kvm->lock);
686777 mutex_init(&kvm->irq_lock);
687778 mutex_init(&kvm->slots_lock);
688
- refcount_set(&kvm->users_count, 1);
689779 INIT_LIST_HEAD(&kvm->devices);
780
+
781
+ BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
782
+
783
+ if (init_srcu_struct(&kvm->srcu))
784
+ goto out_err_no_srcu;
785
+ if (init_srcu_struct(&kvm->irq_srcu))
786
+ goto out_err_no_irq_srcu;
787
+
788
+ refcount_set(&kvm->users_count, 1);
789
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
790
+ struct kvm_memslots *slots = kvm_alloc_memslots();
791
+
792
+ if (!slots)
793
+ goto out_err_no_arch_destroy_vm;
794
+ /* Generations must be different for each address space. */
795
+ slots->generation = i;
796
+ rcu_assign_pointer(kvm->memslots[i], slots);
797
+ }
798
+
799
+ for (i = 0; i < KVM_NR_BUSES; i++) {
800
+ rcu_assign_pointer(kvm->buses[i],
801
+ kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
802
+ if (!kvm->buses[i])
803
+ goto out_err_no_arch_destroy_vm;
804
+ }
805
+
806
+ kvm->max_halt_poll_ns = halt_poll_ns;
690807
691808 r = kvm_arch_init_vm(kvm, type);
692809 if (r)
693
- goto out_err_no_disable;
810
+ goto out_err_no_arch_destroy_vm;
694811
695812 r = hardware_enable_all();
696813 if (r)
....@@ -699,33 +816,6 @@
699816 #ifdef CONFIG_HAVE_KVM_IRQFD
700817 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
701818 #endif
702
-
703
- BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
704
-
705
- r = -ENOMEM;
706
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
707
- struct kvm_memslots *slots = kvm_alloc_memslots();
708
- if (!slots)
709
- goto out_err_no_srcu;
710
- /*
711
- * Generations must be different for each address space.
712
- * Init kvm generation close to the maximum to easily test the
713
- * code of handling generation number wrap-around.
714
- */
715
- slots->generation = i * 2 - 150;
716
- rcu_assign_pointer(kvm->memslots[i], slots);
717
- }
718
-
719
- if (init_srcu_struct(&kvm->srcu))
720
- goto out_err_no_srcu;
721
- if (init_srcu_struct(&kvm->irq_srcu))
722
- goto out_err_no_irq_srcu;
723
- for (i = 0; i < KVM_NR_BUSES; i++) {
724
- rcu_assign_pointer(kvm->buses[i],
725
- kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
726
- if (!kvm->buses[i])
727
- goto out_err_no_mmu_notifier;
728
- }
729819
730820 r = kvm_init_mmu_notifier(kvm);
731821 if (r)
....@@ -741,6 +831,16 @@
741831
742832 preempt_notifier_inc();
743833
834
+ /*
835
+ * When the fd passed to this ioctl() is opened it pins the module,
836
+ * but try_module_get() also prevents getting a reference if the module
837
+ * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait").
838
+ */
839
+ if (!try_module_get(kvm_chardev_ops.owner)) {
840
+ r = -ENODEV;
841
+ goto out_err;
842
+ }
843
+
744844 return kvm;
745845
746846 out_err:
....@@ -749,17 +849,19 @@
749849 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
750850 #endif
751851 out_err_no_mmu_notifier:
752
- cleanup_srcu_struct(&kvm->irq_srcu);
753
-out_err_no_irq_srcu:
754
- cleanup_srcu_struct(&kvm->srcu);
755
-out_err_no_srcu:
756852 hardware_disable_all();
757853 out_err_no_disable:
758
- refcount_set(&kvm->users_count, 0);
854
+ kvm_arch_destroy_vm(kvm);
855
+out_err_no_arch_destroy_vm:
856
+ WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
759857 for (i = 0; i < KVM_NR_BUSES; i++)
760858 kfree(kvm_get_bus(kvm, i));
761859 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
762860 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
861
+ cleanup_srcu_struct(&kvm->irq_srcu);
862
+out_err_no_irq_srcu:
863
+ cleanup_srcu_struct(&kvm->srcu);
864
+out_err_no_srcu:
763865 kvm_arch_free_vm(kvm);
764866 mmdrop(current->mm);
765867 return ERR_PTR(r);
....@@ -805,7 +907,7 @@
805907 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
806908 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
807909 #else
808
- kvm_arch_flush_shadow_all(kvm);
910
+ kvm_flush_shadow_all(kvm);
809911 #endif
810912 kvm_arch_destroy_vm(kvm);
811913 kvm_destroy_devices(kvm);
....@@ -817,6 +919,7 @@
817919 preempt_notifier_dec();
818920 hardware_disable_all();
819921 mmdrop(mm);
922
+ module_put(kvm_chardev_ops.owner);
820923 }
821924
822925 void kvm_get_kvm(struct kvm *kvm)
....@@ -832,6 +935,18 @@
832935 }
833936 EXPORT_SYMBOL_GPL(kvm_put_kvm);
834937
938
+/*
939
+ * Used to put a reference that was taken on behalf of an object associated
940
+ * with a user-visible file descriptor, e.g. a vcpu or device, if installation
941
+ * of the new file descriptor fails and the reference cannot be transferred to
942
+ * its final owner. In such cases, the caller is still actively using @kvm and
943
+ * will fail miserably if the refcount unexpectedly hits zero.
944
+ */
945
+void kvm_put_kvm_no_destroy(struct kvm *kvm)
946
+{
947
+ WARN_ON(refcount_dec_and_test(&kvm->users_count));
948
+}
949
+EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
835950
836951 static int kvm_vm_release(struct inode *inode, struct file *filp)
837952 {
....@@ -845,13 +960,13 @@
845960
846961 /*
847962 * Allocation size is twice as large as the actual dirty bitmap size.
848
- * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
963
+ * See kvm_vm_ioctl_get_dirty_log() why this is needed.
849964 */
850
-static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
965
+static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
851966 {
852967 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
853968
854
- memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
969
+ memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
855970 if (!memslot->dirty_bitmap)
856971 return -ENOMEM;
857972
....@@ -859,58 +974,165 @@
859974 }
860975
861976 /*
862
- * Insert memslot and re-sort memslots based on their GFN,
863
- * so binary search could be used to lookup GFN.
864
- * Sorting algorithm takes advantage of having initially
865
- * sorted array and known changed memslot position.
977
+ * Delete a memslot by decrementing the number of used slots and shifting all
978
+ * other entries in the array forward one spot.
866979 */
867
-static void update_memslots(struct kvm_memslots *slots,
868
- struct kvm_memory_slot *new)
980
+static inline void kvm_memslot_delete(struct kvm_memslots *slots,
981
+ struct kvm_memory_slot *memslot)
869982 {
870
- int id = new->id;
871
- int i = slots->id_to_index[id];
872983 struct kvm_memory_slot *mslots = slots->memslots;
984
+ int i;
873985
874
- WARN_ON(mslots[i].id != id);
875
- if (!new->npages) {
876
- WARN_ON(!mslots[i].npages);
877
- if (mslots[i].npages)
878
- slots->used_slots--;
879
- } else {
880
- if (!mslots[i].npages)
881
- slots->used_slots++;
882
- }
986
+ if (WARN_ON(slots->id_to_index[memslot->id] == -1))
987
+ return;
883988
884
- while (i < KVM_MEM_SLOTS_NUM - 1 &&
885
- new->base_gfn <= mslots[i + 1].base_gfn) {
886
- if (!mslots[i + 1].npages)
887
- break;
989
+ slots->used_slots--;
990
+
991
+ if (atomic_read(&slots->lru_slot) >= slots->used_slots)
992
+ atomic_set(&slots->lru_slot, 0);
993
+
994
+ for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
888995 mslots[i] = mslots[i + 1];
889996 slots->id_to_index[mslots[i].id] = i;
890
- i++;
891997 }
998
+ mslots[i] = *memslot;
999
+ slots->id_to_index[memslot->id] = -1;
1000
+}
1001
+
1002
+/*
1003
+ * "Insert" a new memslot by incrementing the number of used slots. Returns
1004
+ * the new slot's initial index into the memslots array.
1005
+ */
1006
+static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1007
+{
1008
+ return slots->used_slots++;
1009
+}
1010
+
1011
+/*
1012
+ * Move a changed memslot backwards in the array by shifting existing slots
1013
+ * with a higher GFN toward the front of the array. Note, the changed memslot
1014
+ * itself is not preserved in the array, i.e. not swapped at this time, only
1015
+ * its new index into the array is tracked. Returns the changed memslot's
1016
+ * current index into the memslots array.
1017
+ */
1018
+static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1019
+ struct kvm_memory_slot *memslot)
1020
+{
1021
+ struct kvm_memory_slot *mslots = slots->memslots;
1022
+ int i;
1023
+
1024
+ if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1025
+ WARN_ON_ONCE(!slots->used_slots))
1026
+ return -1;
8921027
8931028 /*
894
- * The ">=" is needed when creating a slot with base_gfn == 0,
895
- * so that it moves before all those with base_gfn == npages == 0.
896
- *
897
- * On the other hand, if new->npages is zero, the above loop has
898
- * already left i pointing to the beginning of the empty part of
899
- * mslots, and the ">=" would move the hole backwards in this
900
- * case---which is wrong. So skip the loop when deleting a slot.
1029
+ * Move the target memslot backward in the array by shifting existing
1030
+ * memslots with a higher GFN (than the target memslot) towards the
1031
+ * front of the array.
9011032 */
902
- if (new->npages) {
903
- while (i > 0 &&
904
- new->base_gfn >= mslots[i - 1].base_gfn) {
905
- mslots[i] = mslots[i - 1];
906
- slots->id_to_index[mslots[i].id] = i;
907
- i--;
908
- }
909
- } else
910
- WARN_ON_ONCE(i != slots->used_slots);
1033
+ for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1034
+ if (memslot->base_gfn > mslots[i + 1].base_gfn)
1035
+ break;
9111036
912
- mslots[i] = *new;
913
- slots->id_to_index[mslots[i].id] = i;
1037
+ WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1038
+
1039
+ /* Shift the next memslot forward one and update its index. */
1040
+ mslots[i] = mslots[i + 1];
1041
+ slots->id_to_index[mslots[i].id] = i;
1042
+ }
1043
+ return i;
1044
+}
1045
+
1046
+/*
1047
+ * Move a changed memslot forwards in the array by shifting existing slots with
1048
+ * a lower GFN toward the back of the array. Note, the changed memslot itself
1049
+ * is not preserved in the array, i.e. not swapped at this time, only its new
1050
+ * index into the array is tracked. Returns the changed memslot's final index
1051
+ * into the memslots array.
1052
+ */
1053
+static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1054
+ struct kvm_memory_slot *memslot,
1055
+ int start)
1056
+{
1057
+ struct kvm_memory_slot *mslots = slots->memslots;
1058
+ int i;
1059
+
1060
+ for (i = start; i > 0; i--) {
1061
+ if (memslot->base_gfn < mslots[i - 1].base_gfn)
1062
+ break;
1063
+
1064
+ WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1065
+
1066
+ /* Shift the next memslot back one and update its index. */
1067
+ mslots[i] = mslots[i - 1];
1068
+ slots->id_to_index[mslots[i].id] = i;
1069
+ }
1070
+ return i;
1071
+}
1072
+
1073
+/*
1074
+ * Re-sort memslots based on their GFN to account for an added, deleted, or
1075
+ * moved memslot. Sorting memslots by GFN allows using a binary search during
1076
+ * memslot lookup.
1077
+ *
1078
+ * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry
1079
+ * at memslots[0] has the highest GFN.
1080
+ *
1081
+ * The sorting algorithm takes advantage of having initially sorted memslots
1082
+ * and knowing the position of the changed memslot. Sorting is also optimized
1083
+ * by not swapping the updated memslot and instead only shifting other memslots
1084
+ * and tracking the new index for the update memslot. Only once its final
1085
+ * index is known is the updated memslot copied into its position in the array.
1086
+ *
1087
+ * - When deleting a memslot, the deleted memslot simply needs to be moved to
1088
+ * the end of the array.
1089
+ *
1090
+ * - When creating a memslot, the algorithm "inserts" the new memslot at the
1091
+ * end of the array and then it forward to its correct location.
1092
+ *
1093
+ * - When moving a memslot, the algorithm first moves the updated memslot
1094
+ * backward to handle the scenario where the memslot's GFN was changed to a
1095
+ * lower value. update_memslots() then falls through and runs the same flow
1096
+ * as creating a memslot to move the memslot forward to handle the scenario
1097
+ * where its GFN was changed to a higher value.
1098
+ *
1099
+ * Note, slots are sorted from highest->lowest instead of lowest->highest for
1100
+ * historical reasons. Originally, invalid memslots where denoted by having
1101
+ * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
1102
+ * to the end of the array. The current algorithm uses dedicated logic to
1103
+ * delete a memslot and thus does not rely on invalid memslots having GFN=0.
1104
+ *
1105
+ * The other historical motiviation for highest->lowest was to improve the
1106
+ * performance of memslot lookup. KVM originally used a linear search starting
1107
+ * at memslots[0]. On x86, the largest memslot usually has one of the highest,
1108
+ * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a
1109
+ * single memslot above the 4gb boundary. As the largest memslot is also the
1110
+ * most likely to be referenced, sorting it to the front of the array was
1111
+ * advantageous. The current binary search starts from the middle of the array
1112
+ * and uses an LRU pointer to improve performance for all memslots and GFNs.
1113
+ */
1114
+static void update_memslots(struct kvm_memslots *slots,
1115
+ struct kvm_memory_slot *memslot,
1116
+ enum kvm_mr_change change)
1117
+{
1118
+ int i;
1119
+
1120
+ if (change == KVM_MR_DELETE) {
1121
+ kvm_memslot_delete(slots, memslot);
1122
+ } else {
1123
+ if (change == KVM_MR_CREATE)
1124
+ i = kvm_memslot_insert_back(slots);
1125
+ else
1126
+ i = kvm_memslot_move_backward(slots, memslot);
1127
+ i = kvm_memslot_move_forward(slots, memslot, i);
1128
+
1129
+ /*
1130
+ * Copy the memslot to its new position in memslots and update
1131
+ * its index accordingly.
1132
+ */
1133
+ slots->memslots[i] = *memslot;
1134
+ slots->id_to_index[memslot->id] = i;
1135
+ }
9141136 }
9151137
9161138 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
....@@ -931,36 +1153,148 @@
9311153 int as_id, struct kvm_memslots *slots)
9321154 {
9331155 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
934
- u64 gen;
1156
+ u64 gen = old_memslots->generation;
9351157
936
- /*
937
- * Set the low bit in the generation, which disables SPTE caching
938
- * until the end of synchronize_srcu_expedited.
939
- */
940
- WARN_ON(old_memslots->generation & 1);
941
- slots->generation = old_memslots->generation + 1;
1158
+ WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1159
+ slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
9421160
9431161 rcu_assign_pointer(kvm->memslots[as_id], slots);
9441162 synchronize_srcu_expedited(&kvm->srcu);
9451163
9461164 /*
947
- * Increment the new memslot generation a second time. This prevents
948
- * vm exits that race with memslot updates from caching a memslot
949
- * generation that will (potentially) be valid forever.
950
- *
1165
+ * Increment the new memslot generation a second time, dropping the
1166
+ * update in-progress flag and incrementing the generation based on
1167
+ * the number of address spaces. This provides a unique and easily
1168
+ * identifiable generation number while the memslots are in flux.
1169
+ */
1170
+ gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1171
+
1172
+ /*
9511173 * Generations must be unique even across address spaces. We do not need
9521174 * a global counter for that, instead the generation space is evenly split
9531175 * across address spaces. For example, with two address spaces, address
954
- * space 0 will use generations 0, 4, 8, ... while * address space 1 will
955
- * use generations 2, 6, 10, 14, ...
1176
+ * space 0 will use generations 0, 2, 4, ... while address space 1 will
1177
+ * use generations 1, 3, 5, ...
9561178 */
957
- gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
1179
+ gen += KVM_ADDRESS_SPACE_NUM;
9581180
9591181 kvm_arch_memslots_updated(kvm, gen);
9601182
9611183 slots->generation = gen;
9621184
9631185 return old_memslots;
1186
+}
1187
+
1188
+/*
1189
+ * Note, at a minimum, the current number of used slots must be allocated, even
1190
+ * when deleting a memslot, as we need a complete duplicate of the memslots for
1191
+ * use when invalidating a memslot prior to deleting/moving the memslot.
1192
+ */
1193
+static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1194
+ enum kvm_mr_change change)
1195
+{
1196
+ struct kvm_memslots *slots;
1197
+ size_t old_size, new_size;
1198
+
1199
+ old_size = sizeof(struct kvm_memslots) +
1200
+ (sizeof(struct kvm_memory_slot) * old->used_slots);
1201
+
1202
+ if (change == KVM_MR_CREATE)
1203
+ new_size = old_size + sizeof(struct kvm_memory_slot);
1204
+ else
1205
+ new_size = old_size;
1206
+
1207
+ slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1208
+ if (likely(slots))
1209
+ memcpy(slots, old, old_size);
1210
+
1211
+ return slots;
1212
+}
1213
+
1214
+static int kvm_set_memslot(struct kvm *kvm,
1215
+ const struct kvm_userspace_memory_region *mem,
1216
+ struct kvm_memory_slot *old,
1217
+ struct kvm_memory_slot *new, int as_id,
1218
+ enum kvm_mr_change change)
1219
+{
1220
+ struct kvm_memory_slot *slot;
1221
+ struct kvm_memslots *slots;
1222
+ int r;
1223
+
1224
+ slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1225
+ if (!slots)
1226
+ return -ENOMEM;
1227
+
1228
+ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1229
+ /*
1230
+ * Note, the INVALID flag needs to be in the appropriate entry
1231
+ * in the freshly allocated memslots, not in @old or @new.
1232
+ */
1233
+ slot = id_to_memslot(slots, old->id);
1234
+ slot->flags |= KVM_MEMSLOT_INVALID;
1235
+
1236
+ /*
1237
+ * We can re-use the old memslots, the only difference from the
1238
+ * newly installed memslots is the invalid flag, which will get
1239
+ * dropped by update_memslots anyway. We'll also revert to the
1240
+ * old memslots if preparing the new memory region fails.
1241
+ */
1242
+ slots = install_new_memslots(kvm, as_id, slots);
1243
+
1244
+ /* From this point no new shadow pages pointing to a deleted,
1245
+ * or moved, memslot will be created.
1246
+ *
1247
+ * validation of sp->gfn happens in:
1248
+ * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1249
+ * - kvm_is_visible_gfn (mmu_check_root)
1250
+ */
1251
+ kvm_arch_flush_shadow_memslot(kvm, slot);
1252
+ kvm_arch_guest_memory_reclaimed(kvm);
1253
+ }
1254
+
1255
+ r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1256
+ if (r)
1257
+ goto out_slots;
1258
+
1259
+ update_memslots(slots, new, change);
1260
+ slots = install_new_memslots(kvm, as_id, slots);
1261
+
1262
+ kvm_arch_commit_memory_region(kvm, mem, old, new, change);
1263
+
1264
+ kvfree(slots);
1265
+ return 0;
1266
+
1267
+out_slots:
1268
+ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1269
+ slots = install_new_memslots(kvm, as_id, slots);
1270
+ kvfree(slots);
1271
+ return r;
1272
+}
1273
+
1274
+static int kvm_delete_memslot(struct kvm *kvm,
1275
+ const struct kvm_userspace_memory_region *mem,
1276
+ struct kvm_memory_slot *old, int as_id)
1277
+{
1278
+ struct kvm_memory_slot new;
1279
+ int r;
1280
+
1281
+ if (!old->npages)
1282
+ return -EINVAL;
1283
+
1284
+ memset(&new, 0, sizeof(new));
1285
+ new.id = old->id;
1286
+ /*
1287
+ * This is only for debugging purpose; it should never be referenced
1288
+ * for a removed memslot.
1289
+ */
1290
+ new.as_id = as_id;
1291
+
1292
+ r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
1293
+ if (r)
1294
+ return r;
1295
+
1296
+ kvm_free_memslot(kvm, old);
1297
+ return 0;
9641298 }
9651299
9661300 /*
....@@ -974,163 +1308,120 @@
9741308 int __kvm_set_memory_region(struct kvm *kvm,
9751309 const struct kvm_userspace_memory_region *mem)
9761310 {
977
- int r;
978
- gfn_t base_gfn;
979
- unsigned long npages;
980
- struct kvm_memory_slot *slot;
9811311 struct kvm_memory_slot old, new;
982
- struct kvm_memslots *slots = NULL, *old_memslots;
983
- int as_id, id;
1312
+ struct kvm_memory_slot *tmp;
9841313 enum kvm_mr_change change;
1314
+ int as_id, id;
1315
+ int r;
9851316
9861317 r = check_memory_region_flags(mem);
9871318 if (r)
988
- goto out;
1319
+ return r;
9891320
990
- r = -EINVAL;
9911321 as_id = mem->slot >> 16;
9921322 id = (u16)mem->slot;
9931323
9941324 /* General sanity checks */
995
- if (mem->memory_size & (PAGE_SIZE - 1))
996
- goto out;
1325
+ if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1326
+ (mem->memory_size != (unsigned long)mem->memory_size))
1327
+ return -EINVAL;
9971328 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
998
- goto out;
1329
+ return -EINVAL;
9991330 /* We can read the guest memory with __xxx_user() later on. */
1000
- if ((id < KVM_USER_MEM_SLOTS) &&
1001
- ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1002
- !access_ok(VERIFY_WRITE,
1003
- (void __user *)(unsigned long)mem->userspace_addr,
1004
- mem->memory_size)))
1005
- goto out;
1331
+ if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1332
+ (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1333
+ !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1334
+ mem->memory_size))
1335
+ return -EINVAL;
10061336 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1007
- goto out;
1337
+ return -EINVAL;
10081338 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1009
- goto out;
1339
+ return -EINVAL;
10101340
1011
- slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1012
- base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1013
- npages = mem->memory_size >> PAGE_SHIFT;
1014
-
1015
- if (npages > KVM_MEM_MAX_NR_PAGES)
1016
- goto out;
1017
-
1018
- new = old = *slot;
1019
-
1020
- new.id = id;
1021
- new.base_gfn = base_gfn;
1022
- new.npages = npages;
1023
- new.flags = mem->flags;
1024
-
1025
- if (npages) {
1026
- if (!old.npages)
1027
- change = KVM_MR_CREATE;
1028
- else { /* Modify an existing slot. */
1029
- if ((mem->userspace_addr != old.userspace_addr) ||
1030
- (npages != old.npages) ||
1031
- ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1032
- goto out;
1033
-
1034
- if (base_gfn != old.base_gfn)
1035
- change = KVM_MR_MOVE;
1036
- else if (new.flags != old.flags)
1037
- change = KVM_MR_FLAGS_ONLY;
1038
- else { /* Nothing to change. */
1039
- r = 0;
1040
- goto out;
1041
- }
1042
- }
1341
+ /*
1342
+ * Make a full copy of the old memslot, the pointer will become stale
1343
+ * when the memslots are re-sorted by update_memslots(), and the old
1344
+ * memslot needs to be referenced after calling update_memslots(), e.g.
1345
+ * to free its resources and for arch specific behavior.
1346
+ */
1347
+ tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1348
+ if (tmp) {
1349
+ old = *tmp;
1350
+ tmp = NULL;
10431351 } else {
1044
- if (!old.npages)
1045
- goto out;
1352
+ memset(&old, 0, sizeof(old));
1353
+ old.id = id;
1354
+ }
10461355
1047
- change = KVM_MR_DELETE;
1048
- new.base_gfn = 0;
1049
- new.flags = 0;
1356
+ if (!mem->memory_size)
1357
+ return kvm_delete_memslot(kvm, mem, &old, as_id);
1358
+
1359
+ new.as_id = as_id;
1360
+ new.id = id;
1361
+ new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1362
+ new.npages = mem->memory_size >> PAGE_SHIFT;
1363
+ new.flags = mem->flags;
1364
+ new.userspace_addr = mem->userspace_addr;
1365
+
1366
+ if (new.npages > KVM_MEM_MAX_NR_PAGES)
1367
+ return -EINVAL;
1368
+
1369
+ if (!old.npages) {
1370
+ change = KVM_MR_CREATE;
1371
+ new.dirty_bitmap = NULL;
1372
+ memset(&new.arch, 0, sizeof(new.arch));
1373
+ } else { /* Modify an existing slot. */
1374
+ if ((new.userspace_addr != old.userspace_addr) ||
1375
+ (new.npages != old.npages) ||
1376
+ ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1377
+ return -EINVAL;
1378
+
1379
+ if (new.base_gfn != old.base_gfn)
1380
+ change = KVM_MR_MOVE;
1381
+ else if (new.flags != old.flags)
1382
+ change = KVM_MR_FLAGS_ONLY;
1383
+ else /* Nothing to change. */
1384
+ return 0;
1385
+
1386
+ /* Copy dirty_bitmap and arch from the current memslot. */
1387
+ new.dirty_bitmap = old.dirty_bitmap;
1388
+ memcpy(&new.arch, &old.arch, sizeof(new.arch));
10501389 }
10511390
10521391 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
10531392 /* Check for overlaps */
1054
- r = -EEXIST;
1055
- kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
1056
- if (slot->id == id)
1393
+ kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1394
+ if (tmp->id == id)
10571395 continue;
1058
- if (!((base_gfn + npages <= slot->base_gfn) ||
1059
- (base_gfn >= slot->base_gfn + slot->npages)))
1060
- goto out;
1396
+ if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1397
+ (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1398
+ return -EEXIST;
10611399 }
10621400 }
10631401
1064
- /* Free page dirty bitmap if unneeded */
1402
+ /* Allocate/free page dirty bitmap as needed */
10651403 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
10661404 new.dirty_bitmap = NULL;
1405
+ else if (!new.dirty_bitmap) {
1406
+ r = kvm_alloc_dirty_bitmap(&new);
1407
+ if (r)
1408
+ return r;
10671409
1068
- r = -ENOMEM;
1069
- if (change == KVM_MR_CREATE) {
1070
- new.userspace_addr = mem->userspace_addr;
1071
-
1072
- if (kvm_arch_create_memslot(kvm, &new, npages))
1073
- goto out_free;
1410
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1411
+ bitmap_set(new.dirty_bitmap, 0, new.npages);
10741412 }
10751413
1076
- /* Allocate page dirty bitmap if needed */
1077
- if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1078
- if (kvm_create_dirty_bitmap(&new) < 0)
1079
- goto out_free;
1080
- }
1081
-
1082
- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
1083
- if (!slots)
1084
- goto out_free;
1085
- memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
1086
-
1087
- if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
1088
- slot = id_to_memslot(slots, id);
1089
- slot->flags |= KVM_MEMSLOT_INVALID;
1090
-
1091
- old_memslots = install_new_memslots(kvm, as_id, slots);
1092
-
1093
- /* From this point no new shadow pages pointing to a deleted,
1094
- * or moved, memslot will be created.
1095
- *
1096
- * validation of sp->gfn happens in:
1097
- * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1098
- * - kvm_is_visible_gfn (mmu_check_roots)
1099
- */
1100
- kvm_arch_flush_shadow_memslot(kvm, slot);
1101
-
1102
- /*
1103
- * We can re-use the old_memslots from above, the only difference
1104
- * from the currently installed memslots is the invalid flag. This
1105
- * will get overwritten by update_memslots anyway.
1106
- */
1107
- slots = old_memslots;
1108
- }
1109
-
1110
- r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
1414
+ r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
11111415 if (r)
1112
- goto out_slots;
1416
+ goto out_bitmap;
11131417
1114
- /* actual memory is freed via old in kvm_free_memslot below */
1115
- if (change == KVM_MR_DELETE) {
1116
- new.dirty_bitmap = NULL;
1117
- memset(&new.arch, 0, sizeof(new.arch));
1118
- }
1119
-
1120
- update_memslots(slots, &new);
1121
- old_memslots = install_new_memslots(kvm, as_id, slots);
1122
-
1123
- kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
1124
-
1125
- kvm_free_memslot(kvm, &old, &new);
1126
- kvfree(old_memslots);
1418
+ if (old.dirty_bitmap && !new.dirty_bitmap)
1419
+ kvm_destroy_dirty_bitmap(&old);
11271420 return 0;
11281421
1129
-out_slots:
1130
- kvfree(slots);
1131
-out_free:
1132
- kvm_free_memslot(kvm, &new, &old);
1133
-out:
1422
+out_bitmap:
1423
+ if (new.dirty_bitmap && !old.dirty_bitmap)
1424
+ kvm_destroy_dirty_bitmap(&new);
11341425 return r;
11351426 }
11361427 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
....@@ -1156,14 +1447,24 @@
11561447 return kvm_set_memory_region(kvm, mem);
11571448 }
11581449
1159
-int kvm_get_dirty_log(struct kvm *kvm,
1160
- struct kvm_dirty_log *log, int *is_dirty)
1450
+#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1451
+/**
1452
+ * kvm_get_dirty_log - get a snapshot of dirty pages
1453
+ * @kvm: pointer to kvm instance
1454
+ * @log: slot id and address to which we copy the log
1455
+ * @is_dirty: set to '1' if any dirty pages were found
1456
+ * @memslot: set to the associated memslot, always valid on success
1457
+ */
1458
+int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1459
+ int *is_dirty, struct kvm_memory_slot **memslot)
11611460 {
11621461 struct kvm_memslots *slots;
1163
- struct kvm_memory_slot *memslot;
11641462 int i, as_id, id;
11651463 unsigned long n;
11661464 unsigned long any = 0;
1465
+
1466
+ *memslot = NULL;
1467
+ *is_dirty = 0;
11671468
11681469 as_id = log->slot >> 16;
11691470 id = (u16)log->slot;
....@@ -1171,16 +1472,18 @@
11711472 return -EINVAL;
11721473
11731474 slots = __kvm_memslots(kvm, as_id);
1174
- memslot = id_to_memslot(slots, id);
1175
- if (!memslot->dirty_bitmap)
1475
+ *memslot = id_to_memslot(slots, id);
1476
+ if (!(*memslot) || !(*memslot)->dirty_bitmap)
11761477 return -ENOENT;
11771478
1178
- n = kvm_dirty_bitmap_bytes(memslot);
1479
+ kvm_arch_sync_dirty_log(kvm, *memslot);
1480
+
1481
+ n = kvm_dirty_bitmap_bytes(*memslot);
11791482
11801483 for (i = 0; !any && i < n/sizeof(long); ++i)
1181
- any = memslot->dirty_bitmap[i];
1484
+ any = (*memslot)->dirty_bitmap[i];
11821485
1183
- if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1486
+ if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
11841487 return -EFAULT;
11851488
11861489 if (any)
....@@ -1189,13 +1492,12 @@
11891492 }
11901493 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
11911494
1192
-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1495
+#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
11931496 /**
1194
- * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
1195
- * are dirty write protect them for next write.
1497
+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages
1498
+ * and reenable dirty page tracking for the corresponding pages.
11961499 * @kvm: pointer to kvm instance
11971500 * @log: slot id and address to which we copy the log
1198
- * @is_dirty: flag set if any page is dirty
11991501 *
12001502 * We need to keep it in mind that VCPU threads can write to the bitmap
12011503 * concurrently. So, to avoid losing track of dirty pages we keep the
....@@ -1212,8 +1514,7 @@
12121514 * exiting to userspace will be logged for the next call.
12131515 *
12141516 */
1215
-int kvm_get_dirty_log_protect(struct kvm *kvm,
1216
- struct kvm_dirty_log *log, bool *is_dirty)
1517
+static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
12171518 {
12181519 struct kvm_memslots *slots;
12191520 struct kvm_memory_slot *memslot;
....@@ -1221,6 +1522,7 @@
12211522 unsigned long n;
12221523 unsigned long *dirty_bitmap;
12231524 unsigned long *dirty_bitmap_buffer;
1525
+ bool flush;
12241526
12251527 as_id = log->slot >> 16;
12261528 id = (u16)log->slot;
....@@ -1229,55 +1531,180 @@
12291531
12301532 slots = __kvm_memslots(kvm, as_id);
12311533 memslot = id_to_memslot(slots, id);
1232
-
1233
- dirty_bitmap = memslot->dirty_bitmap;
1234
- if (!dirty_bitmap)
1534
+ if (!memslot || !memslot->dirty_bitmap)
12351535 return -ENOENT;
12361536
1537
+ dirty_bitmap = memslot->dirty_bitmap;
1538
+
1539
+ kvm_arch_sync_dirty_log(kvm, memslot);
1540
+
12371541 n = kvm_dirty_bitmap_bytes(memslot);
1542
+ flush = false;
1543
+ if (kvm->manual_dirty_log_protect) {
1544
+ /*
1545
+ * Unlike kvm_get_dirty_log, we always return false in *flush,
1546
+ * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
1547
+ * is some code duplication between this function and
1548
+ * kvm_get_dirty_log, but hopefully all architecture
1549
+ * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1550
+ * can be eliminated.
1551
+ */
1552
+ dirty_bitmap_buffer = dirty_bitmap;
1553
+ } else {
1554
+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1555
+ memset(dirty_bitmap_buffer, 0, n);
12381556
1239
- dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1240
- memset(dirty_bitmap_buffer, 0, n);
1557
+ spin_lock(&kvm->mmu_lock);
1558
+ for (i = 0; i < n / sizeof(long); i++) {
1559
+ unsigned long mask;
1560
+ gfn_t offset;
12411561
1242
- spin_lock(&kvm->mmu_lock);
1243
- *is_dirty = false;
1244
- for (i = 0; i < n / sizeof(long); i++) {
1245
- unsigned long mask;
1246
- gfn_t offset;
1562
+ if (!dirty_bitmap[i])
1563
+ continue;
12471564
1248
- if (!dirty_bitmap[i])
1249
- continue;
1565
+ flush = true;
1566
+ mask = xchg(&dirty_bitmap[i], 0);
1567
+ dirty_bitmap_buffer[i] = mask;
12501568
1251
- *is_dirty = true;
1252
-
1253
- mask = xchg(&dirty_bitmap[i], 0);
1254
- dirty_bitmap_buffer[i] = mask;
1255
-
1256
- if (mask) {
12571569 offset = i * BITS_PER_LONG;
12581570 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
12591571 offset, mask);
12601572 }
1573
+ spin_unlock(&kvm->mmu_lock);
12611574 }
12621575
1263
- spin_unlock(&kvm->mmu_lock);
1576
+ if (flush)
1577
+ kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1578
+
12641579 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
12651580 return -EFAULT;
12661581 return 0;
12671582 }
1268
-EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1269
-#endif
12701583
1271
-bool kvm_largepages_enabled(void)
1584
+
1585
+/**
1586
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
1587
+ * @kvm: kvm instance
1588
+ * @log: slot id and address to which we copy the log
1589
+ *
1590
+ * Steps 1-4 below provide general overview of dirty page logging. See
1591
+ * kvm_get_dirty_log_protect() function description for additional details.
1592
+ *
1593
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
1594
+ * always flush the TLB (step 4) even if previous step failed and the dirty
1595
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
1596
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
1597
+ * writes will be marked dirty for next log read.
1598
+ *
1599
+ * 1. Take a snapshot of the bit and clear it if needed.
1600
+ * 2. Write protect the corresponding page.
1601
+ * 3. Copy the snapshot to the userspace.
1602
+ * 4. Flush TLB's if needed.
1603
+ */
1604
+static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1605
+ struct kvm_dirty_log *log)
12721606 {
1273
- return largepages_enabled;
1607
+ int r;
1608
+
1609
+ mutex_lock(&kvm->slots_lock);
1610
+
1611
+ r = kvm_get_dirty_log_protect(kvm, log);
1612
+
1613
+ mutex_unlock(&kvm->slots_lock);
1614
+ return r;
12741615 }
12751616
1276
-void kvm_disable_largepages(void)
1617
+/**
1618
+ * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
1619
+ * and reenable dirty page tracking for the corresponding pages.
1620
+ * @kvm: pointer to kvm instance
1621
+ * @log: slot id and address from which to fetch the bitmap of dirty pages
1622
+ */
1623
+static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1624
+ struct kvm_clear_dirty_log *log)
12771625 {
1278
- largepages_enabled = false;
1626
+ struct kvm_memslots *slots;
1627
+ struct kvm_memory_slot *memslot;
1628
+ int as_id, id;
1629
+ gfn_t offset;
1630
+ unsigned long i, n;
1631
+ unsigned long *dirty_bitmap;
1632
+ unsigned long *dirty_bitmap_buffer;
1633
+ bool flush;
1634
+
1635
+ as_id = log->slot >> 16;
1636
+ id = (u16)log->slot;
1637
+ if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1638
+ return -EINVAL;
1639
+
1640
+ if (log->first_page & 63)
1641
+ return -EINVAL;
1642
+
1643
+ slots = __kvm_memslots(kvm, as_id);
1644
+ memslot = id_to_memslot(slots, id);
1645
+ if (!memslot || !memslot->dirty_bitmap)
1646
+ return -ENOENT;
1647
+
1648
+ dirty_bitmap = memslot->dirty_bitmap;
1649
+
1650
+ n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
1651
+
1652
+ if (log->first_page > memslot->npages ||
1653
+ log->num_pages > memslot->npages - log->first_page ||
1654
+ (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
1655
+ return -EINVAL;
1656
+
1657
+ kvm_arch_sync_dirty_log(kvm, memslot);
1658
+
1659
+ flush = false;
1660
+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1661
+ if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
1662
+ return -EFAULT;
1663
+
1664
+ spin_lock(&kvm->mmu_lock);
1665
+ for (offset = log->first_page, i = offset / BITS_PER_LONG,
1666
+ n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
1667
+ i++, offset += BITS_PER_LONG) {
1668
+ unsigned long mask = *dirty_bitmap_buffer++;
1669
+ atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
1670
+ if (!mask)
1671
+ continue;
1672
+
1673
+ mask &= atomic_long_fetch_andnot(mask, p);
1674
+
1675
+ /*
1676
+ * mask contains the bits that really have been cleared. This
1677
+ * never includes any bits beyond the length of the memslot (if
1678
+ * the length is not aligned to 64 pages), therefore it is not
1679
+ * a problem if userspace sets them in log->dirty_bitmap.
1680
+ */
1681
+ if (mask) {
1682
+ flush = true;
1683
+ kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1684
+ offset, mask);
1685
+ }
1686
+ }
1687
+ spin_unlock(&kvm->mmu_lock);
1688
+
1689
+ if (flush)
1690
+ kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1691
+
1692
+ return 0;
12791693 }
1280
-EXPORT_SYMBOL_GPL(kvm_disable_largepages);
1694
+
1695
+static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
1696
+ struct kvm_clear_dirty_log *log)
1697
+{
1698
+ int r;
1699
+
1700
+ mutex_lock(&kvm->slots_lock);
1701
+
1702
+ r = kvm_clear_dirty_log_protect(kvm, log);
1703
+
1704
+ mutex_unlock(&kvm->slots_lock);
1705
+ return r;
1706
+}
1707
+#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
12811708
12821709 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
12831710 {
....@@ -1294,13 +1721,17 @@
12941721 {
12951722 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
12961723
1297
- if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
1298
- memslot->flags & KVM_MEMSLOT_INVALID)
1299
- return false;
1300
-
1301
- return true;
1724
+ return kvm_is_visible_memslot(memslot);
13021725 }
13031726 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1727
+
1728
+bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1729
+{
1730
+ struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1731
+
1732
+ return kvm_is_visible_memslot(memslot);
1733
+}
1734
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
13041735
13051736 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
13061737 {
....@@ -1313,7 +1744,7 @@
13131744 if (kvm_is_error_hva(addr))
13141745 return PAGE_SIZE;
13151746
1316
- down_read(&current->mm->mmap_sem);
1747
+ mmap_read_lock(current->mm);
13171748 vma = find_vma(current->mm, addr);
13181749 if (!vma)
13191750 goto out;
....@@ -1321,7 +1752,7 @@
13211752 size = vma_kernel_pagesize(vma);
13221753
13231754 out:
1324
- up_read(&current->mm->mmap_sem);
1755
+ mmap_read_unlock(current->mm);
13251756
13261757 return size;
13271758 }
....@@ -1372,8 +1803,12 @@
13721803 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
13731804
13741805 /*
1375
- * If writable is set to false, the hva returned by this function is only
1376
- * allowed to be read.
1806
+ * Return the hva of a @gfn and the R/W attribute if possible.
1807
+ *
1808
+ * @slot: the kvm_memory_slot which contains @gfn
1809
+ * @gfn: the gfn to be translated
1810
+ * @writable: used to return the read/write attribute of the @slot if the hva
1811
+ * is valid and @writable is not NULL
13771812 */
13781813 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
13791814 gfn_t gfn, bool *writable)
....@@ -1411,13 +1846,12 @@
14111846 /*
14121847 * The fast path to get the writable pfn which will be stored in @pfn,
14131848 * true indicates success, otherwise false is returned. It's also the
1414
- * only part that runs if we can are in atomic context.
1849
+ * only part that runs if we can in atomic context.
14151850 */
14161851 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
14171852 bool *writable, kvm_pfn_t *pfn)
14181853 {
14191854 struct page *page[1];
1420
- int npages;
14211855
14221856 /*
14231857 * Fast pin a writable pfn only if it is a write fault request
....@@ -1427,8 +1861,7 @@
14271861 if (!(write_fault || writable))
14281862 return false;
14291863
1430
- npages = __get_user_pages_fast(addr, 1, 1, page);
1431
- if (npages == 1) {
1864
+ if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
14321865 *pfn = page_to_pfn(page[0]);
14331866
14341867 if (writable)
....@@ -1468,7 +1901,7 @@
14681901 if (unlikely(!write_fault) && writable) {
14691902 struct page *wpage;
14701903
1471
- if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
1904
+ if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
14721905 *writable = true;
14731906 put_page(page);
14741907 page = wpage;
....@@ -1506,14 +1939,14 @@
15061939 spinlock_t *ptl;
15071940 int r;
15081941
1509
- r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
1942
+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
15101943 if (r) {
15111944 /*
15121945 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
15131946 * not call the fault handler, so do it here.
15141947 */
15151948 bool unlocked = false;
1516
- r = fixup_user_fault(current, current->mm, addr,
1949
+ r = fixup_user_fault(current->mm, addr,
15171950 (write_fault ? FAULT_FLAG_WRITE : 0),
15181951 &unlocked);
15191952 if (unlocked)
....@@ -1521,7 +1954,7 @@
15211954 if (r)
15221955 return r;
15231956
1524
- r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
1957
+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
15251958 if (r)
15261959 return r;
15271960 }
....@@ -1596,7 +2029,7 @@
15962029 if (npages == 1)
15972030 return pfn;
15982031
1599
- down_read(&current->mm->mmap_sem);
2032
+ mmap_read_lock(current->mm);
16002033 if (npages == -EHWPOISON ||
16012034 (!async && check_user_page_hwpoison(addr))) {
16022035 pfn = KVM_PFN_ERR_HWPOISON;
....@@ -1620,7 +2053,7 @@
16202053 pfn = KVM_PFN_ERR_FAULT;
16212054 }
16222055 exit:
1623
- up_read(&current->mm->mmap_sem);
2056
+ mmap_read_unlock(current->mm);
16242057 return pfn;
16252058 }
16262059
....@@ -1673,12 +2106,6 @@
16732106 }
16742107 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
16752108
1676
-kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1677
-{
1678
- return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
1679
-}
1680
-EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1681
-
16822109 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
16832110 {
16842111 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
....@@ -1710,7 +2137,7 @@
17102137 if (entry < nr_pages)
17112138 return 0;
17122139
1713
- return __get_user_pages_fast(addr, nr_pages, 1, pages);
2140
+ return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
17142141 }
17152142 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
17162143
....@@ -1924,20 +2351,28 @@
19242351 }
19252352 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
19262353
2354
+static bool kvm_is_ad_tracked_pfn(kvm_pfn_t pfn)
2355
+{
2356
+ if (!pfn_valid(pfn))
2357
+ return false;
2358
+
2359
+ /*
2360
+ * Per page-flags.h, pages tagged PG_reserved "should in general not be
2361
+ * touched (e.g. set dirty) except by its owner".
2362
+ */
2363
+ return !PageReserved(pfn_to_page(pfn));
2364
+}
2365
+
19272366 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
19282367 {
1929
- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
1930
- struct page *page = pfn_to_page(pfn);
1931
-
1932
- if (!PageReserved(page))
1933
- SetPageDirty(page);
1934
- }
2368
+ if (kvm_is_ad_tracked_pfn(pfn))
2369
+ SetPageDirty(pfn_to_page(pfn));
19352370 }
19362371 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
19372372
19382373 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
19392374 {
1940
- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2375
+ if (kvm_is_ad_tracked_pfn(pfn))
19412376 mark_page_accessed(pfn_to_page(pfn));
19422377 }
19432378 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
....@@ -2047,17 +2482,6 @@
20472482 return 0;
20482483 }
20492484
2050
-int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
2051
- unsigned long len)
2052
-{
2053
- gfn_t gfn = gpa >> PAGE_SHIFT;
2054
- struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2055
- int offset = offset_in_page(gpa);
2056
-
2057
- return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2058
-}
2059
-EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
2060
-
20612485 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
20622486 void *data, unsigned long len)
20632487 {
....@@ -2155,30 +2579,34 @@
21552579 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
21562580 gfn_t nr_pages_avail;
21572581
2158
- ghc->gpa = gpa;
2582
+ /* Update ghc->generation before performing any error checks. */
21592583 ghc->generation = slots->generation;
2160
- ghc->len = len;
2161
- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2162
- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
2163
- if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
2164
- ghc->hva += offset;
2165
- } else {
2166
- /*
2167
- * If the requested region crosses two memslots, we still
2168
- * verify that the entire region is valid here.
2169
- */
2170
- while (start_gfn <= end_gfn) {
2171
- nr_pages_avail = 0;
2172
- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2173
- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2174
- &nr_pages_avail);
2175
- if (kvm_is_error_hva(ghc->hva))
2176
- return -EFAULT;
2177
- start_gfn += nr_pages_avail;
2178
- }
2179
- /* Use the slow path for cross page reads and writes. */
2180
- ghc->memslot = NULL;
2584
+
2585
+ if (start_gfn > end_gfn) {
2586
+ ghc->hva = KVM_HVA_ERR_BAD;
2587
+ return -EINVAL;
21812588 }
2589
+
2590
+ /*
2591
+ * If the requested region crosses two memslots, we still
2592
+ * verify that the entire region is valid here.
2593
+ */
2594
+ for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2595
+ ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2596
+ ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2597
+ &nr_pages_avail);
2598
+ if (kvm_is_error_hva(ghc->hva))
2599
+ return -EFAULT;
2600
+ }
2601
+
2602
+ /* Use the slow path for cross page reads and writes. */
2603
+ if (nr_pages_needed == 1)
2604
+ ghc->hva += offset;
2605
+ else
2606
+ ghc->memslot = NULL;
2607
+
2608
+ ghc->gpa = gpa;
2609
+ ghc->len = len;
21822610 return 0;
21832611 }
21842612
....@@ -2198,10 +2626,13 @@
21982626 int r;
21992627 gpa_t gpa = ghc->gpa + offset;
22002628
2201
- BUG_ON(len + offset > ghc->len);
2629
+ if (WARN_ON_ONCE(len + offset > ghc->len))
2630
+ return -EINVAL;
22022631
2203
- if (slots->generation != ghc->generation)
2204
- __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
2632
+ if (slots->generation != ghc->generation) {
2633
+ if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2634
+ return -EFAULT;
2635
+ }
22052636
22062637 if (kvm_is_error_hva(ghc->hva))
22072638 return -EFAULT;
....@@ -2225,28 +2656,40 @@
22252656 }
22262657 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
22272658
2228
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2229
- void *data, unsigned long len)
2659
+int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2660
+ void *data, unsigned int offset,
2661
+ unsigned long len)
22302662 {
22312663 struct kvm_memslots *slots = kvm_memslots(kvm);
22322664 int r;
2665
+ gpa_t gpa = ghc->gpa + offset;
22332666
2234
- BUG_ON(len > ghc->len);
2667
+ if (WARN_ON_ONCE(len + offset > ghc->len))
2668
+ return -EINVAL;
22352669
2236
- if (slots->generation != ghc->generation)
2237
- __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
2670
+ if (slots->generation != ghc->generation) {
2671
+ if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2672
+ return -EFAULT;
2673
+ }
22382674
22392675 if (kvm_is_error_hva(ghc->hva))
22402676 return -EFAULT;
22412677
22422678 if (unlikely(!ghc->memslot))
2243
- return kvm_read_guest(kvm, ghc->gpa, data, len);
2679
+ return kvm_read_guest(kvm, gpa, data, len);
22442680
2245
- r = __copy_from_user(data, (void __user *)ghc->hva, len);
2681
+ r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
22462682 if (r)
22472683 return -EFAULT;
22482684
22492685 return 0;
2686
+}
2687
+EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
2688
+
2689
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2690
+ void *data, unsigned long len)
2691
+{
2692
+ return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
22502693 }
22512694 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
22522695
....@@ -2277,8 +2720,7 @@
22772720 }
22782721 EXPORT_SYMBOL_GPL(kvm_clear_guest);
22792722
2280
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
2281
- gfn_t gfn)
2723
+void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn)
22822724 {
22832725 if (memslot && memslot->dirty_bitmap) {
22842726 unsigned long rel_gfn = gfn - memslot->base_gfn;
....@@ -2286,6 +2728,7 @@
22862728 set_bit_le(rel_gfn, memslot->dirty_bitmap);
22872729 }
22882730 }
2731
+EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
22892732
22902733 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
22912734 {
....@@ -2330,33 +2773,40 @@
23302773
23312774 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
23322775 {
2333
- unsigned int old, val, grow;
2776
+ unsigned int old, val, grow, grow_start;
23342777
23352778 old = val = vcpu->halt_poll_ns;
2779
+ grow_start = READ_ONCE(halt_poll_ns_grow_start);
23362780 grow = READ_ONCE(halt_poll_ns_grow);
2337
- /* 10us base */
2338
- if (val == 0 && grow)
2339
- val = 10000;
2340
- else
2341
- val *= grow;
2781
+ if (!grow)
2782
+ goto out;
23422783
2343
- if (val > halt_poll_ns)
2344
- val = halt_poll_ns;
2784
+ val *= grow;
2785
+ if (val < grow_start)
2786
+ val = grow_start;
2787
+
2788
+ if (val > vcpu->kvm->max_halt_poll_ns)
2789
+ val = vcpu->kvm->max_halt_poll_ns;
23452790
23462791 vcpu->halt_poll_ns = val;
2792
+out:
23472793 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
23482794 }
23492795
23502796 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
23512797 {
2352
- unsigned int old, val, shrink;
2798
+ unsigned int old, val, shrink, grow_start;
23532799
23542800 old = val = vcpu->halt_poll_ns;
23552801 shrink = READ_ONCE(halt_poll_ns_shrink);
2802
+ grow_start = READ_ONCE(halt_poll_ns_grow_start);
23562803 if (shrink == 0)
23572804 val = 0;
23582805 else
23592806 val /= shrink;
2807
+
2808
+ if (val < grow_start)
2809
+ val = 0;
23602810
23612811 vcpu->halt_poll_ns = val;
23622812 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
....@@ -2382,18 +2832,28 @@
23822832 return ret;
23832833 }
23842834
2835
+static inline void
2836
+update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
2837
+{
2838
+ if (waited)
2839
+ vcpu->stat.halt_poll_fail_ns += poll_ns;
2840
+ else
2841
+ vcpu->stat.halt_poll_success_ns += poll_ns;
2842
+}
2843
+
23852844 /*
23862845 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
23872846 */
23882847 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
23892848 {
2390
- ktime_t start, cur;
2391
- DECLARE_SWAITQUEUE(wait);
2849
+ ktime_t start, cur, poll_end;
23922850 bool waited = false;
23932851 u64 block_ns;
23942852
2395
- start = cur = ktime_get();
2396
- if (vcpu->halt_poll_ns) {
2853
+ kvm_arch_vcpu_blocking(vcpu);
2854
+
2855
+ start = cur = poll_end = ktime_get();
2856
+ if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
23972857 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
23982858
23992859 ++vcpu->stat.halt_attempted_poll;
....@@ -2408,14 +2868,14 @@
24082868 ++vcpu->stat.halt_poll_invalid;
24092869 goto out;
24102870 }
2411
- cur = ktime_get();
2412
- } while (single_task_running() && ktime_before(cur, stop));
2871
+ poll_end = cur = ktime_get();
2872
+ } while (single_task_running() && !need_resched() &&
2873
+ ktime_before(cur, stop));
24132874 }
24142875
2415
- kvm_arch_vcpu_blocking(vcpu);
2416
-
2876
+ prepare_to_rcuwait(&vcpu->wait);
24172877 for (;;) {
2418
- prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
2878
+ set_current_state(TASK_INTERRUPTIBLE);
24192879
24202880 if (kvm_vcpu_check_block(vcpu) < 0)
24212881 break;
....@@ -2423,28 +2883,33 @@
24232883 waited = true;
24242884 schedule();
24252885 }
2426
-
2427
- finish_swait(&vcpu->wq, &wait);
2886
+ finish_rcuwait(&vcpu->wait);
24282887 cur = ktime_get();
2429
-
2430
- kvm_arch_vcpu_unblocking(vcpu);
24312888 out:
2889
+ kvm_arch_vcpu_unblocking(vcpu);
24322890 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
24332891
2434
- if (!vcpu_valid_wakeup(vcpu))
2435
- shrink_halt_poll_ns(vcpu);
2436
- else if (halt_poll_ns) {
2437
- if (block_ns <= vcpu->halt_poll_ns)
2438
- ;
2439
- /* we had a long block, shrink polling */
2440
- else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
2892
+ update_halt_poll_stats(
2893
+ vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
2894
+
2895
+ if (!kvm_arch_no_poll(vcpu)) {
2896
+ if (!vcpu_valid_wakeup(vcpu)) {
24412897 shrink_halt_poll_ns(vcpu);
2442
- /* we had a short halt and our poll time is too small */
2443
- else if (vcpu->halt_poll_ns < halt_poll_ns &&
2444
- block_ns < halt_poll_ns)
2445
- grow_halt_poll_ns(vcpu);
2446
- } else
2447
- vcpu->halt_poll_ns = 0;
2898
+ } else if (vcpu->kvm->max_halt_poll_ns) {
2899
+ if (block_ns <= vcpu->halt_poll_ns)
2900
+ ;
2901
+ /* we had a long block, shrink polling */
2902
+ else if (vcpu->halt_poll_ns &&
2903
+ block_ns > vcpu->kvm->max_halt_poll_ns)
2904
+ shrink_halt_poll_ns(vcpu);
2905
+ /* we had a short halt and our poll time is too small */
2906
+ else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
2907
+ block_ns < vcpu->kvm->max_halt_poll_ns)
2908
+ grow_halt_poll_ns(vcpu);
2909
+ } else {
2910
+ vcpu->halt_poll_ns = 0;
2911
+ }
2912
+ }
24482913
24492914 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
24502915 kvm_arch_vcpu_block_finish(vcpu);
....@@ -2453,11 +2918,11 @@
24532918
24542919 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
24552920 {
2456
- struct swait_queue_head *wqp;
2921
+ struct rcuwait *waitp;
24572922
2458
- wqp = kvm_arch_vcpu_wq(vcpu);
2459
- if (swq_has_sleeper(wqp)) {
2460
- swake_up_one(wqp);
2923
+ waitp = kvm_arch_vcpu_get_wait(vcpu);
2924
+ if (rcuwait_wake_up(waitp)) {
2925
+ WRITE_ONCE(vcpu->ready, true);
24612926 ++vcpu->stat.halt_wakeup;
24622927 return true;
24632928 }
....@@ -2513,7 +2978,7 @@
25132978 *
25142979 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
25152980 * (preempted lock holder), indicated by @in_spin_loop.
2516
- * Set at the beiginning and cleared at the end of interception/PLE handler.
2981
+ * Set at the beginning and cleared at the end of interception/PLE handler.
25172982 *
25182983 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
25192984 * chance last time (mostly it has become eligible now since we have probably
....@@ -2594,13 +3059,15 @@
25943059 continue;
25953060 } else if (pass && i > last_boosted_vcpu)
25963061 break;
2597
- if (!READ_ONCE(vcpu->preempted))
3062
+ if (!READ_ONCE(vcpu->ready))
25983063 continue;
25993064 if (vcpu == me)
26003065 continue;
2601
- if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
3066
+ if (rcuwait_active(&vcpu->wait) &&
3067
+ !vcpu_dy_runnable(vcpu))
26023068 continue;
2603
- if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
3069
+ if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3070
+ !kvm_arch_vcpu_in_kernel(vcpu))
26043071 continue;
26053072 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
26063073 continue;
....@@ -2659,7 +3126,6 @@
26593126 {
26603127 struct kvm_vcpu *vcpu = filp->private_data;
26613128
2662
- debugfs_remove_recursive(vcpu->debugfs_dentry);
26633129 kvm_put_kvm(vcpu->kvm);
26643130 return 0;
26653131 }
....@@ -2683,30 +3149,21 @@
26833149 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
26843150 }
26853151
2686
-static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3152
+static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
26873153 {
3154
+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3155
+ struct dentry *debugfs_dentry;
26883156 char dir_name[ITOA_MAX_LEN * 2];
2689
- int ret;
2690
-
2691
- if (!kvm_arch_has_vcpu_debugfs())
2692
- return 0;
26933157
26943158 if (!debugfs_initialized())
2695
- return 0;
3159
+ return;
26963160
26973161 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
2698
- vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
2699
- vcpu->kvm->debugfs_dentry);
2700
- if (!vcpu->debugfs_dentry)
2701
- return -ENOMEM;
3162
+ debugfs_dentry = debugfs_create_dir(dir_name,
3163
+ vcpu->kvm->debugfs_dentry);
27023164
2703
- ret = kvm_arch_create_vcpu_debugfs(vcpu);
2704
- if (ret < 0) {
2705
- debugfs_remove_recursive(vcpu->debugfs_dentry);
2706
- return ret;
2707
- }
2708
-
2709
- return 0;
3165
+ kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3166
+#endif
27103167 }
27113168
27123169 /*
....@@ -2716,6 +3173,7 @@
27163173 {
27173174 int r;
27183175 struct kvm_vcpu *vcpu;
3176
+ struct page *page;
27193177
27203178 if (id >= KVM_MAX_VCPU_ID)
27213179 return -EINVAL;
....@@ -2729,21 +3187,29 @@
27293187 kvm->created_vcpus++;
27303188 mutex_unlock(&kvm->lock);
27313189
2732
- vcpu = kvm_arch_vcpu_create(kvm, id);
2733
- if (IS_ERR(vcpu)) {
2734
- r = PTR_ERR(vcpu);
3190
+ r = kvm_arch_vcpu_precreate(kvm, id);
3191
+ if (r)
3192
+ goto vcpu_decrement;
3193
+
3194
+ vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
3195
+ if (!vcpu) {
3196
+ r = -ENOMEM;
27353197 goto vcpu_decrement;
27363198 }
27373199
2738
- preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
3200
+ BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3201
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3202
+ if (!page) {
3203
+ r = -ENOMEM;
3204
+ goto vcpu_free;
3205
+ }
3206
+ vcpu->run = page_address(page);
27393207
2740
- r = kvm_arch_vcpu_setup(vcpu);
2741
- if (r)
2742
- goto vcpu_destroy;
3208
+ kvm_vcpu_init(vcpu, kvm, id);
27433209
2744
- r = kvm_create_vcpu_debugfs(vcpu);
3210
+ r = kvm_arch_vcpu_create(vcpu);
27453211 if (r)
2746
- goto vcpu_destroy;
3212
+ goto vcpu_free_run_page;
27473213
27483214 mutex_lock(&kvm->lock);
27493215 if (kvm_get_vcpu_by_id(kvm, id)) {
....@@ -2758,7 +3224,7 @@
27583224 kvm_get_kvm(kvm);
27593225 r = create_vcpu_fd(vcpu);
27603226 if (r < 0) {
2761
- kvm_put_kvm(kvm);
3227
+ kvm_put_kvm_no_destroy(kvm);
27623228 goto unlock_vcpu_destroy;
27633229 }
27643230
....@@ -2773,13 +3239,16 @@
27733239
27743240 mutex_unlock(&kvm->lock);
27753241 kvm_arch_vcpu_postcreate(vcpu);
3242
+ kvm_create_vcpu_debugfs(vcpu);
27763243 return r;
27773244
27783245 unlock_vcpu_destroy:
27793246 mutex_unlock(&kvm->lock);
2780
- debugfs_remove_recursive(vcpu->debugfs_dentry);
2781
-vcpu_destroy:
27823247 kvm_arch_vcpu_destroy(vcpu);
3248
+vcpu_free_run_page:
3249
+ free_page((unsigned long)vcpu->run);
3250
+vcpu_free:
3251
+ kmem_cache_free(kvm_vcpu_cache, vcpu);
27833252 vcpu_decrement:
27843253 mutex_lock(&kvm->lock);
27853254 kvm->created_vcpus--;
....@@ -2807,7 +3276,7 @@
28073276 struct kvm_fpu *fpu = NULL;
28083277 struct kvm_sregs *kvm_sregs = NULL;
28093278
2810
- if (vcpu->kvm->mm != current->mm)
3279
+ if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
28113280 return -EIO;
28123281
28133282 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
....@@ -2844,7 +3313,7 @@
28443313 synchronize_rcu();
28453314 put_pid(oldpid);
28463315 }
2847
- r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
3316
+ r = kvm_arch_vcpu_ioctl_run(vcpu);
28483317 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
28493318 break;
28503319 }
....@@ -2852,7 +3321,7 @@
28523321 struct kvm_regs *kvm_regs;
28533322
28543323 r = -ENOMEM;
2855
- kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
3324
+ kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
28563325 if (!kvm_regs)
28573326 goto out;
28583327 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
....@@ -2869,7 +3338,6 @@
28693338 case KVM_SET_REGS: {
28703339 struct kvm_regs *kvm_regs;
28713340
2872
- r = -ENOMEM;
28733341 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
28743342 if (IS_ERR(kvm_regs)) {
28753343 r = PTR_ERR(kvm_regs);
....@@ -2880,7 +3348,8 @@
28803348 break;
28813349 }
28823350 case KVM_GET_SREGS: {
2883
- kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
3351
+ kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3352
+ GFP_KERNEL_ACCOUNT);
28843353 r = -ENOMEM;
28853354 if (!kvm_sregs)
28863355 goto out;
....@@ -2972,7 +3441,7 @@
29723441 break;
29733442 }
29743443 case KVM_GET_FPU: {
2975
- fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
3444
+ fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
29763445 r = -ENOMEM;
29773446 if (!fpu)
29783447 goto out;
....@@ -3013,7 +3482,7 @@
30133482 void __user *argp = compat_ptr(arg);
30143483 int r;
30153484
3016
- if (vcpu->kvm->mm != current->mm)
3485
+ if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
30173486 return -EIO;
30183487
30193488 switch (ioctl) {
....@@ -3031,7 +3500,8 @@
30313500 if (kvm_sigmask.len != sizeof(compat_sigset_t))
30323501 goto out;
30333502 r = -EFAULT;
3034
- if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
3503
+ if (get_compat_sigset(&sigset,
3504
+ (compat_sigset_t __user *)sigmask_arg->sigset))
30353505 goto out;
30363506 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
30373507 } else
....@@ -3046,6 +3516,16 @@
30463516 return r;
30473517 }
30483518 #endif
3519
+
3520
+static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3521
+{
3522
+ struct kvm_device *dev = filp->private_data;
3523
+
3524
+ if (dev->ops->mmap)
3525
+ return dev->ops->mmap(dev, vma);
3526
+
3527
+ return -ENODEV;
3528
+}
30493529
30503530 static int kvm_device_ioctl_attr(struct kvm_device *dev,
30513531 int (*accessor)(struct kvm_device *dev,
....@@ -3068,7 +3548,7 @@
30683548 {
30693549 struct kvm_device *dev = filp->private_data;
30703550
3071
- if (dev->kvm->mm != current->mm)
3551
+ if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
30723552 return -EIO;
30733553
30743554 switch (ioctl) {
....@@ -3091,6 +3571,13 @@
30913571 struct kvm_device *dev = filp->private_data;
30923572 struct kvm *kvm = dev->kvm;
30933573
3574
+ if (dev->ops->release) {
3575
+ mutex_lock(&kvm->lock);
3576
+ list_del(&dev->vm_node);
3577
+ dev->ops->release(dev);
3578
+ mutex_unlock(&kvm->lock);
3579
+ }
3580
+
30943581 kvm_put_kvm(kvm);
30953582 return 0;
30963583 }
....@@ -3099,6 +3586,7 @@
30993586 .unlocked_ioctl = kvm_device_ioctl,
31003587 .release = kvm_device_release,
31013588 KVM_COMPAT(kvm_device_ioctl),
3589
+ .mmap = kvm_device_mmap,
31023590 };
31033591
31043592 struct kvm_device *kvm_device_from_filp(struct file *filp)
....@@ -3109,14 +3597,14 @@
31093597 return filp->private_data;
31103598 }
31113599
3112
-static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3600
+static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
31133601 #ifdef CONFIG_KVM_MPIC
31143602 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
31153603 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
31163604 #endif
31173605 };
31183606
3119
-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
3607
+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
31203608 {
31213609 if (type >= ARRAY_SIZE(kvm_device_ops_table))
31223610 return -ENOSPC;
....@@ -3137,7 +3625,7 @@
31373625 static int kvm_ioctl_create_device(struct kvm *kvm,
31383626 struct kvm_create_device *cd)
31393627 {
3140
- struct kvm_device_ops *ops = NULL;
3628
+ const struct kvm_device_ops *ops = NULL;
31413629 struct kvm_device *dev;
31423630 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
31433631 int type;
....@@ -3154,7 +3642,7 @@
31543642 if (test)
31553643 return 0;
31563644
3157
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
3645
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
31583646 if (!dev)
31593647 return -ENOMEM;
31603648
....@@ -3177,11 +3665,14 @@
31773665 kvm_get_kvm(kvm);
31783666 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
31793667 if (ret < 0) {
3180
- kvm_put_kvm(kvm);
3668
+ kvm_put_kvm_no_destroy(kvm);
31813669 mutex_lock(&kvm->lock);
31823670 list_del(&dev->vm_node);
3671
+ if (ops->release)
3672
+ ops->release(dev);
31833673 mutex_unlock(&kvm->lock);
3184
- ops->destroy(dev);
3674
+ if (ops->destroy)
3675
+ ops->destroy(dev);
31853676 return ret;
31863677 }
31873678
....@@ -3205,10 +3696,18 @@
32053696 #endif
32063697 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
32073698 case KVM_CAP_CHECK_EXTENSION_VM:
3699
+ case KVM_CAP_ENABLE_CAP_VM:
3700
+ case KVM_CAP_HALT_POLL:
32083701 return 1;
32093702 #ifdef CONFIG_KVM_MMIO
32103703 case KVM_CAP_COALESCED_MMIO:
32113704 return KVM_COALESCED_MMIO_PAGE_OFFSET;
3705
+ case KVM_CAP_COALESCED_PIO:
3706
+ return 1;
3707
+#endif
3708
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3709
+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
3710
+ return KVM_DIRTY_LOG_MANUAL_CAPS;
32123711 #endif
32133712 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
32143713 case KVM_CAP_IRQ_ROUTING:
....@@ -3218,10 +3717,47 @@
32183717 case KVM_CAP_MULTI_ADDRESS_SPACE:
32193718 return KVM_ADDRESS_SPACE_NUM;
32203719 #endif
3720
+ case KVM_CAP_NR_MEMSLOTS:
3721
+ return KVM_USER_MEM_SLOTS;
32213722 default:
32223723 break;
32233724 }
32243725 return kvm_vm_ioctl_check_extension(kvm, arg);
3726
+}
3727
+
3728
+int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3729
+ struct kvm_enable_cap *cap)
3730
+{
3731
+ return -EINVAL;
3732
+}
3733
+
3734
+static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
3735
+ struct kvm_enable_cap *cap)
3736
+{
3737
+ switch (cap->cap) {
3738
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3739
+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
3740
+ u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
3741
+
3742
+ if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
3743
+ allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
3744
+
3745
+ if (cap->flags || (cap->args[0] & ~allowed_options))
3746
+ return -EINVAL;
3747
+ kvm->manual_dirty_log_protect = cap->args[0];
3748
+ return 0;
3749
+ }
3750
+#endif
3751
+ case KVM_CAP_HALT_POLL: {
3752
+ if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
3753
+ return -EINVAL;
3754
+
3755
+ kvm->max_halt_poll_ns = cap->args[0];
3756
+ return 0;
3757
+ }
3758
+ default:
3759
+ return kvm_vm_ioctl_enable_cap(kvm, cap);
3760
+ }
32253761 }
32263762
32273763 static long kvm_vm_ioctl(struct file *filp,
....@@ -3231,12 +3767,21 @@
32313767 void __user *argp = (void __user *)arg;
32323768 int r;
32333769
3234
- if (kvm->mm != current->mm)
3770
+ if (kvm->mm != current->mm || kvm->vm_bugged)
32353771 return -EIO;
32363772 switch (ioctl) {
32373773 case KVM_CREATE_VCPU:
32383774 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
32393775 break;
3776
+ case KVM_ENABLE_CAP: {
3777
+ struct kvm_enable_cap cap;
3778
+
3779
+ r = -EFAULT;
3780
+ if (copy_from_user(&cap, argp, sizeof(cap)))
3781
+ goto out;
3782
+ r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
3783
+ break;
3784
+ }
32403785 case KVM_SET_USER_MEMORY_REGION: {
32413786 struct kvm_userspace_memory_region kvm_userspace_mem;
32423787
....@@ -3257,6 +3802,17 @@
32573802 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
32583803 break;
32593804 }
3805
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3806
+ case KVM_CLEAR_DIRTY_LOG: {
3807
+ struct kvm_clear_dirty_log log;
3808
+
3809
+ r = -EFAULT;
3810
+ if (copy_from_user(&log, argp, sizeof(log)))
3811
+ goto out;
3812
+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
3813
+ break;
3814
+ }
3815
+#endif
32603816 #ifdef CONFIG_KVM_MMIO
32613817 case KVM_REGISTER_COALESCED_MMIO: {
32623818 struct kvm_coalesced_mmio_zone zone;
....@@ -3347,21 +3903,18 @@
33473903 if (routing.flags)
33483904 goto out;
33493905 if (routing.nr) {
3350
- r = -ENOMEM;
3351
- entries = vmalloc(array_size(sizeof(*entries),
3352
- routing.nr));
3353
- if (!entries)
3354
- goto out;
3355
- r = -EFAULT;
33563906 urouting = argp;
3357
- if (copy_from_user(entries, urouting->entries,
3358
- routing.nr * sizeof(*entries)))
3359
- goto out_free_irq_routing;
3907
+ entries = vmemdup_user(urouting->entries,
3908
+ array_size(sizeof(*entries),
3909
+ routing.nr));
3910
+ if (IS_ERR(entries)) {
3911
+ r = PTR_ERR(entries);
3912
+ goto out;
3913
+ }
33603914 }
33613915 r = kvm_set_irq_routing(kvm, entries, routing.nr,
33623916 routing.flags);
3363
-out_free_irq_routing:
3364
- vfree(entries);
3917
+ kvfree(entries);
33653918 break;
33663919 }
33673920 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
....@@ -3403,15 +3956,54 @@
34033956 };
34043957 };
34053958
3959
+struct compat_kvm_clear_dirty_log {
3960
+ __u32 slot;
3961
+ __u32 num_pages;
3962
+ __u64 first_page;
3963
+ union {
3964
+ compat_uptr_t dirty_bitmap; /* one bit per page */
3965
+ __u64 padding2;
3966
+ };
3967
+};
3968
+
3969
+long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
3970
+ unsigned long arg)
3971
+{
3972
+ return -ENOTTY;
3973
+}
3974
+
34063975 static long kvm_vm_compat_ioctl(struct file *filp,
34073976 unsigned int ioctl, unsigned long arg)
34083977 {
34093978 struct kvm *kvm = filp->private_data;
34103979 int r;
34113980
3412
- if (kvm->mm != current->mm)
3981
+ if (kvm->mm != current->mm || kvm->vm_bugged)
34133982 return -EIO;
3983
+
3984
+ r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
3985
+ if (r != -ENOTTY)
3986
+ return r;
3987
+
34143988 switch (ioctl) {
3989
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3990
+ case KVM_CLEAR_DIRTY_LOG: {
3991
+ struct compat_kvm_clear_dirty_log compat_log;
3992
+ struct kvm_clear_dirty_log log;
3993
+
3994
+ if (copy_from_user(&compat_log, (void __user *)arg,
3995
+ sizeof(compat_log)))
3996
+ return -EFAULT;
3997
+ log.slot = compat_log.slot;
3998
+ log.num_pages = compat_log.num_pages;
3999
+ log.first_page = compat_log.first_page;
4000
+ log.padding2 = compat_log.padding2;
4001
+ log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4002
+
4003
+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4004
+ break;
4005
+ }
4006
+#endif
34154007 case KVM_GET_DIRTY_LOG: {
34164008 struct compat_kvm_dirty_log compat_log;
34174009 struct kvm_dirty_log log;
....@@ -3749,6 +4341,7 @@
37494341 r = __kvm_io_bus_write(vcpu, bus, &range, val);
37504342 return r < 0 ? r : 0;
37514343 }
4344
+EXPORT_SYMBOL_GPL(kvm_io_bus_write);
37524345
37534346 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
37544347 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
....@@ -3799,7 +4392,6 @@
37994392
38004393 return -EOPNOTSUPP;
38014394 }
3802
-EXPORT_SYMBOL_GPL(kvm_io_bus_write);
38034395
38044396 /* kvm_io_bus_read - called under kvm->slots_lock */
38054397 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
....@@ -3821,7 +4413,6 @@
38214413 return r < 0 ? r : 0;
38224414 }
38234415
3824
-
38254416 /* Caller must hold slots_lock. */
38264417 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
38274418 int len, struct kvm_io_device *dev)
....@@ -3838,8 +4429,8 @@
38384429 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
38394430 return -ENOSPC;
38404431
3841
- new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
3842
- sizeof(struct kvm_io_range)), GFP_KERNEL);
4432
+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
4433
+ GFP_KERNEL_ACCOUNT);
38434434 if (!new_bus)
38444435 return -ENOMEM;
38454436
....@@ -3866,15 +4457,15 @@
38664457 }
38674458
38684459 /* Caller must hold slots_lock. */
3869
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
3870
- struct kvm_io_device *dev)
4460
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4461
+ struct kvm_io_device *dev)
38714462 {
38724463 int i, j;
38734464 struct kvm_io_bus *new_bus, *bus;
38744465
38754466 bus = kvm_get_bus(kvm, bus_idx);
38764467 if (!bus)
3877
- return;
4468
+ return 0;
38784469
38794470 for (i = 0; i < bus->dev_count; i++)
38804471 if (bus->range[i].dev == dev) {
....@@ -3882,16 +4473,22 @@
38824473 }
38834474
38844475 if (i == bus->dev_count)
3885
- return;
4476
+ return 0;
38864477
3887
- new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
3888
- sizeof(struct kvm_io_range)), GFP_KERNEL);
4478
+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
4479
+ GFP_KERNEL_ACCOUNT);
38894480 if (new_bus) {
3890
- memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4481
+ memcpy(new_bus, bus, struct_size(bus, range, i));
38914482 new_bus->dev_count--;
38924483 memcpy(new_bus->range + i, bus->range + i + 1,
3893
- (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
3894
- } else {
4484
+ flex_array_size(new_bus, range, new_bus->dev_count - i));
4485
+ }
4486
+
4487
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4488
+ synchronize_srcu_expedited(&kvm->srcu);
4489
+
4490
+ /* Destroy the old bus _after_ installing the (null) bus. */
4491
+ if (!new_bus) {
38954492 pr_err("kvm: failed to shrink bus, removing it completely\n");
38964493 for (j = 0; j < bus->dev_count; j++) {
38974494 if (j == i)
....@@ -3900,10 +4497,8 @@
39004497 }
39014498 }
39024499
3903
- rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
3904
- synchronize_srcu_expedited(&kvm->srcu);
39054500 kfree(bus);
3906
- return;
4501
+ return new_bus ? 0 : -ENOMEM;
39074502 }
39084503
39094504 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
....@@ -3948,8 +4543,9 @@
39484543 return -ENOENT;
39494544
39504545 if (simple_attr_open(inode, file, get,
3951
- stat_data->mode & S_IWUGO ? set : NULL,
3952
- fmt)) {
4546
+ KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
4547
+ ? set : NULL,
4548
+ fmt)) {
39534549 kvm_put_kvm(stat_data->kvm);
39544550 return -ENOMEM;
39554551 }
....@@ -3968,105 +4564,111 @@
39684564 return 0;
39694565 }
39704566
3971
-static int vm_stat_get_per_vm(void *data, u64 *val)
4567
+static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
39724568 {
3973
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
3974
-
3975
- *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
4569
+ *val = *(ulong *)((void *)kvm + offset);
39764570
39774571 return 0;
39784572 }
39794573
3980
-static int vm_stat_clear_per_vm(void *data, u64 val)
4574
+static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
39814575 {
3982
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
3983
-
3984
- if (val)
3985
- return -EINVAL;
3986
-
3987
- *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0;
4576
+ *(ulong *)((void *)kvm + offset) = 0;
39884577
39894578 return 0;
39904579 }
39914580
3992
-static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
3993
-{
3994
- __simple_attr_check_format("%llu\n", 0ull);
3995
- return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
3996
- vm_stat_clear_per_vm, "%llu\n");
3997
-}
3998
-
3999
-static const struct file_operations vm_stat_get_per_vm_fops = {
4000
- .owner = THIS_MODULE,
4001
- .open = vm_stat_get_per_vm_open,
4002
- .release = kvm_debugfs_release,
4003
- .read = simple_attr_read,
4004
- .write = simple_attr_write,
4005
- .llseek = no_llseek,
4006
-};
4007
-
4008
-static int vcpu_stat_get_per_vm(void *data, u64 *val)
4581
+static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
40094582 {
40104583 int i;
4011
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
40124584 struct kvm_vcpu *vcpu;
40134585
40144586 *val = 0;
40154587
4016
- kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4017
- *val += *(u64 *)((void *)vcpu + stat_data->offset);
4588
+ kvm_for_each_vcpu(i, vcpu, kvm)
4589
+ *val += *(u64 *)((void *)vcpu + offset);
40184590
40194591 return 0;
40204592 }
40214593
4022
-static int vcpu_stat_clear_per_vm(void *data, u64 val)
4594
+static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
40234595 {
40244596 int i;
4025
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
40264597 struct kvm_vcpu *vcpu;
4598
+
4599
+ kvm_for_each_vcpu(i, vcpu, kvm)
4600
+ *(u64 *)((void *)vcpu + offset) = 0;
4601
+
4602
+ return 0;
4603
+}
4604
+
4605
+static int kvm_stat_data_get(void *data, u64 *val)
4606
+{
4607
+ int r = -EFAULT;
4608
+ struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4609
+
4610
+ switch (stat_data->dbgfs_item->kind) {
4611
+ case KVM_STAT_VM:
4612
+ r = kvm_get_stat_per_vm(stat_data->kvm,
4613
+ stat_data->dbgfs_item->offset, val);
4614
+ break;
4615
+ case KVM_STAT_VCPU:
4616
+ r = kvm_get_stat_per_vcpu(stat_data->kvm,
4617
+ stat_data->dbgfs_item->offset, val);
4618
+ break;
4619
+ }
4620
+
4621
+ return r;
4622
+}
4623
+
4624
+static int kvm_stat_data_clear(void *data, u64 val)
4625
+{
4626
+ int r = -EFAULT;
4627
+ struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
40274628
40284629 if (val)
40294630 return -EINVAL;
40304631
4031
- kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4032
- *(u64 *)((void *)vcpu + stat_data->offset) = 0;
4632
+ switch (stat_data->dbgfs_item->kind) {
4633
+ case KVM_STAT_VM:
4634
+ r = kvm_clear_stat_per_vm(stat_data->kvm,
4635
+ stat_data->dbgfs_item->offset);
4636
+ break;
4637
+ case KVM_STAT_VCPU:
4638
+ r = kvm_clear_stat_per_vcpu(stat_data->kvm,
4639
+ stat_data->dbgfs_item->offset);
4640
+ break;
4641
+ }
40334642
4034
- return 0;
4643
+ return r;
40354644 }
40364645
4037
-static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
4646
+static int kvm_stat_data_open(struct inode *inode, struct file *file)
40384647 {
40394648 __simple_attr_check_format("%llu\n", 0ull);
4040
- return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
4041
- vcpu_stat_clear_per_vm, "%llu\n");
4649
+ return kvm_debugfs_open(inode, file, kvm_stat_data_get,
4650
+ kvm_stat_data_clear, "%llu\n");
40424651 }
40434652
4044
-static const struct file_operations vcpu_stat_get_per_vm_fops = {
4045
- .owner = THIS_MODULE,
4046
- .open = vcpu_stat_get_per_vm_open,
4653
+static const struct file_operations stat_fops_per_vm = {
4654
+ .owner = THIS_MODULE,
4655
+ .open = kvm_stat_data_open,
40474656 .release = kvm_debugfs_release,
4048
- .read = simple_attr_read,
4049
- .write = simple_attr_write,
4050
- .llseek = no_llseek,
4051
-};
4052
-
4053
-static const struct file_operations *stat_fops_per_vm[] = {
4054
- [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
4055
- [KVM_STAT_VM] = &vm_stat_get_per_vm_fops,
4657
+ .read = simple_attr_read,
4658
+ .write = simple_attr_write,
4659
+ .llseek = no_llseek,
40564660 };
40574661
40584662 static int vm_stat_get(void *_offset, u64 *val)
40594663 {
40604664 unsigned offset = (long)_offset;
40614665 struct kvm *kvm;
4062
- struct kvm_stat_data stat_tmp = {.offset = offset};
40634666 u64 tmp_val;
40644667
40654668 *val = 0;
40664669 mutex_lock(&kvm_lock);
40674670 list_for_each_entry(kvm, &vm_list, vm_list) {
4068
- stat_tmp.kvm = kvm;
4069
- vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
4671
+ kvm_get_stat_per_vm(kvm, offset, &tmp_val);
40704672 *val += tmp_val;
40714673 }
40724674 mutex_unlock(&kvm_lock);
....@@ -4077,15 +4679,13 @@
40774679 {
40784680 unsigned offset = (long)_offset;
40794681 struct kvm *kvm;
4080
- struct kvm_stat_data stat_tmp = {.offset = offset};
40814682
40824683 if (val)
40834684 return -EINVAL;
40844685
40854686 mutex_lock(&kvm_lock);
40864687 list_for_each_entry(kvm, &vm_list, vm_list) {
4087
- stat_tmp.kvm = kvm;
4088
- vm_stat_clear_per_vm((void *)&stat_tmp, 0);
4688
+ kvm_clear_stat_per_vm(kvm, offset);
40894689 }
40904690 mutex_unlock(&kvm_lock);
40914691
....@@ -4098,14 +4698,12 @@
40984698 {
40994699 unsigned offset = (long)_offset;
41004700 struct kvm *kvm;
4101
- struct kvm_stat_data stat_tmp = {.offset = offset};
41024701 u64 tmp_val;
41034702
41044703 *val = 0;
41054704 mutex_lock(&kvm_lock);
41064705 list_for_each_entry(kvm, &vm_list, vm_list) {
4107
- stat_tmp.kvm = kvm;
4108
- vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
4706
+ kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
41094707 *val += tmp_val;
41104708 }
41114709 mutex_unlock(&kvm_lock);
....@@ -4116,15 +4714,13 @@
41164714 {
41174715 unsigned offset = (long)_offset;
41184716 struct kvm *kvm;
4119
- struct kvm_stat_data stat_tmp = {.offset = offset};
41204717
41214718 if (val)
41224719 return -EINVAL;
41234720
41244721 mutex_lock(&kvm_lock);
41254722 list_for_each_entry(kvm, &vm_list, vm_list) {
4126
- stat_tmp.kvm = kvm;
4127
- vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
4723
+ kvm_clear_stat_per_vcpu(kvm, offset);
41284724 }
41294725 mutex_unlock(&kvm_lock);
41304726
....@@ -4158,7 +4754,7 @@
41584754 active = kvm_active_vms;
41594755 mutex_unlock(&kvm_lock);
41604756
4161
- env = kzalloc(sizeof(*env), GFP_KERNEL);
4757
+ env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
41624758 if (!env)
41634759 return;
41644760
....@@ -4173,8 +4769,8 @@
41734769 }
41744770 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
41754771
4176
- if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
4177
- char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
4772
+ if (kvm->debugfs_dentry) {
4773
+ char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
41784774
41794775 if (p) {
41804776 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
....@@ -4197,9 +4793,8 @@
41974793
41984794 kvm_debugfs_num_entries = 0;
41994795 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
4200
- int mode = p->mode ? p->mode : 0644;
4201
- debugfs_create_file(p->name, mode, kvm_debugfs_dir,
4202
- (void *)(long)p->offset,
4796
+ debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
4797
+ kvm_debugfs_dir, (void *)(long)p->offset,
42034798 stat_fops[p->kind]);
42044799 }
42054800 }
....@@ -4214,7 +4809,9 @@
42144809 static void kvm_resume(void)
42154810 {
42164811 if (kvm_usage_count) {
4217
- WARN_ON(raw_spin_is_locked(&kvm_count_lock));
4812
+#ifdef CONFIG_LOCKDEP
4813
+ WARN_ON(lockdep_is_held(&kvm_count_lock));
4814
+#endif
42184815 hardware_enable_nolock(NULL);
42194816 }
42204817 }
....@@ -4234,11 +4831,11 @@
42344831 {
42354832 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
42364833
4237
- if (vcpu->preempted)
4238
- vcpu->preempted = false;
4834
+ WRITE_ONCE(vcpu->preempted, false);
4835
+ WRITE_ONCE(vcpu->ready, false);
42394836
4837
+ __this_cpu_write(kvm_running_vcpu, vcpu);
42404838 kvm_arch_sched_in(vcpu, cpu);
4241
-
42424839 kvm_arch_vcpu_load(vcpu, cpu);
42434840 }
42444841
....@@ -4247,14 +4844,59 @@
42474844 {
42484845 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
42494846
4250
- if (current->state == TASK_RUNNING)
4251
- vcpu->preempted = true;
4847
+ if (current->state == TASK_RUNNING) {
4848
+ WRITE_ONCE(vcpu->preempted, true);
4849
+ WRITE_ONCE(vcpu->ready, true);
4850
+ }
42524851 kvm_arch_vcpu_put(vcpu);
4852
+ __this_cpu_write(kvm_running_vcpu, NULL);
4853
+}
4854
+
4855
+/**
4856
+ * kvm_get_running_vcpu - get the vcpu running on the current CPU.
4857
+ *
4858
+ * We can disable preemption locally around accessing the per-CPU variable,
4859
+ * and use the resolved vcpu pointer after enabling preemption again,
4860
+ * because even if the current thread is migrated to another CPU, reading
4861
+ * the per-CPU value later will give us the same value as we update the
4862
+ * per-CPU variable in the preempt notifier handlers.
4863
+ */
4864
+struct kvm_vcpu *kvm_get_running_vcpu(void)
4865
+{
4866
+ struct kvm_vcpu *vcpu;
4867
+
4868
+ preempt_disable();
4869
+ vcpu = __this_cpu_read(kvm_running_vcpu);
4870
+ preempt_enable();
4871
+
4872
+ return vcpu;
4873
+}
4874
+EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
4875
+
4876
+/**
4877
+ * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
4878
+ */
4879
+struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
4880
+{
4881
+ return &kvm_running_vcpu;
4882
+}
4883
+
4884
+struct kvm_cpu_compat_check {
4885
+ void *opaque;
4886
+ int *ret;
4887
+};
4888
+
4889
+static void check_processor_compat(void *data)
4890
+{
4891
+ struct kvm_cpu_compat_check *c = data;
4892
+
4893
+ *c->ret = kvm_arch_check_processor_compat(c->opaque);
42534894 }
42544895
42554896 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
42564897 struct module *module)
42574898 {
4899
+ struct kvm_cpu_compat_check c;
42584900 int r;
42594901 int cpu;
42604902
....@@ -4278,16 +4920,16 @@
42784920 goto out_free_0;
42794921 }
42804922
4281
- r = kvm_arch_hardware_setup();
4923
+ r = kvm_arch_hardware_setup(opaque);
42824924 if (r < 0)
4283
- goto out_free_0a;
4925
+ goto out_free_1;
42844926
4927
+ c.ret = &r;
4928
+ c.opaque = opaque;
42854929 for_each_online_cpu(cpu) {
4286
- smp_call_function_single(cpu,
4287
- kvm_arch_check_processor_compat,
4288
- &r, 1);
4930
+ smp_call_function_single(cpu, check_processor_compat, &c, 1);
42894931 if (r < 0)
4290
- goto out_free_1;
4932
+ goto out_free_2;
42914933 }
42924934
42934935 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
....@@ -4344,9 +4986,8 @@
43444986 unregister_reboot_notifier(&kvm_reboot_notifier);
43454987 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
43464988 out_free_2:
4347
-out_free_1:
43484989 kvm_arch_hardware_unsetup();
4349
-out_free_0a:
4990
+out_free_1:
43504991 free_cpumask_var(cpus_hardware_enabled);
43514992 out_free_0:
43524993 kvm_irqfd_exit();