hc
2024-01-31 f70575805708cabdedea7498aaa3f710fde4d920
kernel/virt/kvm/kvm_main.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Kernel-based Virtual Machine driver for Linux
34 *
....@@ -10,10 +11,6 @@
1011 * Authors:
1112 * Avi Kivity <avi@qumranet.com>
1213 * Yaniv Kamay <yaniv@qumranet.com>
13
- *
14
- * This work is licensed under the terms of the GNU GPL, version 2. See
15
- * the COPYING file in the top-level directory.
16
- *
1714 */
1815
1916 #include <kvm/iodev.h>
....@@ -51,13 +48,13 @@
5148 #include <linux/slab.h>
5249 #include <linux/sort.h>
5350 #include <linux/bsearch.h>
54
-#include <linux/kthread.h>
5551 #include <linux/io.h>
52
+#include <linux/lockdep.h>
53
+#include <linux/kthread.h>
5654
5755 #include <asm/processor.h>
5856 #include <asm/ioctl.h>
5957 #include <linux/uaccess.h>
60
-#include <asm/pgtable.h>
6158
6259 #include "coalesced_mmio.h"
6360 #include "async_pf.h"
....@@ -82,6 +79,11 @@
8279 module_param(halt_poll_ns_grow, uint, 0644);
8380 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
8481
82
+/* The start value to grow halt_poll_ns from */
83
+unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
84
+module_param(halt_poll_ns_grow_start, uint, 0644);
85
+EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
86
+
8587 /* Default resets per-vcpu halt_poll_ns . */
8688 unsigned int halt_poll_ns_shrink;
8789 module_param(halt_poll_ns_shrink, uint, 0644);
....@@ -101,16 +103,18 @@
101103 static int kvm_usage_count;
102104 static atomic_t hardware_enable_failed;
103105
104
-struct kmem_cache *kvm_vcpu_cache;
105
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
106
+static struct kmem_cache *kvm_vcpu_cache;
106107
107108 static __read_mostly struct preempt_ops kvm_preempt_ops;
109
+static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
108110
109111 struct dentry *kvm_debugfs_dir;
110112 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
111113
112114 static int kvm_debugfs_num_entries;
113
-static const struct file_operations *stat_fops_per_vm[];
115
+static const struct file_operations stat_fops_per_vm;
116
+
117
+static struct file_operations kvm_chardev_ops;
114118
115119 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116120 unsigned long arg);
....@@ -119,21 +123,30 @@
119123 unsigned long arg);
120124 #define KVM_COMPAT(c) .compat_ioctl = (c)
121125 #else
126
+/*
127
+ * For architectures that don't implement a compat infrastructure,
128
+ * adopt a double line of defense:
129
+ * - Prevent a compat task from opening /dev/kvm
130
+ * - If the open has been done by a 64bit task, and the KVM fd
131
+ * passed to a compat task, let the ioctls fail.
132
+ */
122133 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
123134 unsigned long arg) { return -EINVAL; }
124
-#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl
135
+
136
+static int kvm_no_compat_open(struct inode *inode, struct file *file)
137
+{
138
+ return is_compat_task() ? -ENODEV : 0;
139
+}
140
+#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
141
+ .open = kvm_no_compat_open
125142 #endif
126143 static int hardware_enable_all(void);
127144 static void hardware_disable_all(void);
128145
129146 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
130147
131
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
132
-
133148 __visible bool kvm_rebooting;
134149 EXPORT_SYMBOL_GPL(kvm_rebooting);
135
-
136
-static bool largepages_enabled = true;
137150
138151 #define KVM_EVENT_CREATE_VM 0
139152 #define KVM_EVENT_DESTROY_VM 1
....@@ -141,8 +154,14 @@
141154 static unsigned long long kvm_createvm_count;
142155 static unsigned long long kvm_active_vms;
143156
157
+static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
158
+
144159 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
145160 unsigned long start, unsigned long end)
161
+{
162
+}
163
+
164
+__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
146165 {
147166 }
148167
....@@ -175,12 +194,24 @@
175194 return true;
176195 }
177196
197
+bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
198
+{
199
+ struct page *page = pfn_to_page(pfn);
200
+
201
+ if (!PageTransCompoundMap(page))
202
+ return false;
203
+
204
+ return is_transparent_hugepage(compound_head(page));
205
+}
206
+
178207 /*
179208 * Switches to specified vcpu, until a matching vcpu_put()
180209 */
181210 void vcpu_load(struct kvm_vcpu *vcpu)
182211 {
183212 int cpu = get_cpu();
213
+
214
+ __this_cpu_write(kvm_running_vcpu, vcpu);
184215 preempt_notifier_register(&vcpu->preempt_notifier);
185216 kvm_arch_vcpu_load(vcpu, cpu);
186217 put_cpu();
....@@ -192,6 +223,7 @@
192223 preempt_disable();
193224 kvm_arch_vcpu_put(vcpu);
194225 preempt_notifier_unregister(&vcpu->preempt_notifier);
226
+ __this_cpu_write(kvm_running_vcpu, NULL);
195227 preempt_enable();
196228 }
197229 EXPORT_SYMBOL_GPL(vcpu_put);
....@@ -218,9 +250,13 @@
218250 {
219251 }
220252
221
-static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
253
+static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait)
222254 {
223
- if (unlikely(!cpus))
255
+ const struct cpumask *cpus;
256
+
257
+ if (likely(cpumask_available(tmp)))
258
+ cpus = tmp;
259
+ else
224260 cpus = cpu_online_mask;
225261
226262 if (cpumask_empty(cpus))
....@@ -230,28 +266,57 @@
230266 return true;
231267 }
232268
269
+static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
270
+ unsigned int req, cpumask_var_t tmp,
271
+ int current_cpu)
272
+{
273
+ int cpu;
274
+
275
+ kvm_make_request(req, vcpu);
276
+
277
+ if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
278
+ return;
279
+
280
+ /*
281
+ * tmp can be "unavailable" if cpumasks are allocated off stack as
282
+ * allocation of the mask is deliberately not fatal and is handled by
283
+ * falling back to kicking all online CPUs.
284
+ */
285
+ if (!cpumask_available(tmp))
286
+ return;
287
+
288
+ /*
289
+ * Note, the vCPU could get migrated to a different pCPU at any point
290
+ * after kvm_request_needs_ipi(), which could result in sending an IPI
291
+ * to the previous pCPU. But, that's OK because the purpose of the IPI
292
+ * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
293
+ * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
294
+ * after this point is also OK, as the requirement is only that KVM wait
295
+ * for vCPUs that were reading SPTEs _before_ any changes were
296
+ * finalized. See kvm_vcpu_kick() for more details on handling requests.
297
+ */
298
+ if (kvm_request_needs_ipi(vcpu, req)) {
299
+ cpu = READ_ONCE(vcpu->cpu);
300
+ if (cpu != -1 && cpu != current_cpu)
301
+ __cpumask_set_cpu(cpu, tmp);
302
+ }
303
+}
304
+
233305 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
306
+ struct kvm_vcpu *except,
234307 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
235308 {
236
- int i, cpu, me;
237309 struct kvm_vcpu *vcpu;
310
+ int i, me;
238311 bool called;
239312
240313 me = get_cpu();
241314
242
- kvm_for_each_vcpu(i, vcpu, kvm) {
243
- if (!test_bit(i, vcpu_bitmap))
315
+ for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
316
+ vcpu = kvm_get_vcpu(kvm, i);
317
+ if (!vcpu || vcpu == except)
244318 continue;
245
-
246
- kvm_make_request(req, vcpu);
247
- cpu = vcpu->cpu;
248
-
249
- if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
250
- continue;
251
-
252
- if (tmp != NULL && cpu != -1 && cpu != me &&
253
- kvm_request_needs_ipi(vcpu, req))
254
- __cpumask_set_cpu(cpu, tmp);
319
+ kvm_make_vcpu_request(kvm, vcpu, req, tmp, me);
255320 }
256321
257322 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
....@@ -260,19 +325,34 @@
260325 return called;
261326 }
262327
328
+bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
329
+ struct kvm_vcpu *except)
330
+{
331
+ struct kvm_vcpu *vcpu;
332
+ struct cpumask *cpus;
333
+ bool called;
334
+ int i, me;
335
+
336
+ me = get_cpu();
337
+
338
+ cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
339
+ cpumask_clear(cpus);
340
+
341
+ kvm_for_each_vcpu(i, vcpu, kvm) {
342
+ if (vcpu == except)
343
+ continue;
344
+ kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
345
+ }
346
+
347
+ called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
348
+ put_cpu();
349
+
350
+ return called;
351
+}
352
+
263353 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
264354 {
265
- cpumask_var_t cpus;
266
- bool called;
267
- static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]
268
- = {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX};
269
-
270
- zalloc_cpumask_var(&cpus, GFP_ATOMIC);
271
-
272
- called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus);
273
-
274
- free_cpumask_var(cpus);
275
- return called;
355
+ return kvm_make_all_cpus_request_except(kvm, req, NULL);
276356 }
277357
278358 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
....@@ -308,57 +388,102 @@
308388 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
309389 }
310390
311
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
391
+static void kvm_flush_shadow_all(struct kvm *kvm)
312392 {
313
- struct page *page;
314
- int r;
393
+ kvm_arch_flush_shadow_all(kvm);
394
+ kvm_arch_guest_memory_reclaimed(kvm);
395
+}
315396
397
+#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
398
+static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
399
+ gfp_t gfp_flags)
400
+{
401
+ gfp_flags |= mc->gfp_zero;
402
+
403
+ if (mc->kmem_cache)
404
+ return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
405
+ else
406
+ return (void *)__get_free_page(gfp_flags);
407
+}
408
+
409
+int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
410
+{
411
+ void *obj;
412
+
413
+ if (mc->nobjs >= min)
414
+ return 0;
415
+ while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
416
+ obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
417
+ if (!obj)
418
+ return mc->nobjs >= min ? 0 : -ENOMEM;
419
+ mc->objects[mc->nobjs++] = obj;
420
+ }
421
+ return 0;
422
+}
423
+
424
+int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
425
+{
426
+ return mc->nobjs;
427
+}
428
+
429
+void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
430
+{
431
+ while (mc->nobjs) {
432
+ if (mc->kmem_cache)
433
+ kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
434
+ else
435
+ free_page((unsigned long)mc->objects[--mc->nobjs]);
436
+ }
437
+}
438
+
439
+void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
440
+{
441
+ void *p;
442
+
443
+ if (WARN_ON(!mc->nobjs))
444
+ p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
445
+ else
446
+ p = mc->objects[--mc->nobjs];
447
+ BUG_ON(!p);
448
+ return p;
449
+}
450
+#endif
451
+
452
+static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
453
+{
316454 mutex_init(&vcpu->mutex);
317455 vcpu->cpu = -1;
318456 vcpu->kvm = kvm;
319457 vcpu->vcpu_id = id;
320458 vcpu->pid = NULL;
321
- init_swait_queue_head(&vcpu->wq);
459
+ rcuwait_init(&vcpu->wait);
322460 kvm_async_pf_vcpu_init(vcpu);
323461
324462 vcpu->pre_pcpu = -1;
325463 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
326464
327
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
328
- if (!page) {
329
- r = -ENOMEM;
330
- goto fail;
331
- }
332
- vcpu->run = page_address(page);
333
-
334465 kvm_vcpu_set_in_spin_loop(vcpu, false);
335466 kvm_vcpu_set_dy_eligible(vcpu, false);
336467 vcpu->preempted = false;
337
-
338
- r = kvm_arch_vcpu_init(vcpu);
339
- if (r < 0)
340
- goto fail_free_run;
341
- return 0;
342
-
343
-fail_free_run:
344
- free_page((unsigned long)vcpu->run);
345
-fail:
346
- return r;
468
+ vcpu->ready = false;
469
+ preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
347470 }
348
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
349471
350
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
472
+void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
351473 {
474
+ kvm_arch_vcpu_destroy(vcpu);
475
+
352476 /*
353
- * no need for rcu_read_lock as VCPU_RUN is the only place that
354
- * will change the vcpu->pid pointer and on uninit all file
355
- * descriptors are already gone.
477
+ * No need for rcu_read_lock as VCPU_RUN is the only place that changes
478
+ * the vcpu->pid pointer, and at destruction time all file descriptors
479
+ * are already gone.
356480 */
357481 put_pid(rcu_dereference_protected(vcpu->pid, 1));
358
- kvm_arch_vcpu_uninit(vcpu);
482
+
359483 free_page((unsigned long)vcpu->run);
484
+ kmem_cache_free(kvm_vcpu_cache, vcpu);
360485 }
361
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
486
+EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
362487
363488 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
364489 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
....@@ -389,16 +514,16 @@
389514 idx = srcu_read_lock(&kvm->srcu);
390515 spin_lock(&kvm->mmu_lock);
391516 kvm->mmu_notifier_seq++;
392
- kvm_set_spte_hva(kvm, address, pte);
517
+
518
+ if (kvm_set_spte_hva(kvm, address, pte))
519
+ kvm_flush_remote_tlbs(kvm);
520
+
393521 spin_unlock(&kvm->mmu_lock);
394522 srcu_read_unlock(&kvm->srcu, idx);
395523 }
396524
397525 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
398
- struct mm_struct *mm,
399
- unsigned long start,
400
- unsigned long end,
401
- bool blockable)
526
+ const struct mmu_notifier_range *range)
402527 {
403528 struct kvm *kvm = mmu_notifier_to_kvm(mn);
404529 int need_tlb_flush = 0, idx;
....@@ -411,21 +536,21 @@
411536 * count is also read inside the mmu_lock critical section.
412537 */
413538 kvm->mmu_notifier_count++;
414
- need_tlb_flush = kvm_unmap_hva_range(kvm, start, end, blockable);
539
+ need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
540
+ range->flags);
415541 /* we've to flush the tlb before the pages can be freed */
416542 if (need_tlb_flush || kvm->tlbs_dirty)
417543 kvm_flush_remote_tlbs(kvm);
418544
419545 spin_unlock(&kvm->mmu_lock);
546
+ kvm_arch_guest_memory_reclaimed(kvm);
420547 srcu_read_unlock(&kvm->srcu, idx);
421548
422549 return 0;
423550 }
424551
425552 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
426
- struct mm_struct *mm,
427
- unsigned long start,
428
- unsigned long end)
553
+ const struct mmu_notifier_range *range)
429554 {
430555 struct kvm *kvm = mmu_notifier_to_kvm(mn);
431556
....@@ -522,12 +647,11 @@
522647 int idx;
523648
524649 idx = srcu_read_lock(&kvm->srcu);
525
- kvm_arch_flush_shadow_all(kvm);
650
+ kvm_flush_shadow_all(kvm);
526651 srcu_read_unlock(&kvm->srcu, idx);
527652 }
528653
529654 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
530
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
531655 .invalidate_range = kvm_mmu_notifier_invalidate_range,
532656 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
533657 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
....@@ -558,12 +682,12 @@
558682 int i;
559683 struct kvm_memslots *slots;
560684
561
- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
685
+ slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
562686 if (!slots)
563687 return NULL;
564688
565689 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
566
- slots->id_to_index[i] = slots->memslots[i].id = i;
690
+ slots->id_to_index[i] = -1;
567691
568692 return slots;
569693 }
....@@ -577,18 +701,14 @@
577701 memslot->dirty_bitmap = NULL;
578702 }
579703
580
-/*
581
- * Free any memory in @free but not in @dont.
582
- */
583
-static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
584
- struct kvm_memory_slot *dont)
704
+static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
585705 {
586
- if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
587
- kvm_destroy_dirty_bitmap(free);
706
+ kvm_destroy_dirty_bitmap(slot);
588707
589
- kvm_arch_free_memslot(kvm, free, dont);
708
+ kvm_arch_free_memslot(kvm, slot);
590709
591
- free->npages = 0;
710
+ slot->flags = 0;
711
+ slot->npages = 0;
592712 }
593713
594714 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
....@@ -599,7 +719,7 @@
599719 return;
600720
601721 kvm_for_each_memslot(memslot, slots)
602
- kvm_free_memslot(kvm, memslot, NULL);
722
+ kvm_free_memslot(kvm, memslot);
603723
604724 kvfree(slots);
605725 }
....@@ -622,6 +742,8 @@
622742
623743 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
624744 {
745
+ static DEFINE_MUTEX(kvm_debugfs_lock);
746
+ struct dentry *dent;
625747 char dir_name[ITOA_MAX_LEN * 2];
626748 struct kvm_stat_data *stat_data;
627749 struct kvm_stats_debugfs_item *p;
....@@ -630,25 +752,37 @@
630752 return 0;
631753
632754 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
633
- kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
755
+ mutex_lock(&kvm_debugfs_lock);
756
+ dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
757
+ if (dent) {
758
+ pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
759
+ dput(dent);
760
+ mutex_unlock(&kvm_debugfs_lock);
761
+ return 0;
762
+ }
763
+ dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
764
+ mutex_unlock(&kvm_debugfs_lock);
765
+ if (IS_ERR(dent))
766
+ return 0;
634767
768
+ kvm->debugfs_dentry = dent;
635769 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
636770 sizeof(*kvm->debugfs_stat_data),
637
- GFP_KERNEL);
771
+ GFP_KERNEL_ACCOUNT);
638772 if (!kvm->debugfs_stat_data)
639773 return -ENOMEM;
640774
641775 for (p = debugfs_entries; p->name; p++) {
642
- stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
776
+ stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
643777 if (!stat_data)
644778 return -ENOMEM;
645779
646780 stat_data->kvm = kvm;
647
- stat_data->offset = p->offset;
648
- stat_data->mode = p->mode ? p->mode : 0644;
781
+ stat_data->dbgfs_item = p;
649782 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
650
- debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
651
- stat_data, stat_fops_per_vm[p->kind]);
783
+ debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
784
+ kvm->debugfs_dentry, stat_data,
785
+ &stat_fops_per_vm);
652786 }
653787 return 0;
654788 }
....@@ -672,8 +806,9 @@
672806
673807 static struct kvm *kvm_create_vm(unsigned long type)
674808 {
675
- int r, i;
676809 struct kvm *kvm = kvm_arch_alloc_vm();
810
+ int r = -ENOMEM;
811
+ int i;
677812
678813 if (!kvm)
679814 return ERR_PTR(-ENOMEM);
....@@ -685,12 +820,38 @@
685820 mutex_init(&kvm->lock);
686821 mutex_init(&kvm->irq_lock);
687822 mutex_init(&kvm->slots_lock);
688
- refcount_set(&kvm->users_count, 1);
689823 INIT_LIST_HEAD(&kvm->devices);
824
+
825
+ BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
826
+
827
+ if (init_srcu_struct(&kvm->srcu))
828
+ goto out_err_no_srcu;
829
+ if (init_srcu_struct(&kvm->irq_srcu))
830
+ goto out_err_no_irq_srcu;
831
+
832
+ refcount_set(&kvm->users_count, 1);
833
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
834
+ struct kvm_memslots *slots = kvm_alloc_memslots();
835
+
836
+ if (!slots)
837
+ goto out_err_no_arch_destroy_vm;
838
+ /* Generations must be different for each address space. */
839
+ slots->generation = i;
840
+ rcu_assign_pointer(kvm->memslots[i], slots);
841
+ }
842
+
843
+ for (i = 0; i < KVM_NR_BUSES; i++) {
844
+ rcu_assign_pointer(kvm->buses[i],
845
+ kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
846
+ if (!kvm->buses[i])
847
+ goto out_err_no_arch_destroy_vm;
848
+ }
849
+
850
+ kvm->max_halt_poll_ns = halt_poll_ns;
690851
691852 r = kvm_arch_init_vm(kvm, type);
692853 if (r)
693
- goto out_err_no_disable;
854
+ goto out_err_no_arch_destroy_vm;
694855
695856 r = hardware_enable_all();
696857 if (r)
....@@ -699,33 +860,6 @@
699860 #ifdef CONFIG_HAVE_KVM_IRQFD
700861 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
701862 #endif
702
-
703
- BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
704
-
705
- r = -ENOMEM;
706
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
707
- struct kvm_memslots *slots = kvm_alloc_memslots();
708
- if (!slots)
709
- goto out_err_no_srcu;
710
- /*
711
- * Generations must be different for each address space.
712
- * Init kvm generation close to the maximum to easily test the
713
- * code of handling generation number wrap-around.
714
- */
715
- slots->generation = i * 2 - 150;
716
- rcu_assign_pointer(kvm->memslots[i], slots);
717
- }
718
-
719
- if (init_srcu_struct(&kvm->srcu))
720
- goto out_err_no_srcu;
721
- if (init_srcu_struct(&kvm->irq_srcu))
722
- goto out_err_no_irq_srcu;
723
- for (i = 0; i < KVM_NR_BUSES; i++) {
724
- rcu_assign_pointer(kvm->buses[i],
725
- kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
726
- if (!kvm->buses[i])
727
- goto out_err_no_mmu_notifier;
728
- }
729863
730864 r = kvm_init_mmu_notifier(kvm);
731865 if (r)
....@@ -741,6 +875,16 @@
741875
742876 preempt_notifier_inc();
743877
878
+ /*
879
+ * When the fd passed to this ioctl() is opened it pins the module,
880
+ * but try_module_get() also prevents getting a reference if the module
881
+ * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait").
882
+ */
883
+ if (!try_module_get(kvm_chardev_ops.owner)) {
884
+ r = -ENODEV;
885
+ goto out_err;
886
+ }
887
+
744888 return kvm;
745889
746890 out_err:
....@@ -749,17 +893,19 @@
749893 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
750894 #endif
751895 out_err_no_mmu_notifier:
752
- cleanup_srcu_struct(&kvm->irq_srcu);
753
-out_err_no_irq_srcu:
754
- cleanup_srcu_struct(&kvm->srcu);
755
-out_err_no_srcu:
756896 hardware_disable_all();
757897 out_err_no_disable:
758
- refcount_set(&kvm->users_count, 0);
898
+ kvm_arch_destroy_vm(kvm);
899
+out_err_no_arch_destroy_vm:
900
+ WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
759901 for (i = 0; i < KVM_NR_BUSES; i++)
760902 kfree(kvm_get_bus(kvm, i));
761903 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
762904 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
905
+ cleanup_srcu_struct(&kvm->irq_srcu);
906
+out_err_no_irq_srcu:
907
+ cleanup_srcu_struct(&kvm->srcu);
908
+out_err_no_srcu:
763909 kvm_arch_free_vm(kvm);
764910 mmdrop(current->mm);
765911 return ERR_PTR(r);
....@@ -805,7 +951,7 @@
805951 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
806952 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
807953 #else
808
- kvm_arch_flush_shadow_all(kvm);
954
+ kvm_flush_shadow_all(kvm);
809955 #endif
810956 kvm_arch_destroy_vm(kvm);
811957 kvm_destroy_devices(kvm);
....@@ -817,6 +963,7 @@
817963 preempt_notifier_dec();
818964 hardware_disable_all();
819965 mmdrop(mm);
966
+ module_put(kvm_chardev_ops.owner);
820967 }
821968
822969 void kvm_get_kvm(struct kvm *kvm)
....@@ -832,6 +979,18 @@
832979 }
833980 EXPORT_SYMBOL_GPL(kvm_put_kvm);
834981
982
+/*
983
+ * Used to put a reference that was taken on behalf of an object associated
984
+ * with a user-visible file descriptor, e.g. a vcpu or device, if installation
985
+ * of the new file descriptor fails and the reference cannot be transferred to
986
+ * its final owner. In such cases, the caller is still actively using @kvm and
987
+ * will fail miserably if the refcount unexpectedly hits zero.
988
+ */
989
+void kvm_put_kvm_no_destroy(struct kvm *kvm)
990
+{
991
+ WARN_ON(refcount_dec_and_test(&kvm->users_count));
992
+}
993
+EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
835994
836995 static int kvm_vm_release(struct inode *inode, struct file *filp)
837996 {
....@@ -845,13 +1004,13 @@
8451004
8461005 /*
8471006 * Allocation size is twice as large as the actual dirty bitmap size.
848
- * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
1007
+ * See kvm_vm_ioctl_get_dirty_log() why this is needed.
8491008 */
850
-static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
1009
+static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
8511010 {
8521011 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
8531012
854
- memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
1013
+ memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
8551014 if (!memslot->dirty_bitmap)
8561015 return -ENOMEM;
8571016
....@@ -859,58 +1018,165 @@
8591018 }
8601019
8611020 /*
862
- * Insert memslot and re-sort memslots based on their GFN,
863
- * so binary search could be used to lookup GFN.
864
- * Sorting algorithm takes advantage of having initially
865
- * sorted array and known changed memslot position.
1021
+ * Delete a memslot by decrementing the number of used slots and shifting all
1022
+ * other entries in the array forward one spot.
8661023 */
867
-static void update_memslots(struct kvm_memslots *slots,
868
- struct kvm_memory_slot *new)
1024
+static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1025
+ struct kvm_memory_slot *memslot)
8691026 {
870
- int id = new->id;
871
- int i = slots->id_to_index[id];
8721027 struct kvm_memory_slot *mslots = slots->memslots;
1028
+ int i;
8731029
874
- WARN_ON(mslots[i].id != id);
875
- if (!new->npages) {
876
- WARN_ON(!mslots[i].npages);
877
- if (mslots[i].npages)
878
- slots->used_slots--;
879
- } else {
880
- if (!mslots[i].npages)
881
- slots->used_slots++;
882
- }
1030
+ if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1031
+ return;
8831032
884
- while (i < KVM_MEM_SLOTS_NUM - 1 &&
885
- new->base_gfn <= mslots[i + 1].base_gfn) {
886
- if (!mslots[i + 1].npages)
887
- break;
1033
+ slots->used_slots--;
1034
+
1035
+ if (atomic_read(&slots->lru_slot) >= slots->used_slots)
1036
+ atomic_set(&slots->lru_slot, 0);
1037
+
1038
+ for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
8881039 mslots[i] = mslots[i + 1];
8891040 slots->id_to_index[mslots[i].id] = i;
890
- i++;
8911041 }
1042
+ mslots[i] = *memslot;
1043
+ slots->id_to_index[memslot->id] = -1;
1044
+}
1045
+
1046
+/*
1047
+ * "Insert" a new memslot by incrementing the number of used slots. Returns
1048
+ * the new slot's initial index into the memslots array.
1049
+ */
1050
+static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1051
+{
1052
+ return slots->used_slots++;
1053
+}
1054
+
1055
+/*
1056
+ * Move a changed memslot backwards in the array by shifting existing slots
1057
+ * with a higher GFN toward the front of the array. Note, the changed memslot
1058
+ * itself is not preserved in the array, i.e. not swapped at this time, only
1059
+ * its new index into the array is tracked. Returns the changed memslot's
1060
+ * current index into the memslots array.
1061
+ */
1062
+static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1063
+ struct kvm_memory_slot *memslot)
1064
+{
1065
+ struct kvm_memory_slot *mslots = slots->memslots;
1066
+ int i;
1067
+
1068
+ if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1069
+ WARN_ON_ONCE(!slots->used_slots))
1070
+ return -1;
8921071
8931072 /*
894
- * The ">=" is needed when creating a slot with base_gfn == 0,
895
- * so that it moves before all those with base_gfn == npages == 0.
896
- *
897
- * On the other hand, if new->npages is zero, the above loop has
898
- * already left i pointing to the beginning of the empty part of
899
- * mslots, and the ">=" would move the hole backwards in this
900
- * case---which is wrong. So skip the loop when deleting a slot.
1073
+ * Move the target memslot backward in the array by shifting existing
1074
+ * memslots with a higher GFN (than the target memslot) towards the
1075
+ * front of the array.
9011076 */
902
- if (new->npages) {
903
- while (i > 0 &&
904
- new->base_gfn >= mslots[i - 1].base_gfn) {
905
- mslots[i] = mslots[i - 1];
906
- slots->id_to_index[mslots[i].id] = i;
907
- i--;
908
- }
909
- } else
910
- WARN_ON_ONCE(i != slots->used_slots);
1077
+ for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1078
+ if (memslot->base_gfn > mslots[i + 1].base_gfn)
1079
+ break;
9111080
912
- mslots[i] = *new;
913
- slots->id_to_index[mslots[i].id] = i;
1081
+ WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1082
+
1083
+ /* Shift the next memslot forward one and update its index. */
1084
+ mslots[i] = mslots[i + 1];
1085
+ slots->id_to_index[mslots[i].id] = i;
1086
+ }
1087
+ return i;
1088
+}
1089
+
1090
+/*
1091
+ * Move a changed memslot forwards in the array by shifting existing slots with
1092
+ * a lower GFN toward the back of the array. Note, the changed memslot itself
1093
+ * is not preserved in the array, i.e. not swapped at this time, only its new
1094
+ * index into the array is tracked. Returns the changed memslot's final index
1095
+ * into the memslots array.
1096
+ */
1097
+static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1098
+ struct kvm_memory_slot *memslot,
1099
+ int start)
1100
+{
1101
+ struct kvm_memory_slot *mslots = slots->memslots;
1102
+ int i;
1103
+
1104
+ for (i = start; i > 0; i--) {
1105
+ if (memslot->base_gfn < mslots[i - 1].base_gfn)
1106
+ break;
1107
+
1108
+ WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1109
+
1110
+ /* Shift the next memslot back one and update its index. */
1111
+ mslots[i] = mslots[i - 1];
1112
+ slots->id_to_index[mslots[i].id] = i;
1113
+ }
1114
+ return i;
1115
+}
1116
+
1117
+/*
1118
+ * Re-sort memslots based on their GFN to account for an added, deleted, or
1119
+ * moved memslot. Sorting memslots by GFN allows using a binary search during
1120
+ * memslot lookup.
1121
+ *
1122
+ * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry
1123
+ * at memslots[0] has the highest GFN.
1124
+ *
1125
+ * The sorting algorithm takes advantage of having initially sorted memslots
1126
+ * and knowing the position of the changed memslot. Sorting is also optimized
1127
+ * by not swapping the updated memslot and instead only shifting other memslots
1128
+ * and tracking the new index for the update memslot. Only once its final
1129
+ * index is known is the updated memslot copied into its position in the array.
1130
+ *
1131
+ * - When deleting a memslot, the deleted memslot simply needs to be moved to
1132
+ * the end of the array.
1133
+ *
1134
+ * - When creating a memslot, the algorithm "inserts" the new memslot at the
1135
+ * end of the array and then it forward to its correct location.
1136
+ *
1137
+ * - When moving a memslot, the algorithm first moves the updated memslot
1138
+ * backward to handle the scenario where the memslot's GFN was changed to a
1139
+ * lower value. update_memslots() then falls through and runs the same flow
1140
+ * as creating a memslot to move the memslot forward to handle the scenario
1141
+ * where its GFN was changed to a higher value.
1142
+ *
1143
+ * Note, slots are sorted from highest->lowest instead of lowest->highest for
1144
+ * historical reasons. Originally, invalid memslots where denoted by having
1145
+ * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
1146
+ * to the end of the array. The current algorithm uses dedicated logic to
1147
+ * delete a memslot and thus does not rely on invalid memslots having GFN=0.
1148
+ *
1149
+ * The other historical motiviation for highest->lowest was to improve the
1150
+ * performance of memslot lookup. KVM originally used a linear search starting
1151
+ * at memslots[0]. On x86, the largest memslot usually has one of the highest,
1152
+ * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a
1153
+ * single memslot above the 4gb boundary. As the largest memslot is also the
1154
+ * most likely to be referenced, sorting it to the front of the array was
1155
+ * advantageous. The current binary search starts from the middle of the array
1156
+ * and uses an LRU pointer to improve performance for all memslots and GFNs.
1157
+ */
1158
+static void update_memslots(struct kvm_memslots *slots,
1159
+ struct kvm_memory_slot *memslot,
1160
+ enum kvm_mr_change change)
1161
+{
1162
+ int i;
1163
+
1164
+ if (change == KVM_MR_DELETE) {
1165
+ kvm_memslot_delete(slots, memslot);
1166
+ } else {
1167
+ if (change == KVM_MR_CREATE)
1168
+ i = kvm_memslot_insert_back(slots);
1169
+ else
1170
+ i = kvm_memslot_move_backward(slots, memslot);
1171
+ i = kvm_memslot_move_forward(slots, memslot, i);
1172
+
1173
+ /*
1174
+ * Copy the memslot to its new position in memslots and update
1175
+ * its index accordingly.
1176
+ */
1177
+ slots->memslots[i] = *memslot;
1178
+ slots->id_to_index[memslot->id] = i;
1179
+ }
9141180 }
9151181
9161182 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
....@@ -931,36 +1197,148 @@
9311197 int as_id, struct kvm_memslots *slots)
9321198 {
9331199 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
934
- u64 gen;
1200
+ u64 gen = old_memslots->generation;
9351201
936
- /*
937
- * Set the low bit in the generation, which disables SPTE caching
938
- * until the end of synchronize_srcu_expedited.
939
- */
940
- WARN_ON(old_memslots->generation & 1);
941
- slots->generation = old_memslots->generation + 1;
1202
+ WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1203
+ slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
9421204
9431205 rcu_assign_pointer(kvm->memslots[as_id], slots);
9441206 synchronize_srcu_expedited(&kvm->srcu);
9451207
9461208 /*
947
- * Increment the new memslot generation a second time. This prevents
948
- * vm exits that race with memslot updates from caching a memslot
949
- * generation that will (potentially) be valid forever.
950
- *
1209
+ * Increment the new memslot generation a second time, dropping the
1210
+ * update in-progress flag and incrementing the generation based on
1211
+ * the number of address spaces. This provides a unique and easily
1212
+ * identifiable generation number while the memslots are in flux.
1213
+ */
1214
+ gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1215
+
1216
+ /*
9511217 * Generations must be unique even across address spaces. We do not need
9521218 * a global counter for that, instead the generation space is evenly split
9531219 * across address spaces. For example, with two address spaces, address
954
- * space 0 will use generations 0, 4, 8, ... while * address space 1 will
955
- * use generations 2, 6, 10, 14, ...
1220
+ * space 0 will use generations 0, 2, 4, ... while address space 1 will
1221
+ * use generations 1, 3, 5, ...
9561222 */
957
- gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
1223
+ gen += KVM_ADDRESS_SPACE_NUM;
9581224
9591225 kvm_arch_memslots_updated(kvm, gen);
9601226
9611227 slots->generation = gen;
9621228
9631229 return old_memslots;
1230
+}
1231
+
1232
+/*
1233
+ * Note, at a minimum, the current number of used slots must be allocated, even
1234
+ * when deleting a memslot, as we need a complete duplicate of the memslots for
1235
+ * use when invalidating a memslot prior to deleting/moving the memslot.
1236
+ */
1237
+static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1238
+ enum kvm_mr_change change)
1239
+{
1240
+ struct kvm_memslots *slots;
1241
+ size_t old_size, new_size;
1242
+
1243
+ old_size = sizeof(struct kvm_memslots) +
1244
+ (sizeof(struct kvm_memory_slot) * old->used_slots);
1245
+
1246
+ if (change == KVM_MR_CREATE)
1247
+ new_size = old_size + sizeof(struct kvm_memory_slot);
1248
+ else
1249
+ new_size = old_size;
1250
+
1251
+ slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1252
+ if (likely(slots))
1253
+ memcpy(slots, old, old_size);
1254
+
1255
+ return slots;
1256
+}
1257
+
1258
+static int kvm_set_memslot(struct kvm *kvm,
1259
+ const struct kvm_userspace_memory_region *mem,
1260
+ struct kvm_memory_slot *old,
1261
+ struct kvm_memory_slot *new, int as_id,
1262
+ enum kvm_mr_change change)
1263
+{
1264
+ struct kvm_memory_slot *slot;
1265
+ struct kvm_memslots *slots;
1266
+ int r;
1267
+
1268
+ slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1269
+ if (!slots)
1270
+ return -ENOMEM;
1271
+
1272
+ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1273
+ /*
1274
+ * Note, the INVALID flag needs to be in the appropriate entry
1275
+ * in the freshly allocated memslots, not in @old or @new.
1276
+ */
1277
+ slot = id_to_memslot(slots, old->id);
1278
+ slot->flags |= KVM_MEMSLOT_INVALID;
1279
+
1280
+ /*
1281
+ * We can re-use the old memslots, the only difference from the
1282
+ * newly installed memslots is the invalid flag, which will get
1283
+ * dropped by update_memslots anyway. We'll also revert to the
1284
+ * old memslots if preparing the new memory region fails.
1285
+ */
1286
+ slots = install_new_memslots(kvm, as_id, slots);
1287
+
1288
+ /* From this point no new shadow pages pointing to a deleted,
1289
+ * or moved, memslot will be created.
1290
+ *
1291
+ * validation of sp->gfn happens in:
1292
+ * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1293
+ * - kvm_is_visible_gfn (mmu_check_root)
1294
+ */
1295
+ kvm_arch_flush_shadow_memslot(kvm, slot);
1296
+ kvm_arch_guest_memory_reclaimed(kvm);
1297
+ }
1298
+
1299
+ r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1300
+ if (r)
1301
+ goto out_slots;
1302
+
1303
+ update_memslots(slots, new, change);
1304
+ slots = install_new_memslots(kvm, as_id, slots);
1305
+
1306
+ kvm_arch_commit_memory_region(kvm, mem, old, new, change);
1307
+
1308
+ kvfree(slots);
1309
+ return 0;
1310
+
1311
+out_slots:
1312
+ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1313
+ slots = install_new_memslots(kvm, as_id, slots);
1314
+ kvfree(slots);
1315
+ return r;
1316
+}
1317
+
1318
+static int kvm_delete_memslot(struct kvm *kvm,
1319
+ const struct kvm_userspace_memory_region *mem,
1320
+ struct kvm_memory_slot *old, int as_id)
1321
+{
1322
+ struct kvm_memory_slot new;
1323
+ int r;
1324
+
1325
+ if (!old->npages)
1326
+ return -EINVAL;
1327
+
1328
+ memset(&new, 0, sizeof(new));
1329
+ new.id = old->id;
1330
+ /*
1331
+ * This is only for debugging purpose; it should never be referenced
1332
+ * for a removed memslot.
1333
+ */
1334
+ new.as_id = as_id;
1335
+
1336
+ r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
1337
+ if (r)
1338
+ return r;
1339
+
1340
+ kvm_free_memslot(kvm, old);
1341
+ return 0;
9641342 }
9651343
9661344 /*
....@@ -974,163 +1352,120 @@
9741352 int __kvm_set_memory_region(struct kvm *kvm,
9751353 const struct kvm_userspace_memory_region *mem)
9761354 {
977
- int r;
978
- gfn_t base_gfn;
979
- unsigned long npages;
980
- struct kvm_memory_slot *slot;
9811355 struct kvm_memory_slot old, new;
982
- struct kvm_memslots *slots = NULL, *old_memslots;
983
- int as_id, id;
1356
+ struct kvm_memory_slot *tmp;
9841357 enum kvm_mr_change change;
1358
+ int as_id, id;
1359
+ int r;
9851360
9861361 r = check_memory_region_flags(mem);
9871362 if (r)
988
- goto out;
1363
+ return r;
9891364
990
- r = -EINVAL;
9911365 as_id = mem->slot >> 16;
9921366 id = (u16)mem->slot;
9931367
9941368 /* General sanity checks */
995
- if (mem->memory_size & (PAGE_SIZE - 1))
996
- goto out;
1369
+ if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1370
+ (mem->memory_size != (unsigned long)mem->memory_size))
1371
+ return -EINVAL;
9971372 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
998
- goto out;
1373
+ return -EINVAL;
9991374 /* We can read the guest memory with __xxx_user() later on. */
1000
- if ((id < KVM_USER_MEM_SLOTS) &&
1001
- ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1002
- !access_ok(VERIFY_WRITE,
1003
- (void __user *)(unsigned long)mem->userspace_addr,
1004
- mem->memory_size)))
1005
- goto out;
1375
+ if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1376
+ (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1377
+ !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1378
+ mem->memory_size))
1379
+ return -EINVAL;
10061380 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1007
- goto out;
1381
+ return -EINVAL;
10081382 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1009
- goto out;
1383
+ return -EINVAL;
10101384
1011
- slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1012
- base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1013
- npages = mem->memory_size >> PAGE_SHIFT;
1014
-
1015
- if (npages > KVM_MEM_MAX_NR_PAGES)
1016
- goto out;
1017
-
1018
- new = old = *slot;
1019
-
1020
- new.id = id;
1021
- new.base_gfn = base_gfn;
1022
- new.npages = npages;
1023
- new.flags = mem->flags;
1024
-
1025
- if (npages) {
1026
- if (!old.npages)
1027
- change = KVM_MR_CREATE;
1028
- else { /* Modify an existing slot. */
1029
- if ((mem->userspace_addr != old.userspace_addr) ||
1030
- (npages != old.npages) ||
1031
- ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1032
- goto out;
1033
-
1034
- if (base_gfn != old.base_gfn)
1035
- change = KVM_MR_MOVE;
1036
- else if (new.flags != old.flags)
1037
- change = KVM_MR_FLAGS_ONLY;
1038
- else { /* Nothing to change. */
1039
- r = 0;
1040
- goto out;
1041
- }
1042
- }
1385
+ /*
1386
+ * Make a full copy of the old memslot, the pointer will become stale
1387
+ * when the memslots are re-sorted by update_memslots(), and the old
1388
+ * memslot needs to be referenced after calling update_memslots(), e.g.
1389
+ * to free its resources and for arch specific behavior.
1390
+ */
1391
+ tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1392
+ if (tmp) {
1393
+ old = *tmp;
1394
+ tmp = NULL;
10431395 } else {
1044
- if (!old.npages)
1045
- goto out;
1396
+ memset(&old, 0, sizeof(old));
1397
+ old.id = id;
1398
+ }
10461399
1047
- change = KVM_MR_DELETE;
1048
- new.base_gfn = 0;
1049
- new.flags = 0;
1400
+ if (!mem->memory_size)
1401
+ return kvm_delete_memslot(kvm, mem, &old, as_id);
1402
+
1403
+ new.as_id = as_id;
1404
+ new.id = id;
1405
+ new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1406
+ new.npages = mem->memory_size >> PAGE_SHIFT;
1407
+ new.flags = mem->flags;
1408
+ new.userspace_addr = mem->userspace_addr;
1409
+
1410
+ if (new.npages > KVM_MEM_MAX_NR_PAGES)
1411
+ return -EINVAL;
1412
+
1413
+ if (!old.npages) {
1414
+ change = KVM_MR_CREATE;
1415
+ new.dirty_bitmap = NULL;
1416
+ memset(&new.arch, 0, sizeof(new.arch));
1417
+ } else { /* Modify an existing slot. */
1418
+ if ((new.userspace_addr != old.userspace_addr) ||
1419
+ (new.npages != old.npages) ||
1420
+ ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1421
+ return -EINVAL;
1422
+
1423
+ if (new.base_gfn != old.base_gfn)
1424
+ change = KVM_MR_MOVE;
1425
+ else if (new.flags != old.flags)
1426
+ change = KVM_MR_FLAGS_ONLY;
1427
+ else /* Nothing to change. */
1428
+ return 0;
1429
+
1430
+ /* Copy dirty_bitmap and arch from the current memslot. */
1431
+ new.dirty_bitmap = old.dirty_bitmap;
1432
+ memcpy(&new.arch, &old.arch, sizeof(new.arch));
10501433 }
10511434
10521435 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
10531436 /* Check for overlaps */
1054
- r = -EEXIST;
1055
- kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
1056
- if (slot->id == id)
1437
+ kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1438
+ if (tmp->id == id)
10571439 continue;
1058
- if (!((base_gfn + npages <= slot->base_gfn) ||
1059
- (base_gfn >= slot->base_gfn + slot->npages)))
1060
- goto out;
1440
+ if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1441
+ (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1442
+ return -EEXIST;
10611443 }
10621444 }
10631445
1064
- /* Free page dirty bitmap if unneeded */
1446
+ /* Allocate/free page dirty bitmap as needed */
10651447 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
10661448 new.dirty_bitmap = NULL;
1449
+ else if (!new.dirty_bitmap) {
1450
+ r = kvm_alloc_dirty_bitmap(&new);
1451
+ if (r)
1452
+ return r;
10671453
1068
- r = -ENOMEM;
1069
- if (change == KVM_MR_CREATE) {
1070
- new.userspace_addr = mem->userspace_addr;
1071
-
1072
- if (kvm_arch_create_memslot(kvm, &new, npages))
1073
- goto out_free;
1454
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1455
+ bitmap_set(new.dirty_bitmap, 0, new.npages);
10741456 }
10751457
1076
- /* Allocate page dirty bitmap if needed */
1077
- if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1078
- if (kvm_create_dirty_bitmap(&new) < 0)
1079
- goto out_free;
1080
- }
1081
-
1082
- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
1083
- if (!slots)
1084
- goto out_free;
1085
- memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
1086
-
1087
- if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
1088
- slot = id_to_memslot(slots, id);
1089
- slot->flags |= KVM_MEMSLOT_INVALID;
1090
-
1091
- old_memslots = install_new_memslots(kvm, as_id, slots);
1092
-
1093
- /* From this point no new shadow pages pointing to a deleted,
1094
- * or moved, memslot will be created.
1095
- *
1096
- * validation of sp->gfn happens in:
1097
- * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1098
- * - kvm_is_visible_gfn (mmu_check_roots)
1099
- */
1100
- kvm_arch_flush_shadow_memslot(kvm, slot);
1101
-
1102
- /*
1103
- * We can re-use the old_memslots from above, the only difference
1104
- * from the currently installed memslots is the invalid flag. This
1105
- * will get overwritten by update_memslots anyway.
1106
- */
1107
- slots = old_memslots;
1108
- }
1109
-
1110
- r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
1458
+ r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
11111459 if (r)
1112
- goto out_slots;
1460
+ goto out_bitmap;
11131461
1114
- /* actual memory is freed via old in kvm_free_memslot below */
1115
- if (change == KVM_MR_DELETE) {
1116
- new.dirty_bitmap = NULL;
1117
- memset(&new.arch, 0, sizeof(new.arch));
1118
- }
1119
-
1120
- update_memslots(slots, &new);
1121
- old_memslots = install_new_memslots(kvm, as_id, slots);
1122
-
1123
- kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
1124
-
1125
- kvm_free_memslot(kvm, &old, &new);
1126
- kvfree(old_memslots);
1462
+ if (old.dirty_bitmap && !new.dirty_bitmap)
1463
+ kvm_destroy_dirty_bitmap(&old);
11271464 return 0;
11281465
1129
-out_slots:
1130
- kvfree(slots);
1131
-out_free:
1132
- kvm_free_memslot(kvm, &new, &old);
1133
-out:
1466
+out_bitmap:
1467
+ if (new.dirty_bitmap && !old.dirty_bitmap)
1468
+ kvm_destroy_dirty_bitmap(&new);
11341469 return r;
11351470 }
11361471 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
....@@ -1156,14 +1491,24 @@
11561491 return kvm_set_memory_region(kvm, mem);
11571492 }
11581493
1159
-int kvm_get_dirty_log(struct kvm *kvm,
1160
- struct kvm_dirty_log *log, int *is_dirty)
1494
+#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1495
+/**
1496
+ * kvm_get_dirty_log - get a snapshot of dirty pages
1497
+ * @kvm: pointer to kvm instance
1498
+ * @log: slot id and address to which we copy the log
1499
+ * @is_dirty: set to '1' if any dirty pages were found
1500
+ * @memslot: set to the associated memslot, always valid on success
1501
+ */
1502
+int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1503
+ int *is_dirty, struct kvm_memory_slot **memslot)
11611504 {
11621505 struct kvm_memslots *slots;
1163
- struct kvm_memory_slot *memslot;
11641506 int i, as_id, id;
11651507 unsigned long n;
11661508 unsigned long any = 0;
1509
+
1510
+ *memslot = NULL;
1511
+ *is_dirty = 0;
11671512
11681513 as_id = log->slot >> 16;
11691514 id = (u16)log->slot;
....@@ -1171,16 +1516,18 @@
11711516 return -EINVAL;
11721517
11731518 slots = __kvm_memslots(kvm, as_id);
1174
- memslot = id_to_memslot(slots, id);
1175
- if (!memslot->dirty_bitmap)
1519
+ *memslot = id_to_memslot(slots, id);
1520
+ if (!(*memslot) || !(*memslot)->dirty_bitmap)
11761521 return -ENOENT;
11771522
1178
- n = kvm_dirty_bitmap_bytes(memslot);
1523
+ kvm_arch_sync_dirty_log(kvm, *memslot);
1524
+
1525
+ n = kvm_dirty_bitmap_bytes(*memslot);
11791526
11801527 for (i = 0; !any && i < n/sizeof(long); ++i)
1181
- any = memslot->dirty_bitmap[i];
1528
+ any = (*memslot)->dirty_bitmap[i];
11821529
1183
- if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1530
+ if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
11841531 return -EFAULT;
11851532
11861533 if (any)
....@@ -1189,13 +1536,12 @@
11891536 }
11901537 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
11911538
1192
-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1539
+#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
11931540 /**
1194
- * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
1195
- * are dirty write protect them for next write.
1541
+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages
1542
+ * and reenable dirty page tracking for the corresponding pages.
11961543 * @kvm: pointer to kvm instance
11971544 * @log: slot id and address to which we copy the log
1198
- * @is_dirty: flag set if any page is dirty
11991545 *
12001546 * We need to keep it in mind that VCPU threads can write to the bitmap
12011547 * concurrently. So, to avoid losing track of dirty pages we keep the
....@@ -1212,8 +1558,7 @@
12121558 * exiting to userspace will be logged for the next call.
12131559 *
12141560 */
1215
-int kvm_get_dirty_log_protect(struct kvm *kvm,
1216
- struct kvm_dirty_log *log, bool *is_dirty)
1561
+static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
12171562 {
12181563 struct kvm_memslots *slots;
12191564 struct kvm_memory_slot *memslot;
....@@ -1221,6 +1566,7 @@
12211566 unsigned long n;
12221567 unsigned long *dirty_bitmap;
12231568 unsigned long *dirty_bitmap_buffer;
1569
+ bool flush;
12241570
12251571 as_id = log->slot >> 16;
12261572 id = (u16)log->slot;
....@@ -1229,55 +1575,180 @@
12291575
12301576 slots = __kvm_memslots(kvm, as_id);
12311577 memslot = id_to_memslot(slots, id);
1232
-
1233
- dirty_bitmap = memslot->dirty_bitmap;
1234
- if (!dirty_bitmap)
1578
+ if (!memslot || !memslot->dirty_bitmap)
12351579 return -ENOENT;
12361580
1581
+ dirty_bitmap = memslot->dirty_bitmap;
1582
+
1583
+ kvm_arch_sync_dirty_log(kvm, memslot);
1584
+
12371585 n = kvm_dirty_bitmap_bytes(memslot);
1586
+ flush = false;
1587
+ if (kvm->manual_dirty_log_protect) {
1588
+ /*
1589
+ * Unlike kvm_get_dirty_log, we always return false in *flush,
1590
+ * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
1591
+ * is some code duplication between this function and
1592
+ * kvm_get_dirty_log, but hopefully all architecture
1593
+ * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1594
+ * can be eliminated.
1595
+ */
1596
+ dirty_bitmap_buffer = dirty_bitmap;
1597
+ } else {
1598
+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1599
+ memset(dirty_bitmap_buffer, 0, n);
12381600
1239
- dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1240
- memset(dirty_bitmap_buffer, 0, n);
1601
+ spin_lock(&kvm->mmu_lock);
1602
+ for (i = 0; i < n / sizeof(long); i++) {
1603
+ unsigned long mask;
1604
+ gfn_t offset;
12411605
1242
- spin_lock(&kvm->mmu_lock);
1243
- *is_dirty = false;
1244
- for (i = 0; i < n / sizeof(long); i++) {
1245
- unsigned long mask;
1246
- gfn_t offset;
1606
+ if (!dirty_bitmap[i])
1607
+ continue;
12471608
1248
- if (!dirty_bitmap[i])
1249
- continue;
1609
+ flush = true;
1610
+ mask = xchg(&dirty_bitmap[i], 0);
1611
+ dirty_bitmap_buffer[i] = mask;
12501612
1251
- *is_dirty = true;
1252
-
1253
- mask = xchg(&dirty_bitmap[i], 0);
1254
- dirty_bitmap_buffer[i] = mask;
1255
-
1256
- if (mask) {
12571613 offset = i * BITS_PER_LONG;
12581614 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
12591615 offset, mask);
12601616 }
1617
+ spin_unlock(&kvm->mmu_lock);
12611618 }
12621619
1263
- spin_unlock(&kvm->mmu_lock);
1620
+ if (flush)
1621
+ kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1622
+
12641623 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
12651624 return -EFAULT;
12661625 return 0;
12671626 }
1268
-EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1269
-#endif
12701627
1271
-bool kvm_largepages_enabled(void)
1628
+
1629
+/**
1630
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
1631
+ * @kvm: kvm instance
1632
+ * @log: slot id and address to which we copy the log
1633
+ *
1634
+ * Steps 1-4 below provide general overview of dirty page logging. See
1635
+ * kvm_get_dirty_log_protect() function description for additional details.
1636
+ *
1637
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
1638
+ * always flush the TLB (step 4) even if previous step failed and the dirty
1639
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
1640
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
1641
+ * writes will be marked dirty for next log read.
1642
+ *
1643
+ * 1. Take a snapshot of the bit and clear it if needed.
1644
+ * 2. Write protect the corresponding page.
1645
+ * 3. Copy the snapshot to the userspace.
1646
+ * 4. Flush TLB's if needed.
1647
+ */
1648
+static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1649
+ struct kvm_dirty_log *log)
12721650 {
1273
- return largepages_enabled;
1651
+ int r;
1652
+
1653
+ mutex_lock(&kvm->slots_lock);
1654
+
1655
+ r = kvm_get_dirty_log_protect(kvm, log);
1656
+
1657
+ mutex_unlock(&kvm->slots_lock);
1658
+ return r;
12741659 }
12751660
1276
-void kvm_disable_largepages(void)
1661
+/**
1662
+ * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
1663
+ * and reenable dirty page tracking for the corresponding pages.
1664
+ * @kvm: pointer to kvm instance
1665
+ * @log: slot id and address from which to fetch the bitmap of dirty pages
1666
+ */
1667
+static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1668
+ struct kvm_clear_dirty_log *log)
12771669 {
1278
- largepages_enabled = false;
1670
+ struct kvm_memslots *slots;
1671
+ struct kvm_memory_slot *memslot;
1672
+ int as_id, id;
1673
+ gfn_t offset;
1674
+ unsigned long i, n;
1675
+ unsigned long *dirty_bitmap;
1676
+ unsigned long *dirty_bitmap_buffer;
1677
+ bool flush;
1678
+
1679
+ as_id = log->slot >> 16;
1680
+ id = (u16)log->slot;
1681
+ if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1682
+ return -EINVAL;
1683
+
1684
+ if (log->first_page & 63)
1685
+ return -EINVAL;
1686
+
1687
+ slots = __kvm_memslots(kvm, as_id);
1688
+ memslot = id_to_memslot(slots, id);
1689
+ if (!memslot || !memslot->dirty_bitmap)
1690
+ return -ENOENT;
1691
+
1692
+ dirty_bitmap = memslot->dirty_bitmap;
1693
+
1694
+ n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
1695
+
1696
+ if (log->first_page > memslot->npages ||
1697
+ log->num_pages > memslot->npages - log->first_page ||
1698
+ (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
1699
+ return -EINVAL;
1700
+
1701
+ kvm_arch_sync_dirty_log(kvm, memslot);
1702
+
1703
+ flush = false;
1704
+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1705
+ if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
1706
+ return -EFAULT;
1707
+
1708
+ spin_lock(&kvm->mmu_lock);
1709
+ for (offset = log->first_page, i = offset / BITS_PER_LONG,
1710
+ n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
1711
+ i++, offset += BITS_PER_LONG) {
1712
+ unsigned long mask = *dirty_bitmap_buffer++;
1713
+ atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
1714
+ if (!mask)
1715
+ continue;
1716
+
1717
+ mask &= atomic_long_fetch_andnot(mask, p);
1718
+
1719
+ /*
1720
+ * mask contains the bits that really have been cleared. This
1721
+ * never includes any bits beyond the length of the memslot (if
1722
+ * the length is not aligned to 64 pages), therefore it is not
1723
+ * a problem if userspace sets them in log->dirty_bitmap.
1724
+ */
1725
+ if (mask) {
1726
+ flush = true;
1727
+ kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1728
+ offset, mask);
1729
+ }
1730
+ }
1731
+ spin_unlock(&kvm->mmu_lock);
1732
+
1733
+ if (flush)
1734
+ kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1735
+
1736
+ return 0;
12791737 }
1280
-EXPORT_SYMBOL_GPL(kvm_disable_largepages);
1738
+
1739
+static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
1740
+ struct kvm_clear_dirty_log *log)
1741
+{
1742
+ int r;
1743
+
1744
+ mutex_lock(&kvm->slots_lock);
1745
+
1746
+ r = kvm_clear_dirty_log_protect(kvm, log);
1747
+
1748
+ mutex_unlock(&kvm->slots_lock);
1749
+ return r;
1750
+}
1751
+#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
12811752
12821753 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
12831754 {
....@@ -1294,13 +1765,17 @@
12941765 {
12951766 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
12961767
1297
- if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
1298
- memslot->flags & KVM_MEMSLOT_INVALID)
1299
- return false;
1300
-
1301
- return true;
1768
+ return kvm_is_visible_memslot(memslot);
13021769 }
13031770 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1771
+
1772
+bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1773
+{
1774
+ struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1775
+
1776
+ return kvm_is_visible_memslot(memslot);
1777
+}
1778
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
13041779
13051780 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
13061781 {
....@@ -1313,7 +1788,7 @@
13131788 if (kvm_is_error_hva(addr))
13141789 return PAGE_SIZE;
13151790
1316
- down_read(&current->mm->mmap_sem);
1791
+ mmap_read_lock(current->mm);
13171792 vma = find_vma(current->mm, addr);
13181793 if (!vma)
13191794 goto out;
....@@ -1321,7 +1796,7 @@
13211796 size = vma_kernel_pagesize(vma);
13221797
13231798 out:
1324
- up_read(&current->mm->mmap_sem);
1799
+ mmap_read_unlock(current->mm);
13251800
13261801 return size;
13271802 }
....@@ -1372,8 +1847,12 @@
13721847 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
13731848
13741849 /*
1375
- * If writable is set to false, the hva returned by this function is only
1376
- * allowed to be read.
1850
+ * Return the hva of a @gfn and the R/W attribute if possible.
1851
+ *
1852
+ * @slot: the kvm_memory_slot which contains @gfn
1853
+ * @gfn: the gfn to be translated
1854
+ * @writable: used to return the read/write attribute of the @slot if the hva
1855
+ * is valid and @writable is not NULL
13771856 */
13781857 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
13791858 gfn_t gfn, bool *writable)
....@@ -1411,13 +1890,12 @@
14111890 /*
14121891 * The fast path to get the writable pfn which will be stored in @pfn,
14131892 * true indicates success, otherwise false is returned. It's also the
1414
- * only part that runs if we can are in atomic context.
1893
+ * only part that runs if we can in atomic context.
14151894 */
14161895 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
14171896 bool *writable, kvm_pfn_t *pfn)
14181897 {
14191898 struct page *page[1];
1420
- int npages;
14211899
14221900 /*
14231901 * Fast pin a writable pfn only if it is a write fault request
....@@ -1427,8 +1905,7 @@
14271905 if (!(write_fault || writable))
14281906 return false;
14291907
1430
- npages = __get_user_pages_fast(addr, 1, 1, page);
1431
- if (npages == 1) {
1908
+ if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
14321909 *pfn = page_to_pfn(page[0]);
14331910
14341911 if (writable)
....@@ -1468,7 +1945,7 @@
14681945 if (unlikely(!write_fault) && writable) {
14691946 struct page *wpage;
14701947
1471
- if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
1948
+ if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
14721949 *writable = true;
14731950 put_page(page);
14741951 page = wpage;
....@@ -1506,14 +1983,14 @@
15061983 spinlock_t *ptl;
15071984 int r;
15081985
1509
- r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
1986
+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
15101987 if (r) {
15111988 /*
15121989 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
15131990 * not call the fault handler, so do it here.
15141991 */
15151992 bool unlocked = false;
1516
- r = fixup_user_fault(current, current->mm, addr,
1993
+ r = fixup_user_fault(current->mm, addr,
15171994 (write_fault ? FAULT_FLAG_WRITE : 0),
15181995 &unlocked);
15191996 if (unlocked)
....@@ -1521,7 +1998,7 @@
15211998 if (r)
15221999 return r;
15232000
1524
- r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
2001
+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
15252002 if (r)
15262003 return r;
15272004 }
....@@ -1596,7 +2073,7 @@
15962073 if (npages == 1)
15972074 return pfn;
15982075
1599
- down_read(&current->mm->mmap_sem);
2076
+ mmap_read_lock(current->mm);
16002077 if (npages == -EHWPOISON ||
16012078 (!async && check_user_page_hwpoison(addr))) {
16022079 pfn = KVM_PFN_ERR_HWPOISON;
....@@ -1620,7 +2097,7 @@
16202097 pfn = KVM_PFN_ERR_FAULT;
16212098 }
16222099 exit:
1623
- up_read(&current->mm->mmap_sem);
2100
+ mmap_read_unlock(current->mm);
16242101 return pfn;
16252102 }
16262103
....@@ -1673,12 +2150,6 @@
16732150 }
16742151 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
16752152
1676
-kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1677
-{
1678
- return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
1679
-}
1680
-EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1681
-
16822153 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
16832154 {
16842155 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
....@@ -1710,7 +2181,7 @@
17102181 if (entry < nr_pages)
17112182 return 0;
17122183
1713
- return __get_user_pages_fast(addr, nr_pages, 1, pages);
2184
+ return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
17142185 }
17152186 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
17162187
....@@ -1924,20 +2395,28 @@
19242395 }
19252396 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
19262397
2398
+static bool kvm_is_ad_tracked_pfn(kvm_pfn_t pfn)
2399
+{
2400
+ if (!pfn_valid(pfn))
2401
+ return false;
2402
+
2403
+ /*
2404
+ * Per page-flags.h, pages tagged PG_reserved "should in general not be
2405
+ * touched (e.g. set dirty) except by its owner".
2406
+ */
2407
+ return !PageReserved(pfn_to_page(pfn));
2408
+}
2409
+
19272410 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
19282411 {
1929
- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
1930
- struct page *page = pfn_to_page(pfn);
1931
-
1932
- if (!PageReserved(page))
1933
- SetPageDirty(page);
1934
- }
2412
+ if (kvm_is_ad_tracked_pfn(pfn))
2413
+ SetPageDirty(pfn_to_page(pfn));
19352414 }
19362415 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
19372416
19382417 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
19392418 {
1940
- if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2419
+ if (kvm_is_ad_tracked_pfn(pfn))
19412420 mark_page_accessed(pfn_to_page(pfn));
19422421 }
19432422 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
....@@ -2047,17 +2526,6 @@
20472526 return 0;
20482527 }
20492528
2050
-int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
2051
- unsigned long len)
2052
-{
2053
- gfn_t gfn = gpa >> PAGE_SHIFT;
2054
- struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2055
- int offset = offset_in_page(gpa);
2056
-
2057
- return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2058
-}
2059
-EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
2060
-
20612529 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
20622530 void *data, unsigned long len)
20632531 {
....@@ -2155,30 +2623,34 @@
21552623 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
21562624 gfn_t nr_pages_avail;
21572625
2158
- ghc->gpa = gpa;
2626
+ /* Update ghc->generation before performing any error checks. */
21592627 ghc->generation = slots->generation;
2160
- ghc->len = len;
2161
- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2162
- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
2163
- if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
2164
- ghc->hva += offset;
2165
- } else {
2166
- /*
2167
- * If the requested region crosses two memslots, we still
2168
- * verify that the entire region is valid here.
2169
- */
2170
- while (start_gfn <= end_gfn) {
2171
- nr_pages_avail = 0;
2172
- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2173
- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2174
- &nr_pages_avail);
2175
- if (kvm_is_error_hva(ghc->hva))
2176
- return -EFAULT;
2177
- start_gfn += nr_pages_avail;
2178
- }
2179
- /* Use the slow path for cross page reads and writes. */
2180
- ghc->memslot = NULL;
2628
+
2629
+ if (start_gfn > end_gfn) {
2630
+ ghc->hva = KVM_HVA_ERR_BAD;
2631
+ return -EINVAL;
21812632 }
2633
+
2634
+ /*
2635
+ * If the requested region crosses two memslots, we still
2636
+ * verify that the entire region is valid here.
2637
+ */
2638
+ for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2639
+ ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2640
+ ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2641
+ &nr_pages_avail);
2642
+ if (kvm_is_error_hva(ghc->hva))
2643
+ return -EFAULT;
2644
+ }
2645
+
2646
+ /* Use the slow path for cross page reads and writes. */
2647
+ if (nr_pages_needed == 1)
2648
+ ghc->hva += offset;
2649
+ else
2650
+ ghc->memslot = NULL;
2651
+
2652
+ ghc->gpa = gpa;
2653
+ ghc->len = len;
21822654 return 0;
21832655 }
21842656
....@@ -2198,10 +2670,13 @@
21982670 int r;
21992671 gpa_t gpa = ghc->gpa + offset;
22002672
2201
- BUG_ON(len + offset > ghc->len);
2673
+ if (WARN_ON_ONCE(len + offset > ghc->len))
2674
+ return -EINVAL;
22022675
2203
- if (slots->generation != ghc->generation)
2204
- __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
2676
+ if (slots->generation != ghc->generation) {
2677
+ if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2678
+ return -EFAULT;
2679
+ }
22052680
22062681 if (kvm_is_error_hva(ghc->hva))
22072682 return -EFAULT;
....@@ -2225,28 +2700,40 @@
22252700 }
22262701 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
22272702
2228
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2229
- void *data, unsigned long len)
2703
+int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2704
+ void *data, unsigned int offset,
2705
+ unsigned long len)
22302706 {
22312707 struct kvm_memslots *slots = kvm_memslots(kvm);
22322708 int r;
2709
+ gpa_t gpa = ghc->gpa + offset;
22332710
2234
- BUG_ON(len > ghc->len);
2711
+ if (WARN_ON_ONCE(len + offset > ghc->len))
2712
+ return -EINVAL;
22352713
2236
- if (slots->generation != ghc->generation)
2237
- __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
2714
+ if (slots->generation != ghc->generation) {
2715
+ if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2716
+ return -EFAULT;
2717
+ }
22382718
22392719 if (kvm_is_error_hva(ghc->hva))
22402720 return -EFAULT;
22412721
22422722 if (unlikely(!ghc->memslot))
2243
- return kvm_read_guest(kvm, ghc->gpa, data, len);
2723
+ return kvm_read_guest(kvm, gpa, data, len);
22442724
2245
- r = __copy_from_user(data, (void __user *)ghc->hva, len);
2725
+ r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
22462726 if (r)
22472727 return -EFAULT;
22482728
22492729 return 0;
2730
+}
2731
+EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
2732
+
2733
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2734
+ void *data, unsigned long len)
2735
+{
2736
+ return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
22502737 }
22512738 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
22522739
....@@ -2277,8 +2764,7 @@
22772764 }
22782765 EXPORT_SYMBOL_GPL(kvm_clear_guest);
22792766
2280
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
2281
- gfn_t gfn)
2767
+void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn)
22822768 {
22832769 if (memslot && memslot->dirty_bitmap) {
22842770 unsigned long rel_gfn = gfn - memslot->base_gfn;
....@@ -2286,6 +2772,7 @@
22862772 set_bit_le(rel_gfn, memslot->dirty_bitmap);
22872773 }
22882774 }
2775
+EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
22892776
22902777 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
22912778 {
....@@ -2330,33 +2817,40 @@
23302817
23312818 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
23322819 {
2333
- unsigned int old, val, grow;
2820
+ unsigned int old, val, grow, grow_start;
23342821
23352822 old = val = vcpu->halt_poll_ns;
2823
+ grow_start = READ_ONCE(halt_poll_ns_grow_start);
23362824 grow = READ_ONCE(halt_poll_ns_grow);
2337
- /* 10us base */
2338
- if (val == 0 && grow)
2339
- val = 10000;
2340
- else
2341
- val *= grow;
2825
+ if (!grow)
2826
+ goto out;
23422827
2343
- if (val > halt_poll_ns)
2344
- val = halt_poll_ns;
2828
+ val *= grow;
2829
+ if (val < grow_start)
2830
+ val = grow_start;
2831
+
2832
+ if (val > vcpu->kvm->max_halt_poll_ns)
2833
+ val = vcpu->kvm->max_halt_poll_ns;
23452834
23462835 vcpu->halt_poll_ns = val;
2836
+out:
23472837 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
23482838 }
23492839
23502840 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
23512841 {
2352
- unsigned int old, val, shrink;
2842
+ unsigned int old, val, shrink, grow_start;
23532843
23542844 old = val = vcpu->halt_poll_ns;
23552845 shrink = READ_ONCE(halt_poll_ns_shrink);
2846
+ grow_start = READ_ONCE(halt_poll_ns_grow_start);
23562847 if (shrink == 0)
23572848 val = 0;
23582849 else
23592850 val /= shrink;
2851
+
2852
+ if (val < grow_start)
2853
+ val = 0;
23602854
23612855 vcpu->halt_poll_ns = val;
23622856 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
....@@ -2382,18 +2876,28 @@
23822876 return ret;
23832877 }
23842878
2879
+static inline void
2880
+update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
2881
+{
2882
+ if (waited)
2883
+ vcpu->stat.halt_poll_fail_ns += poll_ns;
2884
+ else
2885
+ vcpu->stat.halt_poll_success_ns += poll_ns;
2886
+}
2887
+
23852888 /*
23862889 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
23872890 */
23882891 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
23892892 {
2390
- ktime_t start, cur;
2391
- DECLARE_SWAITQUEUE(wait);
2893
+ ktime_t start, cur, poll_end;
23922894 bool waited = false;
23932895 u64 block_ns;
23942896
2395
- start = cur = ktime_get();
2396
- if (vcpu->halt_poll_ns) {
2897
+ kvm_arch_vcpu_blocking(vcpu);
2898
+
2899
+ start = cur = poll_end = ktime_get();
2900
+ if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
23972901 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
23982902
23992903 ++vcpu->stat.halt_attempted_poll;
....@@ -2408,14 +2912,14 @@
24082912 ++vcpu->stat.halt_poll_invalid;
24092913 goto out;
24102914 }
2411
- cur = ktime_get();
2412
- } while (single_task_running() && ktime_before(cur, stop));
2915
+ poll_end = cur = ktime_get();
2916
+ } while (single_task_running() && !need_resched() &&
2917
+ ktime_before(cur, stop));
24132918 }
24142919
2415
- kvm_arch_vcpu_blocking(vcpu);
2416
-
2920
+ prepare_to_rcuwait(&vcpu->wait);
24172921 for (;;) {
2418
- prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
2922
+ set_current_state(TASK_INTERRUPTIBLE);
24192923
24202924 if (kvm_vcpu_check_block(vcpu) < 0)
24212925 break;
....@@ -2423,28 +2927,33 @@
24232927 waited = true;
24242928 schedule();
24252929 }
2426
-
2427
- finish_swait(&vcpu->wq, &wait);
2930
+ finish_rcuwait(&vcpu->wait);
24282931 cur = ktime_get();
2429
-
2430
- kvm_arch_vcpu_unblocking(vcpu);
24312932 out:
2933
+ kvm_arch_vcpu_unblocking(vcpu);
24322934 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
24332935
2434
- if (!vcpu_valid_wakeup(vcpu))
2435
- shrink_halt_poll_ns(vcpu);
2436
- else if (halt_poll_ns) {
2437
- if (block_ns <= vcpu->halt_poll_ns)
2438
- ;
2439
- /* we had a long block, shrink polling */
2440
- else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
2936
+ update_halt_poll_stats(
2937
+ vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
2938
+
2939
+ if (!kvm_arch_no_poll(vcpu)) {
2940
+ if (!vcpu_valid_wakeup(vcpu)) {
24412941 shrink_halt_poll_ns(vcpu);
2442
- /* we had a short halt and our poll time is too small */
2443
- else if (vcpu->halt_poll_ns < halt_poll_ns &&
2444
- block_ns < halt_poll_ns)
2445
- grow_halt_poll_ns(vcpu);
2446
- } else
2447
- vcpu->halt_poll_ns = 0;
2942
+ } else if (vcpu->kvm->max_halt_poll_ns) {
2943
+ if (block_ns <= vcpu->halt_poll_ns)
2944
+ ;
2945
+ /* we had a long block, shrink polling */
2946
+ else if (vcpu->halt_poll_ns &&
2947
+ block_ns > vcpu->kvm->max_halt_poll_ns)
2948
+ shrink_halt_poll_ns(vcpu);
2949
+ /* we had a short halt and our poll time is too small */
2950
+ else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
2951
+ block_ns < vcpu->kvm->max_halt_poll_ns)
2952
+ grow_halt_poll_ns(vcpu);
2953
+ } else {
2954
+ vcpu->halt_poll_ns = 0;
2955
+ }
2956
+ }
24482957
24492958 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
24502959 kvm_arch_vcpu_block_finish(vcpu);
....@@ -2453,11 +2962,11 @@
24532962
24542963 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
24552964 {
2456
- struct swait_queue_head *wqp;
2965
+ struct rcuwait *waitp;
24572966
2458
- wqp = kvm_arch_vcpu_wq(vcpu);
2459
- if (swq_has_sleeper(wqp)) {
2460
- swake_up_one(wqp);
2967
+ waitp = kvm_arch_vcpu_get_wait(vcpu);
2968
+ if (rcuwait_wake_up(waitp)) {
2969
+ WRITE_ONCE(vcpu->ready, true);
24612970 ++vcpu->stat.halt_wakeup;
24622971 return true;
24632972 }
....@@ -2472,16 +2981,24 @@
24722981 */
24732982 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
24742983 {
2475
- int me;
2476
- int cpu = vcpu->cpu;
2984
+ int me, cpu;
24772985
24782986 if (kvm_vcpu_wake_up(vcpu))
24792987 return;
24802988
2989
+ /*
2990
+ * Note, the vCPU could get migrated to a different pCPU at any point
2991
+ * after kvm_arch_vcpu_should_kick(), which could result in sending an
2992
+ * IPI to the previous pCPU. But, that's ok because the purpose of the
2993
+ * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
2994
+ * vCPU also requires it to leave IN_GUEST_MODE.
2995
+ */
24812996 me = get_cpu();
2482
- if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
2483
- if (kvm_arch_vcpu_should_kick(vcpu))
2997
+ if (kvm_arch_vcpu_should_kick(vcpu)) {
2998
+ cpu = READ_ONCE(vcpu->cpu);
2999
+ if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
24843000 smp_send_reschedule(cpu);
3001
+ }
24853002 put_cpu();
24863003 }
24873004 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
....@@ -2513,7 +3030,7 @@
25133030 *
25143031 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
25153032 * (preempted lock holder), indicated by @in_spin_loop.
2516
- * Set at the beiginning and cleared at the end of interception/PLE handler.
3033
+ * Set at the beginning and cleared at the end of interception/PLE handler.
25173034 *
25183035 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
25193036 * chance last time (mostly it has become eligible now since we have probably
....@@ -2594,13 +3111,15 @@
25943111 continue;
25953112 } else if (pass && i > last_boosted_vcpu)
25963113 break;
2597
- if (!READ_ONCE(vcpu->preempted))
3114
+ if (!READ_ONCE(vcpu->ready))
25983115 continue;
25993116 if (vcpu == me)
26003117 continue;
2601
- if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
3118
+ if (rcuwait_active(&vcpu->wait) &&
3119
+ !vcpu_dy_runnable(vcpu))
26023120 continue;
2603
- if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
3121
+ if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3122
+ !kvm_arch_vcpu_in_kernel(vcpu))
26043123 continue;
26053124 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
26063125 continue;
....@@ -2659,7 +3178,6 @@
26593178 {
26603179 struct kvm_vcpu *vcpu = filp->private_data;
26613180
2662
- debugfs_remove_recursive(vcpu->debugfs_dentry);
26633181 kvm_put_kvm(vcpu->kvm);
26643182 return 0;
26653183 }
....@@ -2683,30 +3201,21 @@
26833201 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
26843202 }
26853203
2686
-static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3204
+static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
26873205 {
3206
+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3207
+ struct dentry *debugfs_dentry;
26883208 char dir_name[ITOA_MAX_LEN * 2];
2689
- int ret;
2690
-
2691
- if (!kvm_arch_has_vcpu_debugfs())
2692
- return 0;
26933209
26943210 if (!debugfs_initialized())
2695
- return 0;
3211
+ return;
26963212
26973213 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
2698
- vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
2699
- vcpu->kvm->debugfs_dentry);
2700
- if (!vcpu->debugfs_dentry)
2701
- return -ENOMEM;
3214
+ debugfs_dentry = debugfs_create_dir(dir_name,
3215
+ vcpu->kvm->debugfs_dentry);
27023216
2703
- ret = kvm_arch_create_vcpu_debugfs(vcpu);
2704
- if (ret < 0) {
2705
- debugfs_remove_recursive(vcpu->debugfs_dentry);
2706
- return ret;
2707
- }
2708
-
2709
- return 0;
3217
+ kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3218
+#endif
27103219 }
27113220
27123221 /*
....@@ -2716,6 +3225,7 @@
27163225 {
27173226 int r;
27183227 struct kvm_vcpu *vcpu;
3228
+ struct page *page;
27193229
27203230 if (id >= KVM_MAX_VCPU_ID)
27213231 return -EINVAL;
....@@ -2729,21 +3239,29 @@
27293239 kvm->created_vcpus++;
27303240 mutex_unlock(&kvm->lock);
27313241
2732
- vcpu = kvm_arch_vcpu_create(kvm, id);
2733
- if (IS_ERR(vcpu)) {
2734
- r = PTR_ERR(vcpu);
3242
+ r = kvm_arch_vcpu_precreate(kvm, id);
3243
+ if (r)
3244
+ goto vcpu_decrement;
3245
+
3246
+ vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
3247
+ if (!vcpu) {
3248
+ r = -ENOMEM;
27353249 goto vcpu_decrement;
27363250 }
27373251
2738
- preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
3252
+ BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3253
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3254
+ if (!page) {
3255
+ r = -ENOMEM;
3256
+ goto vcpu_free;
3257
+ }
3258
+ vcpu->run = page_address(page);
27393259
2740
- r = kvm_arch_vcpu_setup(vcpu);
2741
- if (r)
2742
- goto vcpu_destroy;
3260
+ kvm_vcpu_init(vcpu, kvm, id);
27433261
2744
- r = kvm_create_vcpu_debugfs(vcpu);
3262
+ r = kvm_arch_vcpu_create(vcpu);
27453263 if (r)
2746
- goto vcpu_destroy;
3264
+ goto vcpu_free_run_page;
27473265
27483266 mutex_lock(&kvm->lock);
27493267 if (kvm_get_vcpu_by_id(kvm, id)) {
....@@ -2758,7 +3276,7 @@
27583276 kvm_get_kvm(kvm);
27593277 r = create_vcpu_fd(vcpu);
27603278 if (r < 0) {
2761
- kvm_put_kvm(kvm);
3279
+ kvm_put_kvm_no_destroy(kvm);
27623280 goto unlock_vcpu_destroy;
27633281 }
27643282
....@@ -2773,13 +3291,16 @@
27733291
27743292 mutex_unlock(&kvm->lock);
27753293 kvm_arch_vcpu_postcreate(vcpu);
3294
+ kvm_create_vcpu_debugfs(vcpu);
27763295 return r;
27773296
27783297 unlock_vcpu_destroy:
27793298 mutex_unlock(&kvm->lock);
2780
- debugfs_remove_recursive(vcpu->debugfs_dentry);
2781
-vcpu_destroy:
27823299 kvm_arch_vcpu_destroy(vcpu);
3300
+vcpu_free_run_page:
3301
+ free_page((unsigned long)vcpu->run);
3302
+vcpu_free:
3303
+ kmem_cache_free(kvm_vcpu_cache, vcpu);
27833304 vcpu_decrement:
27843305 mutex_lock(&kvm->lock);
27853306 kvm->created_vcpus--;
....@@ -2807,7 +3328,7 @@
28073328 struct kvm_fpu *fpu = NULL;
28083329 struct kvm_sregs *kvm_sregs = NULL;
28093330
2810
- if (vcpu->kvm->mm != current->mm)
3331
+ if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
28113332 return -EIO;
28123333
28133334 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
....@@ -2844,7 +3365,7 @@
28443365 synchronize_rcu();
28453366 put_pid(oldpid);
28463367 }
2847
- r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
3368
+ r = kvm_arch_vcpu_ioctl_run(vcpu);
28483369 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
28493370 break;
28503371 }
....@@ -2852,7 +3373,7 @@
28523373 struct kvm_regs *kvm_regs;
28533374
28543375 r = -ENOMEM;
2855
- kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
3376
+ kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
28563377 if (!kvm_regs)
28573378 goto out;
28583379 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
....@@ -2869,7 +3390,6 @@
28693390 case KVM_SET_REGS: {
28703391 struct kvm_regs *kvm_regs;
28713392
2872
- r = -ENOMEM;
28733393 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
28743394 if (IS_ERR(kvm_regs)) {
28753395 r = PTR_ERR(kvm_regs);
....@@ -2880,7 +3400,8 @@
28803400 break;
28813401 }
28823402 case KVM_GET_SREGS: {
2883
- kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
3403
+ kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3404
+ GFP_KERNEL_ACCOUNT);
28843405 r = -ENOMEM;
28853406 if (!kvm_sregs)
28863407 goto out;
....@@ -2972,7 +3493,7 @@
29723493 break;
29733494 }
29743495 case KVM_GET_FPU: {
2975
- fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
3496
+ fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
29763497 r = -ENOMEM;
29773498 if (!fpu)
29783499 goto out;
....@@ -3013,7 +3534,7 @@
30133534 void __user *argp = compat_ptr(arg);
30143535 int r;
30153536
3016
- if (vcpu->kvm->mm != current->mm)
3537
+ if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
30173538 return -EIO;
30183539
30193540 switch (ioctl) {
....@@ -3031,7 +3552,8 @@
30313552 if (kvm_sigmask.len != sizeof(compat_sigset_t))
30323553 goto out;
30333554 r = -EFAULT;
3034
- if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
3555
+ if (get_compat_sigset(&sigset,
3556
+ (compat_sigset_t __user *)sigmask_arg->sigset))
30353557 goto out;
30363558 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
30373559 } else
....@@ -3046,6 +3568,16 @@
30463568 return r;
30473569 }
30483570 #endif
3571
+
3572
+static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3573
+{
3574
+ struct kvm_device *dev = filp->private_data;
3575
+
3576
+ if (dev->ops->mmap)
3577
+ return dev->ops->mmap(dev, vma);
3578
+
3579
+ return -ENODEV;
3580
+}
30493581
30503582 static int kvm_device_ioctl_attr(struct kvm_device *dev,
30513583 int (*accessor)(struct kvm_device *dev,
....@@ -3068,7 +3600,7 @@
30683600 {
30693601 struct kvm_device *dev = filp->private_data;
30703602
3071
- if (dev->kvm->mm != current->mm)
3603
+ if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
30723604 return -EIO;
30733605
30743606 switch (ioctl) {
....@@ -3091,6 +3623,13 @@
30913623 struct kvm_device *dev = filp->private_data;
30923624 struct kvm *kvm = dev->kvm;
30933625
3626
+ if (dev->ops->release) {
3627
+ mutex_lock(&kvm->lock);
3628
+ list_del(&dev->vm_node);
3629
+ dev->ops->release(dev);
3630
+ mutex_unlock(&kvm->lock);
3631
+ }
3632
+
30943633 kvm_put_kvm(kvm);
30953634 return 0;
30963635 }
....@@ -3099,6 +3638,7 @@
30993638 .unlocked_ioctl = kvm_device_ioctl,
31003639 .release = kvm_device_release,
31013640 KVM_COMPAT(kvm_device_ioctl),
3641
+ .mmap = kvm_device_mmap,
31023642 };
31033643
31043644 struct kvm_device *kvm_device_from_filp(struct file *filp)
....@@ -3109,14 +3649,14 @@
31093649 return filp->private_data;
31103650 }
31113651
3112
-static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3652
+static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
31133653 #ifdef CONFIG_KVM_MPIC
31143654 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
31153655 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
31163656 #endif
31173657 };
31183658
3119
-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
3659
+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
31203660 {
31213661 if (type >= ARRAY_SIZE(kvm_device_ops_table))
31223662 return -ENOSPC;
....@@ -3137,7 +3677,7 @@
31373677 static int kvm_ioctl_create_device(struct kvm *kvm,
31383678 struct kvm_create_device *cd)
31393679 {
3140
- struct kvm_device_ops *ops = NULL;
3680
+ const struct kvm_device_ops *ops = NULL;
31413681 struct kvm_device *dev;
31423682 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
31433683 int type;
....@@ -3154,7 +3694,7 @@
31543694 if (test)
31553695 return 0;
31563696
3157
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
3697
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
31583698 if (!dev)
31593699 return -ENOMEM;
31603700
....@@ -3177,11 +3717,14 @@
31773717 kvm_get_kvm(kvm);
31783718 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
31793719 if (ret < 0) {
3180
- kvm_put_kvm(kvm);
3720
+ kvm_put_kvm_no_destroy(kvm);
31813721 mutex_lock(&kvm->lock);
31823722 list_del(&dev->vm_node);
3723
+ if (ops->release)
3724
+ ops->release(dev);
31833725 mutex_unlock(&kvm->lock);
3184
- ops->destroy(dev);
3726
+ if (ops->destroy)
3727
+ ops->destroy(dev);
31853728 return ret;
31863729 }
31873730
....@@ -3205,10 +3748,18 @@
32053748 #endif
32063749 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
32073750 case KVM_CAP_CHECK_EXTENSION_VM:
3751
+ case KVM_CAP_ENABLE_CAP_VM:
3752
+ case KVM_CAP_HALT_POLL:
32083753 return 1;
32093754 #ifdef CONFIG_KVM_MMIO
32103755 case KVM_CAP_COALESCED_MMIO:
32113756 return KVM_COALESCED_MMIO_PAGE_OFFSET;
3757
+ case KVM_CAP_COALESCED_PIO:
3758
+ return 1;
3759
+#endif
3760
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3761
+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
3762
+ return KVM_DIRTY_LOG_MANUAL_CAPS;
32123763 #endif
32133764 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
32143765 case KVM_CAP_IRQ_ROUTING:
....@@ -3218,10 +3769,47 @@
32183769 case KVM_CAP_MULTI_ADDRESS_SPACE:
32193770 return KVM_ADDRESS_SPACE_NUM;
32203771 #endif
3772
+ case KVM_CAP_NR_MEMSLOTS:
3773
+ return KVM_USER_MEM_SLOTS;
32213774 default:
32223775 break;
32233776 }
32243777 return kvm_vm_ioctl_check_extension(kvm, arg);
3778
+}
3779
+
3780
+int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3781
+ struct kvm_enable_cap *cap)
3782
+{
3783
+ return -EINVAL;
3784
+}
3785
+
3786
+static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
3787
+ struct kvm_enable_cap *cap)
3788
+{
3789
+ switch (cap->cap) {
3790
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3791
+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
3792
+ u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
3793
+
3794
+ if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
3795
+ allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
3796
+
3797
+ if (cap->flags || (cap->args[0] & ~allowed_options))
3798
+ return -EINVAL;
3799
+ kvm->manual_dirty_log_protect = cap->args[0];
3800
+ return 0;
3801
+ }
3802
+#endif
3803
+ case KVM_CAP_HALT_POLL: {
3804
+ if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
3805
+ return -EINVAL;
3806
+
3807
+ kvm->max_halt_poll_ns = cap->args[0];
3808
+ return 0;
3809
+ }
3810
+ default:
3811
+ return kvm_vm_ioctl_enable_cap(kvm, cap);
3812
+ }
32253813 }
32263814
32273815 static long kvm_vm_ioctl(struct file *filp,
....@@ -3231,12 +3819,21 @@
32313819 void __user *argp = (void __user *)arg;
32323820 int r;
32333821
3234
- if (kvm->mm != current->mm)
3822
+ if (kvm->mm != current->mm || kvm->vm_bugged)
32353823 return -EIO;
32363824 switch (ioctl) {
32373825 case KVM_CREATE_VCPU:
32383826 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
32393827 break;
3828
+ case KVM_ENABLE_CAP: {
3829
+ struct kvm_enable_cap cap;
3830
+
3831
+ r = -EFAULT;
3832
+ if (copy_from_user(&cap, argp, sizeof(cap)))
3833
+ goto out;
3834
+ r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
3835
+ break;
3836
+ }
32403837 case KVM_SET_USER_MEMORY_REGION: {
32413838 struct kvm_userspace_memory_region kvm_userspace_mem;
32423839
....@@ -3257,6 +3854,17 @@
32573854 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
32583855 break;
32593856 }
3857
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3858
+ case KVM_CLEAR_DIRTY_LOG: {
3859
+ struct kvm_clear_dirty_log log;
3860
+
3861
+ r = -EFAULT;
3862
+ if (copy_from_user(&log, argp, sizeof(log)))
3863
+ goto out;
3864
+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
3865
+ break;
3866
+ }
3867
+#endif
32603868 #ifdef CONFIG_KVM_MMIO
32613869 case KVM_REGISTER_COALESCED_MMIO: {
32623870 struct kvm_coalesced_mmio_zone zone;
....@@ -3347,21 +3955,18 @@
33473955 if (routing.flags)
33483956 goto out;
33493957 if (routing.nr) {
3350
- r = -ENOMEM;
3351
- entries = vmalloc(array_size(sizeof(*entries),
3352
- routing.nr));
3353
- if (!entries)
3354
- goto out;
3355
- r = -EFAULT;
33563958 urouting = argp;
3357
- if (copy_from_user(entries, urouting->entries,
3358
- routing.nr * sizeof(*entries)))
3359
- goto out_free_irq_routing;
3959
+ entries = vmemdup_user(urouting->entries,
3960
+ array_size(sizeof(*entries),
3961
+ routing.nr));
3962
+ if (IS_ERR(entries)) {
3963
+ r = PTR_ERR(entries);
3964
+ goto out;
3965
+ }
33603966 }
33613967 r = kvm_set_irq_routing(kvm, entries, routing.nr,
33623968 routing.flags);
3363
-out_free_irq_routing:
3364
- vfree(entries);
3969
+ kvfree(entries);
33653970 break;
33663971 }
33673972 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
....@@ -3403,15 +4008,54 @@
34034008 };
34044009 };
34054010
4011
+struct compat_kvm_clear_dirty_log {
4012
+ __u32 slot;
4013
+ __u32 num_pages;
4014
+ __u64 first_page;
4015
+ union {
4016
+ compat_uptr_t dirty_bitmap; /* one bit per page */
4017
+ __u64 padding2;
4018
+ };
4019
+};
4020
+
4021
+long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
4022
+ unsigned long arg)
4023
+{
4024
+ return -ENOTTY;
4025
+}
4026
+
34064027 static long kvm_vm_compat_ioctl(struct file *filp,
34074028 unsigned int ioctl, unsigned long arg)
34084029 {
34094030 struct kvm *kvm = filp->private_data;
34104031 int r;
34114032
3412
- if (kvm->mm != current->mm)
4033
+ if (kvm->mm != current->mm || kvm->vm_bugged)
34134034 return -EIO;
4035
+
4036
+ r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
4037
+ if (r != -ENOTTY)
4038
+ return r;
4039
+
34144040 switch (ioctl) {
4041
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4042
+ case KVM_CLEAR_DIRTY_LOG: {
4043
+ struct compat_kvm_clear_dirty_log compat_log;
4044
+ struct kvm_clear_dirty_log log;
4045
+
4046
+ if (copy_from_user(&compat_log, (void __user *)arg,
4047
+ sizeof(compat_log)))
4048
+ return -EFAULT;
4049
+ log.slot = compat_log.slot;
4050
+ log.num_pages = compat_log.num_pages;
4051
+ log.first_page = compat_log.first_page;
4052
+ log.padding2 = compat_log.padding2;
4053
+ log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4054
+
4055
+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4056
+ break;
4057
+ }
4058
+#endif
34154059 case KVM_GET_DIRTY_LOG: {
34164060 struct compat_kvm_dirty_log compat_log;
34174061 struct kvm_dirty_log log;
....@@ -3749,6 +4393,7 @@
37494393 r = __kvm_io_bus_write(vcpu, bus, &range, val);
37504394 return r < 0 ? r : 0;
37514395 }
4396
+EXPORT_SYMBOL_GPL(kvm_io_bus_write);
37524397
37534398 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
37544399 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
....@@ -3799,7 +4444,6 @@
37994444
38004445 return -EOPNOTSUPP;
38014446 }
3802
-EXPORT_SYMBOL_GPL(kvm_io_bus_write);
38034447
38044448 /* kvm_io_bus_read - called under kvm->slots_lock */
38054449 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
....@@ -3821,7 +4465,6 @@
38214465 return r < 0 ? r : 0;
38224466 }
38234467
3824
-
38254468 /* Caller must hold slots_lock. */
38264469 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
38274470 int len, struct kvm_io_device *dev)
....@@ -3838,8 +4481,8 @@
38384481 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
38394482 return -ENOSPC;
38404483
3841
- new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
3842
- sizeof(struct kvm_io_range)), GFP_KERNEL);
4484
+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
4485
+ GFP_KERNEL_ACCOUNT);
38434486 if (!new_bus)
38444487 return -ENOMEM;
38454488
....@@ -3866,15 +4509,15 @@
38664509 }
38674510
38684511 /* Caller must hold slots_lock. */
3869
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
3870
- struct kvm_io_device *dev)
4512
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4513
+ struct kvm_io_device *dev)
38714514 {
38724515 int i, j;
38734516 struct kvm_io_bus *new_bus, *bus;
38744517
38754518 bus = kvm_get_bus(kvm, bus_idx);
38764519 if (!bus)
3877
- return;
4520
+ return 0;
38784521
38794522 for (i = 0; i < bus->dev_count; i++)
38804523 if (bus->range[i].dev == dev) {
....@@ -3882,16 +4525,22 @@
38824525 }
38834526
38844527 if (i == bus->dev_count)
3885
- return;
4528
+ return 0;
38864529
3887
- new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
3888
- sizeof(struct kvm_io_range)), GFP_KERNEL);
4530
+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
4531
+ GFP_KERNEL_ACCOUNT);
38894532 if (new_bus) {
3890
- memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4533
+ memcpy(new_bus, bus, struct_size(bus, range, i));
38914534 new_bus->dev_count--;
38924535 memcpy(new_bus->range + i, bus->range + i + 1,
3893
- (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
3894
- } else {
4536
+ flex_array_size(new_bus, range, new_bus->dev_count - i));
4537
+ }
4538
+
4539
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4540
+ synchronize_srcu_expedited(&kvm->srcu);
4541
+
4542
+ /* Destroy the old bus _after_ installing the (null) bus. */
4543
+ if (!new_bus) {
38954544 pr_err("kvm: failed to shrink bus, removing it completely\n");
38964545 for (j = 0; j < bus->dev_count; j++) {
38974546 if (j == i)
....@@ -3900,10 +4549,8 @@
39004549 }
39014550 }
39024551
3903
- rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
3904
- synchronize_srcu_expedited(&kvm->srcu);
39054552 kfree(bus);
3906
- return;
4553
+ return new_bus ? 0 : -ENOMEM;
39074554 }
39084555
39094556 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
....@@ -3948,8 +4595,9 @@
39484595 return -ENOENT;
39494596
39504597 if (simple_attr_open(inode, file, get,
3951
- stat_data->mode & S_IWUGO ? set : NULL,
3952
- fmt)) {
4598
+ KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
4599
+ ? set : NULL,
4600
+ fmt)) {
39534601 kvm_put_kvm(stat_data->kvm);
39544602 return -ENOMEM;
39554603 }
....@@ -3968,105 +4616,111 @@
39684616 return 0;
39694617 }
39704618
3971
-static int vm_stat_get_per_vm(void *data, u64 *val)
4619
+static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
39724620 {
3973
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
3974
-
3975
- *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
4621
+ *val = *(ulong *)((void *)kvm + offset);
39764622
39774623 return 0;
39784624 }
39794625
3980
-static int vm_stat_clear_per_vm(void *data, u64 val)
4626
+static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
39814627 {
3982
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
3983
-
3984
- if (val)
3985
- return -EINVAL;
3986
-
3987
- *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0;
4628
+ *(ulong *)((void *)kvm + offset) = 0;
39884629
39894630 return 0;
39904631 }
39914632
3992
-static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
3993
-{
3994
- __simple_attr_check_format("%llu\n", 0ull);
3995
- return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
3996
- vm_stat_clear_per_vm, "%llu\n");
3997
-}
3998
-
3999
-static const struct file_operations vm_stat_get_per_vm_fops = {
4000
- .owner = THIS_MODULE,
4001
- .open = vm_stat_get_per_vm_open,
4002
- .release = kvm_debugfs_release,
4003
- .read = simple_attr_read,
4004
- .write = simple_attr_write,
4005
- .llseek = no_llseek,
4006
-};
4007
-
4008
-static int vcpu_stat_get_per_vm(void *data, u64 *val)
4633
+static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
40094634 {
40104635 int i;
4011
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
40124636 struct kvm_vcpu *vcpu;
40134637
40144638 *val = 0;
40154639
4016
- kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4017
- *val += *(u64 *)((void *)vcpu + stat_data->offset);
4640
+ kvm_for_each_vcpu(i, vcpu, kvm)
4641
+ *val += *(u64 *)((void *)vcpu + offset);
40184642
40194643 return 0;
40204644 }
40214645
4022
-static int vcpu_stat_clear_per_vm(void *data, u64 val)
4646
+static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
40234647 {
40244648 int i;
4025
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
40264649 struct kvm_vcpu *vcpu;
4650
+
4651
+ kvm_for_each_vcpu(i, vcpu, kvm)
4652
+ *(u64 *)((void *)vcpu + offset) = 0;
4653
+
4654
+ return 0;
4655
+}
4656
+
4657
+static int kvm_stat_data_get(void *data, u64 *val)
4658
+{
4659
+ int r = -EFAULT;
4660
+ struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4661
+
4662
+ switch (stat_data->dbgfs_item->kind) {
4663
+ case KVM_STAT_VM:
4664
+ r = kvm_get_stat_per_vm(stat_data->kvm,
4665
+ stat_data->dbgfs_item->offset, val);
4666
+ break;
4667
+ case KVM_STAT_VCPU:
4668
+ r = kvm_get_stat_per_vcpu(stat_data->kvm,
4669
+ stat_data->dbgfs_item->offset, val);
4670
+ break;
4671
+ }
4672
+
4673
+ return r;
4674
+}
4675
+
4676
+static int kvm_stat_data_clear(void *data, u64 val)
4677
+{
4678
+ int r = -EFAULT;
4679
+ struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
40274680
40284681 if (val)
40294682 return -EINVAL;
40304683
4031
- kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4032
- *(u64 *)((void *)vcpu + stat_data->offset) = 0;
4684
+ switch (stat_data->dbgfs_item->kind) {
4685
+ case KVM_STAT_VM:
4686
+ r = kvm_clear_stat_per_vm(stat_data->kvm,
4687
+ stat_data->dbgfs_item->offset);
4688
+ break;
4689
+ case KVM_STAT_VCPU:
4690
+ r = kvm_clear_stat_per_vcpu(stat_data->kvm,
4691
+ stat_data->dbgfs_item->offset);
4692
+ break;
4693
+ }
40334694
4034
- return 0;
4695
+ return r;
40354696 }
40364697
4037
-static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
4698
+static int kvm_stat_data_open(struct inode *inode, struct file *file)
40384699 {
40394700 __simple_attr_check_format("%llu\n", 0ull);
4040
- return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
4041
- vcpu_stat_clear_per_vm, "%llu\n");
4701
+ return kvm_debugfs_open(inode, file, kvm_stat_data_get,
4702
+ kvm_stat_data_clear, "%llu\n");
40424703 }
40434704
4044
-static const struct file_operations vcpu_stat_get_per_vm_fops = {
4045
- .owner = THIS_MODULE,
4046
- .open = vcpu_stat_get_per_vm_open,
4705
+static const struct file_operations stat_fops_per_vm = {
4706
+ .owner = THIS_MODULE,
4707
+ .open = kvm_stat_data_open,
40474708 .release = kvm_debugfs_release,
4048
- .read = simple_attr_read,
4049
- .write = simple_attr_write,
4050
- .llseek = no_llseek,
4051
-};
4052
-
4053
-static const struct file_operations *stat_fops_per_vm[] = {
4054
- [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
4055
- [KVM_STAT_VM] = &vm_stat_get_per_vm_fops,
4709
+ .read = simple_attr_read,
4710
+ .write = simple_attr_write,
4711
+ .llseek = no_llseek,
40564712 };
40574713
40584714 static int vm_stat_get(void *_offset, u64 *val)
40594715 {
40604716 unsigned offset = (long)_offset;
40614717 struct kvm *kvm;
4062
- struct kvm_stat_data stat_tmp = {.offset = offset};
40634718 u64 tmp_val;
40644719
40654720 *val = 0;
40664721 mutex_lock(&kvm_lock);
40674722 list_for_each_entry(kvm, &vm_list, vm_list) {
4068
- stat_tmp.kvm = kvm;
4069
- vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
4723
+ kvm_get_stat_per_vm(kvm, offset, &tmp_val);
40704724 *val += tmp_val;
40714725 }
40724726 mutex_unlock(&kvm_lock);
....@@ -4077,15 +4731,13 @@
40774731 {
40784732 unsigned offset = (long)_offset;
40794733 struct kvm *kvm;
4080
- struct kvm_stat_data stat_tmp = {.offset = offset};
40814734
40824735 if (val)
40834736 return -EINVAL;
40844737
40854738 mutex_lock(&kvm_lock);
40864739 list_for_each_entry(kvm, &vm_list, vm_list) {
4087
- stat_tmp.kvm = kvm;
4088
- vm_stat_clear_per_vm((void *)&stat_tmp, 0);
4740
+ kvm_clear_stat_per_vm(kvm, offset);
40894741 }
40904742 mutex_unlock(&kvm_lock);
40914743
....@@ -4098,14 +4750,12 @@
40984750 {
40994751 unsigned offset = (long)_offset;
41004752 struct kvm *kvm;
4101
- struct kvm_stat_data stat_tmp = {.offset = offset};
41024753 u64 tmp_val;
41034754
41044755 *val = 0;
41054756 mutex_lock(&kvm_lock);
41064757 list_for_each_entry(kvm, &vm_list, vm_list) {
4107
- stat_tmp.kvm = kvm;
4108
- vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
4758
+ kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
41094759 *val += tmp_val;
41104760 }
41114761 mutex_unlock(&kvm_lock);
....@@ -4116,15 +4766,13 @@
41164766 {
41174767 unsigned offset = (long)_offset;
41184768 struct kvm *kvm;
4119
- struct kvm_stat_data stat_tmp = {.offset = offset};
41204769
41214770 if (val)
41224771 return -EINVAL;
41234772
41244773 mutex_lock(&kvm_lock);
41254774 list_for_each_entry(kvm, &vm_list, vm_list) {
4126
- stat_tmp.kvm = kvm;
4127
- vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
4775
+ kvm_clear_stat_per_vcpu(kvm, offset);
41284776 }
41294777 mutex_unlock(&kvm_lock);
41304778
....@@ -4158,7 +4806,7 @@
41584806 active = kvm_active_vms;
41594807 mutex_unlock(&kvm_lock);
41604808
4161
- env = kzalloc(sizeof(*env), GFP_KERNEL);
4809
+ env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
41624810 if (!env)
41634811 return;
41644812
....@@ -4173,8 +4821,8 @@
41734821 }
41744822 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
41754823
4176
- if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
4177
- char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
4824
+ if (kvm->debugfs_dentry) {
4825
+ char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
41784826
41794827 if (p) {
41804828 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
....@@ -4197,9 +4845,8 @@
41974845
41984846 kvm_debugfs_num_entries = 0;
41994847 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
4200
- int mode = p->mode ? p->mode : 0644;
4201
- debugfs_create_file(p->name, mode, kvm_debugfs_dir,
4202
- (void *)(long)p->offset,
4848
+ debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
4849
+ kvm_debugfs_dir, (void *)(long)p->offset,
42034850 stat_fops[p->kind]);
42044851 }
42054852 }
....@@ -4214,7 +4861,9 @@
42144861 static void kvm_resume(void)
42154862 {
42164863 if (kvm_usage_count) {
4217
- WARN_ON(raw_spin_is_locked(&kvm_count_lock));
4864
+#ifdef CONFIG_LOCKDEP
4865
+ WARN_ON(lockdep_is_held(&kvm_count_lock));
4866
+#endif
42184867 hardware_enable_nolock(NULL);
42194868 }
42204869 }
....@@ -4234,11 +4883,11 @@
42344883 {
42354884 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
42364885
4237
- if (vcpu->preempted)
4238
- vcpu->preempted = false;
4886
+ WRITE_ONCE(vcpu->preempted, false);
4887
+ WRITE_ONCE(vcpu->ready, false);
42394888
4889
+ __this_cpu_write(kvm_running_vcpu, vcpu);
42404890 kvm_arch_sched_in(vcpu, cpu);
4241
-
42424891 kvm_arch_vcpu_load(vcpu, cpu);
42434892 }
42444893
....@@ -4247,14 +4896,59 @@
42474896 {
42484897 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
42494898
4250
- if (current->state == TASK_RUNNING)
4251
- vcpu->preempted = true;
4899
+ if (current->state == TASK_RUNNING) {
4900
+ WRITE_ONCE(vcpu->preempted, true);
4901
+ WRITE_ONCE(vcpu->ready, true);
4902
+ }
42524903 kvm_arch_vcpu_put(vcpu);
4904
+ __this_cpu_write(kvm_running_vcpu, NULL);
4905
+}
4906
+
4907
+/**
4908
+ * kvm_get_running_vcpu - get the vcpu running on the current CPU.
4909
+ *
4910
+ * We can disable preemption locally around accessing the per-CPU variable,
4911
+ * and use the resolved vcpu pointer after enabling preemption again,
4912
+ * because even if the current thread is migrated to another CPU, reading
4913
+ * the per-CPU value later will give us the same value as we update the
4914
+ * per-CPU variable in the preempt notifier handlers.
4915
+ */
4916
+struct kvm_vcpu *kvm_get_running_vcpu(void)
4917
+{
4918
+ struct kvm_vcpu *vcpu;
4919
+
4920
+ preempt_disable();
4921
+ vcpu = __this_cpu_read(kvm_running_vcpu);
4922
+ preempt_enable();
4923
+
4924
+ return vcpu;
4925
+}
4926
+EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
4927
+
4928
+/**
4929
+ * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
4930
+ */
4931
+struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
4932
+{
4933
+ return &kvm_running_vcpu;
4934
+}
4935
+
4936
+struct kvm_cpu_compat_check {
4937
+ void *opaque;
4938
+ int *ret;
4939
+};
4940
+
4941
+static void check_processor_compat(void *data)
4942
+{
4943
+ struct kvm_cpu_compat_check *c = data;
4944
+
4945
+ *c->ret = kvm_arch_check_processor_compat(c->opaque);
42534946 }
42544947
42554948 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
42564949 struct module *module)
42574950 {
4951
+ struct kvm_cpu_compat_check c;
42584952 int r;
42594953 int cpu;
42604954
....@@ -4278,16 +4972,16 @@
42784972 goto out_free_0;
42794973 }
42804974
4281
- r = kvm_arch_hardware_setup();
4975
+ r = kvm_arch_hardware_setup(opaque);
42824976 if (r < 0)
4283
- goto out_free_0a;
4977
+ goto out_free_1;
42844978
4979
+ c.ret = &r;
4980
+ c.opaque = opaque;
42854981 for_each_online_cpu(cpu) {
4286
- smp_call_function_single(cpu,
4287
- kvm_arch_check_processor_compat,
4288
- &r, 1);
4982
+ smp_call_function_single(cpu, check_processor_compat, &c, 1);
42894983 if (r < 0)
4290
- goto out_free_1;
4984
+ goto out_free_2;
42914985 }
42924986
42934987 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
....@@ -4310,19 +5004,21 @@
43105004 goto out_free_3;
43115005 }
43125006
5007
+ for_each_possible_cpu(cpu) {
5008
+ if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
5009
+ GFP_KERNEL, cpu_to_node(cpu))) {
5010
+ r = -ENOMEM;
5011
+ goto out_free_4;
5012
+ }
5013
+ }
5014
+
43135015 r = kvm_async_pf_init();
43145016 if (r)
4315
- goto out_free;
5017
+ goto out_free_4;
43165018
43175019 kvm_chardev_ops.owner = module;
43185020 kvm_vm_fops.owner = module;
43195021 kvm_vcpu_fops.owner = module;
4320
-
4321
- r = misc_register(&kvm_dev);
4322
- if (r) {
4323
- pr_err("kvm: misc device register failed\n");
4324
- goto out_unreg;
4325
- }
43265022
43275023 register_syscore_ops(&kvm_syscore_ops);
43285024
....@@ -4332,21 +5028,35 @@
43325028 kvm_init_debug();
43335029
43345030 r = kvm_vfio_ops_init();
4335
- WARN_ON(r);
5031
+ if (WARN_ON_ONCE(r))
5032
+ goto err_vfio;
5033
+
5034
+ /*
5035
+ * Registration _must_ be the very last thing done, as this exposes
5036
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
5037
+ */
5038
+ r = misc_register(&kvm_dev);
5039
+ if (r) {
5040
+ pr_err("kvm: misc device register failed\n");
5041
+ goto err_register;
5042
+ }
43365043
43375044 return 0;
43385045
4339
-out_unreg:
5046
+err_register:
5047
+ kvm_vfio_ops_exit();
5048
+err_vfio:
43405049 kvm_async_pf_deinit();
4341
-out_free:
5050
+out_free_4:
5051
+ for_each_possible_cpu(cpu)
5052
+ free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
43425053 kmem_cache_destroy(kvm_vcpu_cache);
43435054 out_free_3:
43445055 unregister_reboot_notifier(&kvm_reboot_notifier);
43455056 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
43465057 out_free_2:
4347
-out_free_1:
43485058 kvm_arch_hardware_unsetup();
4349
-out_free_0a:
5059
+out_free_1:
43505060 free_cpumask_var(cpus_hardware_enabled);
43515061 out_free_0:
43525062 kvm_irqfd_exit();
....@@ -4359,8 +5069,18 @@
43595069
43605070 void kvm_exit(void)
43615071 {
4362
- debugfs_remove_recursive(kvm_debugfs_dir);
5072
+ int cpu;
5073
+
5074
+ /*
5075
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
5076
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
5077
+ * to KVM while the module is being stopped.
5078
+ */
43635079 misc_deregister(&kvm_dev);
5080
+
5081
+ debugfs_remove_recursive(kvm_debugfs_dir);
5082
+ for_each_possible_cpu(cpu)
5083
+ free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
43645084 kmem_cache_destroy(kvm_vcpu_cache);
43655085 kvm_async_pf_deinit();
43665086 unregister_syscore_ops(&kvm_syscore_ops);