hc
2024-01-31 f70575805708cabdedea7498aaa3f710fde4d920
kernel/virt/kvm/kvm_main.c
....@@ -154,6 +154,8 @@
154154 static unsigned long long kvm_createvm_count;
155155 static unsigned long long kvm_active_vms;
156156
157
+static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
158
+
157159 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
158160 unsigned long start, unsigned long end)
159161 {
....@@ -248,9 +250,13 @@
248250 {
249251 }
250252
251
-static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
253
+static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait)
252254 {
253
- if (unlikely(!cpus))
255
+ const struct cpumask *cpus;
256
+
257
+ if (likely(cpumask_available(tmp)))
258
+ cpus = tmp;
259
+ else
254260 cpus = cpu_online_mask;
255261
256262 if (cpumask_empty(cpus))
....@@ -260,30 +266,57 @@
260266 return true;
261267 }
262268
269
+static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
270
+ unsigned int req, cpumask_var_t tmp,
271
+ int current_cpu)
272
+{
273
+ int cpu;
274
+
275
+ kvm_make_request(req, vcpu);
276
+
277
+ if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
278
+ return;
279
+
280
+ /*
281
+ * tmp can be "unavailable" if cpumasks are allocated off stack as
282
+ * allocation of the mask is deliberately not fatal and is handled by
283
+ * falling back to kicking all online CPUs.
284
+ */
285
+ if (!cpumask_available(tmp))
286
+ return;
287
+
288
+ /*
289
+ * Note, the vCPU could get migrated to a different pCPU at any point
290
+ * after kvm_request_needs_ipi(), which could result in sending an IPI
291
+ * to the previous pCPU. But, that's OK because the purpose of the IPI
292
+ * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
293
+ * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
294
+ * after this point is also OK, as the requirement is only that KVM wait
295
+ * for vCPUs that were reading SPTEs _before_ any changes were
296
+ * finalized. See kvm_vcpu_kick() for more details on handling requests.
297
+ */
298
+ if (kvm_request_needs_ipi(vcpu, req)) {
299
+ cpu = READ_ONCE(vcpu->cpu);
300
+ if (cpu != -1 && cpu != current_cpu)
301
+ __cpumask_set_cpu(cpu, tmp);
302
+ }
303
+}
304
+
263305 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
264306 struct kvm_vcpu *except,
265307 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
266308 {
267
- int i, cpu, me;
268309 struct kvm_vcpu *vcpu;
310
+ int i, me;
269311 bool called;
270312
271313 me = get_cpu();
272314
273
- kvm_for_each_vcpu(i, vcpu, kvm) {
274
- if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
275
- vcpu == except)
315
+ for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
316
+ vcpu = kvm_get_vcpu(kvm, i);
317
+ if (!vcpu || vcpu == except)
276318 continue;
277
-
278
- kvm_make_request(req, vcpu);
279
- cpu = vcpu->cpu;
280
-
281
- if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
282
- continue;
283
-
284
- if (tmp != NULL && cpu != -1 && cpu != me &&
285
- kvm_request_needs_ipi(vcpu, req))
286
- __cpumask_set_cpu(cpu, tmp);
319
+ kvm_make_vcpu_request(kvm, vcpu, req, tmp, me);
287320 }
288321
289322 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
....@@ -295,14 +328,25 @@
295328 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
296329 struct kvm_vcpu *except)
297330 {
298
- cpumask_var_t cpus;
331
+ struct kvm_vcpu *vcpu;
332
+ struct cpumask *cpus;
299333 bool called;
334
+ int i, me;
300335
301
- zalloc_cpumask_var(&cpus, GFP_ATOMIC);
336
+ me = get_cpu();
302337
303
- called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
338
+ cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
339
+ cpumask_clear(cpus);
304340
305
- free_cpumask_var(cpus);
341
+ kvm_for_each_vcpu(i, vcpu, kvm) {
342
+ if (vcpu == except)
343
+ continue;
344
+ kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
345
+ }
346
+
347
+ called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
348
+ put_cpu();
349
+
306350 return called;
307351 }
308352
....@@ -2937,16 +2981,24 @@
29372981 */
29382982 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
29392983 {
2940
- int me;
2941
- int cpu = vcpu->cpu;
2984
+ int me, cpu;
29422985
29432986 if (kvm_vcpu_wake_up(vcpu))
29442987 return;
29452988
2989
+ /*
2990
+ * Note, the vCPU could get migrated to a different pCPU at any point
2991
+ * after kvm_arch_vcpu_should_kick(), which could result in sending an
2992
+ * IPI to the previous pCPU. But, that's ok because the purpose of the
2993
+ * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
2994
+ * vCPU also requires it to leave IN_GUEST_MODE.
2995
+ */
29462996 me = get_cpu();
2947
- if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
2948
- if (kvm_arch_vcpu_should_kick(vcpu))
2997
+ if (kvm_arch_vcpu_should_kick(vcpu)) {
2998
+ cpu = READ_ONCE(vcpu->cpu);
2999
+ if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
29493000 smp_send_reschedule(cpu);
3001
+ }
29503002 put_cpu();
29513003 }
29523004 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
....@@ -4952,19 +5004,21 @@
49525004 goto out_free_3;
49535005 }
49545006
5007
+ for_each_possible_cpu(cpu) {
5008
+ if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
5009
+ GFP_KERNEL, cpu_to_node(cpu))) {
5010
+ r = -ENOMEM;
5011
+ goto out_free_4;
5012
+ }
5013
+ }
5014
+
49555015 r = kvm_async_pf_init();
49565016 if (r)
4957
- goto out_free;
5017
+ goto out_free_4;
49585018
49595019 kvm_chardev_ops.owner = module;
49605020 kvm_vm_fops.owner = module;
49615021 kvm_vcpu_fops.owner = module;
4962
-
4963
- r = misc_register(&kvm_dev);
4964
- if (r) {
4965
- pr_err("kvm: misc device register failed\n");
4966
- goto out_unreg;
4967
- }
49685022
49695023 register_syscore_ops(&kvm_syscore_ops);
49705024
....@@ -4974,13 +5028,28 @@
49745028 kvm_init_debug();
49755029
49765030 r = kvm_vfio_ops_init();
4977
- WARN_ON(r);
5031
+ if (WARN_ON_ONCE(r))
5032
+ goto err_vfio;
5033
+
5034
+ /*
5035
+ * Registration _must_ be the very last thing done, as this exposes
5036
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
5037
+ */
5038
+ r = misc_register(&kvm_dev);
5039
+ if (r) {
5040
+ pr_err("kvm: misc device register failed\n");
5041
+ goto err_register;
5042
+ }
49785043
49795044 return 0;
49805045
4981
-out_unreg:
5046
+err_register:
5047
+ kvm_vfio_ops_exit();
5048
+err_vfio:
49825049 kvm_async_pf_deinit();
4983
-out_free:
5050
+out_free_4:
5051
+ for_each_possible_cpu(cpu)
5052
+ free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
49845053 kmem_cache_destroy(kvm_vcpu_cache);
49855054 out_free_3:
49865055 unregister_reboot_notifier(&kvm_reboot_notifier);
....@@ -5000,8 +5069,18 @@
50005069
50015070 void kvm_exit(void)
50025071 {
5003
- debugfs_remove_recursive(kvm_debugfs_dir);
5072
+ int cpu;
5073
+
5074
+ /*
5075
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
5076
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
5077
+ * to KVM while the module is being stopped.
5078
+ */
50045079 misc_deregister(&kvm_dev);
5080
+
5081
+ debugfs_remove_recursive(kvm_debugfs_dir);
5082
+ for_each_possible_cpu(cpu)
5083
+ free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
50055084 kmem_cache_destroy(kvm_vcpu_cache);
50065085 kvm_async_pf_deinit();
50075086 unregister_syscore_ops(&kvm_syscore_ops);