kvm_main.c (149952B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16#include <asm-generic/errno-base.h> 17#include <kvm/iodev.h> 18 19#include <linux/kvm_host.h> 20#include <linux/kvm.h> 21#include <linux/module.h> 22#include <linux/errno.h> 23#include <linux/percpu.h> 24#include <linux/mm.h> 25#include <linux/miscdevice.h> 26#include <linux/vmalloc.h> 27#include <linux/reboot.h> 28#include <linux/debugfs.h> 29#include <linux/highmem.h> 30#include <linux/file.h> 31#include <linux/syscore_ops.h> 32#include <linux/cpu.h> 33#include <linux/sched/signal.h> 34#include <linux/sched/mm.h> 35#include <linux/sched/stat.h> 36#include <linux/cpumask.h> 37#include <linux/smp.h> 38#include <linux/anon_inodes.h> 39#include <linux/profile.h> 40#include <linux/kvm_para.h> 41#include <linux/pagemap.h> 42#include <linux/mman.h> 43#include <linux/swap.h> 44#include <linux/bitops.h> 45#include <linux/spinlock.h> 46#include <linux/compat.h> 47#include <linux/srcu.h> 48#include <linux/hugetlb.h> 49#include <linux/slab.h> 50#include <linux/sort.h> 51#include <linux/bsearch.h> 52#include <linux/io.h> 53#include <linux/lockdep.h> 54#include <linux/kthread.h> 55#include <linux/suspend.h> 56 57#include <asm/processor.h> 58#include <asm/ioctl.h> 59#include <linux/uaccess.h> 60 61#include "coalesced_mmio.h" 62#include "async_pf.h" 63#include "kvm_mm.h" 64#include "vfio.h" 65 66#define CREATE_TRACE_POINTS 67#include <trace/events/kvm.h> 68#include "../../arch/x86/kvm/cachepc/track.h" 69 70#include <linux/kvm_dirty_ring.h> 71 72/* Worst case buffer size needed for holding an integer. */ 73#define ITOA_MAX_LEN 12 74 75#include "../../arch/x86/kvm/cachepc/kvm.h" 76 77MODULE_AUTHOR("Qumranet"); 78MODULE_LICENSE("GPL"); 79 80/* Architectures should define their poll value according to the halt latency */ 81unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 82module_param(halt_poll_ns, uint, 0644); 83EXPORT_SYMBOL_GPL(halt_poll_ns); 84 85/* Default doubles per-vcpu halt_poll_ns. */ 86unsigned int halt_poll_ns_grow = 2; 87module_param(halt_poll_ns_grow, uint, 0644); 88EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 89 90/* The start value to grow halt_poll_ns from */ 91unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ 92module_param(halt_poll_ns_grow_start, uint, 0644); 93EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); 94 95/* Default resets per-vcpu halt_poll_ns . */ 96unsigned int halt_poll_ns_shrink; 97module_param(halt_poll_ns_shrink, uint, 0644); 98EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 99 100/* 101 * Ordering of locks: 102 * 103 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 104 */ 105 106DEFINE_MUTEX(kvm_lock); 107static DEFINE_RAW_SPINLOCK(kvm_count_lock); 108LIST_HEAD(vm_list); 109 110static cpumask_var_t cpus_hardware_enabled; 111static int kvm_usage_count; 112static atomic_t hardware_enable_failed; 113 114static struct kmem_cache *kvm_vcpu_cache; 115 116static __read_mostly struct preempt_ops kvm_preempt_ops; 117static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); 118 119struct dentry *kvm_debugfs_dir; 120EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 121 122static const struct file_operations stat_fops_per_vm; 123 124static struct file_operations kvm_chardev_ops; 125 126static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 127 unsigned long arg); 128#ifdef CONFIG_KVM_COMPAT 129static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 130 unsigned long arg); 131#define KVM_COMPAT(c) .compat_ioctl = (c) 132#else 133/* 134 * For architectures that don't implement a compat infrastructure, 135 * adopt a double line of defense: 136 * - Prevent a compat task from opening /dev/kvm 137 * - If the open has been done by a 64bit task, and the KVM fd 138 * passed to a compat task, let the ioctls fail. 139 */ 140static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 141 unsigned long arg) { return -EINVAL; } 142 143static int kvm_no_compat_open(struct inode *inode, struct file *file) 144{ 145 return is_compat_task() ? -ENODEV : 0; 146} 147#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 148 .open = kvm_no_compat_open 149#endif 150static int hardware_enable_all(void); 151static void hardware_disable_all(void); 152 153static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 154 155__visible bool kvm_rebooting; 156EXPORT_SYMBOL_GPL(kvm_rebooting); 157 158#define KVM_EVENT_CREATE_VM 0 159#define KVM_EVENT_DESTROY_VM 1 160static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 161static unsigned long long kvm_createvm_count; 162static unsigned long long kvm_active_vms; 163 164static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); 165 166__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 167 unsigned long start, unsigned long end) 168{ 169} 170 171bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) 172{ 173 /* 174 * The metadata used by is_zone_device_page() to determine whether or 175 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if 176 * the device has been pinned, e.g. by get_user_pages(). WARN if the 177 * page_count() is zero to help detect bad usage of this helper. 178 */ 179 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) 180 return false; 181 182 return is_zone_device_page(pfn_to_page(pfn)); 183} 184 185bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 186{ 187 /* 188 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting 189 * perspective they are "normal" pages, albeit with slightly different 190 * usage rules. 191 */ 192 if (pfn_valid(pfn)) 193 return PageReserved(pfn_to_page(pfn)) && 194 !is_zero_pfn(pfn) && 195 !kvm_is_zone_device_pfn(pfn); 196 197 return true; 198} 199 200/* 201 * Switches to specified vcpu, until a matching vcpu_put() 202 */ 203void vcpu_load(struct kvm_vcpu *vcpu) 204{ 205 int cpu = get_cpu(); 206 207 __this_cpu_write(kvm_running_vcpu, vcpu); 208 preempt_notifier_register(&vcpu->preempt_notifier); 209 kvm_arch_vcpu_load(vcpu, cpu); 210 put_cpu(); 211} 212EXPORT_SYMBOL_GPL(vcpu_load); 213 214void vcpu_put(struct kvm_vcpu *vcpu) 215{ 216 preempt_disable(); 217 kvm_arch_vcpu_put(vcpu); 218 preempt_notifier_unregister(&vcpu->preempt_notifier); 219 __this_cpu_write(kvm_running_vcpu, NULL); 220 preempt_enable(); 221} 222EXPORT_SYMBOL_GPL(vcpu_put); 223 224/* TODO: merge with kvm_arch_vcpu_should_kick */ 225static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 226{ 227 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 228 229 /* 230 * We need to wait for the VCPU to reenable interrupts and get out of 231 * READING_SHADOW_PAGE_TABLES mode. 232 */ 233 if (req & KVM_REQUEST_WAIT) 234 return mode != OUTSIDE_GUEST_MODE; 235 236 /* 237 * Need to kick a running VCPU, but otherwise there is nothing to do. 238 */ 239 return mode == IN_GUEST_MODE; 240} 241 242static void ack_flush(void *_completed) 243{ 244} 245 246static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait) 247{ 248 if (cpumask_empty(cpus)) 249 return false; 250 251 smp_call_function_many(cpus, ack_flush, NULL, wait); 252 return true; 253} 254 255static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req, 256 struct cpumask *tmp, int current_cpu) 257{ 258 int cpu; 259 260 if (likely(!(req & KVM_REQUEST_NO_ACTION))) 261 __kvm_make_request(req, vcpu); 262 263 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 264 return; 265 266 /* 267 * Note, the vCPU could get migrated to a different pCPU at any point 268 * after kvm_request_needs_ipi(), which could result in sending an IPI 269 * to the previous pCPU. But, that's OK because the purpose of the IPI 270 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is 271 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES 272 * after this point is also OK, as the requirement is only that KVM wait 273 * for vCPUs that were reading SPTEs _before_ any changes were 274 * finalized. See kvm_vcpu_kick() for more details on handling requests. 275 */ 276 if (kvm_request_needs_ipi(vcpu, req)) { 277 cpu = READ_ONCE(vcpu->cpu); 278 if (cpu != -1 && cpu != current_cpu) 279 __cpumask_set_cpu(cpu, tmp); 280 } 281} 282 283bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 284 unsigned long *vcpu_bitmap) 285{ 286 struct kvm_vcpu *vcpu; 287 struct cpumask *cpus; 288 int i, me; 289 bool called; 290 291 me = get_cpu(); 292 293 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); 294 cpumask_clear(cpus); 295 296 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) { 297 vcpu = kvm_get_vcpu(kvm, i); 298 if (!vcpu) 299 continue; 300 kvm_make_vcpu_request(vcpu, req, cpus, me); 301 } 302 303 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); 304 put_cpu(); 305 306 return called; 307} 308 309bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, 310 struct kvm_vcpu *except) 311{ 312 struct kvm_vcpu *vcpu; 313 struct cpumask *cpus; 314 unsigned long i; 315 bool called; 316 int me; 317 318 me = get_cpu(); 319 320 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); 321 cpumask_clear(cpus); 322 323 kvm_for_each_vcpu(i, vcpu, kvm) { 324 if (vcpu == except) 325 continue; 326 kvm_make_vcpu_request(vcpu, req, cpus, me); 327 } 328 329 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); 330 put_cpu(); 331 332 return called; 333} 334 335bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 336{ 337 return kvm_make_all_cpus_request_except(kvm, req, NULL); 338} 339EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); 340 341#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 342void kvm_flush_remote_tlbs(struct kvm *kvm) 343{ 344 ++kvm->stat.generic.remote_tlb_flush_requests; 345 346 /* 347 * We want to publish modifications to the page tables before reading 348 * mode. Pairs with a memory barrier in arch-specific code. 349 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 350 * and smp_mb in walk_shadow_page_lockless_begin/end. 351 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 352 * 353 * There is already an smp_mb__after_atomic() before 354 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 355 * barrier here. 356 */ 357 if (!kvm_arch_flush_remote_tlb(kvm) 358 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 359 ++kvm->stat.generic.remote_tlb_flush; 360} 361EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 362#endif 363 364#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 365static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, 366 gfp_t gfp_flags) 367{ 368 gfp_flags |= mc->gfp_zero; 369 370 if (mc->kmem_cache) 371 return kmem_cache_alloc(mc->kmem_cache, gfp_flags); 372 else 373 return (void *)__get_free_page(gfp_flags); 374} 375 376int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) 377{ 378 void *obj; 379 380 if (mc->nobjs >= min) 381 return 0; 382 while (mc->nobjs < ARRAY_SIZE(mc->objects)) { 383 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); 384 if (!obj) 385 return mc->nobjs >= min ? 0 : -ENOMEM; 386 mc->objects[mc->nobjs++] = obj; 387 } 388 return 0; 389} 390 391int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) 392{ 393 return mc->nobjs; 394} 395 396void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 397{ 398 while (mc->nobjs) { 399 if (mc->kmem_cache) 400 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); 401 else 402 free_page((unsigned long)mc->objects[--mc->nobjs]); 403 } 404} 405 406void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 407{ 408 void *p; 409 410 if (WARN_ON(!mc->nobjs)) 411 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); 412 else 413 p = mc->objects[--mc->nobjs]; 414 BUG_ON(!p); 415 return p; 416} 417#endif 418 419static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 420{ 421 mutex_init(&vcpu->mutex); 422 vcpu->cpu = -1; 423 vcpu->kvm = kvm; 424 vcpu->vcpu_id = id; 425 vcpu->pid = NULL; 426#ifndef __KVM_HAVE_ARCH_WQP 427 rcuwait_init(&vcpu->wait); 428#endif 429 kvm_async_pf_vcpu_init(vcpu); 430 431 kvm_vcpu_set_in_spin_loop(vcpu, false); 432 kvm_vcpu_set_dy_eligible(vcpu, false); 433 vcpu->preempted = false; 434 vcpu->ready = false; 435 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 436 vcpu->last_used_slot = NULL; 437} 438 439static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) 440{ 441 kvm_arch_vcpu_destroy(vcpu); 442 kvm_dirty_ring_free(&vcpu->dirty_ring); 443 444 /* 445 * No need for rcu_read_lock as VCPU_RUN is the only place that changes 446 * the vcpu->pid pointer, and at destruction time all file descriptors 447 * are already gone. 448 */ 449 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 450 451 free_page((unsigned long)vcpu->run); 452 kmem_cache_free(kvm_vcpu_cache, vcpu); 453} 454 455void kvm_destroy_vcpus(struct kvm *kvm) 456{ 457 unsigned long i; 458 struct kvm_vcpu *vcpu; 459 460 kvm_for_each_vcpu(i, vcpu, kvm) { 461 kvm_vcpu_destroy(vcpu); 462 xa_erase(&kvm->vcpu_array, i); 463 } 464 465 atomic_set(&kvm->online_vcpus, 0); 466} 467EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); 468 469#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 470static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 471{ 472 return container_of(mn, struct kvm, mmu_notifier); 473} 474 475static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, 476 struct mm_struct *mm, 477 unsigned long start, unsigned long end) 478{ 479 struct kvm *kvm = mmu_notifier_to_kvm(mn); 480 int idx; 481 482 idx = srcu_read_lock(&kvm->srcu); 483 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); 484 srcu_read_unlock(&kvm->srcu, idx); 485} 486 487typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); 488 489typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, 490 unsigned long end); 491 492struct kvm_hva_range { 493 unsigned long start; 494 unsigned long end; 495 pte_t pte; 496 hva_handler_t handler; 497 on_lock_fn_t on_lock; 498 bool flush_on_ret; 499 bool may_block; 500}; 501 502/* 503 * Use a dedicated stub instead of NULL to indicate that there is no callback 504 * function/handler. The compiler technically can't guarantee that a real 505 * function will have a non-zero address, and so it will generate code to 506 * check for !NULL, whereas comparing against a stub will be elided at compile 507 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9). 508 */ 509static void kvm_null_fn(void) 510{ 511 512} 513#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) 514 515/* Iterate over each memslot intersecting [start, last] (inclusive) range */ 516#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \ 517 for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ 518 node; \ 519 node = interval_tree_iter_next(node, start, last)) \ 520 521static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, 522 const struct kvm_hva_range *range) 523{ 524 bool ret = false, locked = false; 525 struct kvm_gfn_range gfn_range; 526 struct kvm_memory_slot *slot; 527 struct kvm_memslots *slots; 528 int i, idx; 529 530 if (WARN_ON_ONCE(range->end <= range->start)) 531 return 0; 532 533 /* A null handler is allowed if and only if on_lock() is provided. */ 534 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && 535 IS_KVM_NULL_FN(range->handler))) 536 return 0; 537 538 idx = srcu_read_lock(&kvm->srcu); 539 540 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 541 struct interval_tree_node *node; 542 543 slots = __kvm_memslots(kvm, i); 544 kvm_for_each_memslot_in_hva_range(node, slots, 545 range->start, range->end - 1) { 546 unsigned long hva_start, hva_end; 547 548 slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]); 549 hva_start = max(range->start, slot->userspace_addr); 550 hva_end = min(range->end, slot->userspace_addr + 551 (slot->npages << PAGE_SHIFT)); 552 553 /* 554 * To optimize for the likely case where the address 555 * range is covered by zero or one memslots, don't 556 * bother making these conditional (to avoid writes on 557 * the second or later invocation of the handler). 558 */ 559 gfn_range.pte = range->pte; 560 gfn_range.may_block = range->may_block; 561 562 /* 563 * {gfn(page) | page intersects with [hva_start, hva_end)} = 564 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 565 */ 566 gfn_range.start = hva_to_gfn_memslot(hva_start, slot); 567 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); 568 gfn_range.slot = slot; 569 570 if (!locked) { 571 locked = true; 572 KVM_MMU_LOCK(kvm); 573 if (!IS_KVM_NULL_FN(range->on_lock)) 574 range->on_lock(kvm, range->start, range->end); 575 if (IS_KVM_NULL_FN(range->handler)) 576 break; 577 } 578 ret |= range->handler(kvm, &gfn_range); 579 } 580 } 581 582 if (range->flush_on_ret && ret) 583 kvm_flush_remote_tlbs(kvm); 584 585 if (locked) 586 KVM_MMU_UNLOCK(kvm); 587 588 srcu_read_unlock(&kvm->srcu, idx); 589 590 /* The notifiers are averse to booleans. :-( */ 591 return (int)ret; 592} 593 594static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, 595 unsigned long start, 596 unsigned long end, 597 pte_t pte, 598 hva_handler_t handler) 599{ 600 struct kvm *kvm = mmu_notifier_to_kvm(mn); 601 const struct kvm_hva_range range = { 602 .start = start, 603 .end = end, 604 .pte = pte, 605 .handler = handler, 606 .on_lock = (void *)kvm_null_fn, 607 .flush_on_ret = true, 608 .may_block = false, 609 }; 610 611 return __kvm_handle_hva_range(kvm, &range); 612} 613 614static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, 615 unsigned long start, 616 unsigned long end, 617 hva_handler_t handler) 618{ 619 struct kvm *kvm = mmu_notifier_to_kvm(mn); 620 const struct kvm_hva_range range = { 621 .start = start, 622 .end = end, 623 .pte = __pte(0), 624 .handler = handler, 625 .on_lock = (void *)kvm_null_fn, 626 .flush_on_ret = false, 627 .may_block = false, 628 }; 629 630 return __kvm_handle_hva_range(kvm, &range); 631} 632static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 633 struct mm_struct *mm, 634 unsigned long address, 635 pte_t pte) 636{ 637 struct kvm *kvm = mmu_notifier_to_kvm(mn); 638 639 trace_kvm_set_spte_hva(address); 640 641 /* 642 * .change_pte() must be surrounded by .invalidate_range_{start,end}(). 643 * If mmu_notifier_count is zero, then no in-progress invalidations, 644 * including this one, found a relevant memslot at start(); rechecking 645 * memslots here is unnecessary. Note, a false positive (count elevated 646 * by a different invalidation) is sub-optimal but functionally ok. 647 */ 648 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); 649 if (!READ_ONCE(kvm->mmu_notifier_count)) 650 return; 651 652 kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); 653} 654 655void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, 656 unsigned long end) 657{ 658 /* 659 * The count increase must become visible at unlock time as no 660 * spte can be established without taking the mmu_lock and 661 * count is also read inside the mmu_lock critical section. 662 */ 663 kvm->mmu_notifier_count++; 664 if (likely(kvm->mmu_notifier_count == 1)) { 665 kvm->mmu_notifier_range_start = start; 666 kvm->mmu_notifier_range_end = end; 667 } else { 668 /* 669 * Fully tracking multiple concurrent ranges has diminishing 670 * returns. Keep things simple and just find the minimal range 671 * which includes the current and new ranges. As there won't be 672 * enough information to subtract a range after its invalidate 673 * completes, any ranges invalidated concurrently will 674 * accumulate and persist until all outstanding invalidates 675 * complete. 676 */ 677 kvm->mmu_notifier_range_start = 678 min(kvm->mmu_notifier_range_start, start); 679 kvm->mmu_notifier_range_end = 680 max(kvm->mmu_notifier_range_end, end); 681 } 682} 683 684static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 685 const struct mmu_notifier_range *range) 686{ 687 struct kvm *kvm = mmu_notifier_to_kvm(mn); 688 const struct kvm_hva_range hva_range = { 689 .start = range->start, 690 .end = range->end, 691 .pte = __pte(0), 692 .handler = kvm_unmap_gfn_range, 693 .on_lock = kvm_inc_notifier_count, 694 .flush_on_ret = true, 695 .may_block = mmu_notifier_range_blockable(range), 696 }; 697 698 trace_kvm_unmap_hva_range(range->start, range->end); 699 700 /* 701 * Prevent memslot modification between range_start() and range_end() 702 * so that conditionally locking provides the same result in both 703 * functions. Without that guarantee, the mmu_notifier_count 704 * adjustments will be imbalanced. 705 * 706 * Pairs with the decrement in range_end(). 707 */ 708 spin_lock(&kvm->mn_invalidate_lock); 709 kvm->mn_active_invalidate_count++; 710 spin_unlock(&kvm->mn_invalidate_lock); 711 712 gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, 713 hva_range.may_block); 714 715 __kvm_handle_hva_range(kvm, &hva_range); 716 717 return 0; 718} 719 720void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, 721 unsigned long end) 722{ 723 /* 724 * This sequence increase will notify the kvm page fault that 725 * the page that is going to be mapped in the spte could have 726 * been freed. 727 */ 728 kvm->mmu_notifier_seq++; 729 smp_wmb(); 730 /* 731 * The above sequence increase must be visible before the 732 * below count decrease, which is ensured by the smp_wmb above 733 * in conjunction with the smp_rmb in mmu_notifier_retry(). 734 */ 735 kvm->mmu_notifier_count--; 736} 737 738static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 739 const struct mmu_notifier_range *range) 740{ 741 struct kvm *kvm = mmu_notifier_to_kvm(mn); 742 const struct kvm_hva_range hva_range = { 743 .start = range->start, 744 .end = range->end, 745 .pte = __pte(0), 746 .handler = (void *)kvm_null_fn, 747 .on_lock = kvm_dec_notifier_count, 748 .flush_on_ret = false, 749 .may_block = mmu_notifier_range_blockable(range), 750 }; 751 bool wake; 752 753 __kvm_handle_hva_range(kvm, &hva_range); 754 755 /* Pairs with the increment in range_start(). */ 756 spin_lock(&kvm->mn_invalidate_lock); 757 wake = (--kvm->mn_active_invalidate_count == 0); 758 spin_unlock(&kvm->mn_invalidate_lock); 759 760 /* 761 * There can only be one waiter, since the wait happens under 762 * slots_lock. 763 */ 764 if (wake) 765 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); 766 767 BUG_ON(kvm->mmu_notifier_count < 0); 768} 769 770static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 771 struct mm_struct *mm, 772 unsigned long start, 773 unsigned long end) 774{ 775 trace_kvm_age_hva(start, end); 776 777 return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn); 778} 779 780static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 781 struct mm_struct *mm, 782 unsigned long start, 783 unsigned long end) 784{ 785 trace_kvm_age_hva(start, end); 786 787 /* 788 * Even though we do not flush TLB, this will still adversely 789 * affect performance on pre-Haswell Intel EPT, where there is 790 * no EPT Access Bit to clear so that we have to tear down EPT 791 * tables instead. If we find this unacceptable, we can always 792 * add a parameter to kvm_age_hva so that it effectively doesn't 793 * do anything on clear_young. 794 * 795 * Also note that currently we never issue secondary TLB flushes 796 * from clear_young, leaving this job up to the regular system 797 * cadence. If we find this inaccurate, we might come up with a 798 * more sophisticated heuristic later. 799 */ 800 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); 801} 802 803static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 804 struct mm_struct *mm, 805 unsigned long address) 806{ 807 trace_kvm_test_age_hva(address); 808 809 return kvm_handle_hva_range_no_flush(mn, address, address + 1, 810 kvm_test_age_gfn); 811} 812 813static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 814 struct mm_struct *mm) 815{ 816 struct kvm *kvm = mmu_notifier_to_kvm(mn); 817 int idx; 818 819 idx = srcu_read_lock(&kvm->srcu); 820 kvm_arch_flush_shadow_all(kvm); 821 srcu_read_unlock(&kvm->srcu, idx); 822} 823 824static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 825 .invalidate_range = kvm_mmu_notifier_invalidate_range, 826 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 827 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 828 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 829 .clear_young = kvm_mmu_notifier_clear_young, 830 .test_young = kvm_mmu_notifier_test_young, 831 .change_pte = kvm_mmu_notifier_change_pte, 832 .release = kvm_mmu_notifier_release, 833}; 834 835static int kvm_init_mmu_notifier(struct kvm *kvm) 836{ 837 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 838 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 839} 840 841#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 842 843static int kvm_init_mmu_notifier(struct kvm *kvm) 844{ 845 return 0; 846} 847 848#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 849 850#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER 851static int kvm_pm_notifier_call(struct notifier_block *bl, 852 unsigned long state, 853 void *unused) 854{ 855 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier); 856 857 return kvm_arch_pm_notifier(kvm, state); 858} 859 860static void kvm_init_pm_notifier(struct kvm *kvm) 861{ 862 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call; 863 /* Suspend KVM before we suspend ftrace, RCU, etc. */ 864 kvm->pm_notifier.priority = INT_MAX; 865 register_pm_notifier(&kvm->pm_notifier); 866} 867 868static void kvm_destroy_pm_notifier(struct kvm *kvm) 869{ 870 unregister_pm_notifier(&kvm->pm_notifier); 871} 872#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */ 873static void kvm_init_pm_notifier(struct kvm *kvm) 874{ 875} 876 877static void kvm_destroy_pm_notifier(struct kvm *kvm) 878{ 879} 880#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ 881 882static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 883{ 884 if (!memslot->dirty_bitmap) 885 return; 886 887 kvfree(memslot->dirty_bitmap); 888 memslot->dirty_bitmap = NULL; 889} 890 891/* This does not remove the slot from struct kvm_memslots data structures */ 892static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 893{ 894 kvm_destroy_dirty_bitmap(slot); 895 896 kvm_arch_free_memslot(kvm, slot); 897 898 kfree(slot); 899} 900 901static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 902{ 903 struct hlist_node *idnode; 904 struct kvm_memory_slot *memslot; 905 int bkt; 906 907 /* 908 * The same memslot objects live in both active and inactive sets, 909 * arbitrarily free using index '1' so the second invocation of this 910 * function isn't operating over a structure with dangling pointers 911 * (even though this function isn't actually touching them). 912 */ 913 if (!slots->node_idx) 914 return; 915 916 hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1]) 917 kvm_free_memslot(kvm, memslot); 918} 919 920static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc) 921{ 922 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) { 923 case KVM_STATS_TYPE_INSTANT: 924 return 0444; 925 case KVM_STATS_TYPE_CUMULATIVE: 926 case KVM_STATS_TYPE_PEAK: 927 default: 928 return 0644; 929 } 930} 931 932 933static void kvm_destroy_vm_debugfs(struct kvm *kvm) 934{ 935 int i; 936 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + 937 kvm_vcpu_stats_header.num_desc; 938 939 if (IS_ERR(kvm->debugfs_dentry)) 940 return; 941 942 debugfs_remove_recursive(kvm->debugfs_dentry); 943 944 if (kvm->debugfs_stat_data) { 945 for (i = 0; i < kvm_debugfs_num_entries; i++) 946 kfree(kvm->debugfs_stat_data[i]); 947 kfree(kvm->debugfs_stat_data); 948 } 949} 950 951static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 952{ 953 static DEFINE_MUTEX(kvm_debugfs_lock); 954 struct dentry *dent; 955 char dir_name[ITOA_MAX_LEN * 2]; 956 struct kvm_stat_data *stat_data; 957 const struct _kvm_stats_desc *pdesc; 958 int i, ret; 959 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + 960 kvm_vcpu_stats_header.num_desc; 961 962 if (!debugfs_initialized()) 963 return 0; 964 965 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 966 mutex_lock(&kvm_debugfs_lock); 967 dent = debugfs_lookup(dir_name, kvm_debugfs_dir); 968 if (dent) { 969 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name); 970 dput(dent); 971 mutex_unlock(&kvm_debugfs_lock); 972 return 0; 973 } 974 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir); 975 mutex_unlock(&kvm_debugfs_lock); 976 if (IS_ERR(dent)) 977 return 0; 978 979 kvm->debugfs_dentry = dent; 980 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 981 sizeof(*kvm->debugfs_stat_data), 982 GFP_KERNEL_ACCOUNT); 983 if (!kvm->debugfs_stat_data) 984 return -ENOMEM; 985 986 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { 987 pdesc = &kvm_vm_stats_desc[i]; 988 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 989 if (!stat_data) 990 return -ENOMEM; 991 992 stat_data->kvm = kvm; 993 stat_data->desc = pdesc; 994 stat_data->kind = KVM_STAT_VM; 995 kvm->debugfs_stat_data[i] = stat_data; 996 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), 997 kvm->debugfs_dentry, stat_data, 998 &stat_fops_per_vm); 999 } 1000 1001 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { 1002 pdesc = &kvm_vcpu_stats_desc[i]; 1003 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 1004 if (!stat_data) 1005 return -ENOMEM; 1006 1007 stat_data->kvm = kvm; 1008 stat_data->desc = pdesc; 1009 stat_data->kind = KVM_STAT_VCPU; 1010 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data; 1011 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), 1012 kvm->debugfs_dentry, stat_data, 1013 &stat_fops_per_vm); 1014 } 1015 1016 ret = kvm_arch_create_vm_debugfs(kvm); 1017 if (ret) { 1018 kvm_destroy_vm_debugfs(kvm); 1019 return i; 1020 } 1021 1022 return 0; 1023} 1024 1025/* 1026 * Called after the VM is otherwise initialized, but just before adding it to 1027 * the vm_list. 1028 */ 1029int __weak kvm_arch_post_init_vm(struct kvm *kvm) 1030{ 1031 return 0; 1032} 1033 1034/* 1035 * Called just after removing the VM from the vm_list, but before doing any 1036 * other destruction. 1037 */ 1038void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) 1039{ 1040} 1041 1042/* 1043 * Called after per-vm debugfs created. When called kvm->debugfs_dentry should 1044 * be setup already, so we can create arch-specific debugfs entries under it. 1045 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so 1046 * a per-arch destroy interface is not needed. 1047 */ 1048int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) 1049{ 1050 return 0; 1051} 1052 1053static struct kvm *kvm_create_vm(unsigned long type) 1054{ 1055 struct kvm *kvm = kvm_arch_alloc_vm(); 1056 struct kvm_memslots *slots; 1057 int r = -ENOMEM; 1058 int i, j; 1059 1060 if (!kvm) 1061 return ERR_PTR(-ENOMEM); 1062 1063 KVM_MMU_LOCK_INIT(kvm); 1064 mmgrab(current->mm); 1065 kvm->mm = current->mm; 1066 kvm_eventfd_init(kvm); 1067 mutex_init(&kvm->lock); 1068 mutex_init(&kvm->irq_lock); 1069 mutex_init(&kvm->slots_lock); 1070 mutex_init(&kvm->slots_arch_lock); 1071 spin_lock_init(&kvm->mn_invalidate_lock); 1072 rcuwait_init(&kvm->mn_memslots_update_rcuwait); 1073 xa_init(&kvm->vcpu_array); 1074 1075 INIT_LIST_HEAD(&kvm->gpc_list); 1076 spin_lock_init(&kvm->gpc_lock); 1077 1078 INIT_LIST_HEAD(&kvm->devices); 1079 kvm->max_vcpus = KVM_MAX_VCPUS; 1080 1081 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 1082 1083 /* 1084 * Force subsequent debugfs file creations to fail if the VM directory 1085 * is not created (by kvm_create_vm_debugfs()). 1086 */ 1087 kvm->debugfs_dentry = ERR_PTR(-ENOENT); 1088 1089 if (init_srcu_struct(&kvm->srcu)) 1090 goto out_err_no_srcu; 1091 if (init_srcu_struct(&kvm->irq_srcu)) 1092 goto out_err_no_irq_srcu; 1093 1094 refcount_set(&kvm->users_count, 1); 1095 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1096 for (j = 0; j < 2; j++) { 1097 slots = &kvm->__memslots[i][j]; 1098 1099 atomic_long_set(&slots->last_used_slot, (unsigned long)NULL); 1100 slots->hva_tree = RB_ROOT_CACHED; 1101 slots->gfn_tree = RB_ROOT; 1102 hash_init(slots->id_hash); 1103 slots->node_idx = j; 1104 1105 /* Generations must be different for each address space. */ 1106 slots->generation = i; 1107 } 1108 1109 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]); 1110 } 1111 1112 for (i = 0; i < KVM_NR_BUSES; i++) { 1113 rcu_assign_pointer(kvm->buses[i], 1114 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); 1115 if (!kvm->buses[i]) 1116 goto out_err_no_arch_destroy_vm; 1117 } 1118 1119 kvm->max_halt_poll_ns = halt_poll_ns; 1120 1121 r = kvm_arch_init_vm(kvm, type); 1122 if (r) 1123 goto out_err_no_arch_destroy_vm; 1124 1125 r = hardware_enable_all(); 1126 if (r) 1127 goto out_err_no_disable; 1128 1129#ifdef CONFIG_HAVE_KVM_IRQFD 1130 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 1131#endif 1132 1133 r = kvm_init_mmu_notifier(kvm); 1134 if (r) 1135 goto out_err_no_mmu_notifier; 1136 1137 r = kvm_arch_post_init_vm(kvm); 1138 if (r) 1139 goto out_err; 1140 1141 mutex_lock(&kvm_lock); 1142 list_add(&kvm->vm_list, &vm_list); 1143 mutex_unlock(&kvm_lock); 1144 1145 preempt_notifier_inc(); 1146 kvm_init_pm_notifier(kvm); 1147 1148 /* 1149 * When the fd passed to this ioctl() is opened it pins the module, 1150 * but try_module_get() also prevents getting a reference if the module 1151 * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait"). 1152 */ 1153 if (!try_module_get(kvm_chardev_ops.owner)) { 1154 r = -ENODEV; 1155 goto out_err; 1156 } 1157 1158 return kvm; 1159 1160out_err: 1161#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 1162 if (kvm->mmu_notifier.ops) 1163 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 1164#endif 1165out_err_no_mmu_notifier: 1166 hardware_disable_all(); 1167out_err_no_disable: 1168 kvm_arch_destroy_vm(kvm); 1169out_err_no_arch_destroy_vm: 1170 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); 1171 for (i = 0; i < KVM_NR_BUSES; i++) 1172 kfree(kvm_get_bus(kvm, i)); 1173 cleanup_srcu_struct(&kvm->irq_srcu); 1174out_err_no_irq_srcu: 1175 cleanup_srcu_struct(&kvm->srcu); 1176out_err_no_srcu: 1177 kvm_arch_free_vm(kvm); 1178 mmdrop(current->mm); 1179 return ERR_PTR(r); 1180} 1181 1182static void kvm_destroy_devices(struct kvm *kvm) 1183{ 1184 struct kvm_device *dev, *tmp; 1185 1186 /* 1187 * We do not need to take the kvm->lock here, because nobody else 1188 * has a reference to the struct kvm at this point and therefore 1189 * cannot access the devices list anyhow. 1190 */ 1191 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 1192 list_del(&dev->vm_node); 1193 dev->ops->destroy(dev); 1194 } 1195} 1196 1197static void kvm_destroy_vm(struct kvm *kvm) 1198{ 1199 int i; 1200 struct mm_struct *mm = kvm->mm; 1201 1202 kvm_destroy_pm_notifier(kvm); 1203 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 1204 kvm_destroy_vm_debugfs(kvm); 1205 kvm_arch_sync_events(kvm); 1206 mutex_lock(&kvm_lock); 1207 list_del(&kvm->vm_list); 1208 mutex_unlock(&kvm_lock); 1209 kvm_arch_pre_destroy_vm(kvm); 1210 1211 kvm_free_irq_routing(kvm); 1212 for (i = 0; i < KVM_NR_BUSES; i++) { 1213 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 1214 1215 if (bus) 1216 kvm_io_bus_destroy(bus); 1217 kvm->buses[i] = NULL; 1218 } 1219 kvm_coalesced_mmio_free(kvm); 1220#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 1221 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 1222 /* 1223 * At this point, pending calls to invalidate_range_start() 1224 * have completed but no more MMU notifiers will run, so 1225 * mn_active_invalidate_count may remain unbalanced. 1226 * No threads can be waiting in install_new_memslots as the 1227 * last reference on KVM has been dropped, but freeing 1228 * memslots would deadlock without this manual intervention. 1229 */ 1230 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); 1231 kvm->mn_active_invalidate_count = 0; 1232#else 1233 kvm_arch_flush_shadow_all(kvm); 1234#endif 1235 kvm_arch_destroy_vm(kvm); 1236 kvm_destroy_devices(kvm); 1237 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1238 kvm_free_memslots(kvm, &kvm->__memslots[i][0]); 1239 kvm_free_memslots(kvm, &kvm->__memslots[i][1]); 1240 } 1241 cleanup_srcu_struct(&kvm->irq_srcu); 1242 cleanup_srcu_struct(&kvm->srcu); 1243 kvm_arch_free_vm(kvm); 1244 preempt_notifier_dec(); 1245 hardware_disable_all(); 1246 mmdrop(mm); 1247 module_put(kvm_chardev_ops.owner); 1248 1249 if (main_vm == kvm) 1250 main_vm = NULL; 1251} 1252 1253void kvm_get_kvm(struct kvm *kvm) 1254{ 1255 refcount_inc(&kvm->users_count); 1256} 1257EXPORT_SYMBOL_GPL(kvm_get_kvm); 1258 1259/* 1260 * Make sure the vm is not during destruction, which is a safe version of 1261 * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise. 1262 */ 1263bool kvm_get_kvm_safe(struct kvm *kvm) 1264{ 1265 return refcount_inc_not_zero(&kvm->users_count); 1266} 1267EXPORT_SYMBOL_GPL(kvm_get_kvm_safe); 1268 1269void kvm_put_kvm(struct kvm *kvm) 1270{ 1271 if (refcount_dec_and_test(&kvm->users_count)) 1272 kvm_destroy_vm(kvm); 1273} 1274EXPORT_SYMBOL_GPL(kvm_put_kvm); 1275 1276/* 1277 * Used to put a reference that was taken on behalf of an object associated 1278 * with a user-visible file descriptor, e.g. a vcpu or device, if installation 1279 * of the new file descriptor fails and the reference cannot be transferred to 1280 * its final owner. In such cases, the caller is still actively using @kvm and 1281 * will fail miserably if the refcount unexpectedly hits zero. 1282 */ 1283void kvm_put_kvm_no_destroy(struct kvm *kvm) 1284{ 1285 WARN_ON(refcount_dec_and_test(&kvm->users_count)); 1286} 1287EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); 1288 1289static int kvm_vm_release(struct inode *inode, struct file *filp) 1290{ 1291 struct kvm *kvm = filp->private_data; 1292 1293 kvm_irqfd_release(kvm); 1294 1295 kvm_put_kvm(kvm); 1296 return 0; 1297} 1298 1299/* 1300 * Allocation size is twice as large as the actual dirty bitmap size. 1301 * See kvm_vm_ioctl_get_dirty_log() why this is needed. 1302 */ 1303static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) 1304{ 1305 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot); 1306 1307 memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT); 1308 if (!memslot->dirty_bitmap) 1309 return -ENOMEM; 1310 1311 return 0; 1312} 1313 1314static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id) 1315{ 1316 struct kvm_memslots *active = __kvm_memslots(kvm, as_id); 1317 int node_idx_inactive = active->node_idx ^ 1; 1318 1319 return &kvm->__memslots[as_id][node_idx_inactive]; 1320} 1321 1322/* 1323 * Helper to get the address space ID when one of memslot pointers may be NULL. 1324 * This also serves as a sanity that at least one of the pointers is non-NULL, 1325 * and that their address space IDs don't diverge. 1326 */ 1327static int kvm_memslots_get_as_id(struct kvm_memory_slot *a, 1328 struct kvm_memory_slot *b) 1329{ 1330 if (WARN_ON_ONCE(!a && !b)) 1331 return 0; 1332 1333 if (!a) 1334 return b->as_id; 1335 if (!b) 1336 return a->as_id; 1337 1338 WARN_ON_ONCE(a->as_id != b->as_id); 1339 return a->as_id; 1340} 1341 1342static void kvm_insert_gfn_node(struct kvm_memslots *slots, 1343 struct kvm_memory_slot *slot) 1344{ 1345 struct rb_root *gfn_tree = &slots->gfn_tree; 1346 struct rb_node **node, *parent; 1347 int idx = slots->node_idx; 1348 1349 parent = NULL; 1350 for (node = &gfn_tree->rb_node; *node; ) { 1351 struct kvm_memory_slot *tmp; 1352 1353 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]); 1354 parent = *node; 1355 if (slot->base_gfn < tmp->base_gfn) 1356 node = &(*node)->rb_left; 1357 else if (slot->base_gfn > tmp->base_gfn) 1358 node = &(*node)->rb_right; 1359 else 1360 BUG(); 1361 } 1362 1363 rb_link_node(&slot->gfn_node[idx], parent, node); 1364 rb_insert_color(&slot->gfn_node[idx], gfn_tree); 1365} 1366 1367static void kvm_erase_gfn_node(struct kvm_memslots *slots, 1368 struct kvm_memory_slot *slot) 1369{ 1370 rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree); 1371} 1372 1373static void kvm_replace_gfn_node(struct kvm_memslots *slots, 1374 struct kvm_memory_slot *old, 1375 struct kvm_memory_slot *new) 1376{ 1377 int idx = slots->node_idx; 1378 1379 WARN_ON_ONCE(old->base_gfn != new->base_gfn); 1380 1381 rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx], 1382 &slots->gfn_tree); 1383} 1384 1385/* 1386 * Replace @old with @new in the inactive memslots. 1387 * 1388 * With NULL @old this simply adds @new. 1389 * With NULL @new this simply removes @old. 1390 * 1391 * If @new is non-NULL its hva_node[slots_idx] range has to be set 1392 * appropriately. 1393 */ 1394static void kvm_replace_memslot(struct kvm *kvm, 1395 struct kvm_memory_slot *old, 1396 struct kvm_memory_slot *new) 1397{ 1398 int as_id = kvm_memslots_get_as_id(old, new); 1399 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); 1400 int idx = slots->node_idx; 1401 1402 if (old) { 1403 hash_del(&old->id_node[idx]); 1404 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree); 1405 1406 if ((long)old == atomic_long_read(&slots->last_used_slot)) 1407 atomic_long_set(&slots->last_used_slot, (long)new); 1408 1409 if (!new) { 1410 kvm_erase_gfn_node(slots, old); 1411 return; 1412 } 1413 } 1414 1415 /* 1416 * Initialize @new's hva range. Do this even when replacing an @old 1417 * slot, kvm_copy_memslot() deliberately does not touch node data. 1418 */ 1419 new->hva_node[idx].start = new->userspace_addr; 1420 new->hva_node[idx].last = new->userspace_addr + 1421 (new->npages << PAGE_SHIFT) - 1; 1422 1423 /* 1424 * (Re)Add the new memslot. There is no O(1) interval_tree_replace(), 1425 * hva_node needs to be swapped with remove+insert even though hva can't 1426 * change when replacing an existing slot. 1427 */ 1428 hash_add(slots->id_hash, &new->id_node[idx], new->id); 1429 interval_tree_insert(&new->hva_node[idx], &slots->hva_tree); 1430 1431 /* 1432 * If the memslot gfn is unchanged, rb_replace_node() can be used to 1433 * switch the node in the gfn tree instead of removing the old and 1434 * inserting the new as two separate operations. Replacement is a 1435 * single O(1) operation versus two O(log(n)) operations for 1436 * remove+insert. 1437 */ 1438 if (old && old->base_gfn == new->base_gfn) { 1439 kvm_replace_gfn_node(slots, old, new); 1440 } else { 1441 if (old) 1442 kvm_erase_gfn_node(slots, old); 1443 kvm_insert_gfn_node(slots, new); 1444 } 1445} 1446 1447static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 1448{ 1449 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 1450 1451#ifdef __KVM_HAVE_READONLY_MEM 1452 valid_flags |= KVM_MEM_READONLY; 1453#endif 1454 1455 if (mem->flags & ~valid_flags) 1456 return -EINVAL; 1457 1458 return 0; 1459} 1460 1461static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) 1462{ 1463 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); 1464 1465 /* Grab the generation from the activate memslots. */ 1466 u64 gen = __kvm_memslots(kvm, as_id)->generation; 1467 1468 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 1469 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1470 1471 /* 1472 * Do not store the new memslots while there are invalidations in 1473 * progress, otherwise the locking in invalidate_range_start and 1474 * invalidate_range_end will be unbalanced. 1475 */ 1476 spin_lock(&kvm->mn_invalidate_lock); 1477 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); 1478 while (kvm->mn_active_invalidate_count) { 1479 set_current_state(TASK_UNINTERRUPTIBLE); 1480 spin_unlock(&kvm->mn_invalidate_lock); 1481 schedule(); 1482 spin_lock(&kvm->mn_invalidate_lock); 1483 } 1484 finish_rcuwait(&kvm->mn_memslots_update_rcuwait); 1485 rcu_assign_pointer(kvm->memslots[as_id], slots); 1486 spin_unlock(&kvm->mn_invalidate_lock); 1487 1488 /* 1489 * Acquired in kvm_set_memslot. Must be released before synchronize 1490 * SRCU below in order to avoid deadlock with another thread 1491 * acquiring the slots_arch_lock in an srcu critical section. 1492 */ 1493 mutex_unlock(&kvm->slots_arch_lock); 1494 1495 synchronize_srcu_expedited(&kvm->srcu); 1496 1497 /* 1498 * Increment the new memslot generation a second time, dropping the 1499 * update in-progress flag and incrementing the generation based on 1500 * the number of address spaces. This provides a unique and easily 1501 * identifiable generation number while the memslots are in flux. 1502 */ 1503 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1504 1505 /* 1506 * Generations must be unique even across address spaces. We do not need 1507 * a global counter for that, instead the generation space is evenly split 1508 * across address spaces. For example, with two address spaces, address 1509 * space 0 will use generations 0, 2, 4, ... while address space 1 will 1510 * use generations 1, 3, 5, ... 1511 */ 1512 gen += KVM_ADDRESS_SPACE_NUM; 1513 1514 kvm_arch_memslots_updated(kvm, gen); 1515 1516 slots->generation = gen; 1517} 1518 1519static int kvm_prepare_memory_region(struct kvm *kvm, 1520 const struct kvm_memory_slot *old, 1521 struct kvm_memory_slot *new, 1522 enum kvm_mr_change change) 1523{ 1524 int r; 1525 1526 /* 1527 * If dirty logging is disabled, nullify the bitmap; the old bitmap 1528 * will be freed on "commit". If logging is enabled in both old and 1529 * new, reuse the existing bitmap. If logging is enabled only in the 1530 * new and KVM isn't using a ring buffer, allocate and initialize a 1531 * new bitmap. 1532 */ 1533 if (change != KVM_MR_DELETE) { 1534 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) 1535 new->dirty_bitmap = NULL; 1536 else if (old && old->dirty_bitmap) 1537 new->dirty_bitmap = old->dirty_bitmap; 1538 else if (!kvm->dirty_ring_size) { 1539 r = kvm_alloc_dirty_bitmap(new); 1540 if (r) 1541 return r; 1542 1543 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1544 bitmap_set(new->dirty_bitmap, 0, new->npages); 1545 } 1546 } 1547 1548 r = kvm_arch_prepare_memory_region(kvm, old, new, change); 1549 1550 /* Free the bitmap on failure if it was allocated above. */ 1551 if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap)) 1552 kvm_destroy_dirty_bitmap(new); 1553 1554 return r; 1555} 1556 1557static void kvm_commit_memory_region(struct kvm *kvm, 1558 struct kvm_memory_slot *old, 1559 const struct kvm_memory_slot *new, 1560 enum kvm_mr_change change) 1561{ 1562 /* 1563 * Update the total number of memslot pages before calling the arch 1564 * hook so that architectures can consume the result directly. 1565 */ 1566 if (change == KVM_MR_DELETE) 1567 kvm->nr_memslot_pages -= old->npages; 1568 else if (change == KVM_MR_CREATE) 1569 kvm->nr_memslot_pages += new->npages; 1570 1571 kvm_arch_commit_memory_region(kvm, old, new, change); 1572 1573 switch (change) { 1574 case KVM_MR_CREATE: 1575 /* Nothing more to do. */ 1576 break; 1577 case KVM_MR_DELETE: 1578 /* Free the old memslot and all its metadata. */ 1579 kvm_free_memslot(kvm, old); 1580 break; 1581 case KVM_MR_MOVE: 1582 case KVM_MR_FLAGS_ONLY: 1583 /* 1584 * Free the dirty bitmap as needed; the below check encompasses 1585 * both the flags and whether a ring buffer is being used) 1586 */ 1587 if (old->dirty_bitmap && !new->dirty_bitmap) 1588 kvm_destroy_dirty_bitmap(old); 1589 1590 /* 1591 * The final quirk. Free the detached, old slot, but only its 1592 * memory, not any metadata. Metadata, including arch specific 1593 * data, may be reused by @new. 1594 */ 1595 kfree(old); 1596 break; 1597 default: 1598 BUG(); 1599 } 1600} 1601 1602/* 1603 * Activate @new, which must be installed in the inactive slots by the caller, 1604 * by swapping the active slots and then propagating @new to @old once @old is 1605 * unreachable and can be safely modified. 1606 * 1607 * With NULL @old this simply adds @new to @active (while swapping the sets). 1608 * With NULL @new this simply removes @old from @active and frees it 1609 * (while also swapping the sets). 1610 */ 1611static void kvm_activate_memslot(struct kvm *kvm, 1612 struct kvm_memory_slot *old, 1613 struct kvm_memory_slot *new) 1614{ 1615 int as_id = kvm_memslots_get_as_id(old, new); 1616 1617 kvm_swap_active_memslots(kvm, as_id); 1618 1619 /* Propagate the new memslot to the now inactive memslots. */ 1620 kvm_replace_memslot(kvm, old, new); 1621} 1622 1623static void kvm_copy_memslot(struct kvm_memory_slot *dest, 1624 const struct kvm_memory_slot *src) 1625{ 1626 dest->base_gfn = src->base_gfn; 1627 dest->npages = src->npages; 1628 dest->dirty_bitmap = src->dirty_bitmap; 1629 dest->arch = src->arch; 1630 dest->userspace_addr = src->userspace_addr; 1631 dest->flags = src->flags; 1632 dest->id = src->id; 1633 dest->as_id = src->as_id; 1634} 1635 1636static void kvm_invalidate_memslot(struct kvm *kvm, 1637 struct kvm_memory_slot *old, 1638 struct kvm_memory_slot *invalid_slot) 1639{ 1640 /* 1641 * Mark the current slot INVALID. As with all memslot modifications, 1642 * this must be done on an unreachable slot to avoid modifying the 1643 * current slot in the active tree. 1644 */ 1645 kvm_copy_memslot(invalid_slot, old); 1646 invalid_slot->flags |= KVM_MEMSLOT_INVALID; 1647 kvm_replace_memslot(kvm, old, invalid_slot); 1648 1649 /* 1650 * Activate the slot that is now marked INVALID, but don't propagate 1651 * the slot to the now inactive slots. The slot is either going to be 1652 * deleted or recreated as a new slot. 1653 */ 1654 kvm_swap_active_memslots(kvm, old->as_id); 1655 1656 /* 1657 * From this point no new shadow pages pointing to a deleted, or moved, 1658 * memslot will be created. Validation of sp->gfn happens in: 1659 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1660 * - kvm_is_visible_gfn (mmu_check_root) 1661 */ 1662 kvm_arch_flush_shadow_memslot(kvm, old); 1663 1664 /* Was released by kvm_swap_active_memslots, reacquire. */ 1665 mutex_lock(&kvm->slots_arch_lock); 1666 1667 /* 1668 * Copy the arch-specific field of the newly-installed slot back to the 1669 * old slot as the arch data could have changed between releasing 1670 * slots_arch_lock in install_new_memslots() and re-acquiring the lock 1671 * above. Writers are required to retrieve memslots *after* acquiring 1672 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh. 1673 */ 1674 old->arch = invalid_slot->arch; 1675} 1676 1677static void kvm_create_memslot(struct kvm *kvm, 1678 struct kvm_memory_slot *new) 1679{ 1680 /* Add the new memslot to the inactive set and activate. */ 1681 kvm_replace_memslot(kvm, NULL, new); 1682 kvm_activate_memslot(kvm, NULL, new); 1683} 1684 1685static void kvm_delete_memslot(struct kvm *kvm, 1686 struct kvm_memory_slot *old, 1687 struct kvm_memory_slot *invalid_slot) 1688{ 1689 /* 1690 * Remove the old memslot (in the inactive memslots) by passing NULL as 1691 * the "new" slot, and for the invalid version in the active slots. 1692 */ 1693 kvm_replace_memslot(kvm, old, NULL); 1694 kvm_activate_memslot(kvm, invalid_slot, NULL); 1695} 1696 1697static void kvm_move_memslot(struct kvm *kvm, 1698 struct kvm_memory_slot *old, 1699 struct kvm_memory_slot *new, 1700 struct kvm_memory_slot *invalid_slot) 1701{ 1702 /* 1703 * Replace the old memslot in the inactive slots, and then swap slots 1704 * and replace the current INVALID with the new as well. 1705 */ 1706 kvm_replace_memslot(kvm, old, new); 1707 kvm_activate_memslot(kvm, invalid_slot, new); 1708} 1709 1710static void kvm_update_flags_memslot(struct kvm *kvm, 1711 struct kvm_memory_slot *old, 1712 struct kvm_memory_slot *new) 1713{ 1714 /* 1715 * Similar to the MOVE case, but the slot doesn't need to be zapped as 1716 * an intermediate step. Instead, the old memslot is simply replaced 1717 * with a new, updated copy in both memslot sets. 1718 */ 1719 kvm_replace_memslot(kvm, old, new); 1720 kvm_activate_memslot(kvm, old, new); 1721} 1722 1723static int kvm_set_memslot(struct kvm *kvm, 1724 struct kvm_memory_slot *old, 1725 struct kvm_memory_slot *new, 1726 enum kvm_mr_change change) 1727{ 1728 struct kvm_memory_slot *invalid_slot; 1729 int r; 1730 1731 /* 1732 * Released in kvm_swap_active_memslots. 1733 * 1734 * Must be held from before the current memslots are copied until 1735 * after the new memslots are installed with rcu_assign_pointer, 1736 * then released before the synchronize srcu in kvm_swap_active_memslots. 1737 * 1738 * When modifying memslots outside of the slots_lock, must be held 1739 * before reading the pointer to the current memslots until after all 1740 * changes to those memslots are complete. 1741 * 1742 * These rules ensure that installing new memslots does not lose 1743 * changes made to the previous memslots. 1744 */ 1745 mutex_lock(&kvm->slots_arch_lock); 1746 1747 /* 1748 * Invalidate the old slot if it's being deleted or moved. This is 1749 * done prior to actually deleting/moving the memslot to allow vCPUs to 1750 * continue running by ensuring there are no mappings or shadow pages 1751 * for the memslot when it is deleted/moved. Without pre-invalidation 1752 * (and without a lock), a window would exist between effecting the 1753 * delete/move and committing the changes in arch code where KVM or a 1754 * guest could access a non-existent memslot. 1755 * 1756 * Modifications are done on a temporary, unreachable slot. The old 1757 * slot needs to be preserved in case a later step fails and the 1758 * invalidation needs to be reverted. 1759 */ 1760 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { 1761 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT); 1762 if (!invalid_slot) { 1763 mutex_unlock(&kvm->slots_arch_lock); 1764 return -ENOMEM; 1765 } 1766 kvm_invalidate_memslot(kvm, old, invalid_slot); 1767 } 1768 1769 r = kvm_prepare_memory_region(kvm, old, new, change); 1770 if (r) { 1771 /* 1772 * For DELETE/MOVE, revert the above INVALID change. No 1773 * modifications required since the original slot was preserved 1774 * in the inactive slots. Changing the active memslots also 1775 * release slots_arch_lock. 1776 */ 1777 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { 1778 kvm_activate_memslot(kvm, invalid_slot, old); 1779 kfree(invalid_slot); 1780 } else { 1781 mutex_unlock(&kvm->slots_arch_lock); 1782 } 1783 return r; 1784 } 1785 1786 /* 1787 * For DELETE and MOVE, the working slot is now active as the INVALID 1788 * version of the old slot. MOVE is particularly special as it reuses 1789 * the old slot and returns a copy of the old slot (in working_slot). 1790 * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the 1791 * old slot is detached but otherwise preserved. 1792 */ 1793 if (change == KVM_MR_CREATE) 1794 kvm_create_memslot(kvm, new); 1795 else if (change == KVM_MR_DELETE) 1796 kvm_delete_memslot(kvm, old, invalid_slot); 1797 else if (change == KVM_MR_MOVE) 1798 kvm_move_memslot(kvm, old, new, invalid_slot); 1799 else if (change == KVM_MR_FLAGS_ONLY) 1800 kvm_update_flags_memslot(kvm, old, new); 1801 else 1802 BUG(); 1803 1804 /* Free the temporary INVALID slot used for DELETE and MOVE. */ 1805 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) 1806 kfree(invalid_slot); 1807 1808 /* 1809 * No need to refresh new->arch, changes after dropping slots_arch_lock 1810 * will directly hit the final, active memslot. Architectures are 1811 * responsible for knowing that new->arch may be stale. 1812 */ 1813 kvm_commit_memory_region(kvm, old, new, change); 1814 1815 return 0; 1816} 1817 1818static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, 1819 gfn_t start, gfn_t end) 1820{ 1821 struct kvm_memslot_iter iter; 1822 1823 kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) { 1824 if (iter.slot->id != id) 1825 return true; 1826 } 1827 1828 return false; 1829} 1830 1831/* 1832 * Allocate some memory and give it an address in the guest physical address 1833 * space. 1834 * 1835 * Discontiguous memory is allowed, mostly for framebuffers. 1836 * 1837 * Must be called holding kvm->slots_lock for write. 1838 */ 1839int __kvm_set_memory_region(struct kvm *kvm, 1840 const struct kvm_userspace_memory_region *mem) 1841{ 1842 struct kvm_memory_slot *old, *new; 1843 struct kvm_memslots *slots; 1844 enum kvm_mr_change change; 1845 unsigned long npages; 1846 gfn_t base_gfn; 1847 int as_id, id; 1848 int r; 1849 1850 r = check_memory_region_flags(mem); 1851 if (r) 1852 return r; 1853 1854 as_id = mem->slot >> 16; 1855 id = (u16)mem->slot; 1856 1857 /* General sanity checks */ 1858 if ((mem->memory_size & (PAGE_SIZE - 1)) || 1859 (mem->memory_size != (unsigned long)mem->memory_size)) 1860 return -EINVAL; 1861 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1862 return -EINVAL; 1863 /* We can read the guest memory with __xxx_user() later on. */ 1864 if ((mem->userspace_addr & (PAGE_SIZE - 1)) || 1865 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) || 1866 !access_ok((void __user *)(unsigned long)mem->userspace_addr, 1867 mem->memory_size)) 1868 return -EINVAL; 1869 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 1870 return -EINVAL; 1871 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1872 return -EINVAL; 1873 if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) 1874 return -EINVAL; 1875 1876 slots = __kvm_memslots(kvm, as_id); 1877 1878 /* 1879 * Note, the old memslot (and the pointer itself!) may be invalidated 1880 * and/or destroyed by kvm_set_memslot(). 1881 */ 1882 old = id_to_memslot(slots, id); 1883 1884 if (!mem->memory_size) { 1885 if (!old || !old->npages) 1886 return -EINVAL; 1887 1888 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages)) 1889 return -EIO; 1890 1891 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE); 1892 } 1893 1894 base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT); 1895 npages = (mem->memory_size >> PAGE_SHIFT); 1896 1897 if (!old || !old->npages) { 1898 change = KVM_MR_CREATE; 1899 1900 /* 1901 * To simplify KVM internals, the total number of pages across 1902 * all memslots must fit in an unsigned long. 1903 */ 1904 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) 1905 return -EINVAL; 1906 } else { /* Modify an existing slot. */ 1907 if ((mem->userspace_addr != old->userspace_addr) || 1908 (npages != old->npages) || 1909 ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) 1910 return -EINVAL; 1911 1912 if (base_gfn != old->base_gfn) 1913 change = KVM_MR_MOVE; 1914 else if (mem->flags != old->flags) 1915 change = KVM_MR_FLAGS_ONLY; 1916 else /* Nothing to change. */ 1917 return 0; 1918 } 1919 1920 if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) && 1921 kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages)) 1922 return -EEXIST; 1923 1924 /* Allocate a slot that will persist in the memslot. */ 1925 new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); 1926 if (!new) 1927 return -ENOMEM; 1928 1929 new->as_id = as_id; 1930 new->id = id; 1931 new->base_gfn = base_gfn; 1932 new->npages = npages; 1933 new->flags = mem->flags; 1934 new->userspace_addr = mem->userspace_addr; 1935 1936 r = kvm_set_memslot(kvm, old, new, change); 1937 if (r) 1938 kfree(new); 1939 return r; 1940} 1941EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1942 1943int kvm_set_memory_region(struct kvm *kvm, 1944 const struct kvm_userspace_memory_region *mem) 1945{ 1946 int r; 1947 1948 mutex_lock(&kvm->slots_lock); 1949 r = __kvm_set_memory_region(kvm, mem); 1950 mutex_unlock(&kvm->slots_lock); 1951 return r; 1952} 1953EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1954 1955static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1956 struct kvm_userspace_memory_region *mem) 1957{ 1958 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1959 return -EINVAL; 1960 1961 return kvm_set_memory_region(kvm, mem); 1962} 1963 1964#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1965/** 1966 * kvm_get_dirty_log - get a snapshot of dirty pages 1967 * @kvm: pointer to kvm instance 1968 * @log: slot id and address to which we copy the log 1969 * @is_dirty: set to '1' if any dirty pages were found 1970 * @memslot: set to the associated memslot, always valid on success 1971 */ 1972int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, 1973 int *is_dirty, struct kvm_memory_slot **memslot) 1974{ 1975 struct kvm_memslots *slots; 1976 int i, as_id, id; 1977 unsigned long n; 1978 unsigned long any = 0; 1979 1980 /* Dirty ring tracking is exclusive to dirty log tracking */ 1981 if (kvm->dirty_ring_size) 1982 return -ENXIO; 1983 1984 *memslot = NULL; 1985 *is_dirty = 0; 1986 1987 as_id = log->slot >> 16; 1988 id = (u16)log->slot; 1989 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1990 return -EINVAL; 1991 1992 slots = __kvm_memslots(kvm, as_id); 1993 *memslot = id_to_memslot(slots, id); 1994 if (!(*memslot) || !(*memslot)->dirty_bitmap) 1995 return -ENOENT; 1996 1997 kvm_arch_sync_dirty_log(kvm, *memslot); 1998 1999 n = kvm_dirty_bitmap_bytes(*memslot); 2000 2001 for (i = 0; !any && i < n/sizeof(long); ++i) 2002 any = (*memslot)->dirty_bitmap[i]; 2003 2004 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) 2005 return -EFAULT; 2006 2007 if (any) 2008 *is_dirty = 1; 2009 return 0; 2010} 2011EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 2012 2013#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 2014/** 2015 * kvm_get_dirty_log_protect - get a snapshot of dirty pages 2016 * and reenable dirty page tracking for the corresponding pages. 2017 * @kvm: pointer to kvm instance 2018 * @log: slot id and address to which we copy the log 2019 * 2020 * We need to keep it in mind that VCPU threads can write to the bitmap 2021 * concurrently. So, to avoid losing track of dirty pages we keep the 2022 * following order: 2023 * 2024 * 1. Take a snapshot of the bit and clear it if needed. 2025 * 2. Write protect the corresponding page. 2026 * 3. Copy the snapshot to the userspace. 2027 * 4. Upon return caller flushes TLB's if needed. 2028 * 2029 * Between 2 and 4, the guest may write to the page using the remaining TLB 2030 * entry. This is not a problem because the page is reported dirty using 2031 * the snapshot taken before and step 4 ensures that writes done after 2032 * exiting to userspace will be logged for the next call. 2033 * 2034 */ 2035static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) 2036{ 2037 struct kvm_memslots *slots; 2038 struct kvm_memory_slot *memslot; 2039 int i, as_id, id; 2040 unsigned long n; 2041 unsigned long *dirty_bitmap; 2042 unsigned long *dirty_bitmap_buffer; 2043 bool flush; 2044 2045 /* Dirty ring tracking is exclusive to dirty log tracking */ 2046 if (kvm->dirty_ring_size) 2047 return -ENXIO; 2048 2049 as_id = log->slot >> 16; 2050 id = (u16)log->slot; 2051 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 2052 return -EINVAL; 2053 2054 slots = __kvm_memslots(kvm, as_id); 2055 memslot = id_to_memslot(slots, id); 2056 if (!memslot || !memslot->dirty_bitmap) 2057 return -ENOENT; 2058 2059 dirty_bitmap = memslot->dirty_bitmap; 2060 2061 kvm_arch_sync_dirty_log(kvm, memslot); 2062 2063 n = kvm_dirty_bitmap_bytes(memslot); 2064 flush = false; 2065 if (kvm->manual_dirty_log_protect) { 2066 /* 2067 * Unlike kvm_get_dirty_log, we always return false in *flush, 2068 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There 2069 * is some code duplication between this function and 2070 * kvm_get_dirty_log, but hopefully all architecture 2071 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log 2072 * can be eliminated. 2073 */ 2074 dirty_bitmap_buffer = dirty_bitmap; 2075 } else { 2076 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 2077 memset(dirty_bitmap_buffer, 0, n); 2078 2079 KVM_MMU_LOCK(kvm); 2080 for (i = 0; i < n / sizeof(long); i++) { 2081 unsigned long mask; 2082 gfn_t offset; 2083 2084 if (!dirty_bitmap[i]) 2085 continue; 2086 2087 flush = true; 2088 mask = xchg(&dirty_bitmap[i], 0); 2089 dirty_bitmap_buffer[i] = mask; 2090 2091 offset = i * BITS_PER_LONG; 2092 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 2093 offset, mask); 2094 } 2095 KVM_MMU_UNLOCK(kvm); 2096 } 2097 2098 if (flush) 2099 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 2100 2101 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 2102 return -EFAULT; 2103 return 0; 2104} 2105 2106 2107/** 2108 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot 2109 * @kvm: kvm instance 2110 * @log: slot id and address to which we copy the log 2111 * 2112 * Steps 1-4 below provide general overview of dirty page logging. See 2113 * kvm_get_dirty_log_protect() function description for additional details. 2114 * 2115 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we 2116 * always flush the TLB (step 4) even if previous step failed and the dirty 2117 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API 2118 * does not preclude user space subsequent dirty log read. Flushing TLB ensures 2119 * writes will be marked dirty for next log read. 2120 * 2121 * 1. Take a snapshot of the bit and clear it if needed. 2122 * 2. Write protect the corresponding page. 2123 * 3. Copy the snapshot to the userspace. 2124 * 4. Flush TLB's if needed. 2125 */ 2126static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2127 struct kvm_dirty_log *log) 2128{ 2129 int r; 2130 2131 mutex_lock(&kvm->slots_lock); 2132 2133 r = kvm_get_dirty_log_protect(kvm, log); 2134 2135 mutex_unlock(&kvm->slots_lock); 2136 return r; 2137} 2138 2139/** 2140 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap 2141 * and reenable dirty page tracking for the corresponding pages. 2142 * @kvm: pointer to kvm instance 2143 * @log: slot id and address from which to fetch the bitmap of dirty pages 2144 */ 2145static int kvm_clear_dirty_log_protect(struct kvm *kvm, 2146 struct kvm_clear_dirty_log *log) 2147{ 2148 struct kvm_memslots *slots; 2149 struct kvm_memory_slot *memslot; 2150 int as_id, id; 2151 gfn_t offset; 2152 unsigned long i, n; 2153 unsigned long *dirty_bitmap; 2154 unsigned long *dirty_bitmap_buffer; 2155 bool flush; 2156 2157 /* Dirty ring tracking is exclusive to dirty log tracking */ 2158 if (kvm->dirty_ring_size) 2159 return -ENXIO; 2160 2161 as_id = log->slot >> 16; 2162 id = (u16)log->slot; 2163 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 2164 return -EINVAL; 2165 2166 if (log->first_page & 63) 2167 return -EINVAL; 2168 2169 slots = __kvm_memslots(kvm, as_id); 2170 memslot = id_to_memslot(slots, id); 2171 if (!memslot || !memslot->dirty_bitmap) 2172 return -ENOENT; 2173 2174 dirty_bitmap = memslot->dirty_bitmap; 2175 2176 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; 2177 2178 if (log->first_page > memslot->npages || 2179 log->num_pages > memslot->npages - log->first_page || 2180 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 2181 return -EINVAL; 2182 2183 kvm_arch_sync_dirty_log(kvm, memslot); 2184 2185 flush = false; 2186 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 2187 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) 2188 return -EFAULT; 2189 2190 KVM_MMU_LOCK(kvm); 2191 for (offset = log->first_page, i = offset / BITS_PER_LONG, 2192 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; 2193 i++, offset += BITS_PER_LONG) { 2194 unsigned long mask = *dirty_bitmap_buffer++; 2195 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; 2196 if (!mask) 2197 continue; 2198 2199 mask &= atomic_long_fetch_andnot(mask, p); 2200 2201 /* 2202 * mask contains the bits that really have been cleared. This 2203 * never includes any bits beyond the length of the memslot (if 2204 * the length is not aligned to 64 pages), therefore it is not 2205 * a problem if userspace sets them in log->dirty_bitmap. 2206 */ 2207 if (mask) { 2208 flush = true; 2209 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 2210 offset, mask); 2211 } 2212 } 2213 KVM_MMU_UNLOCK(kvm); 2214 2215 if (flush) 2216 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 2217 2218 return 0; 2219} 2220 2221static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, 2222 struct kvm_clear_dirty_log *log) 2223{ 2224 int r; 2225 2226 mutex_lock(&kvm->slots_lock); 2227 2228 r = kvm_clear_dirty_log_protect(kvm, log); 2229 2230 mutex_unlock(&kvm->slots_lock); 2231 return r; 2232} 2233#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 2234 2235struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 2236{ 2237 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 2238} 2239EXPORT_SYMBOL_GPL(gfn_to_memslot); 2240 2241struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 2242{ 2243 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); 2244 u64 gen = slots->generation; 2245 struct kvm_memory_slot *slot; 2246 2247 /* 2248 * This also protects against using a memslot from a different address space, 2249 * since different address spaces have different generation numbers. 2250 */ 2251 if (unlikely(gen != vcpu->last_used_slot_gen)) { 2252 vcpu->last_used_slot = NULL; 2253 vcpu->last_used_slot_gen = gen; 2254 } 2255 2256 slot = try_get_memslot(vcpu->last_used_slot, gfn); 2257 if (slot) 2258 return slot; 2259 2260 /* 2261 * Fall back to searching all memslots. We purposely use 2262 * search_memslots() instead of __gfn_to_memslot() to avoid 2263 * thrashing the VM-wide last_used_slot in kvm_memslots. 2264 */ 2265 slot = search_memslots(slots, gfn, false); 2266 if (slot) { 2267 vcpu->last_used_slot = slot; 2268 return slot; 2269 } 2270 2271 return NULL; 2272} 2273 2274bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 2275{ 2276 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 2277 2278 return kvm_is_visible_memslot(memslot); 2279} 2280EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 2281 2282bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 2283{ 2284 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2285 2286 return kvm_is_visible_memslot(memslot); 2287} 2288EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); 2289 2290unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) 2291{ 2292 struct vm_area_struct *vma; 2293 unsigned long addr, size; 2294 2295 size = PAGE_SIZE; 2296 2297 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); 2298 if (kvm_is_error_hva(addr)) 2299 return PAGE_SIZE; 2300 2301 mmap_read_lock(current->mm); 2302 vma = find_vma(current->mm, addr); 2303 if (!vma) 2304 goto out; 2305 2306 size = vma_kernel_pagesize(vma); 2307 2308out: 2309 mmap_read_unlock(current->mm); 2310 2311 return size; 2312} 2313 2314static bool memslot_is_readonly(const struct kvm_memory_slot *slot) 2315{ 2316 return slot->flags & KVM_MEM_READONLY; 2317} 2318 2319static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, 2320 gfn_t *nr_pages, bool write) 2321{ 2322 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 2323 return KVM_HVA_ERR_BAD; 2324 2325 if (memslot_is_readonly(slot) && write) 2326 return KVM_HVA_ERR_RO_BAD; 2327 2328 if (nr_pages) 2329 *nr_pages = slot->npages - (gfn - slot->base_gfn); 2330 2331 return __gfn_to_hva_memslot(slot, gfn); 2332} 2333 2334static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 2335 gfn_t *nr_pages) 2336{ 2337 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 2338} 2339 2340unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 2341 gfn_t gfn) 2342{ 2343 return gfn_to_hva_many(slot, gfn, NULL); 2344} 2345EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 2346 2347unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 2348{ 2349 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 2350} 2351EXPORT_SYMBOL_GPL(gfn_to_hva); 2352 2353unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 2354{ 2355 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 2356} 2357EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 2358 2359/* 2360 * Return the hva of a @gfn and the R/W attribute if possible. 2361 * 2362 * @slot: the kvm_memory_slot which contains @gfn 2363 * @gfn: the gfn to be translated 2364 * @writable: used to return the read/write attribute of the @slot if the hva 2365 * is valid and @writable is not NULL 2366 */ 2367unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 2368 gfn_t gfn, bool *writable) 2369{ 2370 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 2371 2372 if (!kvm_is_error_hva(hva) && writable) 2373 *writable = !memslot_is_readonly(slot); 2374 2375 return hva; 2376} 2377 2378unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 2379{ 2380 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2381 2382 return gfn_to_hva_memslot_prot(slot, gfn, writable); 2383} 2384 2385unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 2386{ 2387 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2388 2389 return gfn_to_hva_memslot_prot(slot, gfn, writable); 2390} 2391 2392static inline int check_user_page_hwpoison(unsigned long addr) 2393{ 2394 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 2395 2396 rc = get_user_pages(addr, 1, flags, NULL, NULL); 2397 return rc == -EHWPOISON; 2398} 2399 2400/* 2401 * The fast path to get the writable pfn which will be stored in @pfn, 2402 * true indicates success, otherwise false is returned. It's also the 2403 * only part that runs if we can in atomic context. 2404 */ 2405static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 2406 bool *writable, kvm_pfn_t *pfn) 2407{ 2408 struct page *page[1]; 2409 2410 /* 2411 * Fast pin a writable pfn only if it is a write fault request 2412 * or the caller allows to map a writable pfn for a read fault 2413 * request. 2414 */ 2415 if (!(write_fault || writable)) 2416 return false; 2417 2418 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { 2419 *pfn = page_to_pfn(page[0]); 2420 2421 if (writable) 2422 *writable = true; 2423 return true; 2424 } 2425 2426 return false; 2427} 2428 2429/* 2430 * The slow path to get the pfn of the specified host virtual address, 2431 * 1 indicates success, -errno is returned if error is detected. 2432 */ 2433static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 2434 bool *writable, kvm_pfn_t *pfn) 2435{ 2436 unsigned int flags = FOLL_HWPOISON; 2437 struct page *page; 2438 int npages = 0; 2439 2440 might_sleep(); 2441 2442 if (writable) 2443 *writable = write_fault; 2444 2445 if (write_fault) 2446 flags |= FOLL_WRITE; 2447 if (async) 2448 flags |= FOLL_NOWAIT; 2449 2450 npages = get_user_pages_unlocked(addr, 1, &page, flags); 2451 if (npages != 1) 2452 return npages; 2453 2454 /* map read fault as writable if possible */ 2455 if (unlikely(!write_fault) && writable) { 2456 struct page *wpage; 2457 2458 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { 2459 *writable = true; 2460 put_page(page); 2461 page = wpage; 2462 } 2463 } 2464 *pfn = page_to_pfn(page); 2465 return npages; 2466} 2467 2468static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 2469{ 2470 if (unlikely(!(vma->vm_flags & VM_READ))) 2471 return false; 2472 2473 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 2474 return false; 2475 2476 return true; 2477} 2478 2479static int kvm_try_get_pfn(kvm_pfn_t pfn) 2480{ 2481 if (kvm_is_reserved_pfn(pfn)) 2482 return 1; 2483 return get_page_unless_zero(pfn_to_page(pfn)); 2484} 2485 2486static int hva_to_pfn_remapped(struct vm_area_struct *vma, 2487 unsigned long addr, bool write_fault, 2488 bool *writable, kvm_pfn_t *p_pfn) 2489{ 2490 kvm_pfn_t pfn; 2491 pte_t *ptep; 2492 spinlock_t *ptl; 2493 int r; 2494 2495 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); 2496 if (r) { 2497 /* 2498 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 2499 * not call the fault handler, so do it here. 2500 */ 2501 bool unlocked = false; 2502 r = fixup_user_fault(current->mm, addr, 2503 (write_fault ? FAULT_FLAG_WRITE : 0), 2504 &unlocked); 2505 if (unlocked) 2506 return -EAGAIN; 2507 if (r) 2508 return r; 2509 2510 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); 2511 if (r) 2512 return r; 2513 } 2514 2515 if (write_fault && !pte_write(*ptep)) { 2516 pfn = KVM_PFN_ERR_RO_FAULT; 2517 goto out; 2518 } 2519 2520 if (writable) 2521 *writable = pte_write(*ptep); 2522 pfn = pte_pfn(*ptep); 2523 2524 /* 2525 * Get a reference here because callers of *hva_to_pfn* and 2526 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 2527 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 2528 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will 2529 * simply do nothing for reserved pfns. 2530 * 2531 * Whoever called remap_pfn_range is also going to call e.g. 2532 * unmap_mapping_range before the underlying pages are freed, 2533 * causing a call to our MMU notifier. 2534 * 2535 * Certain IO or PFNMAP mappings can be backed with valid 2536 * struct pages, but be allocated without refcounting e.g., 2537 * tail pages of non-compound higher order allocations, which 2538 * would then underflow the refcount when the caller does the 2539 * required put_page. Don't allow those pages here. 2540 */ 2541 if (!kvm_try_get_pfn(pfn)) 2542 r = -EFAULT; 2543 2544out: 2545 pte_unmap_unlock(ptep, ptl); 2546 *p_pfn = pfn; 2547 2548 return r; 2549} 2550 2551/* 2552 * Pin guest page in memory and return its pfn. 2553 * @addr: host virtual address which maps memory to the guest 2554 * @atomic: whether this function can sleep 2555 * @async: whether this function need to wait IO complete if the 2556 * host page is not in the memory 2557 * @write_fault: whether we should get a writable host page 2558 * @writable: whether it allows to map a writable host page for !@write_fault 2559 * 2560 * The function will map a writable host page for these two cases: 2561 * 1): @write_fault = true 2562 * 2): @write_fault = false && @writable, @writable will tell the caller 2563 * whether the mapping is writable. 2564 */ 2565kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 2566 bool write_fault, bool *writable) 2567{ 2568 struct vm_area_struct *vma; 2569 kvm_pfn_t pfn = 0; 2570 int npages, r; 2571 2572 /* we can do it either atomically or asynchronously, not both */ 2573 BUG_ON(atomic && async); 2574 2575 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 2576 return pfn; 2577 2578 if (atomic) 2579 return KVM_PFN_ERR_FAULT; 2580 2581 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 2582 if (npages == 1) 2583 return pfn; 2584 2585 mmap_read_lock(current->mm); 2586 if (npages == -EHWPOISON || 2587 (!async && check_user_page_hwpoison(addr))) { 2588 pfn = KVM_PFN_ERR_HWPOISON; 2589 goto exit; 2590 } 2591 2592retry: 2593 vma = vma_lookup(current->mm, addr); 2594 2595 if (vma == NULL) 2596 pfn = KVM_PFN_ERR_FAULT; 2597 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 2598 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn); 2599 if (r == -EAGAIN) 2600 goto retry; 2601 if (r < 0) 2602 pfn = KVM_PFN_ERR_FAULT; 2603 } else { 2604 if (async && vma_is_valid(vma, write_fault)) 2605 *async = true; 2606 pfn = KVM_PFN_ERR_FAULT; 2607 } 2608exit: 2609 mmap_read_unlock(current->mm); 2610 return pfn; 2611} 2612 2613kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, 2614 bool atomic, bool *async, bool write_fault, 2615 bool *writable, hva_t *hva) 2616{ 2617 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 2618 2619 if (hva) 2620 *hva = addr; 2621 2622 if (addr == KVM_HVA_ERR_RO_BAD) { 2623 if (writable) 2624 *writable = false; 2625 return KVM_PFN_ERR_RO_FAULT; 2626 } 2627 2628 if (kvm_is_error_hva(addr)) { 2629 if (writable) 2630 *writable = false; 2631 return KVM_PFN_NOSLOT; 2632 } 2633 2634 /* Do not map writable pfn in the readonly memslot. */ 2635 if (writable && memslot_is_readonly(slot)) { 2636 *writable = false; 2637 writable = NULL; 2638 } 2639 2640 return hva_to_pfn(addr, atomic, async, write_fault, 2641 writable); 2642} 2643EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 2644 2645kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 2646 bool *writable) 2647{ 2648 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 2649 write_fault, writable, NULL); 2650} 2651EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 2652 2653kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) 2654{ 2655 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); 2656} 2657EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 2658 2659kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn) 2660{ 2661 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL); 2662} 2663EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 2664 2665kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 2666{ 2667 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 2668} 2669EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 2670 2671kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 2672{ 2673 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 2674} 2675EXPORT_SYMBOL_GPL(gfn_to_pfn); 2676 2677kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 2678{ 2679 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 2680} 2681EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 2682 2683int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2684 struct page **pages, int nr_pages) 2685{ 2686 unsigned long addr; 2687 gfn_t entry = 0; 2688 2689 addr = gfn_to_hva_many(slot, gfn, &entry); 2690 if (kvm_is_error_hva(addr)) 2691 return -1; 2692 2693 if (entry < nr_pages) 2694 return 0; 2695 2696 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); 2697} 2698EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 2699 2700static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 2701{ 2702 if (is_error_noslot_pfn(pfn)) 2703 return KVM_ERR_PTR_BAD_PAGE; 2704 2705 if (kvm_is_reserved_pfn(pfn)) { 2706 WARN_ON(1); 2707 return KVM_ERR_PTR_BAD_PAGE; 2708 } 2709 2710 return pfn_to_page(pfn); 2711} 2712 2713struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 2714{ 2715 kvm_pfn_t pfn; 2716 2717 pfn = gfn_to_pfn(kvm, gfn); 2718 2719 return kvm_pfn_to_page(pfn); 2720} 2721EXPORT_SYMBOL_GPL(gfn_to_page); 2722 2723void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) 2724{ 2725 if (pfn == 0) 2726 return; 2727 2728 if (dirty) 2729 kvm_release_pfn_dirty(pfn); 2730 else 2731 kvm_release_pfn_clean(pfn); 2732} 2733 2734int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 2735{ 2736 kvm_pfn_t pfn; 2737 void *hva = NULL; 2738 struct page *page = KVM_UNMAPPED_PAGE; 2739 2740 if (!map) 2741 return -EINVAL; 2742 2743 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2744 if (is_error_noslot_pfn(pfn)) 2745 return -EINVAL; 2746 2747 if (pfn_valid(pfn)) { 2748 page = pfn_to_page(pfn); 2749 hva = kmap(page); 2750#ifdef CONFIG_HAS_IOMEM 2751 } else { 2752 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 2753#endif 2754 } 2755 2756 if (!hva) 2757 return -EFAULT; 2758 2759 map->page = page; 2760 map->hva = hva; 2761 map->pfn = pfn; 2762 map->gfn = gfn; 2763 2764 return 0; 2765} 2766EXPORT_SYMBOL_GPL(kvm_vcpu_map); 2767 2768void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) 2769{ 2770 if (!map) 2771 return; 2772 2773 if (!map->hva) 2774 return; 2775 2776 if (map->page != KVM_UNMAPPED_PAGE) 2777 kunmap(map->page); 2778#ifdef CONFIG_HAS_IOMEM 2779 else 2780 memunmap(map->hva); 2781#endif 2782 2783 if (dirty) 2784 kvm_vcpu_mark_page_dirty(vcpu, map->gfn); 2785 2786 kvm_release_pfn(map->pfn, dirty); 2787 2788 map->hva = NULL; 2789 map->page = NULL; 2790} 2791EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 2792 2793struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 2794{ 2795 kvm_pfn_t pfn; 2796 2797 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 2798 2799 return kvm_pfn_to_page(pfn); 2800} 2801EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 2802 2803void kvm_release_page_clean(struct page *page) 2804{ 2805 WARN_ON(is_error_page(page)); 2806 2807 kvm_release_pfn_clean(page_to_pfn(page)); 2808} 2809EXPORT_SYMBOL_GPL(kvm_release_page_clean); 2810 2811void kvm_release_pfn_clean(kvm_pfn_t pfn) 2812{ 2813 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 2814 put_page(pfn_to_page(pfn)); 2815} 2816EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 2817 2818void kvm_release_page_dirty(struct page *page) 2819{ 2820 WARN_ON(is_error_page(page)); 2821 2822 kvm_release_pfn_dirty(page_to_pfn(page)); 2823} 2824EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 2825 2826void kvm_release_pfn_dirty(kvm_pfn_t pfn) 2827{ 2828 kvm_set_pfn_dirty(pfn); 2829 kvm_release_pfn_clean(pfn); 2830} 2831EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 2832 2833void kvm_set_pfn_dirty(kvm_pfn_t pfn) 2834{ 2835 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2836 SetPageDirty(pfn_to_page(pfn)); 2837} 2838EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 2839 2840void kvm_set_pfn_accessed(kvm_pfn_t pfn) 2841{ 2842 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2843 mark_page_accessed(pfn_to_page(pfn)); 2844} 2845EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 2846 2847static int next_segment(unsigned long len, int offset) 2848{ 2849 if (len > PAGE_SIZE - offset) 2850 return PAGE_SIZE - offset; 2851 else 2852 return len; 2853} 2854 2855static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 2856 void *data, int offset, int len) 2857{ 2858 int r; 2859 unsigned long addr; 2860 2861 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2862 if (kvm_is_error_hva(addr)) 2863 return -EFAULT; 2864 r = __copy_from_user(data, (void __user *)addr + offset, len); 2865 if (r) 2866 return -EFAULT; 2867 return 0; 2868} 2869 2870int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 2871 int len) 2872{ 2873 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2874 2875 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2876} 2877EXPORT_SYMBOL_GPL(kvm_read_guest_page); 2878 2879int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 2880 int offset, int len) 2881{ 2882 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2883 2884 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2885} 2886EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 2887 2888int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 2889{ 2890 gfn_t gfn = gpa >> PAGE_SHIFT; 2891 int seg; 2892 int offset = offset_in_page(gpa); 2893 int ret; 2894 2895 while ((seg = next_segment(len, offset)) != 0) { 2896 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 2897 if (ret < 0) 2898 return ret; 2899 offset = 0; 2900 len -= seg; 2901 data += seg; 2902 ++gfn; 2903 } 2904 return 0; 2905} 2906EXPORT_SYMBOL_GPL(kvm_read_guest); 2907 2908int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 2909{ 2910 gfn_t gfn = gpa >> PAGE_SHIFT; 2911 int seg; 2912 int offset = offset_in_page(gpa); 2913 int ret; 2914 2915 while ((seg = next_segment(len, offset)) != 0) { 2916 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 2917 if (ret < 0) 2918 return ret; 2919 offset = 0; 2920 len -= seg; 2921 data += seg; 2922 ++gfn; 2923 } 2924 return 0; 2925} 2926EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 2927 2928static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2929 void *data, int offset, unsigned long len) 2930{ 2931 int r; 2932 unsigned long addr; 2933 2934 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2935 if (kvm_is_error_hva(addr)) 2936 return -EFAULT; 2937 pagefault_disable(); 2938 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 2939 pagefault_enable(); 2940 if (r) 2941 return -EFAULT; 2942 return 0; 2943} 2944 2945int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2946 void *data, unsigned long len) 2947{ 2948 gfn_t gfn = gpa >> PAGE_SHIFT; 2949 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2950 int offset = offset_in_page(gpa); 2951 2952 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2953} 2954EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 2955 2956static int __kvm_write_guest_page(struct kvm *kvm, 2957 struct kvm_memory_slot *memslot, gfn_t gfn, 2958 const void *data, int offset, int len) 2959{ 2960 int r; 2961 unsigned long addr; 2962 2963 addr = gfn_to_hva_memslot(memslot, gfn); 2964 if (kvm_is_error_hva(addr)) 2965 return -EFAULT; 2966 r = __copy_to_user((void __user *)addr + offset, data, len); 2967 if (r) 2968 return -EFAULT; 2969 mark_page_dirty_in_slot(kvm, memslot, gfn); 2970 return 0; 2971} 2972 2973int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 2974 const void *data, int offset, int len) 2975{ 2976 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2977 2978 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); 2979} 2980EXPORT_SYMBOL_GPL(kvm_write_guest_page); 2981 2982int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 2983 const void *data, int offset, int len) 2984{ 2985 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2986 2987 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); 2988} 2989EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 2990 2991int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 2992 unsigned long len) 2993{ 2994 gfn_t gfn = gpa >> PAGE_SHIFT; 2995 int seg; 2996 int offset = offset_in_page(gpa); 2997 int ret; 2998 2999 while ((seg = next_segment(len, offset)) != 0) { 3000 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 3001 if (ret < 0) 3002 return ret; 3003 offset = 0; 3004 len -= seg; 3005 data += seg; 3006 ++gfn; 3007 } 3008 return 0; 3009} 3010EXPORT_SYMBOL_GPL(kvm_write_guest); 3011 3012int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 3013 unsigned long len) 3014{ 3015 gfn_t gfn = gpa >> PAGE_SHIFT; 3016 int seg; 3017 int offset = offset_in_page(gpa); 3018 int ret; 3019 3020 while ((seg = next_segment(len, offset)) != 0) { 3021 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 3022 if (ret < 0) 3023 return ret; 3024 offset = 0; 3025 len -= seg; 3026 data += seg; 3027 ++gfn; 3028 } 3029 return 0; 3030} 3031EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 3032 3033static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 3034 struct gfn_to_hva_cache *ghc, 3035 gpa_t gpa, unsigned long len) 3036{ 3037 int offset = offset_in_page(gpa); 3038 gfn_t start_gfn = gpa >> PAGE_SHIFT; 3039 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 3040 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 3041 gfn_t nr_pages_avail; 3042 3043 /* Update ghc->generation before performing any error checks. */ 3044 ghc->generation = slots->generation; 3045 3046 if (start_gfn > end_gfn) { 3047 ghc->hva = KVM_HVA_ERR_BAD; 3048 return -EINVAL; 3049 } 3050 3051 /* 3052 * If the requested region crosses two memslots, we still 3053 * verify that the entire region is valid here. 3054 */ 3055 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { 3056 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 3057 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 3058 &nr_pages_avail); 3059 if (kvm_is_error_hva(ghc->hva)) 3060 return -EFAULT; 3061 } 3062 3063 /* Use the slow path for cross page reads and writes. */ 3064 if (nr_pages_needed == 1) 3065 ghc->hva += offset; 3066 else 3067 ghc->memslot = NULL; 3068 3069 ghc->gpa = gpa; 3070 ghc->len = len; 3071 return 0; 3072} 3073 3074int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 3075 gpa_t gpa, unsigned long len) 3076{ 3077 struct kvm_memslots *slots = kvm_memslots(kvm); 3078 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 3079} 3080EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 3081 3082int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 3083 void *data, unsigned int offset, 3084 unsigned long len) 3085{ 3086 struct kvm_memslots *slots = kvm_memslots(kvm); 3087 int r; 3088 gpa_t gpa = ghc->gpa + offset; 3089 3090 if (WARN_ON_ONCE(len + offset > ghc->len)) 3091 return -EINVAL; 3092 3093 if (slots->generation != ghc->generation) { 3094 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 3095 return -EFAULT; 3096 } 3097 3098 if (kvm_is_error_hva(ghc->hva)) 3099 return -EFAULT; 3100 3101 if (unlikely(!ghc->memslot)) 3102 return kvm_write_guest(kvm, gpa, data, len); 3103 3104 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 3105 if (r) 3106 return -EFAULT; 3107 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT); 3108 3109 return 0; 3110} 3111EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 3112 3113int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 3114 void *data, unsigned long len) 3115{ 3116 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 3117} 3118EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 3119 3120int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 3121 void *data, unsigned int offset, 3122 unsigned long len) 3123{ 3124 struct kvm_memslots *slots = kvm_memslots(kvm); 3125 int r; 3126 gpa_t gpa = ghc->gpa + offset; 3127 3128 if (WARN_ON_ONCE(len + offset > ghc->len)) 3129 return -EINVAL; 3130 3131 if (slots->generation != ghc->generation) { 3132 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 3133 return -EFAULT; 3134 } 3135 3136 if (kvm_is_error_hva(ghc->hva)) 3137 return -EFAULT; 3138 3139 if (unlikely(!ghc->memslot)) 3140 return kvm_read_guest(kvm, gpa, data, len); 3141 3142 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); 3143 if (r) 3144 return -EFAULT; 3145 3146 return 0; 3147} 3148EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); 3149 3150int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 3151 void *data, unsigned long len) 3152{ 3153 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); 3154} 3155EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 3156 3157int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 3158{ 3159 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3160 gfn_t gfn = gpa >> PAGE_SHIFT; 3161 int seg; 3162 int offset = offset_in_page(gpa); 3163 int ret; 3164 3165 while ((seg = next_segment(len, offset)) != 0) { 3166 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 3167 if (ret < 0) 3168 return ret; 3169 offset = 0; 3170 len -= seg; 3171 ++gfn; 3172 } 3173 return 0; 3174} 3175EXPORT_SYMBOL_GPL(kvm_clear_guest); 3176 3177void mark_page_dirty_in_slot(struct kvm *kvm, 3178 const struct kvm_memory_slot *memslot, 3179 gfn_t gfn) 3180{ 3181 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 3182 3183#ifdef CONFIG_HAVE_KVM_DIRTY_RING 3184 if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm)) 3185 return; 3186#endif 3187 3188 if (memslot && kvm_slot_dirty_track_enabled(memslot)) { 3189 unsigned long rel_gfn = gfn - memslot->base_gfn; 3190 u32 slot = (memslot->as_id << 16) | memslot->id; 3191 3192 if (kvm->dirty_ring_size) 3193 kvm_dirty_ring_push(&vcpu->dirty_ring, 3194 slot, rel_gfn); 3195 else 3196 set_bit_le(rel_gfn, memslot->dirty_bitmap); 3197 } 3198} 3199EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); 3200 3201void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 3202{ 3203 struct kvm_memory_slot *memslot; 3204 3205 memslot = gfn_to_memslot(kvm, gfn); 3206 mark_page_dirty_in_slot(kvm, memslot, gfn); 3207} 3208EXPORT_SYMBOL_GPL(mark_page_dirty); 3209 3210void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 3211{ 3212 struct kvm_memory_slot *memslot; 3213 3214 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 3215 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn); 3216} 3217EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 3218 3219void kvm_sigset_activate(struct kvm_vcpu *vcpu) 3220{ 3221 if (!vcpu->sigset_active) 3222 return; 3223 3224 /* 3225 * This does a lockless modification of ->real_blocked, which is fine 3226 * because, only current can change ->real_blocked and all readers of 3227 * ->real_blocked don't care as long ->real_blocked is always a subset 3228 * of ->blocked. 3229 */ 3230 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 3231} 3232 3233void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 3234{ 3235 if (!vcpu->sigset_active) 3236 return; 3237 3238 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 3239 sigemptyset(¤t->real_blocked); 3240} 3241 3242static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 3243{ 3244 unsigned int old, val, grow, grow_start; 3245 3246 old = val = vcpu->halt_poll_ns; 3247 grow_start = READ_ONCE(halt_poll_ns_grow_start); 3248 grow = READ_ONCE(halt_poll_ns_grow); 3249 if (!grow) 3250 goto out; 3251 3252 val *= grow; 3253 if (val < grow_start) 3254 val = grow_start; 3255 3256 if (val > vcpu->kvm->max_halt_poll_ns) 3257 val = vcpu->kvm->max_halt_poll_ns; 3258 3259 vcpu->halt_poll_ns = val; 3260out: 3261 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 3262} 3263 3264static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 3265{ 3266 unsigned int old, val, shrink, grow_start; 3267 3268 old = val = vcpu->halt_poll_ns; 3269 shrink = READ_ONCE(halt_poll_ns_shrink); 3270 grow_start = READ_ONCE(halt_poll_ns_grow_start); 3271 if (shrink == 0) 3272 val = 0; 3273 else 3274 val /= shrink; 3275 3276 if (val < grow_start) 3277 val = 0; 3278 3279 vcpu->halt_poll_ns = val; 3280 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 3281} 3282 3283static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 3284{ 3285 int ret = -EINTR; 3286 int idx = srcu_read_lock(&vcpu->kvm->srcu); 3287 3288 if (kvm_arch_vcpu_runnable(vcpu)) { 3289 kvm_make_request(KVM_REQ_UNHALT, vcpu); 3290 goto out; 3291 } 3292 if (kvm_cpu_has_pending_timer(vcpu)) 3293 goto out; 3294 if (signal_pending(current)) 3295 goto out; 3296 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu)) 3297 goto out; 3298 3299 ret = 0; 3300out: 3301 srcu_read_unlock(&vcpu->kvm->srcu, idx); 3302 return ret; 3303} 3304 3305/* 3306 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is 3307 * pending. This is mostly used when halting a vCPU, but may also be used 3308 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI. 3309 */ 3310bool kvm_vcpu_block(struct kvm_vcpu *vcpu) 3311{ 3312 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); 3313 bool waited = false; 3314 3315 vcpu->stat.generic.blocking = 1; 3316 3317 preempt_disable(); 3318 kvm_arch_vcpu_blocking(vcpu); 3319 prepare_to_rcuwait(wait); 3320 preempt_enable(); 3321 3322 for (;;) { 3323 set_current_state(TASK_INTERRUPTIBLE); 3324 3325 if (kvm_vcpu_check_block(vcpu) < 0) 3326 break; 3327 3328 waited = true; 3329 schedule(); 3330 } 3331 3332 preempt_disable(); 3333 finish_rcuwait(wait); 3334 kvm_arch_vcpu_unblocking(vcpu); 3335 preempt_enable(); 3336 3337 vcpu->stat.generic.blocking = 0; 3338 3339 return waited; 3340} 3341 3342static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start, 3343 ktime_t end, bool success) 3344{ 3345 struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic; 3346 u64 poll_ns = ktime_to_ns(ktime_sub(end, start)); 3347 3348 ++vcpu->stat.generic.halt_attempted_poll; 3349 3350 if (success) { 3351 ++vcpu->stat.generic.halt_successful_poll; 3352 3353 if (!vcpu_valid_wakeup(vcpu)) 3354 ++vcpu->stat.generic.halt_poll_invalid; 3355 3356 stats->halt_poll_success_ns += poll_ns; 3357 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns); 3358 } else { 3359 stats->halt_poll_fail_ns += poll_ns; 3360 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns); 3361 } 3362} 3363 3364/* 3365 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt 3366 * polling is enabled, busy wait for a short time before blocking to avoid the 3367 * expensive block+unblock sequence if a wake event arrives soon after the vCPU 3368 * is halted. 3369 */ 3370void kvm_vcpu_halt(struct kvm_vcpu *vcpu) 3371{ 3372 bool halt_poll_allowed = !kvm_arch_no_poll(vcpu); 3373 bool do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns; 3374 ktime_t start, cur, poll_end; 3375 bool waited = false; 3376 u64 halt_ns; 3377 3378 start = cur = poll_end = ktime_get(); 3379 if (do_halt_poll) { 3380 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns); 3381 3382 do { 3383 /* 3384 * This sets KVM_REQ_UNHALT if an interrupt 3385 * arrives. 3386 */ 3387 if (kvm_vcpu_check_block(vcpu) < 0) 3388 goto out; 3389 cpu_relax(); 3390 poll_end = cur = ktime_get(); 3391 } while (kvm_vcpu_can_poll(cur, stop)); 3392 } 3393 3394 waited = kvm_vcpu_block(vcpu); 3395 3396 cur = ktime_get(); 3397 if (waited) { 3398 vcpu->stat.generic.halt_wait_ns += 3399 ktime_to_ns(cur) - ktime_to_ns(poll_end); 3400 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist, 3401 ktime_to_ns(cur) - ktime_to_ns(poll_end)); 3402 } 3403out: 3404 /* The total time the vCPU was "halted", including polling time. */ 3405 halt_ns = ktime_to_ns(cur) - ktime_to_ns(start); 3406 3407 /* 3408 * Note, halt-polling is considered successful so long as the vCPU was 3409 * never actually scheduled out, i.e. even if the wake event arrived 3410 * after of the halt-polling loop itself, but before the full wait. 3411 */ 3412 if (do_halt_poll) 3413 update_halt_poll_stats(vcpu, start, poll_end, !waited); 3414 3415 if (halt_poll_allowed) { 3416 if (!vcpu_valid_wakeup(vcpu)) { 3417 shrink_halt_poll_ns(vcpu); 3418 } else if (vcpu->kvm->max_halt_poll_ns) { 3419 if (halt_ns <= vcpu->halt_poll_ns) 3420 ; 3421 /* we had a long block, shrink polling */ 3422 else if (vcpu->halt_poll_ns && 3423 halt_ns > vcpu->kvm->max_halt_poll_ns) 3424 shrink_halt_poll_ns(vcpu); 3425 /* we had a short halt and our poll time is too small */ 3426 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && 3427 halt_ns < vcpu->kvm->max_halt_poll_ns) 3428 grow_halt_poll_ns(vcpu); 3429 } else { 3430 vcpu->halt_poll_ns = 0; 3431 } 3432 } 3433 3434 trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu)); 3435} 3436EXPORT_SYMBOL_GPL(kvm_vcpu_halt); 3437 3438bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 3439{ 3440 if (__kvm_vcpu_wake_up(vcpu)) { 3441 WRITE_ONCE(vcpu->ready, true); 3442 ++vcpu->stat.generic.halt_wakeup; 3443 return true; 3444 } 3445 3446 return false; 3447} 3448EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 3449 3450#ifndef CONFIG_S390 3451/* 3452 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 3453 */ 3454void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 3455{ 3456 int me, cpu; 3457 3458 if (kvm_vcpu_wake_up(vcpu)) 3459 return; 3460 3461 me = get_cpu(); 3462 /* 3463 * The only state change done outside the vcpu mutex is IN_GUEST_MODE 3464 * to EXITING_GUEST_MODE. Therefore the moderately expensive "should 3465 * kick" check does not need atomic operations if kvm_vcpu_kick is used 3466 * within the vCPU thread itself. 3467 */ 3468 if (vcpu == __this_cpu_read(kvm_running_vcpu)) { 3469 if (vcpu->mode == IN_GUEST_MODE) 3470 WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE); 3471 goto out; 3472 } 3473 3474 /* 3475 * Note, the vCPU could get migrated to a different pCPU at any point 3476 * after kvm_arch_vcpu_should_kick(), which could result in sending an 3477 * IPI to the previous pCPU. But, that's ok because the purpose of the 3478 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the 3479 * vCPU also requires it to leave IN_GUEST_MODE. 3480 */ 3481 if (kvm_arch_vcpu_should_kick(vcpu)) { 3482 cpu = READ_ONCE(vcpu->cpu); 3483 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 3484 smp_send_reschedule(cpu); 3485 } 3486out: 3487 put_cpu(); 3488} 3489EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 3490#endif /* !CONFIG_S390 */ 3491 3492int kvm_vcpu_yield_to(struct kvm_vcpu *target) 3493{ 3494 struct pid *pid; 3495 struct task_struct *task = NULL; 3496 int ret = 0; 3497 3498 rcu_read_lock(); 3499 pid = rcu_dereference(target->pid); 3500 if (pid) 3501 task = get_pid_task(pid, PIDTYPE_PID); 3502 rcu_read_unlock(); 3503 if (!task) 3504 return ret; 3505 ret = yield_to(task, 1); 3506 put_task_struct(task); 3507 3508 return ret; 3509} 3510EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 3511 3512/* 3513 * Helper that checks whether a VCPU is eligible for directed yield. 3514 * Most eligible candidate to yield is decided by following heuristics: 3515 * 3516 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 3517 * (preempted lock holder), indicated by @in_spin_loop. 3518 * Set at the beginning and cleared at the end of interception/PLE handler. 3519 * 3520 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 3521 * chance last time (mostly it has become eligible now since we have probably 3522 * yielded to lockholder in last iteration. This is done by toggling 3523 * @dy_eligible each time a VCPU checked for eligibility.) 3524 * 3525 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 3526 * to preempted lock-holder could result in wrong VCPU selection and CPU 3527 * burning. Giving priority for a potential lock-holder increases lock 3528 * progress. 3529 * 3530 * Since algorithm is based on heuristics, accessing another VCPU data without 3531 * locking does not harm. It may result in trying to yield to same VCPU, fail 3532 * and continue with next VCPU and so on. 3533 */ 3534static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 3535{ 3536#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 3537 bool eligible; 3538 3539 eligible = !vcpu->spin_loop.in_spin_loop || 3540 vcpu->spin_loop.dy_eligible; 3541 3542 if (vcpu->spin_loop.in_spin_loop) 3543 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 3544 3545 return eligible; 3546#else 3547 return true; 3548#endif 3549} 3550 3551/* 3552 * Unlike kvm_arch_vcpu_runnable, this function is called outside 3553 * a vcpu_load/vcpu_put pair. However, for most architectures 3554 * kvm_arch_vcpu_runnable does not require vcpu_load. 3555 */ 3556bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 3557{ 3558 return kvm_arch_vcpu_runnable(vcpu); 3559} 3560 3561static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) 3562{ 3563 if (kvm_arch_dy_runnable(vcpu)) 3564 return true; 3565 3566#ifdef CONFIG_KVM_ASYNC_PF 3567 if (!list_empty_careful(&vcpu->async_pf.done)) 3568 return true; 3569#endif 3570 3571 return false; 3572} 3573 3574bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 3575{ 3576 return false; 3577} 3578 3579void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 3580{ 3581 struct kvm *kvm = me->kvm; 3582 struct kvm_vcpu *vcpu; 3583 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 3584 unsigned long i; 3585 int yielded = 0; 3586 int try = 3; 3587 int pass; 3588 3589 kvm_vcpu_set_in_spin_loop(me, true); 3590 /* 3591 * We boost the priority of a VCPU that is runnable but not 3592 * currently running, because it got preempted by something 3593 * else and called schedule in __vcpu_run. Hopefully that 3594 * VCPU is holding the lock that we need and will release it. 3595 * We approximate round-robin by starting at the last boosted VCPU. 3596 */ 3597 for (pass = 0; pass < 2 && !yielded && try; pass++) { 3598 kvm_for_each_vcpu(i, vcpu, kvm) { 3599 if (!pass && i <= last_boosted_vcpu) { 3600 i = last_boosted_vcpu; 3601 continue; 3602 } else if (pass && i > last_boosted_vcpu) 3603 break; 3604 if (!READ_ONCE(vcpu->ready)) 3605 continue; 3606 if (vcpu == me) 3607 continue; 3608 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) 3609 continue; 3610 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 3611 !kvm_arch_dy_has_pending_interrupt(vcpu) && 3612 !kvm_arch_vcpu_in_kernel(vcpu)) 3613 continue; 3614 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 3615 continue; 3616 3617 yielded = kvm_vcpu_yield_to(vcpu); 3618 if (yielded > 0) { 3619 kvm->last_boosted_vcpu = i; 3620 break; 3621 } else if (yielded < 0) { 3622 try--; 3623 if (!try) 3624 break; 3625 } 3626 } 3627 } 3628 kvm_vcpu_set_in_spin_loop(me, false); 3629 3630 /* Ensure vcpu is not eligible during next spinloop */ 3631 kvm_vcpu_set_dy_eligible(me, false); 3632} 3633EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 3634 3635static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff) 3636{ 3637#ifdef CONFIG_HAVE_KVM_DIRTY_RING 3638 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) && 3639 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET + 3640 kvm->dirty_ring_size / PAGE_SIZE); 3641#else 3642 return false; 3643#endif 3644} 3645 3646static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 3647{ 3648 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 3649 struct page *page; 3650 3651 if (vmf->pgoff == 0) 3652 page = virt_to_page(vcpu->run); 3653#ifdef CONFIG_X86 3654 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 3655 page = virt_to_page(vcpu->arch.pio_data); 3656#endif 3657#ifdef CONFIG_KVM_MMIO 3658 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 3659 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 3660#endif 3661 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff)) 3662 page = kvm_dirty_ring_get_page( 3663 &vcpu->dirty_ring, 3664 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET); 3665 else 3666 return kvm_arch_vcpu_fault(vcpu, vmf); 3667 get_page(page); 3668 vmf->page = page; 3669 return 0; 3670} 3671 3672static const struct vm_operations_struct kvm_vcpu_vm_ops = { 3673 .fault = kvm_vcpu_fault, 3674}; 3675 3676static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 3677{ 3678 struct kvm_vcpu *vcpu = file->private_data; 3679 unsigned long pages = vma_pages(vma); 3680 3681 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) || 3682 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) && 3683 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED))) 3684 return -EINVAL; 3685 3686 vma->vm_ops = &kvm_vcpu_vm_ops; 3687 return 0; 3688} 3689 3690static int kvm_vcpu_release(struct inode *inode, struct file *filp) 3691{ 3692 struct kvm_vcpu *vcpu = filp->private_data; 3693 3694 kvm_put_kvm(vcpu->kvm); 3695 return 0; 3696} 3697 3698static const struct file_operations kvm_vcpu_fops = { 3699 .release = kvm_vcpu_release, 3700 .unlocked_ioctl = kvm_vcpu_ioctl, 3701 .mmap = kvm_vcpu_mmap, 3702 .llseek = noop_llseek, 3703 KVM_COMPAT(kvm_vcpu_compat_ioctl), 3704}; 3705 3706/* 3707 * Allocates an inode for the vcpu. 3708 */ 3709static int create_vcpu_fd(struct kvm_vcpu *vcpu) 3710{ 3711 char name[8 + 1 + ITOA_MAX_LEN + 1]; 3712 3713 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 3714 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 3715} 3716 3717static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 3718{ 3719#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS 3720 struct dentry *debugfs_dentry; 3721 char dir_name[ITOA_MAX_LEN * 2]; 3722 3723 if (!debugfs_initialized()) 3724 return; 3725 3726 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 3727 debugfs_dentry = debugfs_create_dir(dir_name, 3728 vcpu->kvm->debugfs_dentry); 3729 3730 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); 3731#endif 3732} 3733 3734/* 3735 * Creates some virtual cpus. Good luck creating more than one. 3736 */ 3737static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 3738{ 3739 int r; 3740 struct kvm_vcpu *vcpu; 3741 struct page *page; 3742 3743 if (id >= KVM_MAX_VCPU_IDS) 3744 return -EINVAL; 3745 3746 mutex_lock(&kvm->lock); 3747 if (kvm->created_vcpus >= kvm->max_vcpus) { 3748 mutex_unlock(&kvm->lock); 3749 return -EINVAL; 3750 } 3751 3752 kvm->created_vcpus++; 3753 mutex_unlock(&kvm->lock); 3754 3755 r = kvm_arch_vcpu_precreate(kvm, id); 3756 if (r) 3757 goto vcpu_decrement; 3758 3759 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); 3760 if (!vcpu) { 3761 r = -ENOMEM; 3762 goto vcpu_decrement; 3763 } 3764 3765 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); 3766 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 3767 if (!page) { 3768 r = -ENOMEM; 3769 goto vcpu_free; 3770 } 3771 vcpu->run = page_address(page); 3772 3773 kvm_vcpu_init(vcpu, kvm, id); 3774 3775 r = kvm_arch_vcpu_create(vcpu); 3776 if (r) 3777 goto vcpu_free_run_page; 3778 3779 if (kvm->dirty_ring_size) { 3780 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring, 3781 id, kvm->dirty_ring_size); 3782 if (r) 3783 goto arch_vcpu_destroy; 3784 } 3785 3786 mutex_lock(&kvm->lock); 3787 if (kvm_get_vcpu_by_id(kvm, id)) { 3788 r = -EEXIST; 3789 goto unlock_vcpu_destroy; 3790 } 3791 3792 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); 3793 r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); 3794 BUG_ON(r == -EBUSY); 3795 if (r) 3796 goto unlock_vcpu_destroy; 3797 3798 /* Fill the stats id string for the vcpu */ 3799 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", 3800 task_pid_nr(current), id); 3801 3802 /* Now it's all set up, let userspace reach it */ 3803 kvm_get_kvm(kvm); 3804 r = create_vcpu_fd(vcpu); 3805 if (r < 0) { 3806 xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); 3807 kvm_put_kvm_no_destroy(kvm); 3808 goto unlock_vcpu_destroy; 3809 } 3810 3811 /* 3812 * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu 3813 * pointer before kvm->online_vcpu's incremented value. 3814 */ 3815 smp_wmb(); 3816 atomic_inc(&kvm->online_vcpus); 3817 3818 mutex_unlock(&kvm->lock); 3819 kvm_arch_vcpu_postcreate(vcpu); 3820 kvm_create_vcpu_debugfs(vcpu); 3821 return r; 3822 3823unlock_vcpu_destroy: 3824 mutex_unlock(&kvm->lock); 3825 kvm_dirty_ring_free(&vcpu->dirty_ring); 3826arch_vcpu_destroy: 3827 kvm_arch_vcpu_destroy(vcpu); 3828vcpu_free_run_page: 3829 free_page((unsigned long)vcpu->run); 3830vcpu_free: 3831 kmem_cache_free(kvm_vcpu_cache, vcpu); 3832vcpu_decrement: 3833 mutex_lock(&kvm->lock); 3834 kvm->created_vcpus--; 3835 mutex_unlock(&kvm->lock); 3836 return r; 3837} 3838 3839static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 3840{ 3841 if (sigset) { 3842 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3843 vcpu->sigset_active = 1; 3844 vcpu->sigset = *sigset; 3845 } else 3846 vcpu->sigset_active = 0; 3847 return 0; 3848} 3849 3850static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer, 3851 size_t size, loff_t *offset) 3852{ 3853 struct kvm_vcpu *vcpu = file->private_data; 3854 3855 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header, 3856 &kvm_vcpu_stats_desc[0], &vcpu->stat, 3857 sizeof(vcpu->stat), user_buffer, size, offset); 3858} 3859 3860static const struct file_operations kvm_vcpu_stats_fops = { 3861 .read = kvm_vcpu_stats_read, 3862 .llseek = noop_llseek, 3863}; 3864 3865static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) 3866{ 3867 int fd; 3868 struct file *file; 3869 char name[15 + ITOA_MAX_LEN + 1]; 3870 3871 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id); 3872 3873 fd = get_unused_fd_flags(O_CLOEXEC); 3874 if (fd < 0) 3875 return fd; 3876 3877 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY); 3878 if (IS_ERR(file)) { 3879 put_unused_fd(fd); 3880 return PTR_ERR(file); 3881 } 3882 file->f_mode |= FMODE_PREAD; 3883 fd_install(fd, file); 3884 3885 return fd; 3886} 3887 3888static long kvm_vcpu_ioctl(struct file *filp, 3889 unsigned int ioctl, unsigned long arg) 3890{ 3891 struct kvm_vcpu *vcpu = filp->private_data; 3892 void __user *argp = (void __user *)arg; 3893 int r; 3894 struct kvm_fpu *fpu = NULL; 3895 struct kvm_sregs *kvm_sregs = NULL; 3896 3897 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) 3898 return -EIO; 3899 3900 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 3901 return -EINVAL; 3902 3903 /* 3904 * Some architectures have vcpu ioctls that are asynchronous to vcpu 3905 * execution; mutex_lock() would break them. 3906 */ 3907 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 3908 if (r != -ENOIOCTLCMD) 3909 return r; 3910 3911 if (mutex_lock_killable(&vcpu->mutex)) 3912 return -EINTR; 3913 switch (ioctl) { 3914 case KVM_RUN: { 3915 struct pid *oldpid; 3916 r = -EINVAL; 3917 if (arg) 3918 goto out; 3919 oldpid = rcu_access_pointer(vcpu->pid); 3920 if (unlikely(oldpid != task_pid(current))) { 3921 /* The thread running this VCPU changed. */ 3922 struct pid *newpid; 3923 3924 r = kvm_arch_vcpu_run_pid_change(vcpu); 3925 if (r) 3926 break; 3927 3928 newpid = get_task_pid(current, PIDTYPE_PID); 3929 rcu_assign_pointer(vcpu->pid, newpid); 3930 if (oldpid) 3931 synchronize_rcu(); 3932 put_pid(oldpid); 3933 } 3934 r = kvm_arch_vcpu_ioctl_run(vcpu); 3935 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 3936 break; 3937 } 3938 case KVM_GET_REGS: { 3939 struct kvm_regs *kvm_regs; 3940 3941 r = -ENOMEM; 3942 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); 3943 if (!kvm_regs) 3944 goto out; 3945 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 3946 if (r) 3947 goto out_free1; 3948 r = -EFAULT; 3949 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 3950 goto out_free1; 3951 r = 0; 3952out_free1: 3953 kfree(kvm_regs); 3954 break; 3955 } 3956 case KVM_SET_REGS: { 3957 struct kvm_regs *kvm_regs; 3958 3959 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 3960 if (IS_ERR(kvm_regs)) { 3961 r = PTR_ERR(kvm_regs); 3962 goto out; 3963 } 3964 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 3965 kfree(kvm_regs); 3966 break; 3967 } 3968 case KVM_GET_SREGS: { 3969 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), 3970 GFP_KERNEL_ACCOUNT); 3971 r = -ENOMEM; 3972 if (!kvm_sregs) 3973 goto out; 3974 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 3975 if (r) 3976 goto out; 3977 r = -EFAULT; 3978 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 3979 goto out; 3980 r = 0; 3981 break; 3982 } 3983 case KVM_SET_SREGS: { 3984 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 3985 if (IS_ERR(kvm_sregs)) { 3986 r = PTR_ERR(kvm_sregs); 3987 kvm_sregs = NULL; 3988 goto out; 3989 } 3990 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 3991 break; 3992 } 3993 case KVM_GET_MP_STATE: { 3994 struct kvm_mp_state mp_state; 3995 3996 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 3997 if (r) 3998 goto out; 3999 r = -EFAULT; 4000 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 4001 goto out; 4002 r = 0; 4003 break; 4004 } 4005 case KVM_SET_MP_STATE: { 4006 struct kvm_mp_state mp_state; 4007 4008 r = -EFAULT; 4009 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 4010 goto out; 4011 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 4012 break; 4013 } 4014 case KVM_TRANSLATE: { 4015 struct kvm_translation tr; 4016 4017 r = -EFAULT; 4018 if (copy_from_user(&tr, argp, sizeof(tr))) 4019 goto out; 4020 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 4021 if (r) 4022 goto out; 4023 r = -EFAULT; 4024 if (copy_to_user(argp, &tr, sizeof(tr))) 4025 goto out; 4026 r = 0; 4027 break; 4028 } 4029 case KVM_SET_GUEST_DEBUG: { 4030 struct kvm_guest_debug dbg; 4031 4032 r = -EFAULT; 4033 if (copy_from_user(&dbg, argp, sizeof(dbg))) 4034 goto out; 4035 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 4036 break; 4037 } 4038 case KVM_SET_SIGNAL_MASK: { 4039 struct kvm_signal_mask __user *sigmask_arg = argp; 4040 struct kvm_signal_mask kvm_sigmask; 4041 sigset_t sigset, *p; 4042 4043 p = NULL; 4044 if (argp) { 4045 r = -EFAULT; 4046 if (copy_from_user(&kvm_sigmask, argp, 4047 sizeof(kvm_sigmask))) 4048 goto out; 4049 r = -EINVAL; 4050 if (kvm_sigmask.len != sizeof(sigset)) 4051 goto out; 4052 r = -EFAULT; 4053 if (copy_from_user(&sigset, sigmask_arg->sigset, 4054 sizeof(sigset))) 4055 goto out; 4056 p = &sigset; 4057 } 4058 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 4059 break; 4060 } 4061 case KVM_GET_FPU: { 4062 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); 4063 r = -ENOMEM; 4064 if (!fpu) 4065 goto out; 4066 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 4067 if (r) 4068 goto out; 4069 r = -EFAULT; 4070 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 4071 goto out; 4072 r = 0; 4073 break; 4074 } 4075 case KVM_SET_FPU: { 4076 fpu = memdup_user(argp, sizeof(*fpu)); 4077 if (IS_ERR(fpu)) { 4078 r = PTR_ERR(fpu); 4079 fpu = NULL; 4080 goto out; 4081 } 4082 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 4083 break; 4084 } 4085 case KVM_GET_STATS_FD: { 4086 r = kvm_vcpu_ioctl_get_stats_fd(vcpu); 4087 break; 4088 } 4089 default: 4090 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 4091 } 4092out: 4093 mutex_unlock(&vcpu->mutex); 4094 kfree(fpu); 4095 kfree(kvm_sregs); 4096 return r; 4097} 4098 4099#ifdef CONFIG_KVM_COMPAT 4100static long kvm_vcpu_compat_ioctl(struct file *filp, 4101 unsigned int ioctl, unsigned long arg) 4102{ 4103 struct kvm_vcpu *vcpu = filp->private_data; 4104 void __user *argp = compat_ptr(arg); 4105 int r; 4106 4107 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) 4108 return -EIO; 4109 4110 switch (ioctl) { 4111 case KVM_SET_SIGNAL_MASK: { 4112 struct kvm_signal_mask __user *sigmask_arg = argp; 4113 struct kvm_signal_mask kvm_sigmask; 4114 sigset_t sigset; 4115 4116 if (argp) { 4117 r = -EFAULT; 4118 if (copy_from_user(&kvm_sigmask, argp, 4119 sizeof(kvm_sigmask))) 4120 goto out; 4121 r = -EINVAL; 4122 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 4123 goto out; 4124 r = -EFAULT; 4125 if (get_compat_sigset(&sigset, 4126 (compat_sigset_t __user *)sigmask_arg->sigset)) 4127 goto out; 4128 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 4129 } else 4130 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 4131 break; 4132 } 4133 default: 4134 r = kvm_vcpu_ioctl(filp, ioctl, arg); 4135 } 4136 4137out: 4138 return r; 4139} 4140#endif 4141 4142static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) 4143{ 4144 struct kvm_device *dev = filp->private_data; 4145 4146 if (dev->ops->mmap) 4147 return dev->ops->mmap(dev, vma); 4148 4149 return -ENODEV; 4150} 4151 4152static int kvm_device_ioctl_attr(struct kvm_device *dev, 4153 int (*accessor)(struct kvm_device *dev, 4154 struct kvm_device_attr *attr), 4155 unsigned long arg) 4156{ 4157 struct kvm_device_attr attr; 4158 4159 if (!accessor) 4160 return -EPERM; 4161 4162 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 4163 return -EFAULT; 4164 4165 return accessor(dev, &attr); 4166} 4167 4168static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 4169 unsigned long arg) 4170{ 4171 struct kvm_device *dev = filp->private_data; 4172 4173 if (dev->kvm->mm != current->mm || dev->kvm->vm_dead) 4174 return -EIO; 4175 4176 switch (ioctl) { 4177 case KVM_SET_DEVICE_ATTR: 4178 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 4179 case KVM_GET_DEVICE_ATTR: 4180 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 4181 case KVM_HAS_DEVICE_ATTR: 4182 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 4183 default: 4184 if (dev->ops->ioctl) 4185 return dev->ops->ioctl(dev, ioctl, arg); 4186 4187 return -ENOTTY; 4188 } 4189} 4190 4191static int kvm_device_release(struct inode *inode, struct file *filp) 4192{ 4193 struct kvm_device *dev = filp->private_data; 4194 struct kvm *kvm = dev->kvm; 4195 4196 if (dev->ops->release) { 4197 mutex_lock(&kvm->lock); 4198 list_del(&dev->vm_node); 4199 dev->ops->release(dev); 4200 mutex_unlock(&kvm->lock); 4201 } 4202 4203 kvm_put_kvm(kvm); 4204 return 0; 4205} 4206 4207static const struct file_operations kvm_device_fops = { 4208 .unlocked_ioctl = kvm_device_ioctl, 4209 .release = kvm_device_release, 4210 KVM_COMPAT(kvm_device_ioctl), 4211 .mmap = kvm_device_mmap, 4212}; 4213 4214struct kvm_device *kvm_device_from_filp(struct file *filp) 4215{ 4216 if (filp->f_op != &kvm_device_fops) 4217 return NULL; 4218 4219 return filp->private_data; 4220} 4221 4222static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 4223#ifdef CONFIG_KVM_MPIC 4224 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 4225 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 4226#endif 4227}; 4228 4229int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) 4230{ 4231 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 4232 return -ENOSPC; 4233 4234 if (kvm_device_ops_table[type] != NULL) 4235 return -EEXIST; 4236 4237 kvm_device_ops_table[type] = ops; 4238 return 0; 4239} 4240 4241void kvm_unregister_device_ops(u32 type) 4242{ 4243 if (kvm_device_ops_table[type] != NULL) 4244 kvm_device_ops_table[type] = NULL; 4245} 4246 4247static int kvm_ioctl_create_device(struct kvm *kvm, 4248 struct kvm_create_device *cd) 4249{ 4250 const struct kvm_device_ops *ops = NULL; 4251 struct kvm_device *dev; 4252 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 4253 int type; 4254 int ret; 4255 4256 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 4257 return -ENODEV; 4258 4259 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); 4260 ops = kvm_device_ops_table[type]; 4261 if (ops == NULL) 4262 return -ENODEV; 4263 4264 if (test) 4265 return 0; 4266 4267 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); 4268 if (!dev) 4269 return -ENOMEM; 4270 4271 dev->ops = ops; 4272 dev->kvm = kvm; 4273 4274 mutex_lock(&kvm->lock); 4275 ret = ops->create(dev, type); 4276 if (ret < 0) { 4277 mutex_unlock(&kvm->lock); 4278 kfree(dev); 4279 return ret; 4280 } 4281 list_add(&dev->vm_node, &kvm->devices); 4282 mutex_unlock(&kvm->lock); 4283 4284 if (ops->init) 4285 ops->init(dev); 4286 4287 kvm_get_kvm(kvm); 4288 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 4289 if (ret < 0) { 4290 kvm_put_kvm_no_destroy(kvm); 4291 mutex_lock(&kvm->lock); 4292 list_del(&dev->vm_node); 4293 if (ops->release) 4294 ops->release(dev); 4295 mutex_unlock(&kvm->lock); 4296 if (ops->destroy) 4297 ops->destroy(dev); 4298 return ret; 4299 } 4300 4301 cd->fd = ret; 4302 return 0; 4303} 4304 4305static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 4306{ 4307 switch (arg) { 4308 case KVM_CAP_USER_MEMORY: 4309 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 4310 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 4311 case KVM_CAP_INTERNAL_ERROR_DATA: 4312#ifdef CONFIG_HAVE_KVM_MSI 4313 case KVM_CAP_SIGNAL_MSI: 4314#endif 4315#ifdef CONFIG_HAVE_KVM_IRQFD 4316 case KVM_CAP_IRQFD: 4317 case KVM_CAP_IRQFD_RESAMPLE: 4318#endif 4319 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 4320 case KVM_CAP_CHECK_EXTENSION_VM: 4321 case KVM_CAP_ENABLE_CAP_VM: 4322 case KVM_CAP_HALT_POLL: 4323 return 1; 4324#ifdef CONFIG_KVM_MMIO 4325 case KVM_CAP_COALESCED_MMIO: 4326 return KVM_COALESCED_MMIO_PAGE_OFFSET; 4327 case KVM_CAP_COALESCED_PIO: 4328 return 1; 4329#endif 4330#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 4331 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 4332 return KVM_DIRTY_LOG_MANUAL_CAPS; 4333#endif 4334#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 4335 case KVM_CAP_IRQ_ROUTING: 4336 return KVM_MAX_IRQ_ROUTES; 4337#endif 4338#if KVM_ADDRESS_SPACE_NUM > 1 4339 case KVM_CAP_MULTI_ADDRESS_SPACE: 4340 return KVM_ADDRESS_SPACE_NUM; 4341#endif 4342 case KVM_CAP_NR_MEMSLOTS: 4343 return KVM_USER_MEM_SLOTS; 4344 case KVM_CAP_DIRTY_LOG_RING: 4345#ifdef CONFIG_HAVE_KVM_DIRTY_RING 4346 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); 4347#else 4348 return 0; 4349#endif 4350 case KVM_CAP_BINARY_STATS_FD: 4351 case KVM_CAP_SYSTEM_EVENT_DATA: 4352 return 1; 4353 default: 4354 break; 4355 } 4356 return kvm_vm_ioctl_check_extension(kvm, arg); 4357} 4358 4359static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) 4360{ 4361 int r; 4362 4363 if (!KVM_DIRTY_LOG_PAGE_OFFSET) 4364 return -EINVAL; 4365 4366 /* the size should be power of 2 */ 4367 if (!size || (size & (size - 1))) 4368 return -EINVAL; 4369 4370 /* Should be bigger to keep the reserved entries, or a page */ 4371 if (size < kvm_dirty_ring_get_rsvd_entries() * 4372 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) 4373 return -EINVAL; 4374 4375 if (size > KVM_DIRTY_RING_MAX_ENTRIES * 4376 sizeof(struct kvm_dirty_gfn)) 4377 return -E2BIG; 4378 4379 /* We only allow it to set once */ 4380 if (kvm->dirty_ring_size) 4381 return -EINVAL; 4382 4383 mutex_lock(&kvm->lock); 4384 4385 if (kvm->created_vcpus) { 4386 /* We don't allow to change this value after vcpu created */ 4387 r = -EINVAL; 4388 } else { 4389 kvm->dirty_ring_size = size; 4390 r = 0; 4391 } 4392 4393 mutex_unlock(&kvm->lock); 4394 return r; 4395} 4396 4397static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) 4398{ 4399 unsigned long i; 4400 struct kvm_vcpu *vcpu; 4401 int cleared = 0; 4402 4403 if (!kvm->dirty_ring_size) 4404 return -EINVAL; 4405 4406 mutex_lock(&kvm->slots_lock); 4407 4408 kvm_for_each_vcpu(i, vcpu, kvm) 4409 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring); 4410 4411 mutex_unlock(&kvm->slots_lock); 4412 4413 if (cleared) 4414 kvm_flush_remote_tlbs(kvm); 4415 4416 return cleared; 4417} 4418 4419int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, 4420 struct kvm_enable_cap *cap) 4421{ 4422 return -EINVAL; 4423} 4424 4425static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 4426 struct kvm_enable_cap *cap) 4427{ 4428 switch (cap->cap) { 4429#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 4430 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { 4431 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; 4432 4433 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) 4434 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; 4435 4436 if (cap->flags || (cap->args[0] & ~allowed_options)) 4437 return -EINVAL; 4438 kvm->manual_dirty_log_protect = cap->args[0]; 4439 return 0; 4440 } 4441#endif 4442 case KVM_CAP_HALT_POLL: { 4443 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) 4444 return -EINVAL; 4445 4446 kvm->max_halt_poll_ns = cap->args[0]; 4447 return 0; 4448 } 4449 case KVM_CAP_DIRTY_LOG_RING: 4450 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); 4451 default: 4452 return kvm_vm_ioctl_enable_cap(kvm, cap); 4453 } 4454} 4455 4456static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer, 4457 size_t size, loff_t *offset) 4458{ 4459 struct kvm *kvm = file->private_data; 4460 4461 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header, 4462 &kvm_vm_stats_desc[0], &kvm->stat, 4463 sizeof(kvm->stat), user_buffer, size, offset); 4464} 4465 4466static const struct file_operations kvm_vm_stats_fops = { 4467 .read = kvm_vm_stats_read, 4468 .llseek = noop_llseek, 4469}; 4470 4471static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) 4472{ 4473 int fd; 4474 struct file *file; 4475 4476 fd = get_unused_fd_flags(O_CLOEXEC); 4477 if (fd < 0) 4478 return fd; 4479 4480 file = anon_inode_getfile("kvm-vm-stats", 4481 &kvm_vm_stats_fops, kvm, O_RDONLY); 4482 if (IS_ERR(file)) { 4483 put_unused_fd(fd); 4484 return PTR_ERR(file); 4485 } 4486 file->f_mode |= FMODE_PREAD; 4487 fd_install(fd, file); 4488 4489 return fd; 4490} 4491 4492static long kvm_vm_ioctl(struct file *filp, 4493 unsigned int ioctl, unsigned long arg) 4494{ 4495 struct kvm *kvm = filp->private_data; 4496 void __user *argp = (void __user *)arg; 4497 int r; 4498 4499 if ((ioctl != KVM_MEMORY_ENCRYPT_OP && kvm->mm != current->mm) || kvm->vm_dead) 4500 return -EIO; 4501 switch (ioctl) { 4502 case KVM_CREATE_VCPU: 4503 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 4504 break; 4505 case KVM_ENABLE_CAP: { 4506 struct kvm_enable_cap cap; 4507 4508 r = -EFAULT; 4509 if (copy_from_user(&cap, argp, sizeof(cap))) 4510 goto out; 4511 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); 4512 break; 4513 } 4514 case KVM_SET_USER_MEMORY_REGION: { 4515 struct kvm_userspace_memory_region kvm_userspace_mem; 4516 4517 r = -EFAULT; 4518 if (copy_from_user(&kvm_userspace_mem, argp, 4519 sizeof(kvm_userspace_mem))) 4520 goto out; 4521 4522 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 4523 break; 4524 } 4525 case KVM_GET_DIRTY_LOG: { 4526 struct kvm_dirty_log log; 4527 4528 r = -EFAULT; 4529 if (copy_from_user(&log, argp, sizeof(log))) 4530 goto out; 4531 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 4532 break; 4533 } 4534#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 4535 case KVM_CLEAR_DIRTY_LOG: { 4536 struct kvm_clear_dirty_log log; 4537 4538 r = -EFAULT; 4539 if (copy_from_user(&log, argp, sizeof(log))) 4540 goto out; 4541 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 4542 break; 4543 } 4544#endif 4545#ifdef CONFIG_KVM_MMIO 4546 case KVM_REGISTER_COALESCED_MMIO: { 4547 struct kvm_coalesced_mmio_zone zone; 4548 4549 r = -EFAULT; 4550 if (copy_from_user(&zone, argp, sizeof(zone))) 4551 goto out; 4552 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 4553 break; 4554 } 4555 case KVM_UNREGISTER_COALESCED_MMIO: { 4556 struct kvm_coalesced_mmio_zone zone; 4557 4558 r = -EFAULT; 4559 if (copy_from_user(&zone, argp, sizeof(zone))) 4560 goto out; 4561 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 4562 break; 4563 } 4564#endif 4565 case KVM_IRQFD: { 4566 struct kvm_irqfd data; 4567 4568 r = -EFAULT; 4569 if (copy_from_user(&data, argp, sizeof(data))) 4570 goto out; 4571 r = kvm_irqfd(kvm, &data); 4572 break; 4573 } 4574 case KVM_IOEVENTFD: { 4575 struct kvm_ioeventfd data; 4576 4577 r = -EFAULT; 4578 if (copy_from_user(&data, argp, sizeof(data))) 4579 goto out; 4580 r = kvm_ioeventfd(kvm, &data); 4581 break; 4582 } 4583#ifdef CONFIG_HAVE_KVM_MSI 4584 case KVM_SIGNAL_MSI: { 4585 struct kvm_msi msi; 4586 4587 r = -EFAULT; 4588 if (copy_from_user(&msi, argp, sizeof(msi))) 4589 goto out; 4590 r = kvm_send_userspace_msi(kvm, &msi); 4591 break; 4592 } 4593#endif 4594#ifdef __KVM_HAVE_IRQ_LINE 4595 case KVM_IRQ_LINE_STATUS: 4596 case KVM_IRQ_LINE: { 4597 struct kvm_irq_level irq_event; 4598 4599 r = -EFAULT; 4600 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 4601 goto out; 4602 4603 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 4604 ioctl == KVM_IRQ_LINE_STATUS); 4605 if (r) 4606 goto out; 4607 4608 r = -EFAULT; 4609 if (ioctl == KVM_IRQ_LINE_STATUS) { 4610 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 4611 goto out; 4612 } 4613 4614 r = 0; 4615 break; 4616 } 4617#endif 4618#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 4619 case KVM_SET_GSI_ROUTING: { 4620 struct kvm_irq_routing routing; 4621 struct kvm_irq_routing __user *urouting; 4622 struct kvm_irq_routing_entry *entries = NULL; 4623 4624 r = -EFAULT; 4625 if (copy_from_user(&routing, argp, sizeof(routing))) 4626 goto out; 4627 r = -EINVAL; 4628 if (!kvm_arch_can_set_irq_routing(kvm)) 4629 goto out; 4630 if (routing.nr > KVM_MAX_IRQ_ROUTES) 4631 goto out; 4632 if (routing.flags) 4633 goto out; 4634 if (routing.nr) { 4635 urouting = argp; 4636 entries = vmemdup_user(urouting->entries, 4637 array_size(sizeof(*entries), 4638 routing.nr)); 4639 if (IS_ERR(entries)) { 4640 r = PTR_ERR(entries); 4641 goto out; 4642 } 4643 } 4644 r = kvm_set_irq_routing(kvm, entries, routing.nr, 4645 routing.flags); 4646 kvfree(entries); 4647 break; 4648 } 4649#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 4650 case KVM_CREATE_DEVICE: { 4651 struct kvm_create_device cd; 4652 4653 r = -EFAULT; 4654 if (copy_from_user(&cd, argp, sizeof(cd))) 4655 goto out; 4656 4657 r = kvm_ioctl_create_device(kvm, &cd); 4658 if (r) 4659 goto out; 4660 4661 r = -EFAULT; 4662 if (copy_to_user(argp, &cd, sizeof(cd))) 4663 goto out; 4664 4665 r = 0; 4666 break; 4667 } 4668 case KVM_CHECK_EXTENSION: 4669 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 4670 break; 4671 case KVM_RESET_DIRTY_RINGS: 4672 r = kvm_vm_ioctl_reset_dirty_pages(kvm); 4673 break; 4674 case KVM_GET_STATS_FD: 4675 r = kvm_vm_ioctl_get_stats_fd(kvm); 4676 break; 4677 default: 4678 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 4679 } 4680out: 4681 return r; 4682} 4683 4684#ifdef CONFIG_KVM_COMPAT 4685struct compat_kvm_dirty_log { 4686 __u32 slot; 4687 __u32 padding1; 4688 union { 4689 compat_uptr_t dirty_bitmap; /* one bit per page */ 4690 __u64 padding2; 4691 }; 4692}; 4693 4694struct compat_kvm_clear_dirty_log { 4695 __u32 slot; 4696 __u32 num_pages; 4697 __u64 first_page; 4698 union { 4699 compat_uptr_t dirty_bitmap; /* one bit per page */ 4700 __u64 padding2; 4701 }; 4702}; 4703 4704static long kvm_vm_compat_ioctl(struct file *filp, 4705 unsigned int ioctl, unsigned long arg) 4706{ 4707 struct kvm *kvm = filp->private_data; 4708 int r; 4709 4710 if (kvm->mm != current->mm || kvm->vm_dead) 4711 return -EIO; 4712 switch (ioctl) { 4713#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 4714 case KVM_CLEAR_DIRTY_LOG: { 4715 struct compat_kvm_clear_dirty_log compat_log; 4716 struct kvm_clear_dirty_log log; 4717 4718 if (copy_from_user(&compat_log, (void __user *)arg, 4719 sizeof(compat_log))) 4720 return -EFAULT; 4721 log.slot = compat_log.slot; 4722 log.num_pages = compat_log.num_pages; 4723 log.first_page = compat_log.first_page; 4724 log.padding2 = compat_log.padding2; 4725 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 4726 4727 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 4728 break; 4729 } 4730#endif 4731 case KVM_GET_DIRTY_LOG: { 4732 struct compat_kvm_dirty_log compat_log; 4733 struct kvm_dirty_log log; 4734 4735 if (copy_from_user(&compat_log, (void __user *)arg, 4736 sizeof(compat_log))) 4737 return -EFAULT; 4738 log.slot = compat_log.slot; 4739 log.padding1 = compat_log.padding1; 4740 log.padding2 = compat_log.padding2; 4741 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 4742 4743 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 4744 break; 4745 } 4746 default: 4747 r = kvm_vm_ioctl(filp, ioctl, arg); 4748 } 4749 return r; 4750} 4751#endif 4752 4753static const struct file_operations kvm_vm_fops = { 4754 .release = kvm_vm_release, 4755 .unlocked_ioctl = kvm_vm_ioctl, 4756 .llseek = noop_llseek, 4757 KVM_COMPAT(kvm_vm_compat_ioctl), 4758}; 4759 4760bool file_is_kvm(struct file *file) 4761{ 4762 return file && file->f_op == &kvm_vm_fops; 4763} 4764EXPORT_SYMBOL_GPL(file_is_kvm); 4765 4766static int kvm_dev_ioctl_create_vm(unsigned long type) 4767{ 4768 int r; 4769 struct kvm *kvm; 4770 struct file *file; 4771 4772 kvm = kvm_create_vm(type); 4773 if (IS_ERR(kvm)) 4774 return PTR_ERR(kvm); 4775#ifdef CONFIG_KVM_MMIO 4776 r = kvm_coalesced_mmio_init(kvm); 4777 if (r < 0) 4778 goto put_kvm; 4779#endif 4780 r = get_unused_fd_flags(O_CLOEXEC); 4781 if (r < 0) 4782 goto put_kvm; 4783 4784 snprintf(kvm->stats_id, sizeof(kvm->stats_id), 4785 "kvm-%d", task_pid_nr(current)); 4786 4787 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 4788 if (IS_ERR(file)) { 4789 put_unused_fd(r); 4790 r = PTR_ERR(file); 4791 goto put_kvm; 4792 } 4793 4794 /* 4795 * Don't call kvm_put_kvm anymore at this point; file->f_op is 4796 * already set, with ->release() being kvm_vm_release(). In error 4797 * cases it will be called by the final fput(file) and will take 4798 * care of doing kvm_put_kvm(kvm). 4799 */ 4800 if (kvm_create_vm_debugfs(kvm, r) < 0) { 4801 put_unused_fd(r); 4802 fput(file); 4803 return -ENOMEM; 4804 } 4805 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 4806 4807 fd_install(r, file); 4808 4809 main_vm = kvm; 4810 4811 return r; 4812 4813put_kvm: 4814 kvm_put_kvm(kvm); 4815 return r; 4816} 4817 4818static long kvm_dev_ioctl(struct file *filp, 4819 unsigned int ioctl, unsigned long arg) 4820{ 4821 long r = -EINVAL; 4822 4823 switch (ioctl) { 4824 case KVM_GET_API_VERSION: 4825 if (arg) 4826 goto out; 4827 r = KVM_API_VERSION; 4828 break; 4829 case KVM_CREATE_VM: 4830 r = kvm_dev_ioctl_create_vm(arg); 4831 break; 4832 case KVM_CHECK_EXTENSION: 4833 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 4834 break; 4835 case KVM_GET_VCPU_MMAP_SIZE: 4836 if (arg) 4837 goto out; 4838 r = PAGE_SIZE; /* struct kvm_run */ 4839#ifdef CONFIG_X86 4840 r += PAGE_SIZE; /* pio data page */ 4841#endif 4842#ifdef CONFIG_KVM_MMIO 4843 r += PAGE_SIZE; /* coalesced mmio ring page */ 4844#endif 4845 break; 4846 case KVM_TRACE_ENABLE: 4847 case KVM_TRACE_PAUSE: 4848 case KVM_TRACE_DISABLE: 4849 r = -EOPNOTSUPP; 4850 break; 4851 default: 4852 return cpc_kvm_ioctl(filp, ioctl, arg); 4853 } 4854out: 4855 return r; 4856} 4857 4858static struct file_operations kvm_chardev_ops = { 4859 .unlocked_ioctl = kvm_dev_ioctl, 4860 .llseek = noop_llseek, 4861 KVM_COMPAT(kvm_dev_ioctl), 4862}; 4863 4864static struct miscdevice kvm_dev = { 4865 KVM_MINOR, 4866 "kvm", 4867 &kvm_chardev_ops, 4868}; 4869 4870static void hardware_enable_nolock(void *junk) 4871{ 4872 int cpu = raw_smp_processor_id(); 4873 int r; 4874 4875 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 4876 return; 4877 4878 cpumask_set_cpu(cpu, cpus_hardware_enabled); 4879 4880 r = kvm_arch_hardware_enable(); 4881 4882 if (r) { 4883 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 4884 atomic_inc(&hardware_enable_failed); 4885 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 4886 } 4887} 4888 4889static int kvm_starting_cpu(unsigned int cpu) 4890{ 4891 raw_spin_lock(&kvm_count_lock); 4892 if (kvm_usage_count) 4893 hardware_enable_nolock(NULL); 4894 raw_spin_unlock(&kvm_count_lock); 4895 return 0; 4896} 4897 4898static void hardware_disable_nolock(void *junk) 4899{ 4900 int cpu = raw_smp_processor_id(); 4901 4902 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 4903 return; 4904 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 4905 kvm_arch_hardware_disable(); 4906} 4907 4908static int kvm_dying_cpu(unsigned int cpu) 4909{ 4910 raw_spin_lock(&kvm_count_lock); 4911 if (kvm_usage_count) 4912 hardware_disable_nolock(NULL); 4913 raw_spin_unlock(&kvm_count_lock); 4914 return 0; 4915} 4916 4917static void hardware_disable_all_nolock(void) 4918{ 4919 BUG_ON(!kvm_usage_count); 4920 4921 kvm_usage_count--; 4922 if (!kvm_usage_count) 4923 on_each_cpu(hardware_disable_nolock, NULL, 1); 4924} 4925 4926static void hardware_disable_all(void) 4927{ 4928 raw_spin_lock(&kvm_count_lock); 4929 hardware_disable_all_nolock(); 4930 raw_spin_unlock(&kvm_count_lock); 4931} 4932 4933static int hardware_enable_all(void) 4934{ 4935 int r = 0; 4936 4937 raw_spin_lock(&kvm_count_lock); 4938 4939 kvm_usage_count++; 4940 if (kvm_usage_count == 1) { 4941 atomic_set(&hardware_enable_failed, 0); 4942 on_each_cpu(hardware_enable_nolock, NULL, 1); 4943 4944 if (atomic_read(&hardware_enable_failed)) { 4945 hardware_disable_all_nolock(); 4946 r = -EBUSY; 4947 } 4948 } 4949 4950 raw_spin_unlock(&kvm_count_lock); 4951 4952 return r; 4953} 4954 4955static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 4956 void *v) 4957{ 4958 /* 4959 * Some (well, at least mine) BIOSes hang on reboot if 4960 * in vmx root mode. 4961 * 4962 * And Intel TXT required VMX off for all cpu when system shutdown. 4963 */ 4964 pr_info("kvm: exiting hardware virtualization\n"); 4965 kvm_rebooting = true; 4966 on_each_cpu(hardware_disable_nolock, NULL, 1); 4967 return NOTIFY_OK; 4968} 4969 4970static struct notifier_block kvm_reboot_notifier = { 4971 .notifier_call = kvm_reboot, 4972 .priority = 0, 4973}; 4974 4975static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 4976{ 4977 int i; 4978 4979 for (i = 0; i < bus->dev_count; i++) { 4980 struct kvm_io_device *pos = bus->range[i].dev; 4981 4982 kvm_iodevice_destructor(pos); 4983 } 4984 kfree(bus); 4985} 4986 4987static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 4988 const struct kvm_io_range *r2) 4989{ 4990 gpa_t addr1 = r1->addr; 4991 gpa_t addr2 = r2->addr; 4992 4993 if (addr1 < addr2) 4994 return -1; 4995 4996 /* If r2->len == 0, match the exact address. If r2->len != 0, 4997 * accept any overlapping write. Any order is acceptable for 4998 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 4999 * we process all of them. 5000 */ 5001 if (r2->len) { 5002 addr1 += r1->len; 5003 addr2 += r2->len; 5004 } 5005 5006 if (addr1 > addr2) 5007 return 1; 5008 5009 return 0; 5010} 5011 5012static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 5013{ 5014 return kvm_io_bus_cmp(p1, p2); 5015} 5016 5017static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 5018 gpa_t addr, int len) 5019{ 5020 struct kvm_io_range *range, key; 5021 int off; 5022 5023 key = (struct kvm_io_range) { 5024 .addr = addr, 5025 .len = len, 5026 }; 5027 5028 range = bsearch(&key, bus->range, bus->dev_count, 5029 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 5030 if (range == NULL) 5031 return -ENOENT; 5032 5033 off = range - bus->range; 5034 5035 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 5036 off--; 5037 5038 return off; 5039} 5040 5041static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 5042 struct kvm_io_range *range, const void *val) 5043{ 5044 int idx; 5045 5046 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 5047 if (idx < 0) 5048 return -EOPNOTSUPP; 5049 5050 while (idx < bus->dev_count && 5051 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 5052 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 5053 range->len, val)) 5054 return idx; 5055 idx++; 5056 } 5057 5058 return -EOPNOTSUPP; 5059} 5060 5061/* kvm_io_bus_write - called under kvm->slots_lock */ 5062int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 5063 int len, const void *val) 5064{ 5065 struct kvm_io_bus *bus; 5066 struct kvm_io_range range; 5067 int r; 5068 5069 range = (struct kvm_io_range) { 5070 .addr = addr, 5071 .len = len, 5072 }; 5073 5074 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 5075 if (!bus) 5076 return -ENOMEM; 5077 r = __kvm_io_bus_write(vcpu, bus, &range, val); 5078 return r < 0 ? r : 0; 5079} 5080EXPORT_SYMBOL_GPL(kvm_io_bus_write); 5081 5082/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 5083int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 5084 gpa_t addr, int len, const void *val, long cookie) 5085{ 5086 struct kvm_io_bus *bus; 5087 struct kvm_io_range range; 5088 5089 range = (struct kvm_io_range) { 5090 .addr = addr, 5091 .len = len, 5092 }; 5093 5094 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 5095 if (!bus) 5096 return -ENOMEM; 5097 5098 /* First try the device referenced by cookie. */ 5099 if ((cookie >= 0) && (cookie < bus->dev_count) && 5100 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 5101 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 5102 val)) 5103 return cookie; 5104 5105 /* 5106 * cookie contained garbage; fall back to search and return the 5107 * correct cookie value. 5108 */ 5109 return __kvm_io_bus_write(vcpu, bus, &range, val); 5110} 5111 5112static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 5113 struct kvm_io_range *range, void *val) 5114{ 5115 int idx; 5116 5117 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 5118 if (idx < 0) 5119 return -EOPNOTSUPP; 5120 5121 while (idx < bus->dev_count && 5122 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 5123 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 5124 range->len, val)) 5125 return idx; 5126 idx++; 5127 } 5128 5129 return -EOPNOTSUPP; 5130} 5131 5132/* kvm_io_bus_read - called under kvm->slots_lock */ 5133int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 5134 int len, void *val) 5135{ 5136 struct kvm_io_bus *bus; 5137 struct kvm_io_range range; 5138 int r; 5139 5140 range = (struct kvm_io_range) { 5141 .addr = addr, 5142 .len = len, 5143 }; 5144 5145 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 5146 if (!bus) 5147 return -ENOMEM; 5148 r = __kvm_io_bus_read(vcpu, bus, &range, val); 5149 return r < 0 ? r : 0; 5150} 5151 5152/* Caller must hold slots_lock. */ 5153int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 5154 int len, struct kvm_io_device *dev) 5155{ 5156 int i; 5157 struct kvm_io_bus *new_bus, *bus; 5158 struct kvm_io_range range; 5159 5160 bus = kvm_get_bus(kvm, bus_idx); 5161 if (!bus) 5162 return -ENOMEM; 5163 5164 /* exclude ioeventfd which is limited by maximum fd */ 5165 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 5166 return -ENOSPC; 5167 5168 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), 5169 GFP_KERNEL_ACCOUNT); 5170 if (!new_bus) 5171 return -ENOMEM; 5172 5173 range = (struct kvm_io_range) { 5174 .addr = addr, 5175 .len = len, 5176 .dev = dev, 5177 }; 5178 5179 for (i = 0; i < bus->dev_count; i++) 5180 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 5181 break; 5182 5183 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 5184 new_bus->dev_count++; 5185 new_bus->range[i] = range; 5186 memcpy(new_bus->range + i + 1, bus->range + i, 5187 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 5188 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 5189 synchronize_srcu_expedited(&kvm->srcu); 5190 kfree(bus); 5191 5192 return 0; 5193} 5194 5195int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 5196 struct kvm_io_device *dev) 5197{ 5198 int i, j; 5199 struct kvm_io_bus *new_bus, *bus; 5200 5201 lockdep_assert_held(&kvm->slots_lock); 5202 5203 bus = kvm_get_bus(kvm, bus_idx); 5204 if (!bus) 5205 return 0; 5206 5207 for (i = 0; i < bus->dev_count; i++) { 5208 if (bus->range[i].dev == dev) { 5209 break; 5210 } 5211 } 5212 5213 if (i == bus->dev_count) 5214 return 0; 5215 5216 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), 5217 GFP_KERNEL_ACCOUNT); 5218 if (new_bus) { 5219 memcpy(new_bus, bus, struct_size(bus, range, i)); 5220 new_bus->dev_count--; 5221 memcpy(new_bus->range + i, bus->range + i + 1, 5222 flex_array_size(new_bus, range, new_bus->dev_count - i)); 5223 } 5224 5225 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 5226 synchronize_srcu_expedited(&kvm->srcu); 5227 5228 /* Destroy the old bus _after_ installing the (null) bus. */ 5229 if (!new_bus) { 5230 pr_err("kvm: failed to shrink bus, removing it completely\n"); 5231 for (j = 0; j < bus->dev_count; j++) { 5232 if (j == i) 5233 continue; 5234 kvm_iodevice_destructor(bus->range[j].dev); 5235 } 5236 } 5237 5238 kfree(bus); 5239 return new_bus ? 0 : -ENOMEM; 5240} 5241 5242struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 5243 gpa_t addr) 5244{ 5245 struct kvm_io_bus *bus; 5246 int dev_idx, srcu_idx; 5247 struct kvm_io_device *iodev = NULL; 5248 5249 srcu_idx = srcu_read_lock(&kvm->srcu); 5250 5251 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 5252 if (!bus) 5253 goto out_unlock; 5254 5255 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 5256 if (dev_idx < 0) 5257 goto out_unlock; 5258 5259 iodev = bus->range[dev_idx].dev; 5260 5261out_unlock: 5262 srcu_read_unlock(&kvm->srcu, srcu_idx); 5263 5264 return iodev; 5265} 5266EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 5267 5268static int kvm_debugfs_open(struct inode *inode, struct file *file, 5269 int (*get)(void *, u64 *), int (*set)(void *, u64), 5270 const char *fmt) 5271{ 5272 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 5273 inode->i_private; 5274 5275 /* 5276 * The debugfs files are a reference to the kvm struct which 5277 * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe 5278 * avoids the race between open and the removal of the debugfs directory. 5279 */ 5280 if (!kvm_get_kvm_safe(stat_data->kvm)) 5281 return -ENOENT; 5282 5283 if (simple_attr_open(inode, file, get, 5284 kvm_stats_debugfs_mode(stat_data->desc) & 0222 5285 ? set : NULL, 5286 fmt)) { 5287 kvm_put_kvm(stat_data->kvm); 5288 return -ENOMEM; 5289 } 5290 5291 return 0; 5292} 5293 5294static int kvm_debugfs_release(struct inode *inode, struct file *file) 5295{ 5296 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 5297 inode->i_private; 5298 5299 simple_attr_release(inode, file); 5300 kvm_put_kvm(stat_data->kvm); 5301 5302 return 0; 5303} 5304 5305static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) 5306{ 5307 *val = *(u64 *)((void *)(&kvm->stat) + offset); 5308 5309 return 0; 5310} 5311 5312static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) 5313{ 5314 *(u64 *)((void *)(&kvm->stat) + offset) = 0; 5315 5316 return 0; 5317} 5318 5319static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) 5320{ 5321 unsigned long i; 5322 struct kvm_vcpu *vcpu; 5323 5324 *val = 0; 5325 5326 kvm_for_each_vcpu(i, vcpu, kvm) 5327 *val += *(u64 *)((void *)(&vcpu->stat) + offset); 5328 5329 return 0; 5330} 5331 5332static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) 5333{ 5334 unsigned long i; 5335 struct kvm_vcpu *vcpu; 5336 5337 kvm_for_each_vcpu(i, vcpu, kvm) 5338 *(u64 *)((void *)(&vcpu->stat) + offset) = 0; 5339 5340 return 0; 5341} 5342 5343static int kvm_stat_data_get(void *data, u64 *val) 5344{ 5345 int r = -EFAULT; 5346 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 5347 5348 switch (stat_data->kind) { 5349 case KVM_STAT_VM: 5350 r = kvm_get_stat_per_vm(stat_data->kvm, 5351 stat_data->desc->desc.offset, val); 5352 break; 5353 case KVM_STAT_VCPU: 5354 r = kvm_get_stat_per_vcpu(stat_data->kvm, 5355 stat_data->desc->desc.offset, val); 5356 break; 5357 } 5358 5359 return r; 5360} 5361 5362static int kvm_stat_data_clear(void *data, u64 val) 5363{ 5364 int r = -EFAULT; 5365 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 5366 5367 if (val) 5368 return -EINVAL; 5369 5370 switch (stat_data->kind) { 5371 case KVM_STAT_VM: 5372 r = kvm_clear_stat_per_vm(stat_data->kvm, 5373 stat_data->desc->desc.offset); 5374 break; 5375 case KVM_STAT_VCPU: 5376 r = kvm_clear_stat_per_vcpu(stat_data->kvm, 5377 stat_data->desc->desc.offset); 5378 break; 5379 } 5380 5381 return r; 5382} 5383 5384static int kvm_stat_data_open(struct inode *inode, struct file *file) 5385{ 5386 __simple_attr_check_format("%llu\n", 0ull); 5387 return kvm_debugfs_open(inode, file, kvm_stat_data_get, 5388 kvm_stat_data_clear, "%llu\n"); 5389} 5390 5391static const struct file_operations stat_fops_per_vm = { 5392 .owner = THIS_MODULE, 5393 .open = kvm_stat_data_open, 5394 .release = kvm_debugfs_release, 5395 .read = simple_attr_read, 5396 .write = simple_attr_write, 5397 .llseek = no_llseek, 5398}; 5399 5400static int vm_stat_get(void *_offset, u64 *val) 5401{ 5402 unsigned offset = (long)_offset; 5403 struct kvm *kvm; 5404 u64 tmp_val; 5405 5406 *val = 0; 5407 mutex_lock(&kvm_lock); 5408 list_for_each_entry(kvm, &vm_list, vm_list) { 5409 kvm_get_stat_per_vm(kvm, offset, &tmp_val); 5410 *val += tmp_val; 5411 } 5412 mutex_unlock(&kvm_lock); 5413 return 0; 5414} 5415 5416static int vm_stat_clear(void *_offset, u64 val) 5417{ 5418 unsigned offset = (long)_offset; 5419 struct kvm *kvm; 5420 5421 if (val) 5422 return -EINVAL; 5423 5424 mutex_lock(&kvm_lock); 5425 list_for_each_entry(kvm, &vm_list, vm_list) { 5426 kvm_clear_stat_per_vm(kvm, offset); 5427 } 5428 mutex_unlock(&kvm_lock); 5429 5430 return 0; 5431} 5432 5433DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 5434DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n"); 5435 5436static int vcpu_stat_get(void *_offset, u64 *val) 5437{ 5438 unsigned offset = (long)_offset; 5439 struct kvm *kvm; 5440 u64 tmp_val; 5441 5442 *val = 0; 5443 mutex_lock(&kvm_lock); 5444 list_for_each_entry(kvm, &vm_list, vm_list) { 5445 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); 5446 *val += tmp_val; 5447 } 5448 mutex_unlock(&kvm_lock); 5449 return 0; 5450} 5451 5452static int vcpu_stat_clear(void *_offset, u64 val) 5453{ 5454 unsigned offset = (long)_offset; 5455 struct kvm *kvm; 5456 5457 if (val) 5458 return -EINVAL; 5459 5460 mutex_lock(&kvm_lock); 5461 list_for_each_entry(kvm, &vm_list, vm_list) { 5462 kvm_clear_stat_per_vcpu(kvm, offset); 5463 } 5464 mutex_unlock(&kvm_lock); 5465 5466 return 0; 5467} 5468 5469DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 5470 "%llu\n"); 5471DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n"); 5472 5473static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 5474{ 5475 struct kobj_uevent_env *env; 5476 unsigned long long created, active; 5477 5478 if (!kvm_dev.this_device || !kvm) 5479 return; 5480 5481 mutex_lock(&kvm_lock); 5482 if (type == KVM_EVENT_CREATE_VM) { 5483 kvm_createvm_count++; 5484 kvm_active_vms++; 5485 } else if (type == KVM_EVENT_DESTROY_VM) { 5486 kvm_active_vms--; 5487 } 5488 created = kvm_createvm_count; 5489 active = kvm_active_vms; 5490 mutex_unlock(&kvm_lock); 5491 5492 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); 5493 if (!env) 5494 return; 5495 5496 add_uevent_var(env, "CREATED=%llu", created); 5497 add_uevent_var(env, "COUNT=%llu", active); 5498 5499 if (type == KVM_EVENT_CREATE_VM) { 5500 add_uevent_var(env, "EVENT=create"); 5501 kvm->userspace_pid = task_pid_nr(current); 5502 } else if (type == KVM_EVENT_DESTROY_VM) { 5503 add_uevent_var(env, "EVENT=destroy"); 5504 } 5505 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 5506 5507 if (!IS_ERR(kvm->debugfs_dentry)) { 5508 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); 5509 5510 if (p) { 5511 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 5512 if (!IS_ERR(tmp)) 5513 add_uevent_var(env, "STATS_PATH=%s", tmp); 5514 kfree(p); 5515 } 5516 } 5517 /* no need for checks, since we are adding at most only 5 keys */ 5518 env->envp[env->envp_idx++] = NULL; 5519 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 5520 kfree(env); 5521} 5522 5523static void kvm_init_debug(void) 5524{ 5525 const struct file_operations *fops; 5526 const struct _kvm_stats_desc *pdesc; 5527 int i; 5528 5529 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 5530 5531 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { 5532 pdesc = &kvm_vm_stats_desc[i]; 5533 if (kvm_stats_debugfs_mode(pdesc) & 0222) 5534 fops = &vm_stat_fops; 5535 else 5536 fops = &vm_stat_readonly_fops; 5537 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), 5538 kvm_debugfs_dir, 5539 (void *)(long)pdesc->desc.offset, fops); 5540 } 5541 5542 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { 5543 pdesc = &kvm_vcpu_stats_desc[i]; 5544 if (kvm_stats_debugfs_mode(pdesc) & 0222) 5545 fops = &vcpu_stat_fops; 5546 else 5547 fops = &vcpu_stat_readonly_fops; 5548 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), 5549 kvm_debugfs_dir, 5550 (void *)(long)pdesc->desc.offset, fops); 5551 } 5552} 5553 5554static int kvm_suspend(void) 5555{ 5556 if (kvm_usage_count) 5557 hardware_disable_nolock(NULL); 5558 return 0; 5559} 5560 5561static void kvm_resume(void) 5562{ 5563 if (kvm_usage_count) { 5564 lockdep_assert_not_held(&kvm_count_lock); 5565 hardware_enable_nolock(NULL); 5566 } 5567} 5568 5569static struct syscore_ops kvm_syscore_ops = { 5570 .suspend = kvm_suspend, 5571 .resume = kvm_resume, 5572}; 5573 5574static inline 5575struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 5576{ 5577 return container_of(pn, struct kvm_vcpu, preempt_notifier); 5578} 5579 5580static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 5581{ 5582 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 5583 5584 WRITE_ONCE(vcpu->preempted, false); 5585 WRITE_ONCE(vcpu->ready, false); 5586 5587 __this_cpu_write(kvm_running_vcpu, vcpu); 5588 kvm_arch_sched_in(vcpu, cpu); 5589 kvm_arch_vcpu_load(vcpu, cpu); 5590} 5591 5592static void kvm_sched_out(struct preempt_notifier *pn, 5593 struct task_struct *next) 5594{ 5595 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 5596 5597 if (current->on_rq) { 5598 WRITE_ONCE(vcpu->preempted, true); 5599 WRITE_ONCE(vcpu->ready, true); 5600 } 5601 kvm_arch_vcpu_put(vcpu); 5602 __this_cpu_write(kvm_running_vcpu, NULL); 5603} 5604 5605/** 5606 * kvm_get_running_vcpu - get the vcpu running on the current CPU. 5607 * 5608 * We can disable preemption locally around accessing the per-CPU variable, 5609 * and use the resolved vcpu pointer after enabling preemption again, 5610 * because even if the current thread is migrated to another CPU, reading 5611 * the per-CPU value later will give us the same value as we update the 5612 * per-CPU variable in the preempt notifier handlers. 5613 */ 5614struct kvm_vcpu *kvm_get_running_vcpu(void) 5615{ 5616 struct kvm_vcpu *vcpu; 5617 5618 preempt_disable(); 5619 vcpu = __this_cpu_read(kvm_running_vcpu); 5620 preempt_enable(); 5621 5622 return vcpu; 5623} 5624EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); 5625 5626/** 5627 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. 5628 */ 5629struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) 5630{ 5631 return &kvm_running_vcpu; 5632} 5633 5634#ifdef CONFIG_GUEST_PERF_EVENTS 5635static unsigned int kvm_guest_state(void) 5636{ 5637 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 5638 unsigned int state; 5639 5640 if (!kvm_arch_pmi_in_guest(vcpu)) 5641 return 0; 5642 5643 state = PERF_GUEST_ACTIVE; 5644 if (!kvm_arch_vcpu_in_kernel(vcpu)) 5645 state |= PERF_GUEST_USER; 5646 5647 return state; 5648} 5649 5650static unsigned long kvm_guest_get_ip(void) 5651{ 5652 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 5653 5654 /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */ 5655 if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu))) 5656 return 0; 5657 5658 return kvm_arch_vcpu_get_ip(vcpu); 5659} 5660 5661static struct perf_guest_info_callbacks kvm_guest_cbs = { 5662 .state = kvm_guest_state, 5663 .get_ip = kvm_guest_get_ip, 5664 .handle_intel_pt_intr = NULL, 5665}; 5666 5667void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) 5668{ 5669 kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; 5670 perf_register_guest_info_callbacks(&kvm_guest_cbs); 5671} 5672void kvm_unregister_perf_callbacks(void) 5673{ 5674 perf_unregister_guest_info_callbacks(&kvm_guest_cbs); 5675} 5676#endif 5677 5678struct kvm_cpu_compat_check { 5679 void *opaque; 5680 int *ret; 5681}; 5682 5683static void check_processor_compat(void *data) 5684{ 5685 struct kvm_cpu_compat_check *c = data; 5686 5687 *c->ret = kvm_arch_check_processor_compat(c->opaque); 5688} 5689 5690int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 5691 struct module *module) 5692{ 5693 struct kvm_cpu_compat_check c; 5694 int r; 5695 int cpu; 5696 5697 r = kvm_arch_init(opaque); 5698 if (r) 5699 goto out_fail; 5700 5701 /* 5702 * kvm_arch_init makes sure there's at most one caller 5703 * for architectures that support multiple implementations, 5704 * like intel and amd on x86. 5705 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 5706 * conflicts in case kvm is already setup for another implementation. 5707 */ 5708 r = kvm_irqfd_init(); 5709 if (r) 5710 goto out_irqfd; 5711 5712 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 5713 r = -ENOMEM; 5714 goto out_free_0; 5715 } 5716 5717 r = kvm_arch_hardware_setup(opaque); 5718 if (r < 0) 5719 goto out_free_1; 5720 5721 c.ret = &r; 5722 c.opaque = opaque; 5723 for_each_online_cpu(cpu) { 5724 smp_call_function_single(cpu, check_processor_compat, &c, 1); 5725 if (r < 0) 5726 goto out_free_2; 5727 } 5728 5729 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 5730 kvm_starting_cpu, kvm_dying_cpu); 5731 if (r) 5732 goto out_free_2; 5733 register_reboot_notifier(&kvm_reboot_notifier); 5734 5735 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 5736 if (!vcpu_align) 5737 vcpu_align = __alignof__(struct kvm_vcpu); 5738 kvm_vcpu_cache = 5739 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 5740 SLAB_ACCOUNT, 5741 offsetof(struct kvm_vcpu, arch), 5742 offsetofend(struct kvm_vcpu, stats_id) 5743 - offsetof(struct kvm_vcpu, arch), 5744 NULL); 5745 if (!kvm_vcpu_cache) { 5746 r = -ENOMEM; 5747 goto out_free_3; 5748 } 5749 5750 for_each_possible_cpu(cpu) { 5751 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), 5752 GFP_KERNEL, cpu_to_node(cpu))) { 5753 r = -ENOMEM; 5754 goto out_free_4; 5755 } 5756 } 5757 5758 r = kvm_async_pf_init(); 5759 if (r) 5760 goto out_free_5; 5761 5762 kvm_chardev_ops.owner = module; 5763 5764 r = misc_register(&kvm_dev); 5765 if (r) { 5766 pr_err("kvm: misc device register failed\n"); 5767 goto out_unreg; 5768 } 5769 5770 register_syscore_ops(&kvm_syscore_ops); 5771 5772 kvm_preempt_ops.sched_in = kvm_sched_in; 5773 kvm_preempt_ops.sched_out = kvm_sched_out; 5774 5775 kvm_init_debug(); 5776 5777 r = kvm_vfio_ops_init(); 5778 WARN_ON(r); 5779 5780 cpc_kvm_init(); 5781 5782 return 0; 5783 5784out_unreg: 5785 kvm_async_pf_deinit(); 5786out_free_5: 5787 for_each_possible_cpu(cpu) 5788 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 5789out_free_4: 5790 kmem_cache_destroy(kvm_vcpu_cache); 5791out_free_3: 5792 unregister_reboot_notifier(&kvm_reboot_notifier); 5793 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 5794out_free_2: 5795 kvm_arch_hardware_unsetup(); 5796out_free_1: 5797 free_cpumask_var(cpus_hardware_enabled); 5798out_free_0: 5799 kvm_irqfd_exit(); 5800out_irqfd: 5801 kvm_arch_exit(); 5802out_fail: 5803 return r; 5804} 5805EXPORT_SYMBOL_GPL(kvm_init); 5806 5807void kvm_exit(void) 5808{ 5809 int cpu; 5810 5811 cpc_kvm_exit(); 5812 5813 debugfs_remove_recursive(kvm_debugfs_dir); 5814 misc_deregister(&kvm_dev); 5815 for_each_possible_cpu(cpu) 5816 free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); 5817 kmem_cache_destroy(kvm_vcpu_cache); 5818 kvm_async_pf_deinit(); 5819 unregister_syscore_ops(&kvm_syscore_ops); 5820 unregister_reboot_notifier(&kvm_reboot_notifier); 5821 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 5822 on_each_cpu(hardware_disable_nolock, NULL, 1); 5823 kvm_arch_hardware_unsetup(); 5824 kvm_arch_exit(); 5825 kvm_irqfd_exit(); 5826 free_cpumask_var(cpus_hardware_enabled); 5827 kvm_vfio_ops_exit(); 5828} 5829EXPORT_SYMBOL_GPL(kvm_exit); 5830 5831struct kvm_vm_worker_thread_context { 5832 struct kvm *kvm; 5833 struct task_struct *parent; 5834 struct completion init_done; 5835 kvm_vm_thread_fn_t thread_fn; 5836 uintptr_t data; 5837 int err; 5838}; 5839 5840static int kvm_vm_worker_thread(void *context) 5841{ 5842 /* 5843 * The init_context is allocated on the stack of the parent thread, so 5844 * we have to locally copy anything that is needed beyond initialization 5845 */ 5846 struct kvm_vm_worker_thread_context *init_context = context; 5847 struct task_struct *parent; 5848 struct kvm *kvm = init_context->kvm; 5849 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; 5850 uintptr_t data = init_context->data; 5851 int err; 5852 5853 err = kthread_park(current); 5854 /* kthread_park(current) is never supposed to return an error */ 5855 WARN_ON(err != 0); 5856 if (err) 5857 goto init_complete; 5858 5859 err = cgroup_attach_task_all(init_context->parent, current); 5860 if (err) { 5861 kvm_err("%s: cgroup_attach_task_all failed with err %d\n", 5862 __func__, err); 5863 goto init_complete; 5864 } 5865 5866 set_user_nice(current, task_nice(init_context->parent)); 5867 5868init_complete: 5869 init_context->err = err; 5870 complete(&init_context->init_done); 5871 init_context = NULL; 5872 5873 if (err) 5874 goto out; 5875 5876 /* Wait to be woken up by the spawner before proceeding. */ 5877 kthread_parkme(); 5878 5879 if (!kthread_should_stop()) 5880 err = thread_fn(kvm, data); 5881 5882out: 5883 /* 5884 * Move kthread back to its original cgroup to prevent it lingering in 5885 * the cgroup of the VM process, after the latter finishes its 5886 * execution. 5887 * 5888 * kthread_stop() waits on the 'exited' completion condition which is 5889 * set in exit_mm(), via mm_release(), in do_exit(). However, the 5890 * kthread is removed from the cgroup in the cgroup_exit() which is 5891 * called after the exit_mm(). This causes the kthread_stop() to return 5892 * before the kthread actually quits the cgroup. 5893 */ 5894 rcu_read_lock(); 5895 parent = rcu_dereference(current->real_parent); 5896 get_task_struct(parent); 5897 rcu_read_unlock(); 5898 cgroup_attach_task_all(parent, current); 5899 put_task_struct(parent); 5900 5901 return err; 5902} 5903 5904int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, 5905 uintptr_t data, const char *name, 5906 struct task_struct **thread_ptr) 5907{ 5908 struct kvm_vm_worker_thread_context init_context = {}; 5909 struct task_struct *thread; 5910 5911 *thread_ptr = NULL; 5912 init_context.kvm = kvm; 5913 init_context.parent = current; 5914 init_context.thread_fn = thread_fn; 5915 init_context.data = data; 5916 init_completion(&init_context.init_done); 5917 5918 thread = kthread_run(kvm_vm_worker_thread, &init_context, 5919 "%s-%d", name, task_pid_nr(current)); 5920 if (IS_ERR(thread)) 5921 return PTR_ERR(thread); 5922 5923 /* kthread_run is never supposed to return NULL */ 5924 WARN_ON(thread == NULL); 5925 5926 wait_for_completion(&init_context.init_done); 5927 5928 if (!init_context.err) 5929 *thread_ptr = thread; 5930 5931 return init_context.err; 5932}