book3s_64_mmu_hv.c (53508B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * 4 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 5 */ 6 7#include <linux/types.h> 8#include <linux/string.h> 9#include <linux/kvm.h> 10#include <linux/kvm_host.h> 11#include <linux/highmem.h> 12#include <linux/gfp.h> 13#include <linux/slab.h> 14#include <linux/hugetlb.h> 15#include <linux/vmalloc.h> 16#include <linux/srcu.h> 17#include <linux/anon_inodes.h> 18#include <linux/file.h> 19#include <linux/debugfs.h> 20 21#include <asm/kvm_ppc.h> 22#include <asm/kvm_book3s.h> 23#include <asm/book3s/64/mmu-hash.h> 24#include <asm/hvcall.h> 25#include <asm/synch.h> 26#include <asm/ppc-opcode.h> 27#include <asm/cputable.h> 28#include <asm/pte-walk.h> 29 30#include "book3s.h" 31#include "trace_hv.h" 32 33//#define DEBUG_RESIZE_HPT 1 34 35#ifdef DEBUG_RESIZE_HPT 36#define resize_hpt_debug(resize, ...) \ 37 do { \ 38 printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ 39 printk(__VA_ARGS__); \ 40 } while (0) 41#else 42#define resize_hpt_debug(resize, ...) \ 43 do { } while (0) 44#endif 45 46static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 47 long pte_index, unsigned long pteh, 48 unsigned long ptel, unsigned long *pte_idx_ret); 49 50struct kvm_resize_hpt { 51 /* These fields read-only after init */ 52 struct kvm *kvm; 53 struct work_struct work; 54 u32 order; 55 56 /* These fields protected by kvm->arch.mmu_setup_lock */ 57 58 /* Possible values and their usage: 59 * <0 an error occurred during allocation, 60 * -EBUSY allocation is in the progress, 61 * 0 allocation made successfully. 62 */ 63 int error; 64 65 /* Private to the work thread, until error != -EBUSY, 66 * then protected by kvm->arch.mmu_setup_lock. 67 */ 68 struct kvm_hpt_info hpt; 69}; 70 71int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) 72{ 73 unsigned long hpt = 0; 74 int cma = 0; 75 struct page *page = NULL; 76 struct revmap_entry *rev; 77 unsigned long npte; 78 79 if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) 80 return -EINVAL; 81 82 page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); 83 if (page) { 84 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 85 memset((void *)hpt, 0, (1ul << order)); 86 cma = 1; 87 } 88 89 if (!hpt) 90 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL 91 |__GFP_NOWARN, order - PAGE_SHIFT); 92 93 if (!hpt) 94 return -ENOMEM; 95 96 /* HPTEs are 2**4 bytes long */ 97 npte = 1ul << (order - 4); 98 99 /* Allocate reverse map array */ 100 rev = vmalloc(array_size(npte, sizeof(struct revmap_entry))); 101 if (!rev) { 102 if (cma) 103 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); 104 else 105 free_pages(hpt, order - PAGE_SHIFT); 106 return -ENOMEM; 107 } 108 109 info->order = order; 110 info->virt = hpt; 111 info->cma = cma; 112 info->rev = rev; 113 114 return 0; 115} 116 117void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) 118{ 119 atomic64_set(&kvm->arch.mmio_update, 0); 120 kvm->arch.hpt = *info; 121 kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); 122 123 pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n", 124 info->virt, (long)info->order, kvm->arch.lpid); 125} 126 127long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) 128{ 129 long err = -EBUSY; 130 struct kvm_hpt_info info; 131 132 mutex_lock(&kvm->arch.mmu_setup_lock); 133 if (kvm->arch.mmu_ready) { 134 kvm->arch.mmu_ready = 0; 135 /* order mmu_ready vs. vcpus_running */ 136 smp_mb(); 137 if (atomic_read(&kvm->arch.vcpus_running)) { 138 kvm->arch.mmu_ready = 1; 139 goto out; 140 } 141 } 142 if (kvm_is_radix(kvm)) { 143 err = kvmppc_switch_mmu_to_hpt(kvm); 144 if (err) 145 goto out; 146 } 147 148 if (kvm->arch.hpt.order == order) { 149 /* We already have a suitable HPT */ 150 151 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 152 memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); 153 /* 154 * Reset all the reverse-mapping chains for all memslots 155 */ 156 kvmppc_rmap_reset(kvm); 157 err = 0; 158 goto out; 159 } 160 161 if (kvm->arch.hpt.virt) { 162 kvmppc_free_hpt(&kvm->arch.hpt); 163 kvmppc_rmap_reset(kvm); 164 } 165 166 err = kvmppc_allocate_hpt(&info, order); 167 if (err < 0) 168 goto out; 169 kvmppc_set_hpt(kvm, &info); 170 171out: 172 if (err == 0) 173 /* Ensure that each vcpu will flush its TLB on next entry. */ 174 cpumask_setall(&kvm->arch.need_tlb_flush); 175 176 mutex_unlock(&kvm->arch.mmu_setup_lock); 177 return err; 178} 179 180void kvmppc_free_hpt(struct kvm_hpt_info *info) 181{ 182 vfree(info->rev); 183 info->rev = NULL; 184 if (info->cma) 185 kvm_free_hpt_cma(virt_to_page(info->virt), 186 1 << (info->order - PAGE_SHIFT)); 187 else if (info->virt) 188 free_pages(info->virt, info->order - PAGE_SHIFT); 189 info->virt = 0; 190 info->order = 0; 191} 192 193/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 194static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) 195{ 196 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; 197} 198 199/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ 200static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) 201{ 202 return (pgsize == 0x10000) ? 0x1000 : 0; 203} 204 205void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 206 unsigned long porder) 207{ 208 unsigned long i; 209 unsigned long npages; 210 unsigned long hp_v, hp_r; 211 unsigned long addr, hash; 212 unsigned long psize; 213 unsigned long hp0, hp1; 214 unsigned long idx_ret; 215 long ret; 216 struct kvm *kvm = vcpu->kvm; 217 218 psize = 1ul << porder; 219 npages = memslot->npages >> (porder - PAGE_SHIFT); 220 221 /* VRMA can't be > 1TB */ 222 if (npages > 1ul << (40 - porder)) 223 npages = 1ul << (40 - porder); 224 /* Can't use more than 1 HPTE per HPTEG */ 225 if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) 226 npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; 227 228 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 229 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 230 hp1 = hpte1_pgsize_encoding(psize) | 231 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; 232 233 for (i = 0; i < npages; ++i) { 234 addr = i << porder; 235 /* can't use hpt_hash since va > 64 bits */ 236 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) 237 & kvmppc_hpt_mask(&kvm->arch.hpt); 238 /* 239 * We assume that the hash table is empty and no 240 * vcpus are using it at this stage. Since we create 241 * at most one HPTE per HPTEG, we just assume entry 7 242 * is available and use it. 243 */ 244 hash = (hash << 3) + 7; 245 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 246 hp_r = hp1 | addr; 247 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, 248 &idx_ret); 249 if (ret != H_SUCCESS) { 250 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 251 addr, ret); 252 break; 253 } 254 } 255} 256 257int kvmppc_mmu_hv_init(void) 258{ 259 unsigned long nr_lpids; 260 261 if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) 262 return -EINVAL; 263 264 if (cpu_has_feature(CPU_FTR_HVMODE)) { 265 if (WARN_ON(mfspr(SPRN_LPID) != 0)) 266 return -EINVAL; 267 nr_lpids = 1UL << mmu_lpid_bits; 268 } else { 269 nr_lpids = 1UL << KVM_MAX_NESTED_GUESTS_SHIFT; 270 } 271 272 if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 273 /* POWER7 has 10-bit LPIDs, POWER8 has 12-bit LPIDs */ 274 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 275 WARN_ON(nr_lpids != 1UL << 12); 276 else 277 WARN_ON(nr_lpids != 1UL << 10); 278 279 /* 280 * Reserve the last implemented LPID use in partition 281 * switching for POWER7 and POWER8. 282 */ 283 nr_lpids -= 1; 284 } 285 286 kvmppc_init_lpid(nr_lpids); 287 288 return 0; 289} 290 291static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 292 long pte_index, unsigned long pteh, 293 unsigned long ptel, unsigned long *pte_idx_ret) 294{ 295 long ret; 296 297 preempt_disable(); 298 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 299 kvm->mm->pgd, false, pte_idx_ret); 300 preempt_enable(); 301 if (ret == H_TOO_HARD) { 302 /* this can't happen */ 303 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); 304 ret = H_RESOURCE; /* or something */ 305 } 306 return ret; 307 308} 309 310static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 311 gva_t eaddr) 312{ 313 u64 mask; 314 int i; 315 316 for (i = 0; i < vcpu->arch.slb_nr; i++) { 317 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) 318 continue; 319 320 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) 321 mask = ESID_MASK_1T; 322 else 323 mask = ESID_MASK; 324 325 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) 326 return &vcpu->arch.slb[i]; 327 } 328 return NULL; 329} 330 331static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, 332 unsigned long ea) 333{ 334 unsigned long ra_mask; 335 336 ra_mask = kvmppc_actual_pgsz(v, r) - 1; 337 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 338} 339 340static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 341 struct kvmppc_pte *gpte, bool data, bool iswrite) 342{ 343 struct kvm *kvm = vcpu->kvm; 344 struct kvmppc_slb *slbe; 345 unsigned long slb_v; 346 unsigned long pp, key; 347 unsigned long v, orig_v, gr; 348 __be64 *hptep; 349 long int index; 350 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 351 352 if (kvm_is_radix(vcpu->kvm)) 353 return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite); 354 355 /* Get SLB entry */ 356 if (virtmode) { 357 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 358 if (!slbe) 359 return -EINVAL; 360 slb_v = slbe->origv; 361 } else { 362 /* real mode access */ 363 slb_v = vcpu->kvm->arch.vrma_slb_v; 364 } 365 366 preempt_disable(); 367 /* Find the HPTE in the hash table */ 368 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, 369 HPTE_V_VALID | HPTE_V_ABSENT); 370 if (index < 0) { 371 preempt_enable(); 372 return -ENOENT; 373 } 374 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 375 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 376 if (cpu_has_feature(CPU_FTR_ARCH_300)) 377 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); 378 gr = kvm->arch.hpt.rev[index].guest_rpte; 379 380 unlock_hpte(hptep, orig_v); 381 preempt_enable(); 382 383 gpte->eaddr = eaddr; 384 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); 385 386 /* Get PP bits and key for permission check */ 387 pp = gr & (HPTE_R_PP0 | HPTE_R_PP); 388 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; 389 key &= slb_v; 390 391 /* Calculate permissions */ 392 gpte->may_read = hpte_read_permission(pp, key); 393 gpte->may_write = hpte_write_permission(pp, key); 394 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); 395 396 /* Storage key permission check for POWER7 */ 397 if (data && virtmode) { 398 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); 399 if (amrfield & 1) 400 gpte->may_read = 0; 401 if (amrfield & 2) 402 gpte->may_write = 0; 403 } 404 405 /* Get the guest physical address */ 406 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); 407 return 0; 408} 409 410/* 411 * Quick test for whether an instruction is a load or a store. 412 * If the instruction is a load or a store, then this will indicate 413 * which it is, at least on server processors. (Embedded processors 414 * have some external PID instructions that don't follow the rule 415 * embodied here.) If the instruction isn't a load or store, then 416 * this doesn't return anything useful. 417 */ 418static int instruction_is_store(unsigned int instr) 419{ 420 unsigned int mask; 421 422 mask = 0x10000000; 423 if ((instr & 0xfc000000) == 0x7c000000) 424 mask = 0x100; /* major opcode 31 */ 425 return (instr & mask) != 0; 426} 427 428int kvmppc_hv_emulate_mmio(struct kvm_vcpu *vcpu, 429 unsigned long gpa, gva_t ea, int is_store) 430{ 431 u32 last_inst; 432 433 /* 434 * Fast path - check if the guest physical address corresponds to a 435 * device on the FAST_MMIO_BUS, if so we can avoid loading the 436 * instruction all together, then we can just handle it and return. 437 */ 438 if (is_store) { 439 int idx, ret; 440 441 idx = srcu_read_lock(&vcpu->kvm->srcu); 442 ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0, 443 NULL); 444 srcu_read_unlock(&vcpu->kvm->srcu, idx); 445 if (!ret) { 446 kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); 447 return RESUME_GUEST; 448 } 449 } 450 451 /* 452 * If we fail, we just return to the guest and try executing it again. 453 */ 454 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 455 EMULATE_DONE) 456 return RESUME_GUEST; 457 458 /* 459 * WARNING: We do not know for sure whether the instruction we just 460 * read from memory is the same that caused the fault in the first 461 * place. If the instruction we read is neither an load or a store, 462 * then it can't access memory, so we don't need to worry about 463 * enforcing access permissions. So, assuming it is a load or 464 * store, we just check that its direction (load or store) is 465 * consistent with the original fault, since that's what we 466 * checked the access permissions against. If there is a mismatch 467 * we just return and retry the instruction. 468 */ 469 470 if (instruction_is_store(last_inst) != !!is_store) 471 return RESUME_GUEST; 472 473 /* 474 * Emulated accesses are emulated by looking at the hash for 475 * translation once, then performing the access later. The 476 * translation could be invalidated in the meantime in which 477 * point performing the subsequent memory access on the old 478 * physical address could possibly be a security hole for the 479 * guest (but not the host). 480 * 481 * This is less of an issue for MMIO stores since they aren't 482 * globally visible. It could be an issue for MMIO loads to 483 * a certain extent but we'll ignore it for now. 484 */ 485 486 vcpu->arch.paddr_accessed = gpa; 487 vcpu->arch.vaddr_accessed = ea; 488 return kvmppc_emulate_mmio(vcpu); 489} 490 491int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, 492 unsigned long ea, unsigned long dsisr) 493{ 494 struct kvm *kvm = vcpu->kvm; 495 unsigned long hpte[3], r; 496 unsigned long hnow_v, hnow_r; 497 __be64 *hptep; 498 unsigned long mmu_seq, psize, pte_size; 499 unsigned long gpa_base, gfn_base; 500 unsigned long gpa, gfn, hva, pfn, hpa; 501 struct kvm_memory_slot *memslot; 502 unsigned long *rmap; 503 struct revmap_entry *rev; 504 struct page *page; 505 long index, ret; 506 bool is_ci; 507 bool writing, write_ok; 508 unsigned int shift; 509 unsigned long rcbits; 510 long mmio_update; 511 pte_t pte, *ptep; 512 513 if (kvm_is_radix(kvm)) 514 return kvmppc_book3s_radix_page_fault(vcpu, ea, dsisr); 515 516 /* 517 * Real-mode code has already searched the HPT and found the 518 * entry we're interested in. Lock the entry and check that 519 * it hasn't changed. If it has, just return and re-execute the 520 * instruction. 521 */ 522 if (ea != vcpu->arch.pgfault_addr) 523 return RESUME_GUEST; 524 525 if (vcpu->arch.pgfault_cache) { 526 mmio_update = atomic64_read(&kvm->arch.mmio_update); 527 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 528 r = vcpu->arch.pgfault_cache->rpte; 529 psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0], 530 r); 531 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 532 gfn_base = gpa_base >> PAGE_SHIFT; 533 gpa = gpa_base | (ea & (psize - 1)); 534 return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, 535 dsisr & DSISR_ISSTORE); 536 } 537 } 538 index = vcpu->arch.pgfault_index; 539 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); 540 rev = &kvm->arch.hpt.rev[index]; 541 preempt_disable(); 542 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 543 cpu_relax(); 544 hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 545 hpte[1] = be64_to_cpu(hptep[1]); 546 hpte[2] = r = rev->guest_rpte; 547 unlock_hpte(hptep, hpte[0]); 548 preempt_enable(); 549 550 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 551 hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); 552 hpte[1] = hpte_new_to_old_r(hpte[1]); 553 } 554 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || 555 hpte[1] != vcpu->arch.pgfault_hpte[1]) 556 return RESUME_GUEST; 557 558 /* Translate the logical address and get the page */ 559 psize = kvmppc_actual_pgsz(hpte[0], r); 560 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 561 gfn_base = gpa_base >> PAGE_SHIFT; 562 gpa = gpa_base | (ea & (psize - 1)); 563 gfn = gpa >> PAGE_SHIFT; 564 memslot = gfn_to_memslot(kvm, gfn); 565 566 trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); 567 568 /* No memslot means it's an emulated MMIO region */ 569 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 570 return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, 571 dsisr & DSISR_ISSTORE); 572 573 /* 574 * This should never happen, because of the slot_is_aligned() 575 * check in kvmppc_do_h_enter(). 576 */ 577 if (gfn_base < memslot->base_gfn) 578 return -EFAULT; 579 580 /* used to check for invalidations in progress */ 581 mmu_seq = kvm->mmu_notifier_seq; 582 smp_rmb(); 583 584 ret = -EFAULT; 585 page = NULL; 586 writing = (dsisr & DSISR_ISSTORE) != 0; 587 /* If writing != 0, then the HPTE must allow writing, if we get here */ 588 write_ok = writing; 589 hva = gfn_to_hva_memslot(memslot, gfn); 590 591 /* 592 * Do a fast check first, since __gfn_to_pfn_memslot doesn't 593 * do it with !atomic && !async, which is how we call it. 594 * We always ask for write permission since the common case 595 * is that the page is writable. 596 */ 597 if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { 598 write_ok = true; 599 } else { 600 /* Call KVM generic code to do the slow-path check */ 601 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 602 writing, &write_ok, NULL); 603 if (is_error_noslot_pfn(pfn)) 604 return -EFAULT; 605 page = NULL; 606 if (pfn_valid(pfn)) { 607 page = pfn_to_page(pfn); 608 if (PageReserved(page)) 609 page = NULL; 610 } 611 } 612 613 /* 614 * Read the PTE from the process' radix tree and use that 615 * so we get the shift and attribute bits. 616 */ 617 spin_lock(&kvm->mmu_lock); 618 ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); 619 pte = __pte(0); 620 if (ptep) 621 pte = READ_ONCE(*ptep); 622 spin_unlock(&kvm->mmu_lock); 623 /* 624 * If the PTE disappeared temporarily due to a THP 625 * collapse, just return and let the guest try again. 626 */ 627 if (!pte_present(pte)) { 628 if (page) 629 put_page(page); 630 return RESUME_GUEST; 631 } 632 hpa = pte_pfn(pte) << PAGE_SHIFT; 633 pte_size = PAGE_SIZE; 634 if (shift) 635 pte_size = 1ul << shift; 636 is_ci = pte_ci(pte); 637 638 if (psize > pte_size) 639 goto out_put; 640 if (pte_size > psize) 641 hpa |= hva & (pte_size - psize); 642 643 /* Check WIMG vs. the actual page we're accessing */ 644 if (!hpte_cache_flags_ok(r, is_ci)) { 645 if (is_ci) 646 goto out_put; 647 /* 648 * Allow guest to map emulated device memory as 649 * uncacheable, but actually make it cacheable. 650 */ 651 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; 652 } 653 654 /* 655 * Set the HPTE to point to hpa. 656 * Since the hpa is at PAGE_SIZE granularity, make sure we 657 * don't mask out lower-order bits if psize < PAGE_SIZE. 658 */ 659 if (psize < PAGE_SIZE) 660 psize = PAGE_SIZE; 661 r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa; 662 if (hpte_is_writable(r) && !write_ok) 663 r = hpte_make_readonly(r); 664 ret = RESUME_GUEST; 665 preempt_disable(); 666 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 667 cpu_relax(); 668 hnow_v = be64_to_cpu(hptep[0]); 669 hnow_r = be64_to_cpu(hptep[1]); 670 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 671 hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); 672 hnow_r = hpte_new_to_old_r(hnow_r); 673 } 674 675 /* 676 * If the HPT is being resized, don't update the HPTE, 677 * instead let the guest retry after the resize operation is complete. 678 * The synchronization for mmu_ready test vs. set is provided 679 * by the HPTE lock. 680 */ 681 if (!kvm->arch.mmu_ready) 682 goto out_unlock; 683 684 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || 685 rev->guest_rpte != hpte[2]) 686 /* HPTE has been changed under us; let the guest retry */ 687 goto out_unlock; 688 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 689 690 /* Always put the HPTE in the rmap chain for the page base address */ 691 rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; 692 lock_rmap(rmap); 693 694 /* Check if we might have been invalidated; let the guest retry if so */ 695 ret = RESUME_GUEST; 696 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 697 unlock_rmap(rmap); 698 goto out_unlock; 699 } 700 701 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ 702 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; 703 r &= rcbits | ~(HPTE_R_R | HPTE_R_C); 704 705 if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { 706 /* HPTE was previously valid, so we need to invalidate it */ 707 unlock_rmap(rmap); 708 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 709 kvmppc_invalidate_hpte(kvm, hptep, index); 710 /* don't lose previous R and C bits */ 711 r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 712 } else { 713 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); 714 } 715 716 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 717 r = hpte_old_to_new_r(hpte[0], r); 718 hpte[0] = hpte_old_to_new_v(hpte[0]); 719 } 720 hptep[1] = cpu_to_be64(r); 721 eieio(); 722 __unlock_hpte(hptep, hpte[0]); 723 asm volatile("ptesync" : : : "memory"); 724 preempt_enable(); 725 if (page && hpte_is_writable(r)) 726 set_page_dirty_lock(page); 727 728 out_put: 729 trace_kvm_page_fault_exit(vcpu, hpte, ret); 730 731 if (page) 732 put_page(page); 733 return ret; 734 735 out_unlock: 736 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 737 preempt_enable(); 738 goto out_put; 739} 740 741void kvmppc_rmap_reset(struct kvm *kvm) 742{ 743 struct kvm_memslots *slots; 744 struct kvm_memory_slot *memslot; 745 int srcu_idx, bkt; 746 747 srcu_idx = srcu_read_lock(&kvm->srcu); 748 slots = kvm_memslots(kvm); 749 kvm_for_each_memslot(memslot, bkt, slots) { 750 /* Mutual exclusion with kvm_unmap_hva_range etc. */ 751 spin_lock(&kvm->mmu_lock); 752 /* 753 * This assumes it is acceptable to lose reference and 754 * change bits across a reset. 755 */ 756 memset(memslot->arch.rmap, 0, 757 memslot->npages * sizeof(*memslot->arch.rmap)); 758 spin_unlock(&kvm->mmu_lock); 759 } 760 srcu_read_unlock(&kvm->srcu, srcu_idx); 761} 762 763/* Must be called with both HPTE and rmap locked */ 764static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, 765 struct kvm_memory_slot *memslot, 766 unsigned long *rmapp, unsigned long gfn) 767{ 768 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 769 struct revmap_entry *rev = kvm->arch.hpt.rev; 770 unsigned long j, h; 771 unsigned long ptel, psize, rcbits; 772 773 j = rev[i].forw; 774 if (j == i) { 775 /* chain is now empty */ 776 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); 777 } else { 778 /* remove i from chain */ 779 h = rev[i].back; 780 rev[h].forw = j; 781 rev[j].back = h; 782 rev[i].forw = rev[i].back = i; 783 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; 784 } 785 786 /* Now check and modify the HPTE */ 787 ptel = rev[i].guest_rpte; 788 psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel); 789 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 790 hpte_rpn(ptel, psize) == gfn) { 791 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 792 kvmppc_invalidate_hpte(kvm, hptep, i); 793 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); 794 /* Harvest R and C */ 795 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 796 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 797 if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap) 798 kvmppc_update_dirty_map(memslot, gfn, psize); 799 if (rcbits & ~rev[i].guest_rpte) { 800 rev[i].guest_rpte = ptel | rcbits; 801 note_hpte_modification(kvm, &rev[i]); 802 } 803 } 804} 805 806static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 807 unsigned long gfn) 808{ 809 unsigned long i; 810 __be64 *hptep; 811 unsigned long *rmapp; 812 813 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 814 for (;;) { 815 lock_rmap(rmapp); 816 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 817 unlock_rmap(rmapp); 818 break; 819 } 820 821 /* 822 * To avoid an ABBA deadlock with the HPTE lock bit, 823 * we can't spin on the HPTE lock while holding the 824 * rmap chain lock. 825 */ 826 i = *rmapp & KVMPPC_RMAP_INDEX; 827 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 828 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 829 /* unlock rmap before spinning on the HPTE lock */ 830 unlock_rmap(rmapp); 831 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 832 cpu_relax(); 833 continue; 834 } 835 836 kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn); 837 unlock_rmap(rmapp); 838 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 839 } 840} 841 842bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range) 843{ 844 gfn_t gfn; 845 846 if (kvm_is_radix(kvm)) { 847 for (gfn = range->start; gfn < range->end; gfn++) 848 kvm_unmap_radix(kvm, range->slot, gfn); 849 } else { 850 for (gfn = range->start; gfn < range->end; gfn++) 851 kvm_unmap_rmapp(kvm, range->slot, gfn); 852 } 853 854 return false; 855} 856 857void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 858 struct kvm_memory_slot *memslot) 859{ 860 unsigned long gfn; 861 unsigned long n; 862 unsigned long *rmapp; 863 864 gfn = memslot->base_gfn; 865 rmapp = memslot->arch.rmap; 866 if (kvm_is_radix(kvm)) { 867 kvmppc_radix_flush_memslot(kvm, memslot); 868 return; 869 } 870 871 for (n = memslot->npages; n; --n, ++gfn) { 872 /* 873 * Testing the present bit without locking is OK because 874 * the memslot has been marked invalid already, and hence 875 * no new HPTEs referencing this page can be created, 876 * thus the present bit can't go from 0 to 1. 877 */ 878 if (*rmapp & KVMPPC_RMAP_PRESENT) 879 kvm_unmap_rmapp(kvm, memslot, gfn); 880 ++rmapp; 881 } 882} 883 884static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 885 unsigned long gfn) 886{ 887 struct revmap_entry *rev = kvm->arch.hpt.rev; 888 unsigned long head, i, j; 889 __be64 *hptep; 890 bool ret = false; 891 unsigned long *rmapp; 892 893 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 894 retry: 895 lock_rmap(rmapp); 896 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 897 *rmapp &= ~KVMPPC_RMAP_REFERENCED; 898 ret = true; 899 } 900 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 901 unlock_rmap(rmapp); 902 return ret; 903 } 904 905 i = head = *rmapp & KVMPPC_RMAP_INDEX; 906 do { 907 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 908 j = rev[i].forw; 909 910 /* If this HPTE isn't referenced, ignore it */ 911 if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) 912 continue; 913 914 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 915 /* unlock rmap before spinning on the HPTE lock */ 916 unlock_rmap(rmapp); 917 while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) 918 cpu_relax(); 919 goto retry; 920 } 921 922 /* Now check and modify the HPTE */ 923 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 924 (be64_to_cpu(hptep[1]) & HPTE_R_R)) { 925 kvmppc_clear_ref_hpte(kvm, hptep, i); 926 if (!(rev[i].guest_rpte & HPTE_R_R)) { 927 rev[i].guest_rpte |= HPTE_R_R; 928 note_hpte_modification(kvm, &rev[i]); 929 } 930 ret = true; 931 } 932 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 933 } while ((i = j) != head); 934 935 unlock_rmap(rmapp); 936 return ret; 937} 938 939bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) 940{ 941 gfn_t gfn; 942 bool ret = false; 943 944 if (kvm_is_radix(kvm)) { 945 for (gfn = range->start; gfn < range->end; gfn++) 946 ret |= kvm_age_radix(kvm, range->slot, gfn); 947 } else { 948 for (gfn = range->start; gfn < range->end; gfn++) 949 ret |= kvm_age_rmapp(kvm, range->slot, gfn); 950 } 951 952 return ret; 953} 954 955static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 956 unsigned long gfn) 957{ 958 struct revmap_entry *rev = kvm->arch.hpt.rev; 959 unsigned long head, i, j; 960 unsigned long *hp; 961 bool ret = true; 962 unsigned long *rmapp; 963 964 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 965 if (*rmapp & KVMPPC_RMAP_REFERENCED) 966 return true; 967 968 lock_rmap(rmapp); 969 if (*rmapp & KVMPPC_RMAP_REFERENCED) 970 goto out; 971 972 if (*rmapp & KVMPPC_RMAP_PRESENT) { 973 i = head = *rmapp & KVMPPC_RMAP_INDEX; 974 do { 975 hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); 976 j = rev[i].forw; 977 if (be64_to_cpu(hp[1]) & HPTE_R_R) 978 goto out; 979 } while ((i = j) != head); 980 } 981 ret = false; 982 983 out: 984 unlock_rmap(rmapp); 985 return ret; 986} 987 988bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) 989{ 990 WARN_ON(range->start + 1 != range->end); 991 992 if (kvm_is_radix(kvm)) 993 return kvm_test_age_radix(kvm, range->slot, range->start); 994 else 995 return kvm_test_age_rmapp(kvm, range->slot, range->start); 996} 997 998bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) 999{ 1000 WARN_ON(range->start + 1 != range->end); 1001 1002 if (kvm_is_radix(kvm)) 1003 kvm_unmap_radix(kvm, range->slot, range->start); 1004 else 1005 kvm_unmap_rmapp(kvm, range->slot, range->start); 1006 1007 return false; 1008} 1009 1010static int vcpus_running(struct kvm *kvm) 1011{ 1012 return atomic_read(&kvm->arch.vcpus_running) != 0; 1013} 1014 1015/* 1016 * Returns the number of system pages that are dirty. 1017 * This can be more than 1 if we find a huge-page HPTE. 1018 */ 1019static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 1020{ 1021 struct revmap_entry *rev = kvm->arch.hpt.rev; 1022 unsigned long head, i, j; 1023 unsigned long n; 1024 unsigned long v, r; 1025 __be64 *hptep; 1026 int npages_dirty = 0; 1027 1028 retry: 1029 lock_rmap(rmapp); 1030 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1031 unlock_rmap(rmapp); 1032 return npages_dirty; 1033 } 1034 1035 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1036 do { 1037 unsigned long hptep1; 1038 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 1039 j = rev[i].forw; 1040 1041 /* 1042 * Checking the C (changed) bit here is racy since there 1043 * is no guarantee about when the hardware writes it back. 1044 * If the HPTE is not writable then it is stable since the 1045 * page can't be written to, and we would have done a tlbie 1046 * (which forces the hardware to complete any writeback) 1047 * when making the HPTE read-only. 1048 * If vcpus are running then this call is racy anyway 1049 * since the page could get dirtied subsequently, so we 1050 * expect there to be a further call which would pick up 1051 * any delayed C bit writeback. 1052 * Otherwise we need to do the tlbie even if C==0 in 1053 * order to pick up any delayed writeback of C. 1054 */ 1055 hptep1 = be64_to_cpu(hptep[1]); 1056 if (!(hptep1 & HPTE_R_C) && 1057 (!hpte_is_writable(hptep1) || vcpus_running(kvm))) 1058 continue; 1059 1060 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1061 /* unlock rmap before spinning on the HPTE lock */ 1062 unlock_rmap(rmapp); 1063 while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) 1064 cpu_relax(); 1065 goto retry; 1066 } 1067 1068 /* Now check and modify the HPTE */ 1069 if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { 1070 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 1071 continue; 1072 } 1073 1074 /* need to make it temporarily absent so C is stable */ 1075 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 1076 kvmppc_invalidate_hpte(kvm, hptep, i); 1077 v = be64_to_cpu(hptep[0]); 1078 r = be64_to_cpu(hptep[1]); 1079 if (r & HPTE_R_C) { 1080 hptep[1] = cpu_to_be64(r & ~HPTE_R_C); 1081 if (!(rev[i].guest_rpte & HPTE_R_C)) { 1082 rev[i].guest_rpte |= HPTE_R_C; 1083 note_hpte_modification(kvm, &rev[i]); 1084 } 1085 n = kvmppc_actual_pgsz(v, r); 1086 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1087 if (n > npages_dirty) 1088 npages_dirty = n; 1089 eieio(); 1090 } 1091 v &= ~HPTE_V_ABSENT; 1092 v |= HPTE_V_VALID; 1093 __unlock_hpte(hptep, v); 1094 } while ((i = j) != head); 1095 1096 unlock_rmap(rmapp); 1097 return npages_dirty; 1098} 1099 1100void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, 1101 struct kvm_memory_slot *memslot, 1102 unsigned long *map) 1103{ 1104 unsigned long gfn; 1105 1106 if (!vpa->dirty || !vpa->pinned_addr) 1107 return; 1108 gfn = vpa->gpa >> PAGE_SHIFT; 1109 if (gfn < memslot->base_gfn || 1110 gfn >= memslot->base_gfn + memslot->npages) 1111 return; 1112 1113 vpa->dirty = false; 1114 if (map) 1115 __set_bit_le(gfn - memslot->base_gfn, map); 1116} 1117 1118long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, 1119 struct kvm_memory_slot *memslot, unsigned long *map) 1120{ 1121 unsigned long i; 1122 unsigned long *rmapp; 1123 1124 preempt_disable(); 1125 rmapp = memslot->arch.rmap; 1126 for (i = 0; i < memslot->npages; ++i) { 1127 int npages = kvm_test_clear_dirty_npages(kvm, rmapp); 1128 /* 1129 * Note that if npages > 0 then i must be a multiple of npages, 1130 * since we always put huge-page HPTEs in the rmap chain 1131 * corresponding to their page base address. 1132 */ 1133 if (npages) 1134 set_dirty_bits(map, i, npages); 1135 ++rmapp; 1136 } 1137 preempt_enable(); 1138 return 0; 1139} 1140 1141void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, 1142 unsigned long *nb_ret) 1143{ 1144 struct kvm_memory_slot *memslot; 1145 unsigned long gfn = gpa >> PAGE_SHIFT; 1146 struct page *page, *pages[1]; 1147 int npages; 1148 unsigned long hva, offset; 1149 int srcu_idx; 1150 1151 srcu_idx = srcu_read_lock(&kvm->srcu); 1152 memslot = gfn_to_memslot(kvm, gfn); 1153 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1154 goto err; 1155 hva = gfn_to_hva_memslot(memslot, gfn); 1156 npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages); 1157 if (npages < 1) 1158 goto err; 1159 page = pages[0]; 1160 srcu_read_unlock(&kvm->srcu, srcu_idx); 1161 1162 offset = gpa & (PAGE_SIZE - 1); 1163 if (nb_ret) 1164 *nb_ret = PAGE_SIZE - offset; 1165 return page_address(page) + offset; 1166 1167 err: 1168 srcu_read_unlock(&kvm->srcu, srcu_idx); 1169 return NULL; 1170} 1171 1172void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, 1173 bool dirty) 1174{ 1175 struct page *page = virt_to_page(va); 1176 struct kvm_memory_slot *memslot; 1177 unsigned long gfn; 1178 int srcu_idx; 1179 1180 put_page(page); 1181 1182 if (!dirty) 1183 return; 1184 1185 /* We need to mark this page dirty in the memslot dirty_bitmap, if any */ 1186 gfn = gpa >> PAGE_SHIFT; 1187 srcu_idx = srcu_read_lock(&kvm->srcu); 1188 memslot = gfn_to_memslot(kvm, gfn); 1189 if (memslot && memslot->dirty_bitmap) 1190 set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap); 1191 srcu_read_unlock(&kvm->srcu, srcu_idx); 1192} 1193 1194/* 1195 * HPT resizing 1196 */ 1197static int resize_hpt_allocate(struct kvm_resize_hpt *resize) 1198{ 1199 int rc; 1200 1201 rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); 1202 if (rc < 0) 1203 return rc; 1204 1205 resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", 1206 resize->hpt.virt); 1207 1208 return 0; 1209} 1210 1211static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, 1212 unsigned long idx) 1213{ 1214 struct kvm *kvm = resize->kvm; 1215 struct kvm_hpt_info *old = &kvm->arch.hpt; 1216 struct kvm_hpt_info *new = &resize->hpt; 1217 unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; 1218 unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; 1219 __be64 *hptep, *new_hptep; 1220 unsigned long vpte, rpte, guest_rpte; 1221 int ret; 1222 struct revmap_entry *rev; 1223 unsigned long apsize, avpn, pteg, hash; 1224 unsigned long new_idx, new_pteg, replace_vpte; 1225 int pshift; 1226 1227 hptep = (__be64 *)(old->virt + (idx << 4)); 1228 1229 /* Guest is stopped, so new HPTEs can't be added or faulted 1230 * in, only unmapped or altered by host actions. So, it's 1231 * safe to check this before we take the HPTE lock */ 1232 vpte = be64_to_cpu(hptep[0]); 1233 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1234 return 0; /* nothing to do */ 1235 1236 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 1237 cpu_relax(); 1238 1239 vpte = be64_to_cpu(hptep[0]); 1240 1241 ret = 0; 1242 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) 1243 /* Nothing to do */ 1244 goto out; 1245 1246 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1247 rpte = be64_to_cpu(hptep[1]); 1248 vpte = hpte_new_to_old_v(vpte, rpte); 1249 } 1250 1251 /* Unmap */ 1252 rev = &old->rev[idx]; 1253 guest_rpte = rev->guest_rpte; 1254 1255 ret = -EIO; 1256 apsize = kvmppc_actual_pgsz(vpte, guest_rpte); 1257 if (!apsize) 1258 goto out; 1259 1260 if (vpte & HPTE_V_VALID) { 1261 unsigned long gfn = hpte_rpn(guest_rpte, apsize); 1262 int srcu_idx = srcu_read_lock(&kvm->srcu); 1263 struct kvm_memory_slot *memslot = 1264 __gfn_to_memslot(kvm_memslots(kvm), gfn); 1265 1266 if (memslot) { 1267 unsigned long *rmapp; 1268 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1269 1270 lock_rmap(rmapp); 1271 kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn); 1272 unlock_rmap(rmapp); 1273 } 1274 1275 srcu_read_unlock(&kvm->srcu, srcu_idx); 1276 } 1277 1278 /* Reload PTE after unmap */ 1279 vpte = be64_to_cpu(hptep[0]); 1280 BUG_ON(vpte & HPTE_V_VALID); 1281 BUG_ON(!(vpte & HPTE_V_ABSENT)); 1282 1283 ret = 0; 1284 if (!(vpte & HPTE_V_BOLTED)) 1285 goto out; 1286 1287 rpte = be64_to_cpu(hptep[1]); 1288 1289 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1290 vpte = hpte_new_to_old_v(vpte, rpte); 1291 rpte = hpte_new_to_old_r(rpte); 1292 } 1293 1294 pshift = kvmppc_hpte_base_page_shift(vpte, rpte); 1295 avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23); 1296 pteg = idx / HPTES_PER_GROUP; 1297 if (vpte & HPTE_V_SECONDARY) 1298 pteg = ~pteg; 1299 1300 if (!(vpte & HPTE_V_1TB_SEG)) { 1301 unsigned long offset, vsid; 1302 1303 /* We only have 28 - 23 bits of offset in avpn */ 1304 offset = (avpn & 0x1f) << 23; 1305 vsid = avpn >> 5; 1306 /* We can find more bits from the pteg value */ 1307 if (pshift < 23) 1308 offset |= ((vsid ^ pteg) & old_hash_mask) << pshift; 1309 1310 hash = vsid ^ (offset >> pshift); 1311 } else { 1312 unsigned long offset, vsid; 1313 1314 /* We only have 40 - 23 bits of seg_off in avpn */ 1315 offset = (avpn & 0x1ffff) << 23; 1316 vsid = avpn >> 17; 1317 if (pshift < 23) 1318 offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift; 1319 1320 hash = vsid ^ (vsid << 25) ^ (offset >> pshift); 1321 } 1322 1323 new_pteg = hash & new_hash_mask; 1324 if (vpte & HPTE_V_SECONDARY) 1325 new_pteg = ~hash & new_hash_mask; 1326 1327 new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); 1328 new_hptep = (__be64 *)(new->virt + (new_idx << 4)); 1329 1330 replace_vpte = be64_to_cpu(new_hptep[0]); 1331 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1332 unsigned long replace_rpte = be64_to_cpu(new_hptep[1]); 1333 replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte); 1334 } 1335 1336 if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1337 BUG_ON(new->order >= old->order); 1338 1339 if (replace_vpte & HPTE_V_BOLTED) { 1340 if (vpte & HPTE_V_BOLTED) 1341 /* Bolted collision, nothing we can do */ 1342 ret = -ENOSPC; 1343 /* Discard the new HPTE */ 1344 goto out; 1345 } 1346 1347 /* Discard the previous HPTE */ 1348 } 1349 1350 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1351 rpte = hpte_old_to_new_r(vpte, rpte); 1352 vpte = hpte_old_to_new_v(vpte); 1353 } 1354 1355 new_hptep[1] = cpu_to_be64(rpte); 1356 new->rev[new_idx].guest_rpte = guest_rpte; 1357 /* No need for a barrier, since new HPT isn't active */ 1358 new_hptep[0] = cpu_to_be64(vpte); 1359 unlock_hpte(new_hptep, vpte); 1360 1361out: 1362 unlock_hpte(hptep, vpte); 1363 return ret; 1364} 1365 1366static int resize_hpt_rehash(struct kvm_resize_hpt *resize) 1367{ 1368 struct kvm *kvm = resize->kvm; 1369 unsigned long i; 1370 int rc; 1371 1372 for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { 1373 rc = resize_hpt_rehash_hpte(resize, i); 1374 if (rc != 0) 1375 return rc; 1376 } 1377 1378 return 0; 1379} 1380 1381static void resize_hpt_pivot(struct kvm_resize_hpt *resize) 1382{ 1383 struct kvm *kvm = resize->kvm; 1384 struct kvm_hpt_info hpt_tmp; 1385 1386 /* Exchange the pending tables in the resize structure with 1387 * the active tables */ 1388 1389 resize_hpt_debug(resize, "resize_hpt_pivot()\n"); 1390 1391 spin_lock(&kvm->mmu_lock); 1392 asm volatile("ptesync" : : : "memory"); 1393 1394 hpt_tmp = kvm->arch.hpt; 1395 kvmppc_set_hpt(kvm, &resize->hpt); 1396 resize->hpt = hpt_tmp; 1397 1398 spin_unlock(&kvm->mmu_lock); 1399 1400 synchronize_srcu_expedited(&kvm->srcu); 1401 1402 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1403 kvmppc_setup_partition_table(kvm); 1404 1405 resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); 1406} 1407 1408static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) 1409{ 1410 if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock))) 1411 return; 1412 1413 if (!resize) 1414 return; 1415 1416 if (resize->error != -EBUSY) { 1417 if (resize->hpt.virt) 1418 kvmppc_free_hpt(&resize->hpt); 1419 kfree(resize); 1420 } 1421 1422 if (kvm->arch.resize_hpt == resize) 1423 kvm->arch.resize_hpt = NULL; 1424} 1425 1426static void resize_hpt_prepare_work(struct work_struct *work) 1427{ 1428 struct kvm_resize_hpt *resize = container_of(work, 1429 struct kvm_resize_hpt, 1430 work); 1431 struct kvm *kvm = resize->kvm; 1432 int err = 0; 1433 1434 if (WARN_ON(resize->error != -EBUSY)) 1435 return; 1436 1437 mutex_lock(&kvm->arch.mmu_setup_lock); 1438 1439 /* Request is still current? */ 1440 if (kvm->arch.resize_hpt == resize) { 1441 /* We may request large allocations here: 1442 * do not sleep with kvm->arch.mmu_setup_lock held for a while. 1443 */ 1444 mutex_unlock(&kvm->arch.mmu_setup_lock); 1445 1446 resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", 1447 resize->order); 1448 1449 err = resize_hpt_allocate(resize); 1450 1451 /* We have strict assumption about -EBUSY 1452 * when preparing for HPT resize. 1453 */ 1454 if (WARN_ON(err == -EBUSY)) 1455 err = -EINPROGRESS; 1456 1457 mutex_lock(&kvm->arch.mmu_setup_lock); 1458 /* It is possible that kvm->arch.resize_hpt != resize 1459 * after we grab kvm->arch.mmu_setup_lock again. 1460 */ 1461 } 1462 1463 resize->error = err; 1464 1465 if (kvm->arch.resize_hpt != resize) 1466 resize_hpt_release(kvm, resize); 1467 1468 mutex_unlock(&kvm->arch.mmu_setup_lock); 1469} 1470 1471long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, 1472 struct kvm_ppc_resize_hpt *rhpt) 1473{ 1474 unsigned long flags = rhpt->flags; 1475 unsigned long shift = rhpt->shift; 1476 struct kvm_resize_hpt *resize; 1477 int ret; 1478 1479 if (flags != 0 || kvm_is_radix(kvm)) 1480 return -EINVAL; 1481 1482 if (shift && ((shift < 18) || (shift > 46))) 1483 return -EINVAL; 1484 1485 mutex_lock(&kvm->arch.mmu_setup_lock); 1486 1487 resize = kvm->arch.resize_hpt; 1488 1489 if (resize) { 1490 if (resize->order == shift) { 1491 /* Suitable resize in progress? */ 1492 ret = resize->error; 1493 if (ret == -EBUSY) 1494 ret = 100; /* estimated time in ms */ 1495 else if (ret) 1496 resize_hpt_release(kvm, resize); 1497 1498 goto out; 1499 } 1500 1501 /* not suitable, cancel it */ 1502 resize_hpt_release(kvm, resize); 1503 } 1504 1505 ret = 0; 1506 if (!shift) 1507 goto out; /* nothing to do */ 1508 1509 /* start new resize */ 1510 1511 resize = kzalloc(sizeof(*resize), GFP_KERNEL); 1512 if (!resize) { 1513 ret = -ENOMEM; 1514 goto out; 1515 } 1516 1517 resize->error = -EBUSY; 1518 resize->order = shift; 1519 resize->kvm = kvm; 1520 INIT_WORK(&resize->work, resize_hpt_prepare_work); 1521 kvm->arch.resize_hpt = resize; 1522 1523 schedule_work(&resize->work); 1524 1525 ret = 100; /* estimated time in ms */ 1526 1527out: 1528 mutex_unlock(&kvm->arch.mmu_setup_lock); 1529 return ret; 1530} 1531 1532static void resize_hpt_boot_vcpu(void *opaque) 1533{ 1534 /* Nothing to do, just force a KVM exit */ 1535} 1536 1537long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, 1538 struct kvm_ppc_resize_hpt *rhpt) 1539{ 1540 unsigned long flags = rhpt->flags; 1541 unsigned long shift = rhpt->shift; 1542 struct kvm_resize_hpt *resize; 1543 long ret; 1544 1545 if (flags != 0 || kvm_is_radix(kvm)) 1546 return -EINVAL; 1547 1548 if (shift && ((shift < 18) || (shift > 46))) 1549 return -EINVAL; 1550 1551 mutex_lock(&kvm->arch.mmu_setup_lock); 1552 1553 resize = kvm->arch.resize_hpt; 1554 1555 /* This shouldn't be possible */ 1556 ret = -EIO; 1557 if (WARN_ON(!kvm->arch.mmu_ready)) 1558 goto out_no_hpt; 1559 1560 /* Stop VCPUs from running while we mess with the HPT */ 1561 kvm->arch.mmu_ready = 0; 1562 smp_mb(); 1563 1564 /* Boot all CPUs out of the guest so they re-read 1565 * mmu_ready */ 1566 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); 1567 1568 ret = -ENXIO; 1569 if (!resize || (resize->order != shift)) 1570 goto out; 1571 1572 ret = resize->error; 1573 if (ret) 1574 goto out; 1575 1576 ret = resize_hpt_rehash(resize); 1577 if (ret) 1578 goto out; 1579 1580 resize_hpt_pivot(resize); 1581 1582out: 1583 /* Let VCPUs run again */ 1584 kvm->arch.mmu_ready = 1; 1585 smp_mb(); 1586out_no_hpt: 1587 resize_hpt_release(kvm, resize); 1588 mutex_unlock(&kvm->arch.mmu_setup_lock); 1589 return ret; 1590} 1591 1592/* 1593 * Functions for reading and writing the hash table via reads and 1594 * writes on a file descriptor. 1595 * 1596 * Reads return the guest view of the hash table, which has to be 1597 * pieced together from the real hash table and the guest_rpte 1598 * values in the revmap array. 1599 * 1600 * On writes, each HPTE written is considered in turn, and if it 1601 * is valid, it is written to the HPT as if an H_ENTER with the 1602 * exact flag set was done. When the invalid count is non-zero 1603 * in the header written to the stream, the kernel will make 1604 * sure that that many HPTEs are invalid, and invalidate them 1605 * if not. 1606 */ 1607 1608struct kvm_htab_ctx { 1609 unsigned long index; 1610 unsigned long flags; 1611 struct kvm *kvm; 1612 int first_pass; 1613}; 1614 1615#define HPTE_SIZE (2 * sizeof(unsigned long)) 1616 1617/* 1618 * Returns 1 if this HPT entry has been modified or has pending 1619 * R/C bit changes. 1620 */ 1621static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) 1622{ 1623 unsigned long rcbits_unset; 1624 1625 if (revp->guest_rpte & HPTE_GR_MODIFIED) 1626 return 1; 1627 1628 /* Also need to consider changes in reference and changed bits */ 1629 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1630 if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && 1631 (be64_to_cpu(hptp[1]) & rcbits_unset)) 1632 return 1; 1633 1634 return 0; 1635} 1636 1637static long record_hpte(unsigned long flags, __be64 *hptp, 1638 unsigned long *hpte, struct revmap_entry *revp, 1639 int want_valid, int first_pass) 1640{ 1641 unsigned long v, r, hr; 1642 unsigned long rcbits_unset; 1643 int ok = 1; 1644 int valid, dirty; 1645 1646 /* Unmodified entries are uninteresting except on the first pass */ 1647 dirty = hpte_dirty(revp, hptp); 1648 if (!first_pass && !dirty) 1649 return 0; 1650 1651 valid = 0; 1652 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1653 valid = 1; 1654 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && 1655 !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) 1656 valid = 0; 1657 } 1658 if (valid != want_valid) 1659 return 0; 1660 1661 v = r = 0; 1662 if (valid || dirty) { 1663 /* lock the HPTE so it's stable and read it */ 1664 preempt_disable(); 1665 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1666 cpu_relax(); 1667 v = be64_to_cpu(hptp[0]); 1668 hr = be64_to_cpu(hptp[1]); 1669 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1670 v = hpte_new_to_old_v(v, hr); 1671 hr = hpte_new_to_old_r(hr); 1672 } 1673 1674 /* re-evaluate valid and dirty from synchronized HPTE value */ 1675 valid = !!(v & HPTE_V_VALID); 1676 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1677 1678 /* Harvest R and C into guest view if necessary */ 1679 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); 1680 if (valid && (rcbits_unset & hr)) { 1681 revp->guest_rpte |= (hr & 1682 (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; 1683 dirty = 1; 1684 } 1685 1686 if (v & HPTE_V_ABSENT) { 1687 v &= ~HPTE_V_ABSENT; 1688 v |= HPTE_V_VALID; 1689 valid = 1; 1690 } 1691 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1692 valid = 0; 1693 1694 r = revp->guest_rpte; 1695 /* only clear modified if this is the right sort of entry */ 1696 if (valid == want_valid && dirty) { 1697 r &= ~HPTE_GR_MODIFIED; 1698 revp->guest_rpte = r; 1699 } 1700 unlock_hpte(hptp, be64_to_cpu(hptp[0])); 1701 preempt_enable(); 1702 if (!(valid == want_valid && (first_pass || dirty))) 1703 ok = 0; 1704 } 1705 hpte[0] = cpu_to_be64(v); 1706 hpte[1] = cpu_to_be64(r); 1707 return ok; 1708} 1709 1710static ssize_t kvm_htab_read(struct file *file, char __user *buf, 1711 size_t count, loff_t *ppos) 1712{ 1713 struct kvm_htab_ctx *ctx = file->private_data; 1714 struct kvm *kvm = ctx->kvm; 1715 struct kvm_get_htab_header hdr; 1716 __be64 *hptp; 1717 struct revmap_entry *revp; 1718 unsigned long i, nb, nw; 1719 unsigned long __user *lbuf; 1720 struct kvm_get_htab_header __user *hptr; 1721 unsigned long flags; 1722 int first_pass; 1723 unsigned long hpte[2]; 1724 1725 if (!access_ok(buf, count)) 1726 return -EFAULT; 1727 if (kvm_is_radix(kvm)) 1728 return 0; 1729 1730 first_pass = ctx->first_pass; 1731 flags = ctx->flags; 1732 1733 i = ctx->index; 1734 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1735 revp = kvm->arch.hpt.rev + i; 1736 lbuf = (unsigned long __user *)buf; 1737 1738 nb = 0; 1739 while (nb + sizeof(hdr) + HPTE_SIZE < count) { 1740 /* Initialize header */ 1741 hptr = (struct kvm_get_htab_header __user *)buf; 1742 hdr.n_valid = 0; 1743 hdr.n_invalid = 0; 1744 nw = nb; 1745 nb += sizeof(hdr); 1746 lbuf = (unsigned long __user *)(buf + sizeof(hdr)); 1747 1748 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1749 if (!first_pass) { 1750 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1751 !hpte_dirty(revp, hptp)) { 1752 ++i; 1753 hptp += 2; 1754 ++revp; 1755 } 1756 } 1757 hdr.index = i; 1758 1759 /* Grab a series of valid entries */ 1760 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1761 hdr.n_valid < 0xffff && 1762 nb + HPTE_SIZE < count && 1763 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1764 /* valid entry, write it out */ 1765 ++hdr.n_valid; 1766 if (__put_user(hpte[0], lbuf) || 1767 __put_user(hpte[1], lbuf + 1)) 1768 return -EFAULT; 1769 nb += HPTE_SIZE; 1770 lbuf += 2; 1771 ++i; 1772 hptp += 2; 1773 ++revp; 1774 } 1775 /* Now skip invalid entries while we can */ 1776 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && 1777 hdr.n_invalid < 0xffff && 1778 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1779 /* found an invalid entry */ 1780 ++hdr.n_invalid; 1781 ++i; 1782 hptp += 2; 1783 ++revp; 1784 } 1785 1786 if (hdr.n_valid || hdr.n_invalid) { 1787 /* write back the header */ 1788 if (__copy_to_user(hptr, &hdr, sizeof(hdr))) 1789 return -EFAULT; 1790 nw = nb; 1791 buf = (char __user *)lbuf; 1792 } else { 1793 nb = nw; 1794 } 1795 1796 /* Check if we've wrapped around the hash table */ 1797 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { 1798 i = 0; 1799 ctx->first_pass = 0; 1800 break; 1801 } 1802 } 1803 1804 ctx->index = i; 1805 1806 return nb; 1807} 1808 1809static ssize_t kvm_htab_write(struct file *file, const char __user *buf, 1810 size_t count, loff_t *ppos) 1811{ 1812 struct kvm_htab_ctx *ctx = file->private_data; 1813 struct kvm *kvm = ctx->kvm; 1814 struct kvm_get_htab_header hdr; 1815 unsigned long i, j; 1816 unsigned long v, r; 1817 unsigned long __user *lbuf; 1818 __be64 *hptp; 1819 unsigned long tmp[2]; 1820 ssize_t nb; 1821 long int err, ret; 1822 int mmu_ready; 1823 int pshift; 1824 1825 if (!access_ok(buf, count)) 1826 return -EFAULT; 1827 if (kvm_is_radix(kvm)) 1828 return -EINVAL; 1829 1830 /* lock out vcpus from running while we're doing this */ 1831 mutex_lock(&kvm->arch.mmu_setup_lock); 1832 mmu_ready = kvm->arch.mmu_ready; 1833 if (mmu_ready) { 1834 kvm->arch.mmu_ready = 0; /* temporarily */ 1835 /* order mmu_ready vs. vcpus_running */ 1836 smp_mb(); 1837 if (atomic_read(&kvm->arch.vcpus_running)) { 1838 kvm->arch.mmu_ready = 1; 1839 mutex_unlock(&kvm->arch.mmu_setup_lock); 1840 return -EBUSY; 1841 } 1842 } 1843 1844 err = 0; 1845 for (nb = 0; nb + sizeof(hdr) <= count; ) { 1846 err = -EFAULT; 1847 if (__copy_from_user(&hdr, buf, sizeof(hdr))) 1848 break; 1849 1850 err = 0; 1851 if (nb + hdr.n_valid * HPTE_SIZE > count) 1852 break; 1853 1854 nb += sizeof(hdr); 1855 buf += sizeof(hdr); 1856 1857 err = -EINVAL; 1858 i = hdr.index; 1859 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || 1860 i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) 1861 break; 1862 1863 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 1864 lbuf = (unsigned long __user *)buf; 1865 for (j = 0; j < hdr.n_valid; ++j) { 1866 __be64 hpte_v; 1867 __be64 hpte_r; 1868 1869 err = -EFAULT; 1870 if (__get_user(hpte_v, lbuf) || 1871 __get_user(hpte_r, lbuf + 1)) 1872 goto out; 1873 v = be64_to_cpu(hpte_v); 1874 r = be64_to_cpu(hpte_r); 1875 err = -EINVAL; 1876 if (!(v & HPTE_V_VALID)) 1877 goto out; 1878 pshift = kvmppc_hpte_base_page_shift(v, r); 1879 if (pshift <= 0) 1880 goto out; 1881 lbuf += 2; 1882 nb += HPTE_SIZE; 1883 1884 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1885 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1886 err = -EIO; 1887 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, 1888 tmp); 1889 if (ret != H_SUCCESS) { 1890 pr_err("kvm_htab_write ret %ld i=%ld v=%lx " 1891 "r=%lx\n", ret, i, v, r); 1892 goto out; 1893 } 1894 if (!mmu_ready && is_vrma_hpte(v)) { 1895 unsigned long senc, lpcr; 1896 1897 senc = slb_pgsize_encoding(1ul << pshift); 1898 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 1899 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1900 if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 1901 lpcr = senc << (LPCR_VRMASD_SH - 4); 1902 kvmppc_update_lpcr(kvm, lpcr, 1903 LPCR_VRMASD); 1904 } else { 1905 kvmppc_setup_partition_table(kvm); 1906 } 1907 mmu_ready = 1; 1908 } 1909 ++i; 1910 hptp += 2; 1911 } 1912 1913 for (j = 0; j < hdr.n_invalid; ++j) { 1914 if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) 1915 kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1916 ++i; 1917 hptp += 2; 1918 } 1919 err = 0; 1920 } 1921 1922 out: 1923 /* Order HPTE updates vs. mmu_ready */ 1924 smp_wmb(); 1925 kvm->arch.mmu_ready = mmu_ready; 1926 mutex_unlock(&kvm->arch.mmu_setup_lock); 1927 1928 if (err) 1929 return err; 1930 return nb; 1931} 1932 1933static int kvm_htab_release(struct inode *inode, struct file *filp) 1934{ 1935 struct kvm_htab_ctx *ctx = filp->private_data; 1936 1937 filp->private_data = NULL; 1938 if (!(ctx->flags & KVM_GET_HTAB_WRITE)) 1939 atomic_dec(&ctx->kvm->arch.hpte_mod_interest); 1940 kvm_put_kvm(ctx->kvm); 1941 kfree(ctx); 1942 return 0; 1943} 1944 1945static const struct file_operations kvm_htab_fops = { 1946 .read = kvm_htab_read, 1947 .write = kvm_htab_write, 1948 .llseek = default_llseek, 1949 .release = kvm_htab_release, 1950}; 1951 1952int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) 1953{ 1954 int ret; 1955 struct kvm_htab_ctx *ctx; 1956 int rwflag; 1957 1958 /* reject flags we don't recognize */ 1959 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) 1960 return -EINVAL; 1961 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1962 if (!ctx) 1963 return -ENOMEM; 1964 kvm_get_kvm(kvm); 1965 ctx->kvm = kvm; 1966 ctx->index = ghf->start_index; 1967 ctx->flags = ghf->flags; 1968 ctx->first_pass = 1; 1969 1970 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1971 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); 1972 if (ret < 0) { 1973 kfree(ctx); 1974 kvm_put_kvm_no_destroy(kvm); 1975 return ret; 1976 } 1977 1978 if (rwflag == O_RDONLY) { 1979 mutex_lock(&kvm->slots_lock); 1980 atomic_inc(&kvm->arch.hpte_mod_interest); 1981 /* make sure kvmppc_do_h_enter etc. see the increment */ 1982 synchronize_srcu_expedited(&kvm->srcu); 1983 mutex_unlock(&kvm->slots_lock); 1984 } 1985 1986 return ret; 1987} 1988 1989struct debugfs_htab_state { 1990 struct kvm *kvm; 1991 struct mutex mutex; 1992 unsigned long hpt_index; 1993 int chars_left; 1994 int buf_index; 1995 char buf[64]; 1996}; 1997 1998static int debugfs_htab_open(struct inode *inode, struct file *file) 1999{ 2000 struct kvm *kvm = inode->i_private; 2001 struct debugfs_htab_state *p; 2002 2003 p = kzalloc(sizeof(*p), GFP_KERNEL); 2004 if (!p) 2005 return -ENOMEM; 2006 2007 kvm_get_kvm(kvm); 2008 p->kvm = kvm; 2009 mutex_init(&p->mutex); 2010 file->private_data = p; 2011 2012 return nonseekable_open(inode, file); 2013} 2014 2015static int debugfs_htab_release(struct inode *inode, struct file *file) 2016{ 2017 struct debugfs_htab_state *p = file->private_data; 2018 2019 kvm_put_kvm(p->kvm); 2020 kfree(p); 2021 return 0; 2022} 2023 2024static ssize_t debugfs_htab_read(struct file *file, char __user *buf, 2025 size_t len, loff_t *ppos) 2026{ 2027 struct debugfs_htab_state *p = file->private_data; 2028 ssize_t ret, r; 2029 unsigned long i, n; 2030 unsigned long v, hr, gr; 2031 struct kvm *kvm; 2032 __be64 *hptp; 2033 2034 kvm = p->kvm; 2035 if (kvm_is_radix(kvm)) 2036 return 0; 2037 2038 ret = mutex_lock_interruptible(&p->mutex); 2039 if (ret) 2040 return ret; 2041 2042 if (p->chars_left) { 2043 n = p->chars_left; 2044 if (n > len) 2045 n = len; 2046 r = copy_to_user(buf, p->buf + p->buf_index, n); 2047 n -= r; 2048 p->chars_left -= n; 2049 p->buf_index += n; 2050 buf += n; 2051 len -= n; 2052 ret = n; 2053 if (r) { 2054 if (!n) 2055 ret = -EFAULT; 2056 goto out; 2057 } 2058 } 2059 2060 i = p->hpt_index; 2061 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 2062 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); 2063 ++i, hptp += 2) { 2064 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 2065 continue; 2066 2067 /* lock the HPTE so it's stable and read it */ 2068 preempt_disable(); 2069 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 2070 cpu_relax(); 2071 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 2072 hr = be64_to_cpu(hptp[1]); 2073 gr = kvm->arch.hpt.rev[i].guest_rpte; 2074 unlock_hpte(hptp, v); 2075 preempt_enable(); 2076 2077 if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) 2078 continue; 2079 2080 n = scnprintf(p->buf, sizeof(p->buf), 2081 "%6lx %.16lx %.16lx %.16lx\n", 2082 i, v, hr, gr); 2083 p->chars_left = n; 2084 if (n > len) 2085 n = len; 2086 r = copy_to_user(buf, p->buf, n); 2087 n -= r; 2088 p->chars_left -= n; 2089 p->buf_index = n; 2090 buf += n; 2091 len -= n; 2092 ret += n; 2093 if (r) { 2094 if (!ret) 2095 ret = -EFAULT; 2096 goto out; 2097 } 2098 } 2099 p->hpt_index = i; 2100 2101 out: 2102 mutex_unlock(&p->mutex); 2103 return ret; 2104} 2105 2106static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, 2107 size_t len, loff_t *ppos) 2108{ 2109 return -EACCES; 2110} 2111 2112static const struct file_operations debugfs_htab_fops = { 2113 .owner = THIS_MODULE, 2114 .open = debugfs_htab_open, 2115 .release = debugfs_htab_release, 2116 .read = debugfs_htab_read, 2117 .write = debugfs_htab_write, 2118 .llseek = generic_file_llseek, 2119}; 2120 2121void kvmppc_mmu_debugfs_init(struct kvm *kvm) 2122{ 2123 debugfs_create_file("htab", 0400, kvm->debugfs_dentry, kvm, 2124 &debugfs_htab_fops); 2125} 2126 2127void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 2128{ 2129 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 2130 2131 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 2132 2133 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 2134 2135 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 2136}