pgtable.c (30823B)
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright IBM Corp. 2007, 2011 4 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 5 */ 6 7#include <linux/sched.h> 8#include <linux/kernel.h> 9#include <linux/errno.h> 10#include <linux/gfp.h> 11#include <linux/mm.h> 12#include <linux/swap.h> 13#include <linux/smp.h> 14#include <linux/spinlock.h> 15#include <linux/rcupdate.h> 16#include <linux/slab.h> 17#include <linux/swapops.h> 18#include <linux/sysctl.h> 19#include <linux/ksm.h> 20#include <linux/mman.h> 21 22#include <asm/tlb.h> 23#include <asm/tlbflush.h> 24#include <asm/mmu_context.h> 25#include <asm/page-states.h> 26 27pgprot_t pgprot_writecombine(pgprot_t prot) 28{ 29 /* 30 * mio_wb_bit_mask may be set on a different CPU, but it is only set 31 * once at init and only read afterwards. 32 */ 33 return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); 34} 35EXPORT_SYMBOL_GPL(pgprot_writecombine); 36 37pgprot_t pgprot_writethrough(pgprot_t prot) 38{ 39 /* 40 * mio_wb_bit_mask may be set on a different CPU, but it is only set 41 * once at init and only read afterwards. 42 */ 43 return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); 44} 45EXPORT_SYMBOL_GPL(pgprot_writethrough); 46 47static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, 48 pte_t *ptep, int nodat) 49{ 50 unsigned long opt, asce; 51 52 if (MACHINE_HAS_TLB_GUEST) { 53 opt = 0; 54 asce = READ_ONCE(mm->context.gmap_asce); 55 if (asce == 0UL || nodat) 56 opt |= IPTE_NODAT; 57 if (asce != -1UL) { 58 asce = asce ? : mm->context.asce; 59 opt |= IPTE_GUEST_ASCE; 60 } 61 __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); 62 } else { 63 __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); 64 } 65} 66 67static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, 68 pte_t *ptep, int nodat) 69{ 70 unsigned long opt, asce; 71 72 if (MACHINE_HAS_TLB_GUEST) { 73 opt = 0; 74 asce = READ_ONCE(mm->context.gmap_asce); 75 if (asce == 0UL || nodat) 76 opt |= IPTE_NODAT; 77 if (asce != -1UL) { 78 asce = asce ? : mm->context.asce; 79 opt |= IPTE_GUEST_ASCE; 80 } 81 __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); 82 } else { 83 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 84 } 85} 86 87static inline pte_t ptep_flush_direct(struct mm_struct *mm, 88 unsigned long addr, pte_t *ptep, 89 int nodat) 90{ 91 pte_t old; 92 93 old = *ptep; 94 if (unlikely(pte_val(old) & _PAGE_INVALID)) 95 return old; 96 atomic_inc(&mm->context.flush_count); 97 if (MACHINE_HAS_TLB_LC && 98 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 99 ptep_ipte_local(mm, addr, ptep, nodat); 100 else 101 ptep_ipte_global(mm, addr, ptep, nodat); 102 atomic_dec(&mm->context.flush_count); 103 return old; 104} 105 106static inline pte_t ptep_flush_lazy(struct mm_struct *mm, 107 unsigned long addr, pte_t *ptep, 108 int nodat) 109{ 110 pte_t old; 111 112 old = *ptep; 113 if (unlikely(pte_val(old) & _PAGE_INVALID)) 114 return old; 115 atomic_inc(&mm->context.flush_count); 116 if (cpumask_equal(&mm->context.cpu_attach_mask, 117 cpumask_of(smp_processor_id()))) { 118 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); 119 mm->context.flush_mm = 1; 120 } else 121 ptep_ipte_global(mm, addr, ptep, nodat); 122 atomic_dec(&mm->context.flush_count); 123 return old; 124} 125 126static inline pgste_t pgste_get_lock(pte_t *ptep) 127{ 128 unsigned long new = 0; 129#ifdef CONFIG_PGSTE 130 unsigned long old; 131 132 asm( 133 " lg %0,%2\n" 134 "0: lgr %1,%0\n" 135 " nihh %0,0xff7f\n" /* clear PCL bit in old */ 136 " oihh %1,0x0080\n" /* set PCL bit in new */ 137 " csg %0,%1,%2\n" 138 " jl 0b\n" 139 : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) 140 : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); 141#endif 142 return __pgste(new); 143} 144 145static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) 146{ 147#ifdef CONFIG_PGSTE 148 asm( 149 " nihh %1,0xff7f\n" /* clear PCL bit */ 150 " stg %1,%0\n" 151 : "=Q" (ptep[PTRS_PER_PTE]) 152 : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) 153 : "cc", "memory"); 154#endif 155} 156 157static inline pgste_t pgste_get(pte_t *ptep) 158{ 159 unsigned long pgste = 0; 160#ifdef CONFIG_PGSTE 161 pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); 162#endif 163 return __pgste(pgste); 164} 165 166static inline void pgste_set(pte_t *ptep, pgste_t pgste) 167{ 168#ifdef CONFIG_PGSTE 169 *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; 170#endif 171} 172 173static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, 174 struct mm_struct *mm) 175{ 176#ifdef CONFIG_PGSTE 177 unsigned long address, bits, skey; 178 179 if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) 180 return pgste; 181 address = pte_val(pte) & PAGE_MASK; 182 skey = (unsigned long) page_get_storage_key(address); 183 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 184 /* Transfer page changed & referenced bit to guest bits in pgste */ 185 pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ 186 /* Copy page access key and fetch protection bit to pgste */ 187 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); 188 pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 189#endif 190 return pgste; 191 192} 193 194static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, 195 struct mm_struct *mm) 196{ 197#ifdef CONFIG_PGSTE 198 unsigned long address; 199 unsigned long nkey; 200 201 if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) 202 return; 203 VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); 204 address = pte_val(entry) & PAGE_MASK; 205 /* 206 * Set page access key and fetch protection bit from pgste. 207 * The guest C/R information is still in the PGSTE, set real 208 * key C/R to 0. 209 */ 210 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 211 nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 212 page_set_storage_key(address, nkey, 0); 213#endif 214} 215 216static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 217{ 218#ifdef CONFIG_PGSTE 219 if ((pte_val(entry) & _PAGE_PRESENT) && 220 (pte_val(entry) & _PAGE_WRITE) && 221 !(pte_val(entry) & _PAGE_INVALID)) { 222 if (!MACHINE_HAS_ESOP) { 223 /* 224 * Without enhanced suppression-on-protection force 225 * the dirty bit on for all writable ptes. 226 */ 227 entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); 228 entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 229 } 230 if (!(pte_val(entry) & _PAGE_PROTECT)) 231 /* This pte allows write access, set user-dirty */ 232 pgste_val(pgste) |= PGSTE_UC_BIT; 233 } 234#endif 235 set_pte(ptep, entry); 236 return pgste; 237} 238 239static inline pgste_t pgste_pte_notify(struct mm_struct *mm, 240 unsigned long addr, 241 pte_t *ptep, pgste_t pgste) 242{ 243#ifdef CONFIG_PGSTE 244 unsigned long bits; 245 246 bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); 247 if (bits) { 248 pgste_val(pgste) ^= bits; 249 ptep_notify(mm, addr, ptep, bits); 250 } 251#endif 252 return pgste; 253} 254 255static inline pgste_t ptep_xchg_start(struct mm_struct *mm, 256 unsigned long addr, pte_t *ptep) 257{ 258 pgste_t pgste = __pgste(0); 259 260 if (mm_has_pgste(mm)) { 261 pgste = pgste_get_lock(ptep); 262 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 263 } 264 return pgste; 265} 266 267static inline pte_t ptep_xchg_commit(struct mm_struct *mm, 268 unsigned long addr, pte_t *ptep, 269 pgste_t pgste, pte_t old, pte_t new) 270{ 271 if (mm_has_pgste(mm)) { 272 if (pte_val(old) & _PAGE_INVALID) 273 pgste_set_key(ptep, pgste, new, mm); 274 if (pte_val(new) & _PAGE_INVALID) { 275 pgste = pgste_update_all(old, pgste, mm); 276 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == 277 _PGSTE_GPS_USAGE_UNUSED) 278 old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); 279 } 280 pgste = pgste_set_pte(ptep, pgste, new); 281 pgste_set_unlock(ptep, pgste); 282 } else { 283 set_pte(ptep, new); 284 } 285 return old; 286} 287 288pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, 289 pte_t *ptep, pte_t new) 290{ 291 pgste_t pgste; 292 pte_t old; 293 int nodat; 294 295 preempt_disable(); 296 pgste = ptep_xchg_start(mm, addr, ptep); 297 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 298 old = ptep_flush_direct(mm, addr, ptep, nodat); 299 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 300 preempt_enable(); 301 return old; 302} 303EXPORT_SYMBOL(ptep_xchg_direct); 304 305pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, 306 pte_t *ptep, pte_t new) 307{ 308 pgste_t pgste; 309 pte_t old; 310 int nodat; 311 312 preempt_disable(); 313 pgste = ptep_xchg_start(mm, addr, ptep); 314 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 315 old = ptep_flush_lazy(mm, addr, ptep, nodat); 316 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 317 preempt_enable(); 318 return old; 319} 320EXPORT_SYMBOL(ptep_xchg_lazy); 321 322pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 323 pte_t *ptep) 324{ 325 pgste_t pgste; 326 pte_t old; 327 int nodat; 328 struct mm_struct *mm = vma->vm_mm; 329 330 preempt_disable(); 331 pgste = ptep_xchg_start(mm, addr, ptep); 332 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 333 old = ptep_flush_lazy(mm, addr, ptep, nodat); 334 if (mm_has_pgste(mm)) { 335 pgste = pgste_update_all(old, pgste, mm); 336 pgste_set(ptep, pgste); 337 } 338 return old; 339} 340 341void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 342 pte_t *ptep, pte_t old_pte, pte_t pte) 343{ 344 pgste_t pgste; 345 struct mm_struct *mm = vma->vm_mm; 346 347 if (!MACHINE_HAS_NX) 348 pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC)); 349 if (mm_has_pgste(mm)) { 350 pgste = pgste_get(ptep); 351 pgste_set_key(ptep, pgste, pte, mm); 352 pgste = pgste_set_pte(ptep, pgste, pte); 353 pgste_set_unlock(ptep, pgste); 354 } else { 355 set_pte(ptep, pte); 356 } 357 preempt_enable(); 358} 359 360static inline void pmdp_idte_local(struct mm_struct *mm, 361 unsigned long addr, pmd_t *pmdp) 362{ 363 if (MACHINE_HAS_TLB_GUEST) 364 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 365 mm->context.asce, IDTE_LOCAL); 366 else 367 __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); 368 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 369 gmap_pmdp_idte_local(mm, addr); 370} 371 372static inline void pmdp_idte_global(struct mm_struct *mm, 373 unsigned long addr, pmd_t *pmdp) 374{ 375 if (MACHINE_HAS_TLB_GUEST) { 376 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 377 mm->context.asce, IDTE_GLOBAL); 378 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 379 gmap_pmdp_idte_global(mm, addr); 380 } else if (MACHINE_HAS_IDTE) { 381 __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); 382 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 383 gmap_pmdp_idte_global(mm, addr); 384 } else { 385 __pmdp_csp(pmdp); 386 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 387 gmap_pmdp_csp(mm, addr); 388 } 389} 390 391static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, 392 unsigned long addr, pmd_t *pmdp) 393{ 394 pmd_t old; 395 396 old = *pmdp; 397 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 398 return old; 399 atomic_inc(&mm->context.flush_count); 400 if (MACHINE_HAS_TLB_LC && 401 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 402 pmdp_idte_local(mm, addr, pmdp); 403 else 404 pmdp_idte_global(mm, addr, pmdp); 405 atomic_dec(&mm->context.flush_count); 406 return old; 407} 408 409static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, 410 unsigned long addr, pmd_t *pmdp) 411{ 412 pmd_t old; 413 414 old = *pmdp; 415 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 416 return old; 417 atomic_inc(&mm->context.flush_count); 418 if (cpumask_equal(&mm->context.cpu_attach_mask, 419 cpumask_of(smp_processor_id()))) { 420 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); 421 mm->context.flush_mm = 1; 422 if (mm_has_pgste(mm)) 423 gmap_pmdp_invalidate(mm, addr); 424 } else { 425 pmdp_idte_global(mm, addr, pmdp); 426 } 427 atomic_dec(&mm->context.flush_count); 428 return old; 429} 430 431#ifdef CONFIG_PGSTE 432static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) 433{ 434 struct vm_area_struct *vma; 435 pgd_t *pgd; 436 p4d_t *p4d; 437 pud_t *pud; 438 439 /* We need a valid VMA, otherwise this is clearly a fault. */ 440 vma = vma_lookup(mm, addr); 441 if (!vma) 442 return -EFAULT; 443 444 pgd = pgd_offset(mm, addr); 445 if (!pgd_present(*pgd)) 446 return -ENOENT; 447 448 p4d = p4d_offset(pgd, addr); 449 if (!p4d_present(*p4d)) 450 return -ENOENT; 451 452 pud = pud_offset(p4d, addr); 453 if (!pud_present(*pud)) 454 return -ENOENT; 455 456 /* Large PUDs are not supported yet. */ 457 if (pud_large(*pud)) 458 return -EFAULT; 459 460 *pmdp = pmd_offset(pud, addr); 461 return 0; 462} 463#endif 464 465pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, 466 pmd_t *pmdp, pmd_t new) 467{ 468 pmd_t old; 469 470 preempt_disable(); 471 old = pmdp_flush_direct(mm, addr, pmdp); 472 set_pmd(pmdp, new); 473 preempt_enable(); 474 return old; 475} 476EXPORT_SYMBOL(pmdp_xchg_direct); 477 478pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, 479 pmd_t *pmdp, pmd_t new) 480{ 481 pmd_t old; 482 483 preempt_disable(); 484 old = pmdp_flush_lazy(mm, addr, pmdp); 485 set_pmd(pmdp, new); 486 preempt_enable(); 487 return old; 488} 489EXPORT_SYMBOL(pmdp_xchg_lazy); 490 491static inline void pudp_idte_local(struct mm_struct *mm, 492 unsigned long addr, pud_t *pudp) 493{ 494 if (MACHINE_HAS_TLB_GUEST) 495 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 496 mm->context.asce, IDTE_LOCAL); 497 else 498 __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); 499} 500 501static inline void pudp_idte_global(struct mm_struct *mm, 502 unsigned long addr, pud_t *pudp) 503{ 504 if (MACHINE_HAS_TLB_GUEST) 505 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 506 mm->context.asce, IDTE_GLOBAL); 507 else if (MACHINE_HAS_IDTE) 508 __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); 509 else 510 /* 511 * Invalid bit position is the same for pmd and pud, so we can 512 * re-use _pmd_csp() here 513 */ 514 __pmdp_csp((pmd_t *) pudp); 515} 516 517static inline pud_t pudp_flush_direct(struct mm_struct *mm, 518 unsigned long addr, pud_t *pudp) 519{ 520 pud_t old; 521 522 old = *pudp; 523 if (pud_val(old) & _REGION_ENTRY_INVALID) 524 return old; 525 atomic_inc(&mm->context.flush_count); 526 if (MACHINE_HAS_TLB_LC && 527 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 528 pudp_idte_local(mm, addr, pudp); 529 else 530 pudp_idte_global(mm, addr, pudp); 531 atomic_dec(&mm->context.flush_count); 532 return old; 533} 534 535pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, 536 pud_t *pudp, pud_t new) 537{ 538 pud_t old; 539 540 preempt_disable(); 541 old = pudp_flush_direct(mm, addr, pudp); 542 set_pud(pudp, new); 543 preempt_enable(); 544 return old; 545} 546EXPORT_SYMBOL(pudp_xchg_direct); 547 548#ifdef CONFIG_TRANSPARENT_HUGEPAGE 549void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 550 pgtable_t pgtable) 551{ 552 struct list_head *lh = (struct list_head *) pgtable; 553 554 assert_spin_locked(pmd_lockptr(mm, pmdp)); 555 556 /* FIFO */ 557 if (!pmd_huge_pte(mm, pmdp)) 558 INIT_LIST_HEAD(lh); 559 else 560 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 561 pmd_huge_pte(mm, pmdp) = pgtable; 562} 563 564pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 565{ 566 struct list_head *lh; 567 pgtable_t pgtable; 568 pte_t *ptep; 569 570 assert_spin_locked(pmd_lockptr(mm, pmdp)); 571 572 /* FIFO */ 573 pgtable = pmd_huge_pte(mm, pmdp); 574 lh = (struct list_head *) pgtable; 575 if (list_empty(lh)) 576 pmd_huge_pte(mm, pmdp) = NULL; 577 else { 578 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 579 list_del(lh); 580 } 581 ptep = (pte_t *) pgtable; 582 set_pte(ptep, __pte(_PAGE_INVALID)); 583 ptep++; 584 set_pte(ptep, __pte(_PAGE_INVALID)); 585 return pgtable; 586} 587#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 588 589#ifdef CONFIG_PGSTE 590void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, 591 pte_t *ptep, pte_t entry) 592{ 593 pgste_t pgste; 594 595 /* the mm_has_pgste() check is done in set_pte_at() */ 596 preempt_disable(); 597 pgste = pgste_get_lock(ptep); 598 pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; 599 pgste_set_key(ptep, pgste, entry, mm); 600 pgste = pgste_set_pte(ptep, pgste, entry); 601 pgste_set_unlock(ptep, pgste); 602 preempt_enable(); 603} 604 605void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 606{ 607 pgste_t pgste; 608 609 preempt_disable(); 610 pgste = pgste_get_lock(ptep); 611 pgste_val(pgste) |= PGSTE_IN_BIT; 612 pgste_set_unlock(ptep, pgste); 613 preempt_enable(); 614} 615 616/** 617 * ptep_force_prot - change access rights of a locked pte 618 * @mm: pointer to the process mm_struct 619 * @addr: virtual address in the guest address space 620 * @ptep: pointer to the page table entry 621 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE 622 * @bit: pgste bit to set (e.g. for notification) 623 * 624 * Returns 0 if the access rights were changed and -EAGAIN if the current 625 * and requested access rights are incompatible. 626 */ 627int ptep_force_prot(struct mm_struct *mm, unsigned long addr, 628 pte_t *ptep, int prot, unsigned long bit) 629{ 630 pte_t entry; 631 pgste_t pgste; 632 int pte_i, pte_p, nodat; 633 634 pgste = pgste_get_lock(ptep); 635 entry = *ptep; 636 /* Check pte entry after all locks have been acquired */ 637 pte_i = pte_val(entry) & _PAGE_INVALID; 638 pte_p = pte_val(entry) & _PAGE_PROTECT; 639 if ((pte_i && (prot != PROT_NONE)) || 640 (pte_p && (prot & PROT_WRITE))) { 641 pgste_set_unlock(ptep, pgste); 642 return -EAGAIN; 643 } 644 /* Change access rights and set pgste bit */ 645 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 646 if (prot == PROT_NONE && !pte_i) { 647 ptep_flush_direct(mm, addr, ptep, nodat); 648 pgste = pgste_update_all(entry, pgste, mm); 649 entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); 650 } 651 if (prot == PROT_READ && !pte_p) { 652 ptep_flush_direct(mm, addr, ptep, nodat); 653 entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); 654 entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 655 } 656 pgste_val(pgste) |= bit; 657 pgste = pgste_set_pte(ptep, pgste, entry); 658 pgste_set_unlock(ptep, pgste); 659 return 0; 660} 661 662int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, 663 pte_t *sptep, pte_t *tptep, pte_t pte) 664{ 665 pgste_t spgste, tpgste; 666 pte_t spte, tpte; 667 int rc = -EAGAIN; 668 669 if (!(pte_val(*tptep) & _PAGE_INVALID)) 670 return 0; /* already shadowed */ 671 spgste = pgste_get_lock(sptep); 672 spte = *sptep; 673 if (!(pte_val(spte) & _PAGE_INVALID) && 674 !((pte_val(spte) & _PAGE_PROTECT) && 675 !(pte_val(pte) & _PAGE_PROTECT))) { 676 pgste_val(spgste) |= PGSTE_VSIE_BIT; 677 tpgste = pgste_get_lock(tptep); 678 tpte = __pte((pte_val(spte) & PAGE_MASK) | 679 (pte_val(pte) & _PAGE_PROTECT)); 680 /* don't touch the storage key - it belongs to parent pgste */ 681 tpgste = pgste_set_pte(tptep, tpgste, tpte); 682 pgste_set_unlock(tptep, tpgste); 683 rc = 1; 684 } 685 pgste_set_unlock(sptep, spgste); 686 return rc; 687} 688 689void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) 690{ 691 pgste_t pgste; 692 int nodat; 693 694 pgste = pgste_get_lock(ptep); 695 /* notifier is called by the caller */ 696 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 697 ptep_flush_direct(mm, saddr, ptep, nodat); 698 /* don't touch the storage key - it belongs to parent pgste */ 699 pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); 700 pgste_set_unlock(ptep, pgste); 701} 702 703static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 704{ 705 if (!non_swap_entry(entry)) 706 dec_mm_counter(mm, MM_SWAPENTS); 707 else if (is_migration_entry(entry)) { 708 struct page *page = pfn_swap_entry_to_page(entry); 709 710 dec_mm_counter(mm, mm_counter(page)); 711 } 712 free_swap_and_cache(entry); 713} 714 715void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, 716 pte_t *ptep, int reset) 717{ 718 unsigned long pgstev; 719 pgste_t pgste; 720 pte_t pte; 721 722 /* Zap unused and logically-zero pages */ 723 preempt_disable(); 724 pgste = pgste_get_lock(ptep); 725 pgstev = pgste_val(pgste); 726 pte = *ptep; 727 if (!reset && pte_swap(pte) && 728 ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || 729 (pgstev & _PGSTE_GPS_ZERO))) { 730 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); 731 pte_clear(mm, addr, ptep); 732 } 733 if (reset) 734 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; 735 pgste_set_unlock(ptep, pgste); 736 preempt_enable(); 737} 738 739void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 740{ 741 unsigned long ptev; 742 pgste_t pgste; 743 744 /* Clear storage key ACC and F, but set R/C */ 745 preempt_disable(); 746 pgste = pgste_get_lock(ptep); 747 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); 748 pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; 749 ptev = pte_val(*ptep); 750 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 751 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); 752 pgste_set_unlock(ptep, pgste); 753 preempt_enable(); 754} 755 756/* 757 * Test and reset if a guest page is dirty 758 */ 759bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, 760 pte_t *ptep) 761{ 762 pgste_t pgste; 763 pte_t pte; 764 bool dirty; 765 int nodat; 766 767 pgste = pgste_get_lock(ptep); 768 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); 769 pgste_val(pgste) &= ~PGSTE_UC_BIT; 770 pte = *ptep; 771 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { 772 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 773 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 774 ptep_ipte_global(mm, addr, ptep, nodat); 775 if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) 776 pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); 777 else 778 pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); 779 set_pte(ptep, pte); 780 } 781 pgste_set_unlock(ptep, pgste); 782 return dirty; 783} 784EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); 785 786int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 787 unsigned char key, bool nq) 788{ 789 unsigned long keyul, paddr; 790 spinlock_t *ptl; 791 pgste_t old, new; 792 pmd_t *pmdp; 793 pte_t *ptep; 794 795 /* 796 * If we don't have a PTE table and if there is no huge page mapped, 797 * we can ignore attempts to set the key to 0, because it already is 0. 798 */ 799 switch (pmd_lookup(mm, addr, &pmdp)) { 800 case -ENOENT: 801 return key ? -EFAULT : 0; 802 case 0: 803 break; 804 default: 805 return -EFAULT; 806 } 807 808 ptl = pmd_lock(mm, pmdp); 809 if (!pmd_present(*pmdp)) { 810 spin_unlock(ptl); 811 return key ? -EFAULT : 0; 812 } 813 814 if (pmd_large(*pmdp)) { 815 paddr = pmd_val(*pmdp) & HPAGE_MASK; 816 paddr |= addr & ~HPAGE_MASK; 817 /* 818 * Huge pmds need quiescing operations, they are 819 * always mapped. 820 */ 821 page_set_storage_key(paddr, key, 1); 822 spin_unlock(ptl); 823 return 0; 824 } 825 spin_unlock(ptl); 826 827 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 828 new = old = pgste_get_lock(ptep); 829 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | 830 PGSTE_ACC_BITS | PGSTE_FP_BIT); 831 keyul = (unsigned long) key; 832 pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; 833 pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 834 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 835 unsigned long bits, skey; 836 837 paddr = pte_val(*ptep) & PAGE_MASK; 838 skey = (unsigned long) page_get_storage_key(paddr); 839 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 840 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 841 /* Set storage key ACC and FP */ 842 page_set_storage_key(paddr, skey, !nq); 843 /* Merge host changed & referenced into pgste */ 844 pgste_val(new) |= bits << 52; 845 } 846 /* changing the guest storage key is considered a change of the page */ 847 if ((pgste_val(new) ^ pgste_val(old)) & 848 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 849 pgste_val(new) |= PGSTE_UC_BIT; 850 851 pgste_set_unlock(ptep, new); 852 pte_unmap_unlock(ptep, ptl); 853 return 0; 854} 855EXPORT_SYMBOL(set_guest_storage_key); 856 857/* 858 * Conditionally set a guest storage key (handling csske). 859 * oldkey will be updated when either mr or mc is set and a pointer is given. 860 * 861 * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest 862 * storage key was updated and -EFAULT on access errors. 863 */ 864int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 865 unsigned char key, unsigned char *oldkey, 866 bool nq, bool mr, bool mc) 867{ 868 unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; 869 int rc; 870 871 /* we can drop the pgste lock between getting and setting the key */ 872 if (mr | mc) { 873 rc = get_guest_storage_key(current->mm, addr, &tmp); 874 if (rc) 875 return rc; 876 if (oldkey) 877 *oldkey = tmp; 878 if (!mr) 879 mask |= _PAGE_REFERENCED; 880 if (!mc) 881 mask |= _PAGE_CHANGED; 882 if (!((tmp ^ key) & mask)) 883 return 0; 884 } 885 rc = set_guest_storage_key(current->mm, addr, key, nq); 886 return rc < 0 ? rc : 1; 887} 888EXPORT_SYMBOL(cond_set_guest_storage_key); 889 890/* 891 * Reset a guest reference bit (rrbe), returning the reference and changed bit. 892 * 893 * Returns < 0 in case of error, otherwise the cc to be reported to the guest. 894 */ 895int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) 896{ 897 spinlock_t *ptl; 898 unsigned long paddr; 899 pgste_t old, new; 900 pmd_t *pmdp; 901 pte_t *ptep; 902 int cc = 0; 903 904 /* 905 * If we don't have a PTE table and if there is no huge page mapped, 906 * the storage key is 0 and there is nothing for us to do. 907 */ 908 switch (pmd_lookup(mm, addr, &pmdp)) { 909 case -ENOENT: 910 return 0; 911 case 0: 912 break; 913 default: 914 return -EFAULT; 915 } 916 917 ptl = pmd_lock(mm, pmdp); 918 if (!pmd_present(*pmdp)) { 919 spin_unlock(ptl); 920 return 0; 921 } 922 923 if (pmd_large(*pmdp)) { 924 paddr = pmd_val(*pmdp) & HPAGE_MASK; 925 paddr |= addr & ~HPAGE_MASK; 926 cc = page_reset_referenced(paddr); 927 spin_unlock(ptl); 928 return cc; 929 } 930 spin_unlock(ptl); 931 932 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 933 new = old = pgste_get_lock(ptep); 934 /* Reset guest reference bit only */ 935 pgste_val(new) &= ~PGSTE_GR_BIT; 936 937 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 938 paddr = pte_val(*ptep) & PAGE_MASK; 939 cc = page_reset_referenced(paddr); 940 /* Merge real referenced bit into host-set */ 941 pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; 942 } 943 /* Reflect guest's logical view, not physical */ 944 cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; 945 /* Changing the guest storage key is considered a change of the page */ 946 if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) 947 pgste_val(new) |= PGSTE_UC_BIT; 948 949 pgste_set_unlock(ptep, new); 950 pte_unmap_unlock(ptep, ptl); 951 return cc; 952} 953EXPORT_SYMBOL(reset_guest_reference_bit); 954 955int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, 956 unsigned char *key) 957{ 958 unsigned long paddr; 959 spinlock_t *ptl; 960 pgste_t pgste; 961 pmd_t *pmdp; 962 pte_t *ptep; 963 964 /* 965 * If we don't have a PTE table and if there is no huge page mapped, 966 * the storage key is 0. 967 */ 968 *key = 0; 969 970 switch (pmd_lookup(mm, addr, &pmdp)) { 971 case -ENOENT: 972 return 0; 973 case 0: 974 break; 975 default: 976 return -EFAULT; 977 } 978 979 ptl = pmd_lock(mm, pmdp); 980 if (!pmd_present(*pmdp)) { 981 spin_unlock(ptl); 982 return 0; 983 } 984 985 if (pmd_large(*pmdp)) { 986 paddr = pmd_val(*pmdp) & HPAGE_MASK; 987 paddr |= addr & ~HPAGE_MASK; 988 *key = page_get_storage_key(paddr); 989 spin_unlock(ptl); 990 return 0; 991 } 992 spin_unlock(ptl); 993 994 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 995 pgste = pgste_get_lock(ptep); 996 *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 997 paddr = pte_val(*ptep) & PAGE_MASK; 998 if (!(pte_val(*ptep) & _PAGE_INVALID)) 999 *key = page_get_storage_key(paddr); 1000 /* Reflect guest's logical view, not physical */ 1001 *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 1002 pgste_set_unlock(ptep, pgste); 1003 pte_unmap_unlock(ptep, ptl); 1004 return 0; 1005} 1006EXPORT_SYMBOL(get_guest_storage_key); 1007 1008/** 1009 * pgste_perform_essa - perform ESSA actions on the PGSTE. 1010 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1011 * @hva: the host virtual address of the page whose PGSTE is to be processed 1012 * @orc: the specific action to perform, see the ESSA_SET_* macros. 1013 * @oldpte: the PTE will be saved there if the pointer is not NULL. 1014 * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. 1015 * 1016 * Return: 1 if the page is to be added to the CBRL, otherwise 0, 1017 * or < 0 in case of error. -EINVAL is returned for invalid values 1018 * of orc, -EFAULT for invalid addresses. 1019 */ 1020int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, 1021 unsigned long *oldpte, unsigned long *oldpgste) 1022{ 1023 struct vm_area_struct *vma; 1024 unsigned long pgstev; 1025 spinlock_t *ptl; 1026 pgste_t pgste; 1027 pte_t *ptep; 1028 int res = 0; 1029 1030 WARN_ON_ONCE(orc > ESSA_MAX); 1031 if (unlikely(orc > ESSA_MAX)) 1032 return -EINVAL; 1033 1034 vma = vma_lookup(mm, hva); 1035 if (!vma || is_vm_hugetlb_page(vma)) 1036 return -EFAULT; 1037 ptep = get_locked_pte(mm, hva, &ptl); 1038 if (unlikely(!ptep)) 1039 return -EFAULT; 1040 pgste = pgste_get_lock(ptep); 1041 pgstev = pgste_val(pgste); 1042 if (oldpte) 1043 *oldpte = pte_val(*ptep); 1044 if (oldpgste) 1045 *oldpgste = pgstev; 1046 1047 switch (orc) { 1048 case ESSA_GET_STATE: 1049 break; 1050 case ESSA_SET_STABLE: 1051 pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 1052 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1053 break; 1054 case ESSA_SET_UNUSED: 1055 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1056 pgstev |= _PGSTE_GPS_USAGE_UNUSED; 1057 if (pte_val(*ptep) & _PAGE_INVALID) 1058 res = 1; 1059 break; 1060 case ESSA_SET_VOLATILE: 1061 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1062 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1063 if (pte_val(*ptep) & _PAGE_INVALID) 1064 res = 1; 1065 break; 1066 case ESSA_SET_POT_VOLATILE: 1067 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1068 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1069 pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; 1070 break; 1071 } 1072 if (pgstev & _PGSTE_GPS_ZERO) { 1073 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1074 break; 1075 } 1076 if (!(pgstev & PGSTE_GC_BIT)) { 1077 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1078 res = 1; 1079 break; 1080 } 1081 break; 1082 case ESSA_SET_STABLE_RESIDENT: 1083 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1084 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1085 /* 1086 * Since the resident state can go away any time after this 1087 * call, we will not make this page resident. We can revisit 1088 * this decision if a guest will ever start using this. 1089 */ 1090 break; 1091 case ESSA_SET_STABLE_IF_RESIDENT: 1092 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1093 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1094 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1095 } 1096 break; 1097 case ESSA_SET_STABLE_NODAT: 1098 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1099 pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; 1100 break; 1101 default: 1102 /* we should never get here! */ 1103 break; 1104 } 1105 /* If we are discarding a page, set it to logical zero */ 1106 if (res) 1107 pgstev |= _PGSTE_GPS_ZERO; 1108 1109 pgste_val(pgste) = pgstev; 1110 pgste_set_unlock(ptep, pgste); 1111 pte_unmap_unlock(ptep, ptl); 1112 return res; 1113} 1114EXPORT_SYMBOL(pgste_perform_essa); 1115 1116/** 1117 * set_pgste_bits - set specific PGSTE bits. 1118 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1119 * @hva: the host virtual address of the page whose PGSTE is to be processed 1120 * @bits: a bitmask representing the bits that will be touched 1121 * @value: the values of the bits to be written. Only the bits in the mask 1122 * will be written. 1123 * 1124 * Return: 0 on success, < 0 in case of error. 1125 */ 1126int set_pgste_bits(struct mm_struct *mm, unsigned long hva, 1127 unsigned long bits, unsigned long value) 1128{ 1129 struct vm_area_struct *vma; 1130 spinlock_t *ptl; 1131 pgste_t new; 1132 pte_t *ptep; 1133 1134 vma = vma_lookup(mm, hva); 1135 if (!vma || is_vm_hugetlb_page(vma)) 1136 return -EFAULT; 1137 ptep = get_locked_pte(mm, hva, &ptl); 1138 if (unlikely(!ptep)) 1139 return -EFAULT; 1140 new = pgste_get_lock(ptep); 1141 1142 pgste_val(new) &= ~bits; 1143 pgste_val(new) |= value & bits; 1144 1145 pgste_set_unlock(ptep, new); 1146 pte_unmap_unlock(ptep, ptl); 1147 return 0; 1148} 1149EXPORT_SYMBOL(set_pgste_bits); 1150 1151/** 1152 * get_pgste - get the current PGSTE for the given address. 1153 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1154 * @hva: the host virtual address of the page whose PGSTE is to be processed 1155 * @pgstep: will be written with the current PGSTE for the given address. 1156 * 1157 * Return: 0 on success, < 0 in case of error. 1158 */ 1159int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) 1160{ 1161 struct vm_area_struct *vma; 1162 spinlock_t *ptl; 1163 pte_t *ptep; 1164 1165 vma = vma_lookup(mm, hva); 1166 if (!vma || is_vm_hugetlb_page(vma)) 1167 return -EFAULT; 1168 ptep = get_locked_pte(mm, hva, &ptl); 1169 if (unlikely(!ptep)) 1170 return -EFAULT; 1171 *pgstep = pgste_val(pgste_get(ptep)); 1172 pte_unmap_unlock(ptep, ptl); 1173 return 0; 1174} 1175EXPORT_SYMBOL(get_pgste); 1176#endif