hugetlbpage.c (16232B)
1/* 2 * PPC Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor 6 * 7 * Based on the IA-32 version: 8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 9 */ 10 11#include <linux/mm.h> 12#include <linux/io.h> 13#include <linux/slab.h> 14#include <linux/hugetlb.h> 15#include <linux/export.h> 16#include <linux/of_fdt.h> 17#include <linux/memblock.h> 18#include <linux/moduleparam.h> 19#include <linux/swap.h> 20#include <linux/swapops.h> 21#include <linux/kmemleak.h> 22#include <asm/pgalloc.h> 23#include <asm/tlb.h> 24#include <asm/setup.h> 25#include <asm/hugetlb.h> 26#include <asm/pte-walk.h> 27 28bool hugetlb_disabled = false; 29 30#define hugepd_none(hpd) (hpd_val(hpd) == 0) 31 32#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \ 33 __builtin_ffs(sizeof(void *))) 34 35pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) 36{ 37 /* 38 * Only called for hugetlbfs pages, hence can ignore THP and the 39 * irq disabled walk. 40 */ 41 return __find_linux_pte(mm->pgd, addr, NULL, NULL); 42} 43 44static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 45 unsigned long address, unsigned int pdshift, 46 unsigned int pshift, spinlock_t *ptl) 47{ 48 struct kmem_cache *cachep; 49 pte_t *new; 50 int i; 51 int num_hugepd; 52 53 if (pshift >= pdshift) { 54 cachep = PGT_CACHE(PTE_T_ORDER); 55 num_hugepd = 1 << (pshift - pdshift); 56 } else { 57 cachep = PGT_CACHE(pdshift - pshift); 58 num_hugepd = 1; 59 } 60 61 if (!cachep) { 62 WARN_ONCE(1, "No page table cache created for hugetlb tables"); 63 return -ENOMEM; 64 } 65 66 new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); 67 68 BUG_ON(pshift > HUGEPD_SHIFT_MASK); 69 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); 70 71 if (!new) 72 return -ENOMEM; 73 74 /* 75 * Make sure other cpus find the hugepd set only after a 76 * properly initialized page table is visible to them. 77 * For more details look for comment in __pte_alloc(). 78 */ 79 smp_wmb(); 80 81 spin_lock(ptl); 82 /* 83 * We have multiple higher-level entries that point to the same 84 * actual pte location. Fill in each as we go and backtrack on error. 85 * We need all of these so the DTLB pgtable walk code can find the 86 * right higher-level entry without knowing if it's a hugepage or not. 87 */ 88 for (i = 0; i < num_hugepd; i++, hpdp++) { 89 if (unlikely(!hugepd_none(*hpdp))) 90 break; 91 hugepd_populate(hpdp, new, pshift); 92 } 93 /* If we bailed from the for loop early, an error occurred, clean up */ 94 if (i < num_hugepd) { 95 for (i = i - 1 ; i >= 0; i--, hpdp--) 96 *hpdp = __hugepd(0); 97 kmem_cache_free(cachep, new); 98 } else { 99 kmemleak_ignore(new); 100 } 101 spin_unlock(ptl); 102 return 0; 103} 104 105/* 106 * At this point we do the placement change only for BOOK3S 64. This would 107 * possibly work on other subarchs. 108 */ 109pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 110 unsigned long addr, unsigned long sz) 111{ 112 pgd_t *pg; 113 p4d_t *p4; 114 pud_t *pu; 115 pmd_t *pm; 116 hugepd_t *hpdp = NULL; 117 unsigned pshift = __ffs(sz); 118 unsigned pdshift = PGDIR_SHIFT; 119 spinlock_t *ptl; 120 121 addr &= ~(sz-1); 122 pg = pgd_offset(mm, addr); 123 p4 = p4d_offset(pg, addr); 124 125#ifdef CONFIG_PPC_BOOK3S_64 126 if (pshift == PGDIR_SHIFT) 127 /* 16GB huge page */ 128 return (pte_t *) p4; 129 else if (pshift > PUD_SHIFT) { 130 /* 131 * We need to use hugepd table 132 */ 133 ptl = &mm->page_table_lock; 134 hpdp = (hugepd_t *)p4; 135 } else { 136 pdshift = PUD_SHIFT; 137 pu = pud_alloc(mm, p4, addr); 138 if (!pu) 139 return NULL; 140 if (pshift == PUD_SHIFT) 141 return (pte_t *)pu; 142 else if (pshift > PMD_SHIFT) { 143 ptl = pud_lockptr(mm, pu); 144 hpdp = (hugepd_t *)pu; 145 } else { 146 pdshift = PMD_SHIFT; 147 pm = pmd_alloc(mm, pu, addr); 148 if (!pm) 149 return NULL; 150 if (pshift == PMD_SHIFT) 151 /* 16MB hugepage */ 152 return (pte_t *)pm; 153 else { 154 ptl = pmd_lockptr(mm, pm); 155 hpdp = (hugepd_t *)pm; 156 } 157 } 158 } 159#else 160 if (pshift >= PGDIR_SHIFT) { 161 ptl = &mm->page_table_lock; 162 hpdp = (hugepd_t *)p4; 163 } else { 164 pdshift = PUD_SHIFT; 165 pu = pud_alloc(mm, p4, addr); 166 if (!pu) 167 return NULL; 168 if (pshift >= PUD_SHIFT) { 169 ptl = pud_lockptr(mm, pu); 170 hpdp = (hugepd_t *)pu; 171 } else { 172 pdshift = PMD_SHIFT; 173 pm = pmd_alloc(mm, pu, addr); 174 if (!pm) 175 return NULL; 176 ptl = pmd_lockptr(mm, pm); 177 hpdp = (hugepd_t *)pm; 178 } 179 } 180#endif 181 if (!hpdp) 182 return NULL; 183 184 if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT) 185 return pte_alloc_map(mm, (pmd_t *)hpdp, addr); 186 187 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); 188 189 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, 190 pdshift, pshift, ptl)) 191 return NULL; 192 193 return hugepte_offset(*hpdp, addr, pdshift); 194} 195 196#ifdef CONFIG_PPC_BOOK3S_64 197/* 198 * Tracks gpages after the device tree is scanned and before the 199 * huge_boot_pages list is ready on pseries. 200 */ 201#define MAX_NUMBER_GPAGES 1024 202__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; 203__initdata static unsigned nr_gpages; 204 205/* 206 * Build list of addresses of gigantic pages. This function is used in early 207 * boot before the buddy allocator is setup. 208 */ 209void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) 210{ 211 if (!addr) 212 return; 213 while (number_of_pages > 0) { 214 gpage_freearray[nr_gpages] = addr; 215 nr_gpages++; 216 number_of_pages--; 217 addr += page_size; 218 } 219} 220 221static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) 222{ 223 struct huge_bootmem_page *m; 224 if (nr_gpages == 0) 225 return 0; 226 m = phys_to_virt(gpage_freearray[--nr_gpages]); 227 gpage_freearray[nr_gpages] = 0; 228 list_add(&m->list, &huge_boot_pages); 229 m->hstate = hstate; 230 return 1; 231} 232 233bool __init hugetlb_node_alloc_supported(void) 234{ 235 return false; 236} 237#endif 238 239 240int __init alloc_bootmem_huge_page(struct hstate *h, int nid) 241{ 242 243#ifdef CONFIG_PPC_BOOK3S_64 244 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) 245 return pseries_alloc_bootmem_huge_page(h); 246#endif 247 return __alloc_bootmem_huge_page(h, nid); 248} 249 250#ifndef CONFIG_PPC_BOOK3S_64 251#define HUGEPD_FREELIST_SIZE \ 252 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) 253 254struct hugepd_freelist { 255 struct rcu_head rcu; 256 unsigned int index; 257 void *ptes[]; 258}; 259 260static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); 261 262static void hugepd_free_rcu_callback(struct rcu_head *head) 263{ 264 struct hugepd_freelist *batch = 265 container_of(head, struct hugepd_freelist, rcu); 266 unsigned int i; 267 268 for (i = 0; i < batch->index; i++) 269 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]); 270 271 free_page((unsigned long)batch); 272} 273 274static void hugepd_free(struct mmu_gather *tlb, void *hugepte) 275{ 276 struct hugepd_freelist **batchp; 277 278 batchp = &get_cpu_var(hugepd_freelist_cur); 279 280 if (atomic_read(&tlb->mm->mm_users) < 2 || 281 mm_is_thread_local(tlb->mm)) { 282 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte); 283 put_cpu_var(hugepd_freelist_cur); 284 return; 285 } 286 287 if (*batchp == NULL) { 288 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); 289 (*batchp)->index = 0; 290 } 291 292 (*batchp)->ptes[(*batchp)->index++] = hugepte; 293 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { 294 call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback); 295 *batchp = NULL; 296 } 297 put_cpu_var(hugepd_freelist_cur); 298} 299#else 300static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {} 301#endif 302 303/* Return true when the entry to be freed maps more than the area being freed */ 304static bool range_is_outside_limits(unsigned long start, unsigned long end, 305 unsigned long floor, unsigned long ceiling, 306 unsigned long mask) 307{ 308 if ((start & mask) < floor) 309 return true; 310 if (ceiling) { 311 ceiling &= mask; 312 if (!ceiling) 313 return true; 314 } 315 return end - 1 > ceiling - 1; 316} 317 318static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, 319 unsigned long start, unsigned long end, 320 unsigned long floor, unsigned long ceiling) 321{ 322 pte_t *hugepte = hugepd_page(*hpdp); 323 int i; 324 325 unsigned long pdmask = ~((1UL << pdshift) - 1); 326 unsigned int num_hugepd = 1; 327 unsigned int shift = hugepd_shift(*hpdp); 328 329 /* Note: On fsl the hpdp may be the first of several */ 330 if (shift > pdshift) 331 num_hugepd = 1 << (shift - pdshift); 332 333 if (range_is_outside_limits(start, end, floor, ceiling, pdmask)) 334 return; 335 336 for (i = 0; i < num_hugepd; i++, hpdp++) 337 *hpdp = __hugepd(0); 338 339 if (shift >= pdshift) 340 hugepd_free(tlb, hugepte); 341 else 342 pgtable_free_tlb(tlb, hugepte, 343 get_hugepd_cache_index(pdshift - shift)); 344} 345 346static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 347 unsigned long addr, unsigned long end, 348 unsigned long floor, unsigned long ceiling) 349{ 350 pgtable_t token = pmd_pgtable(*pmd); 351 352 if (range_is_outside_limits(addr, end, floor, ceiling, PMD_MASK)) 353 return; 354 355 pmd_clear(pmd); 356 pte_free_tlb(tlb, token, addr); 357 mm_dec_nr_ptes(tlb->mm); 358} 359 360static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 361 unsigned long addr, unsigned long end, 362 unsigned long floor, unsigned long ceiling) 363{ 364 pmd_t *pmd; 365 unsigned long next; 366 unsigned long start; 367 368 start = addr; 369 do { 370 unsigned long more; 371 372 pmd = pmd_offset(pud, addr); 373 next = pmd_addr_end(addr, end); 374 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { 375 if (pmd_none_or_clear_bad(pmd)) 376 continue; 377 378 /* 379 * if it is not hugepd pointer, we should already find 380 * it cleared. 381 */ 382 WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx)); 383 384 hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling); 385 386 continue; 387 } 388 /* 389 * Increment next by the size of the huge mapping since 390 * there may be more than one entry at this level for a 391 * single hugepage, but all of them point to 392 * the same kmem cache that holds the hugepte. 393 */ 394 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); 395 if (more > next) 396 next = more; 397 398 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, 399 addr, next, floor, ceiling); 400 } while (addr = next, addr != end); 401 402 if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK)) 403 return; 404 405 pmd = pmd_offset(pud, start & PUD_MASK); 406 pud_clear(pud); 407 pmd_free_tlb(tlb, pmd, start & PUD_MASK); 408 mm_dec_nr_pmds(tlb->mm); 409} 410 411static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, 412 unsigned long addr, unsigned long end, 413 unsigned long floor, unsigned long ceiling) 414{ 415 pud_t *pud; 416 unsigned long next; 417 unsigned long start; 418 419 start = addr; 420 do { 421 pud = pud_offset(p4d, addr); 422 next = pud_addr_end(addr, end); 423 if (!is_hugepd(__hugepd(pud_val(*pud)))) { 424 if (pud_none_or_clear_bad(pud)) 425 continue; 426 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, 427 ceiling); 428 } else { 429 unsigned long more; 430 /* 431 * Increment next by the size of the huge mapping since 432 * there may be more than one entry at this level for a 433 * single hugepage, but all of them point to 434 * the same kmem cache that holds the hugepte. 435 */ 436 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); 437 if (more > next) 438 next = more; 439 440 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, 441 addr, next, floor, ceiling); 442 } 443 } while (addr = next, addr != end); 444 445 if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK)) 446 return; 447 448 pud = pud_offset(p4d, start & PGDIR_MASK); 449 p4d_clear(p4d); 450 pud_free_tlb(tlb, pud, start & PGDIR_MASK); 451 mm_dec_nr_puds(tlb->mm); 452} 453 454/* 455 * This function frees user-level page tables of a process. 456 */ 457void hugetlb_free_pgd_range(struct mmu_gather *tlb, 458 unsigned long addr, unsigned long end, 459 unsigned long floor, unsigned long ceiling) 460{ 461 pgd_t *pgd; 462 p4d_t *p4d; 463 unsigned long next; 464 465 /* 466 * Because there are a number of different possible pagetable 467 * layouts for hugepage ranges, we limit knowledge of how 468 * things should be laid out to the allocation path 469 * (huge_pte_alloc(), above). Everything else works out the 470 * structure as it goes from information in the hugepd 471 * pointers. That means that we can't here use the 472 * optimization used in the normal page free_pgd_range(), of 473 * checking whether we're actually covering a large enough 474 * range to have to do anything at the top level of the walk 475 * instead of at the bottom. 476 * 477 * To make sense of this, you should probably go read the big 478 * block comment at the top of the normal free_pgd_range(), 479 * too. 480 */ 481 482 do { 483 next = pgd_addr_end(addr, end); 484 pgd = pgd_offset(tlb->mm, addr); 485 p4d = p4d_offset(pgd, addr); 486 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { 487 if (p4d_none_or_clear_bad(p4d)) 488 continue; 489 hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling); 490 } else { 491 unsigned long more; 492 /* 493 * Increment next by the size of the huge mapping since 494 * there may be more than one entry at the pgd level 495 * for a single hugepage, but all of them point to the 496 * same kmem cache that holds the hugepte. 497 */ 498 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); 499 if (more > next) 500 next = more; 501 502 free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT, 503 addr, next, floor, ceiling); 504 } 505 } while (addr = next, addr != end); 506} 507 508struct page *follow_huge_pd(struct vm_area_struct *vma, 509 unsigned long address, hugepd_t hpd, 510 int flags, int pdshift) 511{ 512 pte_t *ptep; 513 spinlock_t *ptl; 514 struct page *page = NULL; 515 unsigned long mask; 516 int shift = hugepd_shift(hpd); 517 struct mm_struct *mm = vma->vm_mm; 518 519retry: 520 /* 521 * hugepage directory entries are protected by mm->page_table_lock 522 * Use this instead of huge_pte_lockptr 523 */ 524 ptl = &mm->page_table_lock; 525 spin_lock(ptl); 526 527 ptep = hugepte_offset(hpd, address, pdshift); 528 if (pte_present(*ptep)) { 529 mask = (1UL << shift) - 1; 530 page = pte_page(*ptep); 531 page += ((address & mask) >> PAGE_SHIFT); 532 if (flags & FOLL_GET) 533 get_page(page); 534 } else { 535 if (is_hugetlb_entry_migration(*ptep)) { 536 spin_unlock(ptl); 537 __migration_entry_wait(mm, ptep, ptl); 538 goto retry; 539 } 540 } 541 spin_unlock(ptl); 542 return page; 543} 544 545bool __init arch_hugetlb_valid_size(unsigned long size) 546{ 547 int shift = __ffs(size); 548 int mmu_psize; 549 550 /* Check that it is a page size supported by the hardware and 551 * that it fits within pagetable and slice limits. */ 552 if (size <= PAGE_SIZE || !is_power_of_2(size)) 553 return false; 554 555 mmu_psize = check_and_get_huge_psize(shift); 556 if (mmu_psize < 0) 557 return false; 558 559 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 560 561 return true; 562} 563 564static int __init add_huge_page_size(unsigned long long size) 565{ 566 int shift = __ffs(size); 567 568 if (!arch_hugetlb_valid_size((unsigned long)size)) 569 return -EINVAL; 570 571 hugetlb_add_hstate(shift - PAGE_SHIFT); 572 return 0; 573} 574 575static int __init hugetlbpage_init(void) 576{ 577 bool configured = false; 578 int psize; 579 580 if (hugetlb_disabled) { 581 pr_info("HugeTLB support is disabled!\n"); 582 return 0; 583 } 584 585 if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() && 586 !mmu_has_feature(MMU_FTR_16M_PAGE)) 587 return -ENODEV; 588 589 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { 590 unsigned shift; 591 unsigned pdshift; 592 593 if (!mmu_psize_defs[psize].shift) 594 continue; 595 596 shift = mmu_psize_to_shift(psize); 597 598#ifdef CONFIG_PPC_BOOK3S_64 599 if (shift > PGDIR_SHIFT) 600 continue; 601 else if (shift > PUD_SHIFT) 602 pdshift = PGDIR_SHIFT; 603 else if (shift > PMD_SHIFT) 604 pdshift = PUD_SHIFT; 605 else 606 pdshift = PMD_SHIFT; 607#else 608 if (shift < PUD_SHIFT) 609 pdshift = PMD_SHIFT; 610 else if (shift < PGDIR_SHIFT) 611 pdshift = PUD_SHIFT; 612 else 613 pdshift = PGDIR_SHIFT; 614#endif 615 616 if (add_huge_page_size(1ULL << shift) < 0) 617 continue; 618 /* 619 * if we have pdshift and shift value same, we don't 620 * use pgt cache for hugepd. 621 */ 622 if (pdshift > shift) { 623 if (!IS_ENABLED(CONFIG_PPC_8xx)) 624 pgtable_cache_add(pdshift - shift); 625 } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || 626 IS_ENABLED(CONFIG_PPC_8xx)) { 627 pgtable_cache_add(PTE_T_ORDER); 628 } 629 630 configured = true; 631 } 632 633 if (!configured) 634 pr_info("Failed to initialize. Disabling HugeTLB"); 635 636 return 0; 637} 638 639arch_initcall(hugetlbpage_init); 640 641void __init gigantic_hugetlb_cma_reserve(void) 642{ 643 unsigned long order = 0; 644 645 if (radix_enabled()) 646 order = PUD_SHIFT - PAGE_SHIFT; 647 else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift) 648 /* 649 * For pseries we do use ibm,expected#pages for reserving 16G pages. 650 */ 651 order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; 652 653 if (order) { 654 VM_WARN_ON(order < MAX_ORDER); 655 hugetlb_cma_reserve(order); 656 } 657}