cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

hash_utils.c (57890B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * PowerPC64 port by Mike Corrigan and Dave Engebretsen
      4 *   {mikejc|engebret}@us.ibm.com
      5 *
      6 *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
      7 *
      8 * SMP scalability work:
      9 *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
     10 *
     11 *    Module name: htab.c
     12 *
     13 *    Description:
     14 *      PowerPC Hashed Page Table functions
     15 */
     16
     17#undef DEBUG
     18#undef DEBUG_LOW
     19
     20#define pr_fmt(fmt) "hash-mmu: " fmt
     21#include <linux/spinlock.h>
     22#include <linux/errno.h>
     23#include <linux/sched/mm.h>
     24#include <linux/proc_fs.h>
     25#include <linux/stat.h>
     26#include <linux/sysctl.h>
     27#include <linux/export.h>
     28#include <linux/ctype.h>
     29#include <linux/cache.h>
     30#include <linux/init.h>
     31#include <linux/signal.h>
     32#include <linux/memblock.h>
     33#include <linux/context_tracking.h>
     34#include <linux/libfdt.h>
     35#include <linux/pkeys.h>
     36#include <linux/hugetlb.h>
     37#include <linux/cpu.h>
     38#include <linux/pgtable.h>
     39#include <linux/debugfs.h>
     40#include <linux/random.h>
     41#include <linux/elf-randomize.h>
     42#include <linux/of_fdt.h>
     43
     44#include <asm/interrupt.h>
     45#include <asm/processor.h>
     46#include <asm/mmu.h>
     47#include <asm/mmu_context.h>
     48#include <asm/page.h>
     49#include <asm/types.h>
     50#include <linux/uaccess.h>
     51#include <asm/machdep.h>
     52#include <asm/io.h>
     53#include <asm/eeh.h>
     54#include <asm/tlb.h>
     55#include <asm/cacheflush.h>
     56#include <asm/cputable.h>
     57#include <asm/sections.h>
     58#include <asm/copro.h>
     59#include <asm/udbg.h>
     60#include <asm/code-patching.h>
     61#include <asm/fadump.h>
     62#include <asm/firmware.h>
     63#include <asm/tm.h>
     64#include <asm/trace.h>
     65#include <asm/ps3.h>
     66#include <asm/pte-walk.h>
     67#include <asm/asm-prototypes.h>
     68#include <asm/ultravisor.h>
     69
     70#include <mm/mmu_decl.h>
     71
     72#include "internal.h"
     73
     74
     75#ifdef DEBUG
     76#define DBG(fmt...) udbg_printf(fmt)
     77#else
     78#define DBG(fmt...)
     79#endif
     80
     81#ifdef DEBUG_LOW
     82#define DBG_LOW(fmt...) udbg_printf(fmt)
     83#else
     84#define DBG_LOW(fmt...)
     85#endif
     86
     87#define KB (1024)
     88#define MB (1024*KB)
     89#define GB (1024L*MB)
     90
     91/*
     92 * Note:  pte   --> Linux PTE
     93 *        HPTE  --> PowerPC Hashed Page Table Entry
     94 *
     95 * Execution context:
     96 *   htab_initialize is called with the MMU off (of course), but
     97 *   the kernel has been copied down to zero so it can directly
     98 *   reference global data.  At this point it is very difficult
     99 *   to print debug info.
    100 *
    101 */
    102
    103static unsigned long _SDR1;
    104
    105u8 hpte_page_sizes[1 << LP_BITS];
    106EXPORT_SYMBOL_GPL(hpte_page_sizes);
    107
    108struct hash_pte *htab_address;
    109unsigned long htab_size_bytes;
    110unsigned long htab_hash_mask;
    111EXPORT_SYMBOL_GPL(htab_hash_mask);
    112int mmu_linear_psize = MMU_PAGE_4K;
    113EXPORT_SYMBOL_GPL(mmu_linear_psize);
    114int mmu_virtual_psize = MMU_PAGE_4K;
    115int mmu_vmalloc_psize = MMU_PAGE_4K;
    116EXPORT_SYMBOL_GPL(mmu_vmalloc_psize);
    117int mmu_io_psize = MMU_PAGE_4K;
    118int mmu_kernel_ssize = MMU_SEGSIZE_256M;
    119EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
    120int mmu_highuser_ssize = MMU_SEGSIZE_256M;
    121u16 mmu_slb_size = 64;
    122EXPORT_SYMBOL_GPL(mmu_slb_size);
    123#ifdef CONFIG_PPC_64K_PAGES
    124int mmu_ci_restrictions;
    125#endif
    126#ifdef CONFIG_DEBUG_PAGEALLOC
    127static u8 *linear_map_hash_slots;
    128static unsigned long linear_map_hash_count;
    129static DEFINE_SPINLOCK(linear_map_hash_lock);
    130#endif /* CONFIG_DEBUG_PAGEALLOC */
    131struct mmu_hash_ops mmu_hash_ops;
    132EXPORT_SYMBOL(mmu_hash_ops);
    133
    134/*
    135 * These are definitions of page sizes arrays to be used when none
    136 * is provided by the firmware.
    137 */
    138
    139/*
    140 * Fallback (4k pages only)
    141 */
    142static struct mmu_psize_def mmu_psize_defaults[] = {
    143	[MMU_PAGE_4K] = {
    144		.shift	= 12,
    145		.sllp	= 0,
    146		.penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
    147		.avpnm	= 0,
    148		.tlbiel = 0,
    149	},
    150};
    151
    152/*
    153 * POWER4, GPUL, POWER5
    154 *
    155 * Support for 16Mb large pages
    156 */
    157static struct mmu_psize_def mmu_psize_defaults_gp[] = {
    158	[MMU_PAGE_4K] = {
    159		.shift	= 12,
    160		.sllp	= 0,
    161		.penc   = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
    162		.avpnm	= 0,
    163		.tlbiel = 1,
    164	},
    165	[MMU_PAGE_16M] = {
    166		.shift	= 24,
    167		.sllp	= SLB_VSID_L,
    168		.penc   = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
    169			    [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
    170		.avpnm	= 0x1UL,
    171		.tlbiel = 0,
    172	},
    173};
    174
    175static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
    176{
    177	unsigned long rb;
    178
    179	rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
    180
    181	asm volatile("tlbiel %0" : : "r" (rb));
    182}
    183
    184/*
    185 * tlbiel instruction for hash, set invalidation
    186 * i.e., r=1 and is=01 or is=10 or is=11
    187 */
    188static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
    189					unsigned int pid,
    190					unsigned int ric, unsigned int prs)
    191{
    192	unsigned long rb;
    193	unsigned long rs;
    194	unsigned int r = 0; /* hash format */
    195
    196	rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
    197	rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
    198
    199	asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
    200		     : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r)
    201		     : "memory");
    202}
    203
    204
    205static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
    206{
    207	unsigned int set;
    208
    209	asm volatile("ptesync": : :"memory");
    210
    211	for (set = 0; set < num_sets; set++)
    212		tlbiel_hash_set_isa206(set, is);
    213
    214	ppc_after_tlbiel_barrier();
    215}
    216
    217static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
    218{
    219	unsigned int set;
    220
    221	asm volatile("ptesync": : :"memory");
    222
    223	/*
    224	 * Flush the partition table cache if this is HV mode.
    225	 */
    226	if (early_cpu_has_feature(CPU_FTR_HVMODE))
    227		tlbiel_hash_set_isa300(0, is, 0, 2, 0);
    228
    229	/*
    230	 * Now invalidate the process table cache. UPRT=0 HPT modes (what
    231	 * current hardware implements) do not use the process table, but
    232	 * add the flushes anyway.
    233	 *
    234	 * From ISA v3.0B p. 1078:
    235	 *     The following forms are invalid.
    236	 *      * PRS=1, R=0, and RIC!=2 (The only process-scoped
    237	 *        HPT caching is of the Process Table.)
    238	 */
    239	tlbiel_hash_set_isa300(0, is, 0, 2, 1);
    240
    241	/*
    242	 * Then flush the sets of the TLB proper. Hash mode uses
    243	 * partition scoped TLB translations, which may be flushed
    244	 * in !HV mode.
    245	 */
    246	for (set = 0; set < num_sets; set++)
    247		tlbiel_hash_set_isa300(set, is, 0, 0, 0);
    248
    249	ppc_after_tlbiel_barrier();
    250
    251	asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
    252}
    253
    254void hash__tlbiel_all(unsigned int action)
    255{
    256	unsigned int is;
    257
    258	switch (action) {
    259	case TLB_INVAL_SCOPE_GLOBAL:
    260		is = 3;
    261		break;
    262	case TLB_INVAL_SCOPE_LPID:
    263		is = 2;
    264		break;
    265	default:
    266		BUG();
    267	}
    268
    269	if (early_cpu_has_feature(CPU_FTR_ARCH_300))
    270		tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
    271	else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
    272		tlbiel_all_isa206(POWER8_TLB_SETS, is);
    273	else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
    274		tlbiel_all_isa206(POWER7_TLB_SETS, is);
    275	else
    276		WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
    277}
    278
    279/*
    280 * 'R' and 'C' update notes:
    281 *  - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
    282 *     create writeable HPTEs without C set, because the hcall H_PROTECT
    283 *     that we use in that case will not update C
    284 *  - The above is however not a problem, because we also don't do that
    285 *     fancy "no flush" variant of eviction and we use H_REMOVE which will
    286 *     do the right thing and thus we don't have the race I described earlier
    287 *
    288 *    - Under bare metal,  we do have the race, so we need R and C set
    289 *    - We make sure R is always set and never lost
    290 *    - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
    291 */
    292unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags)
    293{
    294	unsigned long rflags = 0;
    295
    296	/* _PAGE_EXEC -> NOEXEC */
    297	if ((pteflags & _PAGE_EXEC) == 0)
    298		rflags |= HPTE_R_N;
    299	/*
    300	 * PPP bits:
    301	 * Linux uses slb key 0 for kernel and 1 for user.
    302	 * kernel RW areas are mapped with PPP=0b000
    303	 * User area is mapped with PPP=0b010 for read/write
    304	 * or PPP=0b011 for read-only (including writeable but clean pages).
    305	 */
    306	if (pteflags & _PAGE_PRIVILEGED) {
    307		/*
    308		 * Kernel read only mapped with ppp bits 0b110
    309		 */
    310		if (!(pteflags & _PAGE_WRITE)) {
    311			if (mmu_has_feature(MMU_FTR_KERNEL_RO))
    312				rflags |= (HPTE_R_PP0 | 0x2);
    313			else
    314				rflags |= 0x3;
    315		}
    316	} else {
    317		if (pteflags & _PAGE_RWX)
    318			rflags |= 0x2;
    319		if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
    320			rflags |= 0x1;
    321	}
    322	/*
    323	 * We can't allow hardware to update hpte bits. Hence always
    324	 * set 'R' bit and set 'C' if it is a write fault
    325	 */
    326	rflags |=  HPTE_R_R;
    327
    328	if (pteflags & _PAGE_DIRTY)
    329		rflags |= HPTE_R_C;
    330	/*
    331	 * Add in WIG bits
    332	 */
    333
    334	if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
    335		rflags |= HPTE_R_I;
    336	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
    337		rflags |= (HPTE_R_I | HPTE_R_G);
    338	else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
    339		rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
    340	else
    341		/*
    342		 * Add memory coherence if cache inhibited is not set
    343		 */
    344		rflags |= HPTE_R_M;
    345
    346	rflags |= pte_to_hpte_pkey_bits(pteflags, flags);
    347	return rflags;
    348}
    349
    350int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
    351		      unsigned long pstart, unsigned long prot,
    352		      int psize, int ssize)
    353{
    354	unsigned long vaddr, paddr;
    355	unsigned int step, shift;
    356	int ret = 0;
    357
    358	shift = mmu_psize_defs[psize].shift;
    359	step = 1 << shift;
    360
    361	prot = htab_convert_pte_flags(prot, HPTE_USE_KERNEL_KEY);
    362
    363	DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
    364	    vstart, vend, pstart, prot, psize, ssize);
    365
    366	/* Carefully map only the possible range */
    367	vaddr = ALIGN(vstart, step);
    368	paddr = ALIGN(pstart, step);
    369	vend  = ALIGN_DOWN(vend, step);
    370
    371	for (; vaddr < vend; vaddr += step, paddr += step) {
    372		unsigned long hash, hpteg;
    373		unsigned long vsid = get_kernel_vsid(vaddr, ssize);
    374		unsigned long vpn  = hpt_vpn(vaddr, vsid, ssize);
    375		unsigned long tprot = prot;
    376		bool secondary_hash = false;
    377
    378		/*
    379		 * If we hit a bad address return error.
    380		 */
    381		if (!vsid)
    382			return -1;
    383		/* Make kernel text executable */
    384		if (overlaps_kernel_text(vaddr, vaddr + step))
    385			tprot &= ~HPTE_R_N;
    386
    387		/*
    388		 * If relocatable, check if it overlaps interrupt vectors that
    389		 * are copied down to real 0. For relocatable kernel
    390		 * (e.g. kdump case) we copy interrupt vectors down to real
    391		 * address 0. Mark that region as executable. This is
    392		 * because on p8 system with relocation on exception feature
    393		 * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
    394		 * in order to execute the interrupt handlers in virtual
    395		 * mode the vector region need to be marked as executable.
    396		 */
    397		if ((PHYSICAL_START > MEMORY_START) &&
    398			overlaps_interrupt_vector_text(vaddr, vaddr + step))
    399				tprot &= ~HPTE_R_N;
    400
    401		hash = hpt_hash(vpn, shift, ssize);
    402		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
    403
    404		BUG_ON(!mmu_hash_ops.hpte_insert);
    405repeat:
    406		ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
    407					       HPTE_V_BOLTED, psize, psize,
    408					       ssize);
    409		if (ret == -1) {
    410			/*
    411			 * Try to to keep bolted entries in primary.
    412			 * Remove non bolted entries and try insert again
    413			 */
    414			ret = mmu_hash_ops.hpte_remove(hpteg);
    415			if (ret != -1)
    416				ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
    417							       HPTE_V_BOLTED, psize, psize,
    418							       ssize);
    419			if (ret == -1 && !secondary_hash) {
    420				secondary_hash = true;
    421				hpteg = ((~hash & htab_hash_mask) * HPTES_PER_GROUP);
    422				goto repeat;
    423			}
    424		}
    425
    426		if (ret < 0)
    427			break;
    428
    429		cond_resched();
    430#ifdef CONFIG_DEBUG_PAGEALLOC
    431		if (debug_pagealloc_enabled() &&
    432			(paddr >> PAGE_SHIFT) < linear_map_hash_count)
    433			linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
    434#endif /* CONFIG_DEBUG_PAGEALLOC */
    435	}
    436	return ret < 0 ? ret : 0;
    437}
    438
    439int htab_remove_mapping(unsigned long vstart, unsigned long vend,
    440		      int psize, int ssize)
    441{
    442	unsigned long vaddr, time_limit;
    443	unsigned int step, shift;
    444	int rc;
    445	int ret = 0;
    446
    447	shift = mmu_psize_defs[psize].shift;
    448	step = 1 << shift;
    449
    450	if (!mmu_hash_ops.hpte_removebolted)
    451		return -ENODEV;
    452
    453	/* Unmap the full range specificied */
    454	vaddr = ALIGN_DOWN(vstart, step);
    455	time_limit = jiffies + HZ;
    456
    457	for (;vaddr < vend; vaddr += step) {
    458		rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
    459
    460		/*
    461		 * For large number of mappings introduce a cond_resched()
    462		 * to prevent softlockup warnings.
    463		 */
    464		if (time_after(jiffies, time_limit)) {
    465			cond_resched();
    466			time_limit = jiffies + HZ;
    467		}
    468		if (rc == -ENOENT) {
    469			ret = -ENOENT;
    470			continue;
    471		}
    472		if (rc < 0)
    473			return rc;
    474	}
    475
    476	return ret;
    477}
    478
    479static bool disable_1tb_segments = false;
    480
    481static int __init parse_disable_1tb_segments(char *p)
    482{
    483	disable_1tb_segments = true;
    484	return 0;
    485}
    486early_param("disable_1tb_segments", parse_disable_1tb_segments);
    487
    488static int __init htab_dt_scan_seg_sizes(unsigned long node,
    489					 const char *uname, int depth,
    490					 void *data)
    491{
    492	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
    493	const __be32 *prop;
    494	int size = 0;
    495
    496	/* We are scanning "cpu" nodes only */
    497	if (type == NULL || strcmp(type, "cpu") != 0)
    498		return 0;
    499
    500	prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
    501	if (prop == NULL)
    502		return 0;
    503	for (; size >= 4; size -= 4, ++prop) {
    504		if (be32_to_cpu(prop[0]) == 40) {
    505			DBG("1T segment support detected\n");
    506
    507			if (disable_1tb_segments) {
    508				DBG("1T segments disabled by command line\n");
    509				break;
    510			}
    511
    512			cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
    513			return 1;
    514		}
    515	}
    516	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
    517	return 0;
    518}
    519
    520static int __init get_idx_from_shift(unsigned int shift)
    521{
    522	int idx = -1;
    523
    524	switch (shift) {
    525	case 0xc:
    526		idx = MMU_PAGE_4K;
    527		break;
    528	case 0x10:
    529		idx = MMU_PAGE_64K;
    530		break;
    531	case 0x14:
    532		idx = MMU_PAGE_1M;
    533		break;
    534	case 0x18:
    535		idx = MMU_PAGE_16M;
    536		break;
    537	case 0x22:
    538		idx = MMU_PAGE_16G;
    539		break;
    540	}
    541	return idx;
    542}
    543
    544static int __init htab_dt_scan_page_sizes(unsigned long node,
    545					  const char *uname, int depth,
    546					  void *data)
    547{
    548	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
    549	const __be32 *prop;
    550	int size = 0;
    551
    552	/* We are scanning "cpu" nodes only */
    553	if (type == NULL || strcmp(type, "cpu") != 0)
    554		return 0;
    555
    556	prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
    557	if (!prop)
    558		return 0;
    559
    560	pr_info("Page sizes from device-tree:\n");
    561	size /= 4;
    562	cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
    563	while(size > 0) {
    564		unsigned int base_shift = be32_to_cpu(prop[0]);
    565		unsigned int slbenc = be32_to_cpu(prop[1]);
    566		unsigned int lpnum = be32_to_cpu(prop[2]);
    567		struct mmu_psize_def *def;
    568		int idx, base_idx;
    569
    570		size -= 3; prop += 3;
    571		base_idx = get_idx_from_shift(base_shift);
    572		if (base_idx < 0) {
    573			/* skip the pte encoding also */
    574			prop += lpnum * 2; size -= lpnum * 2;
    575			continue;
    576		}
    577		def = &mmu_psize_defs[base_idx];
    578		if (base_idx == MMU_PAGE_16M)
    579			cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
    580
    581		def->shift = base_shift;
    582		if (base_shift <= 23)
    583			def->avpnm = 0;
    584		else
    585			def->avpnm = (1 << (base_shift - 23)) - 1;
    586		def->sllp = slbenc;
    587		/*
    588		 * We don't know for sure what's up with tlbiel, so
    589		 * for now we only set it for 4K and 64K pages
    590		 */
    591		if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
    592			def->tlbiel = 1;
    593		else
    594			def->tlbiel = 0;
    595
    596		while (size > 0 && lpnum) {
    597			unsigned int shift = be32_to_cpu(prop[0]);
    598			int penc  = be32_to_cpu(prop[1]);
    599
    600			prop += 2; size -= 2;
    601			lpnum--;
    602
    603			idx = get_idx_from_shift(shift);
    604			if (idx < 0)
    605				continue;
    606
    607			if (penc == -1)
    608				pr_err("Invalid penc for base_shift=%d "
    609				       "shift=%d\n", base_shift, shift);
    610
    611			def->penc[idx] = penc;
    612			pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
    613				" avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
    614				base_shift, shift, def->sllp,
    615				def->avpnm, def->tlbiel, def->penc[idx]);
    616		}
    617	}
    618
    619	return 1;
    620}
    621
    622#ifdef CONFIG_HUGETLB_PAGE
    623/*
    624 * Scan for 16G memory blocks that have been set aside for huge pages
    625 * and reserve those blocks for 16G huge pages.
    626 */
    627static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
    628					const char *uname, int depth,
    629					void *data) {
    630	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
    631	const __be64 *addr_prop;
    632	const __be32 *page_count_prop;
    633	unsigned int expected_pages;
    634	long unsigned int phys_addr;
    635	long unsigned int block_size;
    636
    637	/* We are scanning "memory" nodes only */
    638	if (type == NULL || strcmp(type, "memory") != 0)
    639		return 0;
    640
    641	/*
    642	 * This property is the log base 2 of the number of virtual pages that
    643	 * will represent this memory block.
    644	 */
    645	page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
    646	if (page_count_prop == NULL)
    647		return 0;
    648	expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
    649	addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
    650	if (addr_prop == NULL)
    651		return 0;
    652	phys_addr = be64_to_cpu(addr_prop[0]);
    653	block_size = be64_to_cpu(addr_prop[1]);
    654	if (block_size != (16 * GB))
    655		return 0;
    656	printk(KERN_INFO "Huge page(16GB) memory: "
    657			"addr = 0x%lX size = 0x%lX pages = %d\n",
    658			phys_addr, block_size, expected_pages);
    659	if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
    660		memblock_reserve(phys_addr, block_size * expected_pages);
    661		pseries_add_gpage(phys_addr, block_size, expected_pages);
    662	}
    663	return 0;
    664}
    665#endif /* CONFIG_HUGETLB_PAGE */
    666
    667static void __init mmu_psize_set_default_penc(void)
    668{
    669	int bpsize, apsize;
    670	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
    671		for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
    672			mmu_psize_defs[bpsize].penc[apsize] = -1;
    673}
    674
    675#ifdef CONFIG_PPC_64K_PAGES
    676
    677static bool __init might_have_hea(void)
    678{
    679	/*
    680	 * The HEA ethernet adapter requires awareness of the
    681	 * GX bus. Without that awareness we can easily assume
    682	 * we will never see an HEA ethernet device.
    683	 */
    684#ifdef CONFIG_IBMEBUS
    685	return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
    686		firmware_has_feature(FW_FEATURE_SPLPAR);
    687#else
    688	return false;
    689#endif
    690}
    691
    692#endif /* #ifdef CONFIG_PPC_64K_PAGES */
    693
    694static void __init htab_scan_page_sizes(void)
    695{
    696	int rc;
    697
    698	/* se the invalid penc to -1 */
    699	mmu_psize_set_default_penc();
    700
    701	/* Default to 4K pages only */
    702	memcpy(mmu_psize_defs, mmu_psize_defaults,
    703	       sizeof(mmu_psize_defaults));
    704
    705	/*
    706	 * Try to find the available page sizes in the device-tree
    707	 */
    708	rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
    709	if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
    710		/*
    711		 * Nothing in the device-tree, but the CPU supports 16M pages,
    712		 * so let's fallback on a known size list for 16M capable CPUs.
    713		 */
    714		memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
    715		       sizeof(mmu_psize_defaults_gp));
    716	}
    717
    718#ifdef CONFIG_HUGETLB_PAGE
    719	if (!hugetlb_disabled && !early_radix_enabled() ) {
    720		/* Reserve 16G huge page memory sections for huge pages */
    721		of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
    722	}
    723#endif /* CONFIG_HUGETLB_PAGE */
    724}
    725
    726/*
    727 * Fill in the hpte_page_sizes[] array.
    728 * We go through the mmu_psize_defs[] array looking for all the
    729 * supported base/actual page size combinations.  Each combination
    730 * has a unique pagesize encoding (penc) value in the low bits of
    731 * the LP field of the HPTE.  For actual page sizes less than 1MB,
    732 * some of the upper LP bits are used for RPN bits, meaning that
    733 * we need to fill in several entries in hpte_page_sizes[].
    734 *
    735 * In diagrammatic form, with r = RPN bits and z = page size bits:
    736 *        PTE LP     actual page size
    737 *    rrrr rrrz		>=8KB
    738 *    rrrr rrzz		>=16KB
    739 *    rrrr rzzz		>=32KB
    740 *    rrrr zzzz		>=64KB
    741 *    ...
    742 *
    743 * The zzzz bits are implementation-specific but are chosen so that
    744 * no encoding for a larger page size uses the same value in its
    745 * low-order N bits as the encoding for the 2^(12+N) byte page size
    746 * (if it exists).
    747 */
    748static void __init init_hpte_page_sizes(void)
    749{
    750	long int ap, bp;
    751	long int shift, penc;
    752
    753	for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
    754		if (!mmu_psize_defs[bp].shift)
    755			continue;	/* not a supported page size */
    756		for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
    757			penc = mmu_psize_defs[bp].penc[ap];
    758			if (penc == -1 || !mmu_psize_defs[ap].shift)
    759				continue;
    760			shift = mmu_psize_defs[ap].shift - LP_SHIFT;
    761			if (shift <= 0)
    762				continue;	/* should never happen */
    763			/*
    764			 * For page sizes less than 1MB, this loop
    765			 * replicates the entry for all possible values
    766			 * of the rrrr bits.
    767			 */
    768			while (penc < (1 << LP_BITS)) {
    769				hpte_page_sizes[penc] = (ap << 4) | bp;
    770				penc += 1 << shift;
    771			}
    772		}
    773	}
    774}
    775
    776static void __init htab_init_page_sizes(void)
    777{
    778	bool aligned = true;
    779	init_hpte_page_sizes();
    780
    781	if (!debug_pagealloc_enabled()) {
    782		/*
    783		 * Pick a size for the linear mapping. Currently, we only
    784		 * support 16M, 1M and 4K which is the default
    785		 */
    786		if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) &&
    787		    (unsigned long)_stext % 0x1000000) {
    788			if (mmu_psize_defs[MMU_PAGE_16M].shift)
    789				pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n");
    790			aligned = false;
    791		}
    792
    793		if (mmu_psize_defs[MMU_PAGE_16M].shift && aligned)
    794			mmu_linear_psize = MMU_PAGE_16M;
    795		else if (mmu_psize_defs[MMU_PAGE_1M].shift)
    796			mmu_linear_psize = MMU_PAGE_1M;
    797	}
    798
    799#ifdef CONFIG_PPC_64K_PAGES
    800	/*
    801	 * Pick a size for the ordinary pages. Default is 4K, we support
    802	 * 64K for user mappings and vmalloc if supported by the processor.
    803	 * We only use 64k for ioremap if the processor
    804	 * (and firmware) support cache-inhibited large pages.
    805	 * If not, we use 4k and set mmu_ci_restrictions so that
    806	 * hash_page knows to switch processes that use cache-inhibited
    807	 * mappings to 4k pages.
    808	 */
    809	if (mmu_psize_defs[MMU_PAGE_64K].shift) {
    810		mmu_virtual_psize = MMU_PAGE_64K;
    811		mmu_vmalloc_psize = MMU_PAGE_64K;
    812		if (mmu_linear_psize == MMU_PAGE_4K)
    813			mmu_linear_psize = MMU_PAGE_64K;
    814		if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
    815			/*
    816			 * When running on pSeries using 64k pages for ioremap
    817			 * would stop us accessing the HEA ethernet. So if we
    818			 * have the chance of ever seeing one, stay at 4k.
    819			 */
    820			if (!might_have_hea())
    821				mmu_io_psize = MMU_PAGE_64K;
    822		} else
    823			mmu_ci_restrictions = 1;
    824	}
    825#endif /* CONFIG_PPC_64K_PAGES */
    826
    827#ifdef CONFIG_SPARSEMEM_VMEMMAP
    828	/*
    829	 * We try to use 16M pages for vmemmap if that is supported
    830	 * and we have at least 1G of RAM at boot
    831	 */
    832	if (mmu_psize_defs[MMU_PAGE_16M].shift &&
    833	    memblock_phys_mem_size() >= 0x40000000)
    834		mmu_vmemmap_psize = MMU_PAGE_16M;
    835	else
    836		mmu_vmemmap_psize = mmu_virtual_psize;
    837#endif /* CONFIG_SPARSEMEM_VMEMMAP */
    838
    839	printk(KERN_DEBUG "Page orders: linear mapping = %d, "
    840	       "virtual = %d, io = %d"
    841#ifdef CONFIG_SPARSEMEM_VMEMMAP
    842	       ", vmemmap = %d"
    843#endif
    844	       "\n",
    845	       mmu_psize_defs[mmu_linear_psize].shift,
    846	       mmu_psize_defs[mmu_virtual_psize].shift,
    847	       mmu_psize_defs[mmu_io_psize].shift
    848#ifdef CONFIG_SPARSEMEM_VMEMMAP
    849	       ,mmu_psize_defs[mmu_vmemmap_psize].shift
    850#endif
    851	       );
    852}
    853
    854static int __init htab_dt_scan_pftsize(unsigned long node,
    855				       const char *uname, int depth,
    856				       void *data)
    857{
    858	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
    859	const __be32 *prop;
    860
    861	/* We are scanning "cpu" nodes only */
    862	if (type == NULL || strcmp(type, "cpu") != 0)
    863		return 0;
    864
    865	prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
    866	if (prop != NULL) {
    867		/* pft_size[0] is the NUMA CEC cookie */
    868		ppc64_pft_size = be32_to_cpu(prop[1]);
    869		return 1;
    870	}
    871	return 0;
    872}
    873
    874unsigned htab_shift_for_mem_size(unsigned long mem_size)
    875{
    876	unsigned memshift = __ilog2(mem_size);
    877	unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
    878	unsigned pteg_shift;
    879
    880	/* round mem_size up to next power of 2 */
    881	if ((1UL << memshift) < mem_size)
    882		memshift += 1;
    883
    884	/* aim for 2 pages / pteg */
    885	pteg_shift = memshift - (pshift + 1);
    886
    887	/*
    888	 * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab
    889	 * size permitted by the architecture.
    890	 */
    891	return max(pteg_shift + 7, 18U);
    892}
    893
    894static unsigned long __init htab_get_table_size(void)
    895{
    896	/*
    897	 * If hash size isn't already provided by the platform, we try to
    898	 * retrieve it from the device-tree. If it's not there neither, we
    899	 * calculate it now based on the total RAM size
    900	 */
    901	if (ppc64_pft_size == 0)
    902		of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
    903	if (ppc64_pft_size)
    904		return 1UL << ppc64_pft_size;
    905
    906	return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size());
    907}
    908
    909#ifdef CONFIG_MEMORY_HOTPLUG
    910static int resize_hpt_for_hotplug(unsigned long new_mem_size)
    911{
    912	unsigned target_hpt_shift;
    913
    914	if (!mmu_hash_ops.resize_hpt)
    915		return 0;
    916
    917	target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
    918
    919	/*
    920	 * To avoid lots of HPT resizes if memory size is fluctuating
    921	 * across a boundary, we deliberately have some hysterisis
    922	 * here: we immediately increase the HPT size if the target
    923	 * shift exceeds the current shift, but we won't attempt to
    924	 * reduce unless the target shift is at least 2 below the
    925	 * current shift
    926	 */
    927	if (target_hpt_shift > ppc64_pft_size ||
    928	    target_hpt_shift < ppc64_pft_size - 1)
    929		return mmu_hash_ops.resize_hpt(target_hpt_shift);
    930
    931	return 0;
    932}
    933
    934int hash__create_section_mapping(unsigned long start, unsigned long end,
    935				 int nid, pgprot_t prot)
    936{
    937	int rc;
    938
    939	if (end >= H_VMALLOC_START) {
    940		pr_warn("Outside the supported range\n");
    941		return -1;
    942	}
    943
    944	resize_hpt_for_hotplug(memblock_phys_mem_size());
    945
    946	rc = htab_bolt_mapping(start, end, __pa(start),
    947			       pgprot_val(prot), mmu_linear_psize,
    948			       mmu_kernel_ssize);
    949
    950	if (rc < 0) {
    951		int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
    952					      mmu_kernel_ssize);
    953		BUG_ON(rc2 && (rc2 != -ENOENT));
    954	}
    955	return rc;
    956}
    957
    958int hash__remove_section_mapping(unsigned long start, unsigned long end)
    959{
    960	int rc = htab_remove_mapping(start, end, mmu_linear_psize,
    961				     mmu_kernel_ssize);
    962
    963	if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
    964		pr_warn("Hash collision while resizing HPT\n");
    965
    966	return rc;
    967}
    968#endif /* CONFIG_MEMORY_HOTPLUG */
    969
    970static void __init hash_init_partition_table(phys_addr_t hash_table,
    971					     unsigned long htab_size)
    972{
    973	mmu_partition_table_init();
    974
    975	/*
    976	 * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
    977	 * For now, UPRT is 0 and we have no segment table.
    978	 */
    979	htab_size =  __ilog2(htab_size) - 18;
    980	mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false);
    981	pr_info("Partition table %p\n", partition_tb);
    982}
    983
    984static void __init htab_initialize(void)
    985{
    986	unsigned long table;
    987	unsigned long pteg_count;
    988	unsigned long prot;
    989	phys_addr_t base = 0, size = 0, end;
    990	u64 i;
    991
    992	DBG(" -> htab_initialize()\n");
    993
    994	if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
    995		mmu_kernel_ssize = MMU_SEGSIZE_1T;
    996		mmu_highuser_ssize = MMU_SEGSIZE_1T;
    997		printk(KERN_INFO "Using 1TB segments\n");
    998	}
    999
   1000	if (stress_slb_enabled)
   1001		static_branch_enable(&stress_slb_key);
   1002
   1003	/*
   1004	 * Calculate the required size of the htab.  We want the number of
   1005	 * PTEGs to equal one half the number of real pages.
   1006	 */
   1007	htab_size_bytes = htab_get_table_size();
   1008	pteg_count = htab_size_bytes >> 7;
   1009
   1010	htab_hash_mask = pteg_count - 1;
   1011
   1012	if (firmware_has_feature(FW_FEATURE_LPAR) ||
   1013	    firmware_has_feature(FW_FEATURE_PS3_LV1)) {
   1014		/* Using a hypervisor which owns the htab */
   1015		htab_address = NULL;
   1016		_SDR1 = 0;
   1017#ifdef CONFIG_FA_DUMP
   1018		/*
   1019		 * If firmware assisted dump is active firmware preserves
   1020		 * the contents of htab along with entire partition memory.
   1021		 * Clear the htab if firmware assisted dump is active so
   1022		 * that we dont end up using old mappings.
   1023		 */
   1024		if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
   1025			mmu_hash_ops.hpte_clear_all();
   1026#endif
   1027	} else {
   1028		unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE;
   1029
   1030#ifdef CONFIG_PPC_CELL
   1031		/*
   1032		 * Cell may require the hash table down low when using the
   1033		 * Axon IOMMU in order to fit the dynamic region over it, see
   1034		 * comments in cell/iommu.c
   1035		 */
   1036		if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) {
   1037			limit = 0x80000000;
   1038			pr_info("Hash table forced below 2G for Axon IOMMU\n");
   1039		}
   1040#endif /* CONFIG_PPC_CELL */
   1041
   1042		table = memblock_phys_alloc_range(htab_size_bytes,
   1043						  htab_size_bytes,
   1044						  0, limit);
   1045		if (!table)
   1046			panic("ERROR: Failed to allocate %pa bytes below %pa\n",
   1047			      &htab_size_bytes, &limit);
   1048
   1049		DBG("Hash table allocated at %lx, size: %lx\n", table,
   1050		    htab_size_bytes);
   1051
   1052		htab_address = __va(table);
   1053
   1054		/* htab absolute addr + encoded htabsize */
   1055		_SDR1 = table + __ilog2(htab_size_bytes) - 18;
   1056
   1057		/* Initialize the HPT with no entries */
   1058		memset((void *)table, 0, htab_size_bytes);
   1059
   1060		if (!cpu_has_feature(CPU_FTR_ARCH_300))
   1061			/* Set SDR1 */
   1062			mtspr(SPRN_SDR1, _SDR1);
   1063		else
   1064			hash_init_partition_table(table, htab_size_bytes);
   1065	}
   1066
   1067	prot = pgprot_val(PAGE_KERNEL);
   1068
   1069#ifdef CONFIG_DEBUG_PAGEALLOC
   1070	if (debug_pagealloc_enabled()) {
   1071		linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
   1072		linear_map_hash_slots = memblock_alloc_try_nid(
   1073				linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
   1074				ppc64_rma_size,	NUMA_NO_NODE);
   1075		if (!linear_map_hash_slots)
   1076			panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
   1077			      __func__, linear_map_hash_count, &ppc64_rma_size);
   1078	}
   1079#endif /* CONFIG_DEBUG_PAGEALLOC */
   1080
   1081	/* create bolted the linear mapping in the hash table */
   1082	for_each_mem_range(i, &base, &end) {
   1083		size = end - base;
   1084		base = (unsigned long)__va(base);
   1085
   1086		DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
   1087		    base, size, prot);
   1088
   1089		if ((base + size) >= H_VMALLOC_START) {
   1090			pr_warn("Outside the supported range\n");
   1091			continue;
   1092		}
   1093
   1094		BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
   1095				prot, mmu_linear_psize, mmu_kernel_ssize));
   1096	}
   1097	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
   1098
   1099	/*
   1100	 * If we have a memory_limit and we've allocated TCEs then we need to
   1101	 * explicitly map the TCE area at the top of RAM. We also cope with the
   1102	 * case that the TCEs start below memory_limit.
   1103	 * tce_alloc_start/end are 16MB aligned so the mapping should work
   1104	 * for either 4K or 16MB pages.
   1105	 */
   1106	if (tce_alloc_start) {
   1107		tce_alloc_start = (unsigned long)__va(tce_alloc_start);
   1108		tce_alloc_end = (unsigned long)__va(tce_alloc_end);
   1109
   1110		if (base + size >= tce_alloc_start)
   1111			tce_alloc_start = base + size + 1;
   1112
   1113		BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
   1114					 __pa(tce_alloc_start), prot,
   1115					 mmu_linear_psize, mmu_kernel_ssize));
   1116	}
   1117
   1118
   1119	DBG(" <- htab_initialize()\n");
   1120}
   1121#undef KB
   1122#undef MB
   1123
   1124void __init hash__early_init_devtree(void)
   1125{
   1126	/* Initialize segment sizes */
   1127	of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
   1128
   1129	/* Initialize page sizes */
   1130	htab_scan_page_sizes();
   1131}
   1132
   1133static struct hash_mm_context init_hash_mm_context;
   1134void __init hash__early_init_mmu(void)
   1135{
   1136#ifndef CONFIG_PPC_64K_PAGES
   1137	/*
   1138	 * We have code in __hash_page_4K() and elsewhere, which assumes it can
   1139	 * do the following:
   1140	 *   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
   1141	 *
   1142	 * Where the slot number is between 0-15, and values of 8-15 indicate
   1143	 * the secondary bucket. For that code to work H_PAGE_F_SECOND and
   1144	 * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
   1145	 * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
   1146	 * with a BUILD_BUG_ON().
   1147	 */
   1148	BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul  << (H_PAGE_F_GIX_SHIFT + 3)));
   1149#endif /* CONFIG_PPC_64K_PAGES */
   1150
   1151	htab_init_page_sizes();
   1152
   1153	/*
   1154	 * initialize page table size
   1155	 */
   1156	__pte_frag_nr = H_PTE_FRAG_NR;
   1157	__pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
   1158	__pmd_frag_nr = H_PMD_FRAG_NR;
   1159	__pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
   1160
   1161	__pte_index_size = H_PTE_INDEX_SIZE;
   1162	__pmd_index_size = H_PMD_INDEX_SIZE;
   1163	__pud_index_size = H_PUD_INDEX_SIZE;
   1164	__pgd_index_size = H_PGD_INDEX_SIZE;
   1165	__pud_cache_index = H_PUD_CACHE_INDEX;
   1166	__pte_table_size = H_PTE_TABLE_SIZE;
   1167	__pmd_table_size = H_PMD_TABLE_SIZE;
   1168	__pud_table_size = H_PUD_TABLE_SIZE;
   1169	__pgd_table_size = H_PGD_TABLE_SIZE;
   1170	/*
   1171	 * 4k use hugepd format, so for hash set then to
   1172	 * zero
   1173	 */
   1174	__pmd_val_bits = HASH_PMD_VAL_BITS;
   1175	__pud_val_bits = HASH_PUD_VAL_BITS;
   1176	__pgd_val_bits = HASH_PGD_VAL_BITS;
   1177
   1178	__kernel_virt_start = H_KERN_VIRT_START;
   1179	__vmalloc_start = H_VMALLOC_START;
   1180	__vmalloc_end = H_VMALLOC_END;
   1181	__kernel_io_start = H_KERN_IO_START;
   1182	__kernel_io_end = H_KERN_IO_END;
   1183	vmemmap = (struct page *)H_VMEMMAP_START;
   1184	ioremap_bot = IOREMAP_BASE;
   1185
   1186#ifdef CONFIG_PCI
   1187	pci_io_base = ISA_IO_BASE;
   1188#endif
   1189
   1190	/* Select appropriate backend */
   1191	if (firmware_has_feature(FW_FEATURE_PS3_LV1))
   1192		ps3_early_mm_init();
   1193	else if (firmware_has_feature(FW_FEATURE_LPAR))
   1194		hpte_init_pseries();
   1195	else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE))
   1196		hpte_init_native();
   1197
   1198	if (!mmu_hash_ops.hpte_insert)
   1199		panic("hash__early_init_mmu: No MMU hash ops defined!\n");
   1200
   1201	/*
   1202	 * Initialize the MMU Hash table and create the linear mapping
   1203	 * of memory. Has to be done before SLB initialization as this is
   1204	 * currently where the page size encoding is obtained.
   1205	 */
   1206	htab_initialize();
   1207
   1208	init_mm.context.hash_context = &init_hash_mm_context;
   1209	mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT);
   1210
   1211	pr_info("Initializing hash mmu with SLB\n");
   1212	/* Initialize SLB management */
   1213	slb_initialize();
   1214
   1215	if (cpu_has_feature(CPU_FTR_ARCH_206)
   1216			&& cpu_has_feature(CPU_FTR_HVMODE))
   1217		tlbiel_all();
   1218}
   1219
   1220#ifdef CONFIG_SMP
   1221void hash__early_init_mmu_secondary(void)
   1222{
   1223	/* Initialize hash table for that CPU */
   1224	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
   1225
   1226		if (!cpu_has_feature(CPU_FTR_ARCH_300))
   1227			mtspr(SPRN_SDR1, _SDR1);
   1228		else
   1229			set_ptcr_when_no_uv(__pa(partition_tb) |
   1230					    (PATB_SIZE_SHIFT - 12));
   1231	}
   1232	/* Initialize SLB */
   1233	slb_initialize();
   1234
   1235	if (cpu_has_feature(CPU_FTR_ARCH_206)
   1236			&& cpu_has_feature(CPU_FTR_HVMODE))
   1237		tlbiel_all();
   1238
   1239#ifdef CONFIG_PPC_MEM_KEYS
   1240	if (mmu_has_feature(MMU_FTR_PKEY))
   1241		mtspr(SPRN_UAMOR, default_uamor);
   1242#endif
   1243}
   1244#endif /* CONFIG_SMP */
   1245
   1246/*
   1247 * Called by asm hashtable.S for doing lazy icache flush
   1248 */
   1249unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
   1250{
   1251	struct page *page;
   1252
   1253	if (!pfn_valid(pte_pfn(pte)))
   1254		return pp;
   1255
   1256	page = pte_page(pte);
   1257
   1258	/* page is dirty */
   1259	if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) {
   1260		if (trap == INTERRUPT_INST_STORAGE) {
   1261			flush_dcache_icache_page(page);
   1262			set_bit(PG_dcache_clean, &page->flags);
   1263		} else
   1264			pp |= HPTE_R_N;
   1265	}
   1266	return pp;
   1267}
   1268
   1269static unsigned int get_paca_psize(unsigned long addr)
   1270{
   1271	unsigned char *psizes;
   1272	unsigned long index, mask_index;
   1273
   1274	if (addr < SLICE_LOW_TOP) {
   1275		psizes = get_paca()->mm_ctx_low_slices_psize;
   1276		index = GET_LOW_SLICE_INDEX(addr);
   1277	} else {
   1278		psizes = get_paca()->mm_ctx_high_slices_psize;
   1279		index = GET_HIGH_SLICE_INDEX(addr);
   1280	}
   1281	mask_index = index & 0x1;
   1282	return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
   1283}
   1284
   1285
   1286/*
   1287 * Demote a segment to using 4k pages.
   1288 * For now this makes the whole process use 4k pages.
   1289 */
   1290#ifdef CONFIG_PPC_64K_PAGES
   1291void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
   1292{
   1293	if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
   1294		return;
   1295	slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
   1296	copro_flush_all_slbs(mm);
   1297	if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
   1298
   1299		copy_mm_to_paca(mm);
   1300		slb_flush_and_restore_bolted();
   1301	}
   1302}
   1303#endif /* CONFIG_PPC_64K_PAGES */
   1304
   1305#ifdef CONFIG_PPC_SUBPAGE_PROT
   1306/*
   1307 * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
   1308 * Userspace sets the subpage permissions using the subpage_prot system call.
   1309 *
   1310 * Result is 0: full permissions, _PAGE_RW: read-only,
   1311 * _PAGE_RWX: no access.
   1312 */
   1313static int subpage_protection(struct mm_struct *mm, unsigned long ea)
   1314{
   1315	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
   1316	u32 spp = 0;
   1317	u32 **sbpm, *sbpp;
   1318
   1319	if (!spt)
   1320		return 0;
   1321
   1322	if (ea >= spt->maxaddr)
   1323		return 0;
   1324	if (ea < 0x100000000UL) {
   1325		/* addresses below 4GB use spt->low_prot */
   1326		sbpm = spt->low_prot;
   1327	} else {
   1328		sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
   1329		if (!sbpm)
   1330			return 0;
   1331	}
   1332	sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
   1333	if (!sbpp)
   1334		return 0;
   1335	spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
   1336
   1337	/* extract 2-bit bitfield for this 4k subpage */
   1338	spp >>= 30 - 2 * ((ea >> 12) & 0xf);
   1339
   1340	/*
   1341	 * 0 -> full permission
   1342	 * 1 -> Read only
   1343	 * 2 -> no access.
   1344	 * We return the flag that need to be cleared.
   1345	 */
   1346	spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
   1347	return spp;
   1348}
   1349
   1350#else /* CONFIG_PPC_SUBPAGE_PROT */
   1351static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
   1352{
   1353	return 0;
   1354}
   1355#endif
   1356
   1357void hash_failure_debug(unsigned long ea, unsigned long access,
   1358			unsigned long vsid, unsigned long trap,
   1359			int ssize, int psize, int lpsize, unsigned long pte)
   1360{
   1361	if (!printk_ratelimit())
   1362		return;
   1363	pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
   1364		ea, access, current->comm);
   1365	pr_info("    trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
   1366		trap, vsid, ssize, psize, lpsize, pte);
   1367}
   1368
   1369static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
   1370			     int psize, bool user_region)
   1371{
   1372	if (user_region) {
   1373		if (psize != get_paca_psize(ea)) {
   1374			copy_mm_to_paca(mm);
   1375			slb_flush_and_restore_bolted();
   1376		}
   1377	} else if (get_paca()->vmalloc_sllp !=
   1378		   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
   1379		get_paca()->vmalloc_sllp =
   1380			mmu_psize_defs[mmu_vmalloc_psize].sllp;
   1381		slb_vmalloc_update();
   1382	}
   1383}
   1384
   1385/*
   1386 * Result code is:
   1387 *  0 - handled
   1388 *  1 - normal page fault
   1389 * -1 - critical hash insertion error
   1390 * -2 - access not permitted by subpage protection mechanism
   1391 */
   1392int hash_page_mm(struct mm_struct *mm, unsigned long ea,
   1393		 unsigned long access, unsigned long trap,
   1394		 unsigned long flags)
   1395{
   1396	bool is_thp;
   1397	pgd_t *pgdir;
   1398	unsigned long vsid;
   1399	pte_t *ptep;
   1400	unsigned hugeshift;
   1401	int rc, user_region = 0;
   1402	int psize, ssize;
   1403
   1404	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
   1405		ea, access, trap);
   1406	trace_hash_fault(ea, access, trap);
   1407
   1408	/* Get region & vsid */
   1409	switch (get_region_id(ea)) {
   1410	case USER_REGION_ID:
   1411		user_region = 1;
   1412		if (! mm) {
   1413			DBG_LOW(" user region with no mm !\n");
   1414			rc = 1;
   1415			goto bail;
   1416		}
   1417		psize = get_slice_psize(mm, ea);
   1418		ssize = user_segment_size(ea);
   1419		vsid = get_user_vsid(&mm->context, ea, ssize);
   1420		break;
   1421	case VMALLOC_REGION_ID:
   1422		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
   1423		psize = mmu_vmalloc_psize;
   1424		ssize = mmu_kernel_ssize;
   1425		flags |= HPTE_USE_KERNEL_KEY;
   1426		break;
   1427
   1428	case IO_REGION_ID:
   1429		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
   1430		psize = mmu_io_psize;
   1431		ssize = mmu_kernel_ssize;
   1432		flags |= HPTE_USE_KERNEL_KEY;
   1433		break;
   1434	default:
   1435		/*
   1436		 * Not a valid range
   1437		 * Send the problem up to do_page_fault()
   1438		 */
   1439		rc = 1;
   1440		goto bail;
   1441	}
   1442	DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
   1443
   1444	/* Bad address. */
   1445	if (!vsid) {
   1446		DBG_LOW("Bad address!\n");
   1447		rc = 1;
   1448		goto bail;
   1449	}
   1450	/* Get pgdir */
   1451	pgdir = mm->pgd;
   1452	if (pgdir == NULL) {
   1453		rc = 1;
   1454		goto bail;
   1455	}
   1456
   1457	/* Check CPU locality */
   1458	if (user_region && mm_is_thread_local(mm))
   1459		flags |= HPTE_LOCAL_UPDATE;
   1460
   1461#ifndef CONFIG_PPC_64K_PAGES
   1462	/*
   1463	 * If we use 4K pages and our psize is not 4K, then we might
   1464	 * be hitting a special driver mapping, and need to align the
   1465	 * address before we fetch the PTE.
   1466	 *
   1467	 * It could also be a hugepage mapping, in which case this is
   1468	 * not necessary, but it's not harmful, either.
   1469	 */
   1470	if (psize != MMU_PAGE_4K)
   1471		ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
   1472#endif /* CONFIG_PPC_64K_PAGES */
   1473
   1474	/* Get PTE and page size from page tables */
   1475	ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
   1476	if (ptep == NULL || !pte_present(*ptep)) {
   1477		DBG_LOW(" no PTE !\n");
   1478		rc = 1;
   1479		goto bail;
   1480	}
   1481
   1482	/*
   1483	 * Add _PAGE_PRESENT to the required access perm. If there are parallel
   1484	 * updates to the pte that can possibly clear _PAGE_PTE, catch that too.
   1485	 *
   1486	 * We can safely use the return pte address in rest of the function
   1487	 * because we do set H_PAGE_BUSY which prevents further updates to pte
   1488	 * from generic code.
   1489	 */
   1490	access |= _PAGE_PRESENT | _PAGE_PTE;
   1491
   1492	/*
   1493	 * Pre-check access permissions (will be re-checked atomically
   1494	 * in __hash_page_XX but this pre-check is a fast path
   1495	 */
   1496	if (!check_pte_access(access, pte_val(*ptep))) {
   1497		DBG_LOW(" no access !\n");
   1498		rc = 1;
   1499		goto bail;
   1500	}
   1501
   1502	if (hugeshift) {
   1503		if (is_thp)
   1504			rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
   1505					     trap, flags, ssize, psize);
   1506#ifdef CONFIG_HUGETLB_PAGE
   1507		else
   1508			rc = __hash_page_huge(ea, access, vsid, ptep, trap,
   1509					      flags, ssize, hugeshift, psize);
   1510#else
   1511		else {
   1512			/*
   1513			 * if we have hugeshift, and is not transhuge with
   1514			 * hugetlb disabled, something is really wrong.
   1515			 */
   1516			rc = 1;
   1517			WARN_ON(1);
   1518		}
   1519#endif
   1520		if (current->mm == mm)
   1521			check_paca_psize(ea, mm, psize, user_region);
   1522
   1523		goto bail;
   1524	}
   1525
   1526#ifndef CONFIG_PPC_64K_PAGES
   1527	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
   1528#else
   1529	DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
   1530		pte_val(*(ptep + PTRS_PER_PTE)));
   1531#endif
   1532	/* Do actual hashing */
   1533#ifdef CONFIG_PPC_64K_PAGES
   1534	/* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
   1535	if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
   1536		demote_segment_4k(mm, ea);
   1537		psize = MMU_PAGE_4K;
   1538	}
   1539
   1540	/*
   1541	 * If this PTE is non-cacheable and we have restrictions on
   1542	 * using non cacheable large pages, then we switch to 4k
   1543	 */
   1544	if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
   1545		if (user_region) {
   1546			demote_segment_4k(mm, ea);
   1547			psize = MMU_PAGE_4K;
   1548		} else if (ea < VMALLOC_END) {
   1549			/*
   1550			 * some driver did a non-cacheable mapping
   1551			 * in vmalloc space, so switch vmalloc
   1552			 * to 4k pages
   1553			 */
   1554			printk(KERN_ALERT "Reducing vmalloc segment "
   1555			       "to 4kB pages because of "
   1556			       "non-cacheable mapping\n");
   1557			psize = mmu_vmalloc_psize = MMU_PAGE_4K;
   1558			copro_flush_all_slbs(mm);
   1559		}
   1560	}
   1561
   1562#endif /* CONFIG_PPC_64K_PAGES */
   1563
   1564	if (current->mm == mm)
   1565		check_paca_psize(ea, mm, psize, user_region);
   1566
   1567#ifdef CONFIG_PPC_64K_PAGES
   1568	if (psize == MMU_PAGE_64K)
   1569		rc = __hash_page_64K(ea, access, vsid, ptep, trap,
   1570				     flags, ssize);
   1571	else
   1572#endif /* CONFIG_PPC_64K_PAGES */
   1573	{
   1574		int spp = subpage_protection(mm, ea);
   1575		if (access & spp)
   1576			rc = -2;
   1577		else
   1578			rc = __hash_page_4K(ea, access, vsid, ptep, trap,
   1579					    flags, ssize, spp);
   1580	}
   1581
   1582	/*
   1583	 * Dump some info in case of hash insertion failure, they should
   1584	 * never happen so it is really useful to know if/when they do
   1585	 */
   1586	if (rc == -1)
   1587		hash_failure_debug(ea, access, vsid, trap, ssize, psize,
   1588				   psize, pte_val(*ptep));
   1589#ifndef CONFIG_PPC_64K_PAGES
   1590	DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
   1591#else
   1592	DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
   1593		pte_val(*(ptep + PTRS_PER_PTE)));
   1594#endif
   1595	DBG_LOW(" -> rc=%d\n", rc);
   1596
   1597bail:
   1598	return rc;
   1599}
   1600EXPORT_SYMBOL_GPL(hash_page_mm);
   1601
   1602int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
   1603	      unsigned long dsisr)
   1604{
   1605	unsigned long flags = 0;
   1606	struct mm_struct *mm = current->mm;
   1607
   1608	if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
   1609	    (get_region_id(ea) == IO_REGION_ID))
   1610		mm = &init_mm;
   1611
   1612	if (dsisr & DSISR_NOHPTE)
   1613		flags |= HPTE_NOHPTE_UPDATE;
   1614
   1615	return hash_page_mm(mm, ea, access, trap, flags);
   1616}
   1617EXPORT_SYMBOL_GPL(hash_page);
   1618
   1619DEFINE_INTERRUPT_HANDLER(do_hash_fault)
   1620{
   1621	unsigned long ea = regs->dar;
   1622	unsigned long dsisr = regs->dsisr;
   1623	unsigned long access = _PAGE_PRESENT | _PAGE_READ;
   1624	unsigned long flags = 0;
   1625	struct mm_struct *mm;
   1626	unsigned int region_id;
   1627	long err;
   1628
   1629	if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) {
   1630		hash__do_page_fault(regs);
   1631		return;
   1632	}
   1633
   1634	region_id = get_region_id(ea);
   1635	if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
   1636		mm = &init_mm;
   1637	else
   1638		mm = current->mm;
   1639
   1640	if (dsisr & DSISR_NOHPTE)
   1641		flags |= HPTE_NOHPTE_UPDATE;
   1642
   1643	if (dsisr & DSISR_ISSTORE)
   1644		access |= _PAGE_WRITE;
   1645	/*
   1646	 * We set _PAGE_PRIVILEGED only when
   1647	 * kernel mode access kernel space.
   1648	 *
   1649	 * _PAGE_PRIVILEGED is NOT set
   1650	 * 1) when kernel mode access user space
   1651	 * 2) user space access kernel space.
   1652	 */
   1653	access |= _PAGE_PRIVILEGED;
   1654	if (user_mode(regs) || (region_id == USER_REGION_ID))
   1655		access &= ~_PAGE_PRIVILEGED;
   1656
   1657	if (TRAP(regs) == INTERRUPT_INST_STORAGE)
   1658		access |= _PAGE_EXEC;
   1659
   1660	err = hash_page_mm(mm, ea, access, TRAP(regs), flags);
   1661	if (unlikely(err < 0)) {
   1662		// failed to insert a hash PTE due to an hypervisor error
   1663		if (user_mode(regs)) {
   1664			if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2)
   1665				_exception(SIGSEGV, regs, SEGV_ACCERR, ea);
   1666			else
   1667				_exception(SIGBUS, regs, BUS_ADRERR, ea);
   1668		} else {
   1669			bad_page_fault(regs, SIGBUS);
   1670		}
   1671		err = 0;
   1672
   1673	} else if (err) {
   1674		hash__do_page_fault(regs);
   1675	}
   1676}
   1677
   1678static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
   1679{
   1680	int psize = get_slice_psize(mm, ea);
   1681
   1682	/* We only prefault standard pages for now */
   1683	if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
   1684		return false;
   1685
   1686	/*
   1687	 * Don't prefault if subpage protection is enabled for the EA.
   1688	 */
   1689	if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
   1690		return false;
   1691
   1692	return true;
   1693}
   1694
   1695static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
   1696			 bool is_exec, unsigned long trap)
   1697{
   1698	unsigned long vsid;
   1699	pgd_t *pgdir;
   1700	int rc, ssize, update_flags = 0;
   1701	unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
   1702	unsigned long flags;
   1703
   1704	BUG_ON(get_region_id(ea) != USER_REGION_ID);
   1705
   1706	if (!should_hash_preload(mm, ea))
   1707		return;
   1708
   1709	DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
   1710		" trap=%lx\n", mm, mm->pgd, ea, access, trap);
   1711
   1712	/* Get Linux PTE if available */
   1713	pgdir = mm->pgd;
   1714	if (pgdir == NULL)
   1715		return;
   1716
   1717	/* Get VSID */
   1718	ssize = user_segment_size(ea);
   1719	vsid = get_user_vsid(&mm->context, ea, ssize);
   1720	if (!vsid)
   1721		return;
   1722
   1723#ifdef CONFIG_PPC_64K_PAGES
   1724	/* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
   1725	 * a 64K kernel), then we don't preload, hash_page() will take
   1726	 * care of it once we actually try to access the page.
   1727	 * That way we don't have to duplicate all of the logic for segment
   1728	 * page size demotion here
   1729	 * Called with  PTL held, hence can be sure the value won't change in
   1730	 * between.
   1731	 */
   1732	if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
   1733		return;
   1734#endif /* CONFIG_PPC_64K_PAGES */
   1735
   1736	/*
   1737	 * __hash_page_* must run with interrupts off, including PMI interrupts
   1738	 * off, as it sets the H_PAGE_BUSY bit.
   1739	 *
   1740	 * It's otherwise possible for perf interrupts to hit at any time and
   1741	 * may take a hash fault reading the user stack, which could take a
   1742	 * hash miss and deadlock on the same H_PAGE_BUSY bit.
   1743	 *
   1744	 * Interrupts must also be off for the duration of the
   1745	 * mm_is_thread_local test and update, to prevent preempt running the
   1746	 * mm on another CPU (XXX: this may be racy vs kthread_use_mm).
   1747	 */
   1748	powerpc_local_irq_pmu_save(flags);
   1749
   1750	/* Is that local to this CPU ? */
   1751	if (mm_is_thread_local(mm))
   1752		update_flags |= HPTE_LOCAL_UPDATE;
   1753
   1754	/* Hash it in */
   1755#ifdef CONFIG_PPC_64K_PAGES
   1756	if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
   1757		rc = __hash_page_64K(ea, access, vsid, ptep, trap,
   1758				     update_flags, ssize);
   1759	else
   1760#endif /* CONFIG_PPC_64K_PAGES */
   1761		rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags,
   1762				    ssize, subpage_protection(mm, ea));
   1763
   1764	/* Dump some info in case of hash insertion failure, they should
   1765	 * never happen so it is really useful to know if/when they do
   1766	 */
   1767	if (rc == -1)
   1768		hash_failure_debug(ea, access, vsid, trap, ssize,
   1769				   mm_ctx_user_psize(&mm->context),
   1770				   mm_ctx_user_psize(&mm->context),
   1771				   pte_val(*ptep));
   1772
   1773	powerpc_local_irq_pmu_restore(flags);
   1774}
   1775
   1776/*
   1777 * This is called at the end of handling a user page fault, when the
   1778 * fault has been handled by updating a PTE in the linux page tables.
   1779 * We use it to preload an HPTE into the hash table corresponding to
   1780 * the updated linux PTE.
   1781 *
   1782 * This must always be called with the pte lock held.
   1783 */
   1784void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
   1785		      pte_t *ptep)
   1786{
   1787	/*
   1788	 * We don't need to worry about _PAGE_PRESENT here because we are
   1789	 * called with either mm->page_table_lock held or ptl lock held
   1790	 */
   1791	unsigned long trap;
   1792	bool is_exec;
   1793
   1794	if (radix_enabled())
   1795		return;
   1796
   1797	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
   1798	if (!pte_young(*ptep) || address >= TASK_SIZE)
   1799		return;
   1800
   1801	/*
   1802	 * We try to figure out if we are coming from an instruction
   1803	 * access fault and pass that down to __hash_page so we avoid
   1804	 * double-faulting on execution of fresh text. We have to test
   1805	 * for regs NULL since init will get here first thing at boot.
   1806	 *
   1807	 * We also avoid filling the hash if not coming from a fault.
   1808	 */
   1809
   1810	trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
   1811	switch (trap) {
   1812	case 0x300:
   1813		is_exec = false;
   1814		break;
   1815	case 0x400:
   1816		is_exec = true;
   1817		break;
   1818	default:
   1819		return;
   1820	}
   1821
   1822	hash_preload(vma->vm_mm, ptep, address, is_exec, trap);
   1823}
   1824
   1825#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
   1826static inline void tm_flush_hash_page(int local)
   1827{
   1828	/*
   1829	 * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
   1830	 * page back to a block device w/PIO could pick up transactional data
   1831	 * (bad!) so we force an abort here. Before the sync the page will be
   1832	 * made read-only, which will flush_hash_page. BIG ISSUE here: if the
   1833	 * kernel uses a page from userspace without unmapping it first, it may
   1834	 * see the speculated version.
   1835	 */
   1836	if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
   1837	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
   1838		tm_enable();
   1839		tm_abort(TM_CAUSE_TLBI);
   1840	}
   1841}
   1842#else
   1843static inline void tm_flush_hash_page(int local)
   1844{
   1845}
   1846#endif
   1847
   1848/*
   1849 * Return the global hash slot, corresponding to the given PTE, which contains
   1850 * the HPTE.
   1851 */
   1852unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
   1853		int ssize, real_pte_t rpte, unsigned int subpg_index)
   1854{
   1855	unsigned long hash, gslot, hidx;
   1856
   1857	hash = hpt_hash(vpn, shift, ssize);
   1858	hidx = __rpte_to_hidx(rpte, subpg_index);
   1859	if (hidx & _PTEIDX_SECONDARY)
   1860		hash = ~hash;
   1861	gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
   1862	gslot += hidx & _PTEIDX_GROUP_IX;
   1863	return gslot;
   1864}
   1865
   1866void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
   1867		     unsigned long flags)
   1868{
   1869	unsigned long index, shift, gslot;
   1870	int local = flags & HPTE_LOCAL_UPDATE;
   1871
   1872	DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
   1873	pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
   1874		gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
   1875		DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot);
   1876		/*
   1877		 * We use same base page size and actual psize, because we don't
   1878		 * use these functions for hugepage
   1879		 */
   1880		mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
   1881					     ssize, local);
   1882	} pte_iterate_hashed_end();
   1883
   1884	tm_flush_hash_page(local);
   1885}
   1886
   1887#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1888void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
   1889			 pmd_t *pmdp, unsigned int psize, int ssize,
   1890			 unsigned long flags)
   1891{
   1892	int i, max_hpte_count, valid;
   1893	unsigned long s_addr;
   1894	unsigned char *hpte_slot_array;
   1895	unsigned long hidx, shift, vpn, hash, slot;
   1896	int local = flags & HPTE_LOCAL_UPDATE;
   1897
   1898	s_addr = addr & HPAGE_PMD_MASK;
   1899	hpte_slot_array = get_hpte_slot_array(pmdp);
   1900	/*
   1901	 * IF we try to do a HUGE PTE update after a withdraw is done.
   1902	 * we will find the below NULL. This happens when we do
   1903	 * split_huge_pmd
   1904	 */
   1905	if (!hpte_slot_array)
   1906		return;
   1907
   1908	if (mmu_hash_ops.hugepage_invalidate) {
   1909		mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
   1910						 psize, ssize, local);
   1911		goto tm_abort;
   1912	}
   1913	/*
   1914	 * No bluk hpte removal support, invalidate each entry
   1915	 */
   1916	shift = mmu_psize_defs[psize].shift;
   1917	max_hpte_count = HPAGE_PMD_SIZE >> shift;
   1918	for (i = 0; i < max_hpte_count; i++) {
   1919		/*
   1920		 * 8 bits per each hpte entries
   1921		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
   1922		 */
   1923		valid = hpte_valid(hpte_slot_array, i);
   1924		if (!valid)
   1925			continue;
   1926		hidx =  hpte_hash_index(hpte_slot_array, i);
   1927
   1928		/* get the vpn */
   1929		addr = s_addr + (i * (1ul << shift));
   1930		vpn = hpt_vpn(addr, vsid, ssize);
   1931		hash = hpt_hash(vpn, shift, ssize);
   1932		if (hidx & _PTEIDX_SECONDARY)
   1933			hash = ~hash;
   1934
   1935		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
   1936		slot += hidx & _PTEIDX_GROUP_IX;
   1937		mmu_hash_ops.hpte_invalidate(slot, vpn, psize,
   1938					     MMU_PAGE_16M, ssize, local);
   1939	}
   1940tm_abort:
   1941	tm_flush_hash_page(local);
   1942}
   1943#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   1944
   1945void flush_hash_range(unsigned long number, int local)
   1946{
   1947	if (mmu_hash_ops.flush_hash_range)
   1948		mmu_hash_ops.flush_hash_range(number, local);
   1949	else {
   1950		int i;
   1951		struct ppc64_tlb_batch *batch =
   1952			this_cpu_ptr(&ppc64_tlb_batch);
   1953
   1954		for (i = 0; i < number; i++)
   1955			flush_hash_page(batch->vpn[i], batch->pte[i],
   1956					batch->psize, batch->ssize, local);
   1957	}
   1958}
   1959
   1960long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
   1961			   unsigned long pa, unsigned long rflags,
   1962			   unsigned long vflags, int psize, int ssize)
   1963{
   1964	unsigned long hpte_group;
   1965	long slot;
   1966
   1967repeat:
   1968	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
   1969
   1970	/* Insert into the hash table, primary slot */
   1971	slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
   1972					psize, psize, ssize);
   1973
   1974	/* Primary is full, try the secondary */
   1975	if (unlikely(slot == -1)) {
   1976		hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
   1977		slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
   1978						vflags | HPTE_V_SECONDARY,
   1979						psize, psize, ssize);
   1980		if (slot == -1) {
   1981			if (mftb() & 0x1)
   1982				hpte_group = (hash & htab_hash_mask) *
   1983						HPTES_PER_GROUP;
   1984
   1985			mmu_hash_ops.hpte_remove(hpte_group);
   1986			goto repeat;
   1987		}
   1988	}
   1989
   1990	return slot;
   1991}
   1992
   1993#ifdef CONFIG_DEBUG_PAGEALLOC
   1994static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
   1995{
   1996	unsigned long hash;
   1997	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
   1998	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
   1999	unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY);
   2000	long ret;
   2001
   2002	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
   2003
   2004	/* Don't create HPTE entries for bad address */
   2005	if (!vsid)
   2006		return;
   2007
   2008	ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
   2009				    HPTE_V_BOLTED,
   2010				    mmu_linear_psize, mmu_kernel_ssize);
   2011
   2012	BUG_ON (ret < 0);
   2013	spin_lock(&linear_map_hash_lock);
   2014	BUG_ON(linear_map_hash_slots[lmi] & 0x80);
   2015	linear_map_hash_slots[lmi] = ret | 0x80;
   2016	spin_unlock(&linear_map_hash_lock);
   2017}
   2018
   2019static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
   2020{
   2021	unsigned long hash, hidx, slot;
   2022	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
   2023	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
   2024
   2025	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
   2026	spin_lock(&linear_map_hash_lock);
   2027	BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
   2028	hidx = linear_map_hash_slots[lmi] & 0x7f;
   2029	linear_map_hash_slots[lmi] = 0;
   2030	spin_unlock(&linear_map_hash_lock);
   2031	if (hidx & _PTEIDX_SECONDARY)
   2032		hash = ~hash;
   2033	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
   2034	slot += hidx & _PTEIDX_GROUP_IX;
   2035	mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
   2036				     mmu_linear_psize,
   2037				     mmu_kernel_ssize, 0);
   2038}
   2039
   2040void hash__kernel_map_pages(struct page *page, int numpages, int enable)
   2041{
   2042	unsigned long flags, vaddr, lmi;
   2043	int i;
   2044
   2045	local_irq_save(flags);
   2046	for (i = 0; i < numpages; i++, page++) {
   2047		vaddr = (unsigned long)page_address(page);
   2048		lmi = __pa(vaddr) >> PAGE_SHIFT;
   2049		if (lmi >= linear_map_hash_count)
   2050			continue;
   2051		if (enable)
   2052			kernel_map_linear_page(vaddr, lmi);
   2053		else
   2054			kernel_unmap_linear_page(vaddr, lmi);
   2055	}
   2056	local_irq_restore(flags);
   2057}
   2058#endif /* CONFIG_DEBUG_PAGEALLOC */
   2059
   2060void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
   2061				phys_addr_t first_memblock_size)
   2062{
   2063	/*
   2064	 * We don't currently support the first MEMBLOCK not mapping 0
   2065	 * physical on those processors
   2066	 */
   2067	BUG_ON(first_memblock_base != 0);
   2068
   2069	/*
   2070	 * On virtualized systems the first entry is our RMA region aka VRMA,
   2071	 * non-virtualized 64-bit hash MMU systems don't have a limitation
   2072	 * on real mode access.
   2073	 *
   2074	 * For guests on platforms before POWER9, we clamp the it limit to 1G
   2075	 * to avoid some funky things such as RTAS bugs etc...
   2076	 *
   2077	 * On POWER9 we limit to 1TB in case the host erroneously told us that
   2078	 * the RMA was >1TB. Effective address bits 0:23 are treated as zero
   2079	 * (meaning the access is aliased to zero i.e. addr = addr % 1TB)
   2080	 * for virtual real mode addressing and so it doesn't make sense to
   2081	 * have an area larger than 1TB as it can't be addressed.
   2082	 */
   2083	if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
   2084		ppc64_rma_size = first_memblock_size;
   2085		if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
   2086			ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
   2087		else
   2088			ppc64_rma_size = min_t(u64, ppc64_rma_size,
   2089					       1UL << SID_SHIFT_1T);
   2090
   2091		/* Finally limit subsequent allocations */
   2092		memblock_set_current_limit(ppc64_rma_size);
   2093	} else {
   2094		ppc64_rma_size = ULONG_MAX;
   2095	}
   2096}
   2097
   2098#ifdef CONFIG_DEBUG_FS
   2099
   2100static int hpt_order_get(void *data, u64 *val)
   2101{
   2102	*val = ppc64_pft_size;
   2103	return 0;
   2104}
   2105
   2106static int hpt_order_set(void *data, u64 val)
   2107{
   2108	int ret;
   2109
   2110	if (!mmu_hash_ops.resize_hpt)
   2111		return -ENODEV;
   2112
   2113	cpus_read_lock();
   2114	ret = mmu_hash_ops.resize_hpt(val);
   2115	cpus_read_unlock();
   2116
   2117	return ret;
   2118}
   2119
   2120DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
   2121
   2122static int __init hash64_debugfs(void)
   2123{
   2124	debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL,
   2125			    &fops_hpt_order);
   2126	return 0;
   2127}
   2128machine_device_initcall(pseries, hash64_debugfs);
   2129#endif /* CONFIG_DEBUG_FS */
   2130
   2131void __init print_system_hash_info(void)
   2132{
   2133	pr_info("ppc64_pft_size    = 0x%llx\n", ppc64_pft_size);
   2134
   2135	if (htab_hash_mask)
   2136		pr_info("htab_hash_mask    = 0x%lx\n", htab_hash_mask);
   2137}
   2138
   2139unsigned long arch_randomize_brk(struct mm_struct *mm)
   2140{
   2141	/*
   2142	 * If we are using 1TB segments and we are allowed to randomise
   2143	 * the heap, we can put it above 1TB so it is backed by a 1TB
   2144	 * segment. Otherwise the heap will be in the bottom 1TB
   2145	 * which always uses 256MB segments and this may result in a
   2146	 * performance penalty.
   2147	 */
   2148	if (is_32bit_task())
   2149		return randomize_page(mm->brk, SZ_32M);
   2150	else if (!radix_enabled() && mmu_highuser_ssize == MMU_SEGSIZE_1T)
   2151		return randomize_page(max_t(unsigned long, mm->brk, SZ_1T), SZ_1G);
   2152	else
   2153		return randomize_page(mm->brk, SZ_1G);
   2154}