dump_pagetables.c (12462B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Debug helper to dump the current kernel pagetables of the system 4 * so that we can see what the various memory ranges are set to. 5 * 6 * (C) Copyright 2008 Intel Corporation 7 * 8 * Author: Arjan van de Ven <arjan@linux.intel.com> 9 */ 10 11#include <linux/debugfs.h> 12#include <linux/kasan.h> 13#include <linux/mm.h> 14#include <linux/init.h> 15#include <linux/sched.h> 16#include <linux/seq_file.h> 17#include <linux/highmem.h> 18#include <linux/pci.h> 19#include <linux/ptdump.h> 20 21#include <asm/e820/types.h> 22 23/* 24 * The dumper groups pagetable entries of the same type into one, and for 25 * that it needs to keep some state when walking, and flush this state 26 * when a "break" in the continuity is found. 27 */ 28struct pg_state { 29 struct ptdump_state ptdump; 30 int level; 31 pgprotval_t current_prot; 32 pgprotval_t effective_prot; 33 pgprotval_t prot_levels[5]; 34 unsigned long start_address; 35 const struct addr_marker *marker; 36 unsigned long lines; 37 bool to_dmesg; 38 bool check_wx; 39 unsigned long wx_pages; 40 struct seq_file *seq; 41}; 42 43struct addr_marker { 44 unsigned long start_address; 45 const char *name; 46 unsigned long max_lines; 47}; 48 49/* Address space markers hints */ 50 51#ifdef CONFIG_X86_64 52 53enum address_markers_idx { 54 USER_SPACE_NR = 0, 55 KERNEL_SPACE_NR, 56#ifdef CONFIG_MODIFY_LDT_SYSCALL 57 LDT_NR, 58#endif 59 LOW_KERNEL_NR, 60 VMALLOC_START_NR, 61 VMEMMAP_START_NR, 62#ifdef CONFIG_KASAN 63 KASAN_SHADOW_START_NR, 64 KASAN_SHADOW_END_NR, 65#endif 66 CPU_ENTRY_AREA_NR, 67#ifdef CONFIG_X86_ESPFIX64 68 ESPFIX_START_NR, 69#endif 70#ifdef CONFIG_EFI 71 EFI_END_NR, 72#endif 73 HIGH_KERNEL_NR, 74 MODULES_VADDR_NR, 75 MODULES_END_NR, 76 FIXADDR_START_NR, 77 END_OF_SPACE_NR, 78}; 79 80static struct addr_marker address_markers[] = { 81 [USER_SPACE_NR] = { 0, "User Space" }, 82 [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, 83 [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, 84 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 85 [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, 86#ifdef CONFIG_KASAN 87 /* 88 * These fields get initialized with the (dynamic) 89 * KASAN_SHADOW_{START,END} values in pt_dump_init(). 90 */ 91 [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, 92 [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, 93#endif 94#ifdef CONFIG_MODIFY_LDT_SYSCALL 95 [LDT_NR] = { 0UL, "LDT remap" }, 96#endif 97 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, 98#ifdef CONFIG_X86_ESPFIX64 99 [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, 100#endif 101#ifdef CONFIG_EFI 102 [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, 103#endif 104 [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, 105 [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, 106 [MODULES_END_NR] = { MODULES_END, "End Modules" }, 107 [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, 108 [END_OF_SPACE_NR] = { -1, NULL } 109}; 110 111#define INIT_PGD ((pgd_t *) &init_top_pgt) 112 113#else /* CONFIG_X86_64 */ 114 115enum address_markers_idx { 116 USER_SPACE_NR = 0, 117 KERNEL_SPACE_NR, 118 VMALLOC_START_NR, 119 VMALLOC_END_NR, 120#ifdef CONFIG_HIGHMEM 121 PKMAP_BASE_NR, 122#endif 123#ifdef CONFIG_MODIFY_LDT_SYSCALL 124 LDT_NR, 125#endif 126 CPU_ENTRY_AREA_NR, 127 FIXADDR_START_NR, 128 END_OF_SPACE_NR, 129}; 130 131static struct addr_marker address_markers[] = { 132 [USER_SPACE_NR] = { 0, "User Space" }, 133 [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, 134 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 135 [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, 136#ifdef CONFIG_HIGHMEM 137 [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, 138#endif 139#ifdef CONFIG_MODIFY_LDT_SYSCALL 140 [LDT_NR] = { 0UL, "LDT remap" }, 141#endif 142 [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, 143 [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, 144 [END_OF_SPACE_NR] = { -1, NULL } 145}; 146 147#define INIT_PGD (swapper_pg_dir) 148 149#endif /* !CONFIG_X86_64 */ 150 151/* Multipliers for offsets within the PTEs */ 152#define PTE_LEVEL_MULT (PAGE_SIZE) 153#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) 154#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) 155#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) 156#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) 157 158#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ 159({ \ 160 if (to_dmesg) \ 161 printk(KERN_INFO fmt, ##args); \ 162 else \ 163 if (m) \ 164 seq_printf(m, fmt, ##args); \ 165}) 166 167#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ 168({ \ 169 if (to_dmesg) \ 170 printk(KERN_CONT fmt, ##args); \ 171 else \ 172 if (m) \ 173 seq_printf(m, fmt, ##args); \ 174}) 175 176/* 177 * Print a readable form of a pgprot_t to the seq_file 178 */ 179static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg) 180{ 181 static const char * const level_name[] = 182 { "pgd", "p4d", "pud", "pmd", "pte" }; 183 184 if (!(pr & _PAGE_PRESENT)) { 185 /* Not present */ 186 pt_dump_cont_printf(m, dmsg, " "); 187 } else { 188 if (pr & _PAGE_USER) 189 pt_dump_cont_printf(m, dmsg, "USR "); 190 else 191 pt_dump_cont_printf(m, dmsg, " "); 192 if (pr & _PAGE_RW) 193 pt_dump_cont_printf(m, dmsg, "RW "); 194 else 195 pt_dump_cont_printf(m, dmsg, "ro "); 196 if (pr & _PAGE_PWT) 197 pt_dump_cont_printf(m, dmsg, "PWT "); 198 else 199 pt_dump_cont_printf(m, dmsg, " "); 200 if (pr & _PAGE_PCD) 201 pt_dump_cont_printf(m, dmsg, "PCD "); 202 else 203 pt_dump_cont_printf(m, dmsg, " "); 204 205 /* Bit 7 has a different meaning on level 3 vs 4 */ 206 if (level <= 3 && pr & _PAGE_PSE) 207 pt_dump_cont_printf(m, dmsg, "PSE "); 208 else 209 pt_dump_cont_printf(m, dmsg, " "); 210 if ((level == 4 && pr & _PAGE_PAT) || 211 ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) 212 pt_dump_cont_printf(m, dmsg, "PAT "); 213 else 214 pt_dump_cont_printf(m, dmsg, " "); 215 if (pr & _PAGE_GLOBAL) 216 pt_dump_cont_printf(m, dmsg, "GLB "); 217 else 218 pt_dump_cont_printf(m, dmsg, " "); 219 if (pr & _PAGE_NX) 220 pt_dump_cont_printf(m, dmsg, "NX "); 221 else 222 pt_dump_cont_printf(m, dmsg, "x "); 223 } 224 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); 225} 226 227static void note_wx(struct pg_state *st, unsigned long addr) 228{ 229 unsigned long npages; 230 231 npages = (addr - st->start_address) / PAGE_SIZE; 232 233#ifdef CONFIG_PCI_BIOS 234 /* 235 * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. 236 * Inform about it, but avoid the warning. 237 */ 238 if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && 239 addr <= PAGE_OFFSET + BIOS_END) { 240 pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); 241 return; 242 } 243#endif 244 /* Account the WX pages */ 245 st->wx_pages += npages; 246 WARN_ONCE(__supported_pte_mask & _PAGE_NX, 247 "x86/mm: Found insecure W+X mapping at address %pS\n", 248 (void *)st->start_address); 249} 250 251static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) 252{ 253 struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); 254 pgprotval_t prot = val & PTE_FLAGS_MASK; 255 pgprotval_t effective; 256 257 if (level > 0) { 258 pgprotval_t higher_prot = st->prot_levels[level - 1]; 259 260 effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | 261 ((higher_prot | prot) & _PAGE_NX); 262 } else { 263 effective = prot; 264 } 265 266 st->prot_levels[level] = effective; 267} 268 269/* 270 * This function gets called on a break in a continuous series 271 * of PTE entries; the next one is different so we need to 272 * print what we collected so far. 273 */ 274static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, 275 u64 val) 276{ 277 struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); 278 pgprotval_t new_prot, new_eff; 279 pgprotval_t cur, eff; 280 static const char units[] = "BKMGTPE"; 281 struct seq_file *m = st->seq; 282 283 new_prot = val & PTE_FLAGS_MASK; 284 if (!val) 285 new_eff = 0; 286 else 287 new_eff = st->prot_levels[level]; 288 289 /* 290 * If we have a "break" in the series, we need to flush the state that 291 * we have now. "break" is either changing perms, levels or 292 * address space marker. 293 */ 294 cur = st->current_prot; 295 eff = st->effective_prot; 296 297 if (st->level == -1) { 298 /* First entry */ 299 st->current_prot = new_prot; 300 st->effective_prot = new_eff; 301 st->level = level; 302 st->marker = address_markers; 303 st->lines = 0; 304 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 305 st->marker->name); 306 } else if (new_prot != cur || new_eff != eff || level != st->level || 307 addr >= st->marker[1].start_address) { 308 const char *unit = units; 309 unsigned long delta; 310 int width = sizeof(unsigned long) * 2; 311 312 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) 313 note_wx(st, addr); 314 315 /* 316 * Now print the actual finished series 317 */ 318 if (!st->marker->max_lines || 319 st->lines < st->marker->max_lines) { 320 pt_dump_seq_printf(m, st->to_dmesg, 321 "0x%0*lx-0x%0*lx ", 322 width, st->start_address, 323 width, addr); 324 325 delta = addr - st->start_address; 326 while (!(delta & 1023) && unit[1]) { 327 delta >>= 10; 328 unit++; 329 } 330 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", 331 delta, *unit); 332 printk_prot(m, st->current_prot, st->level, 333 st->to_dmesg); 334 } 335 st->lines++; 336 337 /* 338 * We print markers for special areas of address space, 339 * such as the start of vmalloc space etc. 340 * This helps in the interpretation. 341 */ 342 if (addr >= st->marker[1].start_address) { 343 if (st->marker->max_lines && 344 st->lines > st->marker->max_lines) { 345 unsigned long nskip = 346 st->lines - st->marker->max_lines; 347 pt_dump_seq_printf(m, st->to_dmesg, 348 "... %lu entr%s skipped ... \n", 349 nskip, 350 nskip == 1 ? "y" : "ies"); 351 } 352 st->marker++; 353 st->lines = 0; 354 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 355 st->marker->name); 356 } 357 358 st->start_address = addr; 359 st->current_prot = new_prot; 360 st->effective_prot = new_eff; 361 st->level = level; 362 } 363} 364 365static void ptdump_walk_pgd_level_core(struct seq_file *m, 366 struct mm_struct *mm, pgd_t *pgd, 367 bool checkwx, bool dmesg) 368{ 369 const struct ptdump_range ptdump_ranges[] = { 370#ifdef CONFIG_X86_64 371 {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2}, 372 {GUARD_HOLE_END_ADDR, ~0UL}, 373#else 374 {0, ~0UL}, 375#endif 376 {0, 0} 377}; 378 379 struct pg_state st = { 380 .ptdump = { 381 .note_page = note_page, 382 .effective_prot = effective_prot, 383 .range = ptdump_ranges 384 }, 385 .level = -1, 386 .to_dmesg = dmesg, 387 .check_wx = checkwx, 388 .seq = m 389 }; 390 391 ptdump_walk_pgd(&st.ptdump, mm, pgd); 392 393 if (!checkwx) 394 return; 395 if (st.wx_pages) 396 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", 397 st.wx_pages); 398 else 399 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); 400} 401 402void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm) 403{ 404 ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true); 405} 406 407void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, 408 bool user) 409{ 410 pgd_t *pgd = mm->pgd; 411#ifdef CONFIG_PAGE_TABLE_ISOLATION 412 if (user && boot_cpu_has(X86_FEATURE_PTI)) 413 pgd = kernel_to_user_pgdp(pgd); 414#endif 415 ptdump_walk_pgd_level_core(m, mm, pgd, false, false); 416} 417EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); 418 419void ptdump_walk_user_pgd_level_checkwx(void) 420{ 421#ifdef CONFIG_PAGE_TABLE_ISOLATION 422 pgd_t *pgd = INIT_PGD; 423 424 if (!(__supported_pte_mask & _PAGE_NX) || 425 !boot_cpu_has(X86_FEATURE_PTI)) 426 return; 427 428 pr_info("x86/mm: Checking user space page tables\n"); 429 pgd = kernel_to_user_pgdp(pgd); 430 ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false); 431#endif 432} 433 434void ptdump_walk_pgd_level_checkwx(void) 435{ 436 ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); 437} 438 439static int __init pt_dump_init(void) 440{ 441 /* 442 * Various markers are not compile-time constants, so assign them 443 * here. 444 */ 445#ifdef CONFIG_X86_64 446 address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; 447 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 448 address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; 449#ifdef CONFIG_MODIFY_LDT_SYSCALL 450 address_markers[LDT_NR].start_address = LDT_BASE_ADDR; 451#endif 452#ifdef CONFIG_KASAN 453 address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; 454 address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; 455#endif 456#endif 457#ifdef CONFIG_X86_32 458 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; 459 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; 460# ifdef CONFIG_HIGHMEM 461 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; 462# endif 463 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; 464 address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; 465# ifdef CONFIG_MODIFY_LDT_SYSCALL 466 address_markers[LDT_NR].start_address = LDT_BASE_ADDR; 467# endif 468#endif 469 return 0; 470} 471__initcall(pt_dump_init);