From 747ada36ee23225d81657e4d633ac93b8ccbea7d Mon Sep 17 00:00:00 2001 From: Olaf Dabrunz Date: Wed, 11 Jun 2008 16:35:13 +0200 Subject: pci: add PCI IDs for devices that need boot irq quirks Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- include/linux/pci_ids.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 65953822c9cb..7f3f101e03c1 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2235,6 +2235,10 @@ #define PCI_DEVICE_ID_INTEL_PXH_0 0x0329 #define PCI_DEVICE_ID_INTEL_PXH_1 0x032A #define PCI_DEVICE_ID_INTEL_PXHV 0x032C +#define PCI_DEVICE_ID_INTEL_80332_0 0x0330 +#define PCI_DEVICE_ID_INTEL_80332_1 0x0332 +#define PCI_DEVICE_ID_INTEL_80333_0 0x0370 +#define PCI_DEVICE_ID_INTEL_80333_1 0x0372 #define PCI_DEVICE_ID_INTEL_82375 0x0482 #define PCI_DEVICE_ID_INTEL_82424 0x0483 #define PCI_DEVICE_ID_INTEL_82378 0x0484 @@ -2307,6 +2311,7 @@ #define PCI_DEVICE_ID_INTEL_ESB_4 0x25a4 #define PCI_DEVICE_ID_INTEL_ESB_5 0x25a6 #define PCI_DEVICE_ID_INTEL_ESB_9 0x25ab +#define PCI_DEVICE_ID_INTEL_ESB_10 0x25ac #define PCI_DEVICE_ID_INTEL_82820_HB 0x2500 #define PCI_DEVICE_ID_INTEL_82820_UP_HB 0x2501 #define PCI_DEVICE_ID_INTEL_82850_HB 0x2530 -- cgit v1.2.3-71-gd317 From e1d3a90846b40ad3160bf4b648d36c6badad39ac Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Wed, 11 Jun 2008 16:35:17 +0200 Subject: pci, acpi: reroute PCI interrupt to legacy boot interrupt equivalent Some chipsets (e.g. intel 6700PXH) generate a legacy INTx when the IRQ entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel does during interrupt handling). On chipsets where this INTx generation cannot be disabled, we reroute the valid interrupts to their legacy equivalent to get rid of spurious interrupts that might otherwise bring down (vital) interrupt lines through spurious interrupt detection in note_interrupt(). This patch benefited from discussions with Alexander Graf, Torsten Duwe, Ihno Krumreich, Daniel Gollub, Hannes Reinecke. The conclusions we drew and the patch itself are the authors' responsibility alone. Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- drivers/acpi/pci_irq.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ drivers/pci/quirks.c | 28 +++++++++++++++++++++++++ include/linux/pci.h | 6 ++++++ 3 files changed, 90 insertions(+) (limited to 'include/linux') diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index 89022a74faee..b37cb0a9826e 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -384,6 +384,27 @@ acpi_pci_free_irq(struct acpi_prt_entry *entry, return irq; } +#ifdef CONFIG_X86_IO_APIC +extern int noioapicquirk; + +static int bridge_has_boot_interrupt_variant(struct pci_bus *bus) +{ + struct pci_bus *bus_it; + + for (bus_it = bus ; bus_it ; bus_it = bus_it->parent) { + if (!bus_it->self) + return 0; + + printk(KERN_INFO "vendor=%04x device=%04x\n", bus_it->self->vendor, + bus_it->self->device); + + if (bus_it->self->irq_reroute_variant) + return bus_it->self->irq_reroute_variant; + } + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ + /* * acpi_pci_irq_lookup * success: return IRQ >= 0 @@ -413,6 +434,41 @@ acpi_pci_irq_lookup(struct pci_bus *bus, } ret = func(entry, triggering, polarity, link); + +#ifdef CONFIG_X86_IO_APIC + /* + * Some chipsets (e.g. intel 6700PXH) generate a legacy INTx when the + * IRQ entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel + * does during interrupt handling). When this INTx generation cannot be + * disabled, we reroute these interrupts to their legacy equivalent to + * get rid of spurious interrupts. + */ + if (!noioapicquirk) { + switch (bridge_has_boot_interrupt_variant(bus)) { + case 0: + /* no rerouting necessary */ + break; + + case INTEL_IRQ_REROUTE_VARIANT: + /* + * Remap according to INTx routing table in 6700PXH + * specs, intel order number 302628-002, section + * 2.15.2. Other chipsets (80332, ...) have the same + * mapping and are handled here as well. + */ + printk(KERN_INFO "pci irq %d -> rerouted to legacy " + "irq %d\n", ret, (ret % 4) + 16); + ret = (ret % 4) + 16; + break; + + default: + printk(KERN_INFO "not rerouting irq %d to legacy irq: " + "unknown mapping\n", ret); + break; + } + } +#endif /* CONFIG_X86_IO_APIC */ + return ret; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index eb97564316d0..ac634ae2eb08 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1364,6 +1364,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x260a, quirk_intel_pcie_pm); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x260b, quirk_intel_pcie_pm); #ifdef CONFIG_X86_IO_APIC +/* + * Boot interrupts on some chipsets cannot be turned off. For these chipsets, + * remap the original interrupt in the linux kernel to the boot interrupt, so + * that a PCI device's interrupt handler is installed on the boot interrupt + * line instead. + */ +static void quirk_reroute_to_boot_interrupts_intel(struct pci_dev *dev) +{ + int i; + + if (noioapicquirk) + return; + + dev->irq_reroute_variant = INTEL_IRQ_REROUTE_VARIANT; + + printk(KERN_INFO "PCI quirk: reroute interrupts for 0x%04x:0x%04x\n", + dev->vendor, dev->device); + return; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_80333_0, quirk_reroute_to_boot_interrupts_intel); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_80333_1, quirk_reroute_to_boot_interrupts_intel); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, quirk_reroute_to_boot_interrupts_intel); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PXH_0, quirk_reroute_to_boot_interrupts_intel); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PXH_1, quirk_reroute_to_boot_interrupts_intel); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PXHV, quirk_reroute_to_boot_interrupts_intel); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_80332_0, quirk_reroute_to_boot_interrupts_intel); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_80332_1, quirk_reroute_to_boot_interrupts_intel); + /* * On some chipsets we can disable the generation of legacy INTx boot * interrupts. diff --git a/include/linux/pci.h b/include/linux/pci.h index d18b1dd49fab..6755cf5ac109 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -117,6 +117,11 @@ enum pci_dev_flags { PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG = (__force pci_dev_flags_t) 1, }; +enum pci_irq_reroute_variant { + INTEL_IRQ_REROUTE_VARIANT = 1, + MAX_IRQ_REROUTE_VARIANTS = 3 +}; + typedef unsigned short __bitwise pci_bus_flags_t; enum pci_bus_flags { PCI_BUS_FLAGS_NO_MSI = (__force pci_bus_flags_t) 1, @@ -194,6 +199,7 @@ struct pci_dev { unsigned int no_d1d2:1; /* only allow d0 or d3 */ unsigned int block_ucfg_access:1; /* userspace config space access is blocked */ unsigned int broken_parity_status:1; /* Device generates false positive parity */ + unsigned int irq_reroute_variant:2; /* device needs IRQ rerouting variant */ unsigned int msi_enabled:1; unsigned int msix_enabled:1; unsigned int is_managed:1; -- cgit v1.2.3-71-gd317 From 92be3d6bdf2cb34972ab50e12ad4da1076e690da Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Oct 2008 09:48:08 +0800 Subject: kexec/i386: allocate page table pages dynamically Impact: save .text size when kexec is built in but not loaded This patch adds an architecture specific struct kimage_arch into struct kimage. The pointers to page table pages used by kexec are added to struct kimage_arch. The page tables pages are dynamically allocated in machine_kexec_prepare instead of statically from BSS segment. This will save up to 20k memory when kexec image is not loaded. Signed-off-by: Huang Ying Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kexec.h | 14 ++++++++ arch/x86/kernel/machine_kexec_32.c | 67 ++++++++++++++++++++++++++------------ include/linux/kexec.h | 4 +++ 3 files changed, 64 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index a1f22771a15a..df9c41a9c6ae 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -170,6 +170,20 @@ relocate_kernel(unsigned long indirection_page, unsigned long start_address) ATTRIB_NORET; #endif +#ifdef CONFIG_X86_32 +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + pgd_t *pgd; +#ifdef CONFIG_X86_PAE + pmd_t *pmd0; + pmd_t *pmd1; +#endif + pte_t *pte0; + pte_t *pte1; +}; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 7a385746509a..1100312847a5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -25,15 +26,6 @@ #include #include -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -static u32 kexec_pgd[1024] PAGE_ALIGNED; -#ifdef CONFIG_X86_PAE -static u32 kexec_pmd0[1024] PAGE_ALIGNED; -static u32 kexec_pmd1[1024] PAGE_ALIGNED; -#endif -static u32 kexec_pte0[1024] PAGE_ALIGNED; -static u32 kexec_pte1[1024] PAGE_ALIGNED; - static void set_idt(void *newidt, __u16 limit) { struct desc_ptr curidt; @@ -76,6 +68,37 @@ static void load_segments(void) #undef __STR } +static void machine_kexec_free_page_tables(struct kimage *image) +{ + free_page((unsigned long)image->arch.pgd); +#ifdef CONFIG_X86_PAE + free_page((unsigned long)image->arch.pmd0); + free_page((unsigned long)image->arch.pmd1); +#endif + free_page((unsigned long)image->arch.pte0); + free_page((unsigned long)image->arch.pte1); +} + +static int machine_kexec_alloc_page_tables(struct kimage *image) +{ + image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); +#ifdef CONFIG_X86_PAE + image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); +#endif + image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!image->arch.pgd || +#ifdef CONFIG_X86_PAE + !image->arch.pmd0 || !image->arch.pmd1 || +#endif + !image->arch.pte0 || !image->arch.pte1) { + machine_kexec_free_page_tables(image); + return -ENOMEM; + } + return 0; +} + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -87,13 +110,14 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Make control page executable. + * - Make control page executable. + * - Allocate page tables */ int machine_kexec_prepare(struct kimage *image) { if (nx_enabled) set_pages_x(image->control_code_page, 1); - return 0; + return machine_kexec_alloc_page_tables(image); } /* @@ -104,6 +128,7 @@ void machine_kexec_cleanup(struct kimage *image) { if (nx_enabled) set_pages_nx(image->control_code_page, 1); + machine_kexec_free_page_tables(image); } /* @@ -150,18 +175,18 @@ void machine_kexec(struct kimage *image) relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_PGD] = __pa(kexec_pgd); - page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PGD] = __pa(image->arch.pgd); + page_list[VA_PGD] = (unsigned long)image->arch.pgd; #ifdef CONFIG_X86_PAE - page_list[PA_PMD_0] = __pa(kexec_pmd0); - page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PMD_1] = __pa(kexec_pmd1); - page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PMD_0] = __pa(image->arch.pmd0); + page_list[VA_PMD_0] = (unsigned long)image->arch.pmd0; + page_list[PA_PMD_1] = __pa(image->arch.pmd1); + page_list[VA_PMD_1] = (unsigned long)image->arch.pmd1; #endif - page_list[PA_PTE_0] = __pa(kexec_pte0); - page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PTE_1] = __pa(kexec_pte1); - page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_PTE_0] = __pa(image->arch.pte0); + page_list[VA_PTE_0] = (unsigned long)image->arch.pte0; + page_list[PA_PTE_1] = __pa(image->arch.pte1); + page_list[VA_PTE_1] = (unsigned long)image->arch.pte1; if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 17f76fc05173..adc34f2c6eff 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -100,6 +100,10 @@ struct kimage { #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 unsigned int preserve_context : 1; + +#ifdef ARCH_HAS_KIMAGE_ARCH + struct kimage_arch arch; +#endif }; -- cgit v1.2.3-71-gd317 From fd8cd7e1919fc1c27fe2fdccd2a1cd32f791ef0f Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 3 Nov 2008 15:50:38 -0800 Subject: x86: vmware: look for DMI string in the product serial key Impact: Should permit VMware detection on older platforms where the vendor is changed. Could theoretically cause a regression if some weird serial number scheme contains the string "VMware" by pure chance. Seems unlikely, especially with the mixed case. In some user configured cases, VMware may choose not to put a VMware specific DMI string, but the product serial key is always there and is VMware specific. Add a interface to check the serial key, when checking for VMware in the DMI information. Signed-off-by: Alok N Kataria Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/vmware.c | 7 ++++++- drivers/firmware/dmi_scan.c | 11 +++++++++++ include/linux/dmi.h | 2 ++ 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index a0905ecfe7d2..c034bda842d9 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -61,6 +61,11 @@ static unsigned long __vmware_get_tsc_khz(void) return tsc_hz; } +/* + * While checking the dmi string infomation, just checking the product + * serial key should be enough, as this will always have a VMware + * specific string when running under VMware hypervisor. + */ int vmware_platform(void) { if (cpu_has_hypervisor) { @@ -74,7 +79,7 @@ int vmware_platform(void) hyper_vendor_id[12] = '\0'; if (!strcmp(hyper_vendor_id, "VMwareVMware")) return 1; - } else if (dmi_available && dmi_name_in_vendors("VMware") && + } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) return 1; diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 3e526b6d00cb..d66d41282907 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -467,6 +467,17 @@ const char *dmi_get_system_info(int field) } EXPORT_SYMBOL(dmi_get_system_info); +/** + * dmi_name_in_serial - Check if string is in the DMI product serial + * information. + */ +int dmi_name_in_serial(const char *str) +{ + int f = DMI_PRODUCT_SERIAL; + if (dmi_ident[f] && strstr(dmi_ident[f], str)) + return 1; + return 0; +} /** * dmi_name_in_vendors - Check if string is anywhere in the DMI vendor information. diff --git a/include/linux/dmi.h b/include/linux/dmi.h index e5084eb5943a..2bfda178f274 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -44,6 +44,7 @@ extern const struct dmi_device * dmi_find_device(int type, const char *name, extern void dmi_scan_machine(void); extern int dmi_get_year(int field); extern int dmi_name_in_vendors(const char *str); +extern int dmi_name_in_serial(const char *str); extern int dmi_available; extern int dmi_walk(void (*decode)(const struct dmi_header *)); @@ -56,6 +57,7 @@ static inline const struct dmi_device * dmi_find_device(int type, const char *na static inline void dmi_scan_machine(void) { return; } static inline int dmi_get_year(int year) { return 0; } static inline int dmi_name_in_vendors(const char *s) { return 0; } +static inline int dmi_name_in_serial(const char *s) { return 0; } #define dmi_available 0 static inline int dmi_walk(void (*decode)(const struct dmi_header *)) { return -1; } -- cgit v1.2.3-71-gd317 From d211af055d0c12dc3416c2886e6fbdc6eb74a381 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Mon, 24 Nov 2008 15:38:45 +0100 Subject: i386: get rid of the use of KPROBE_ENTRY / KPROBE_END entry_32.S is now the only user of KPROBE_ENTRY / KPROBE_END, treewide. This patch reorders entry_64.S and explicitly generates a separate section for functions that need the protection. The generated code before and after the patch is equal. The KPROBE_ENTRY and KPROBE_END macro's are removed too. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 438 +++++++++++++++++++++++---------------------- include/linux/linkage.h | 8 - 2 files changed, 224 insertions(+), 222 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index bd02ec77edc4..6e96028d1a9c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -688,65 +688,6 @@ ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault - CFI_ADJUST_CFA_OFFSET 4 - ALIGN -error_code: - /* the function address is in %fs's slot on the stack */ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET ds, 0*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebx, 0 - cld - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET fs, 0*/ - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - /*CFI_REGISTER es, ecx*/ - movl PT_FS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_FS(%esp) - /*CFI_REL_OFFSET fs, ES*/ - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(page_fault) - ENTRY(coprocessor_error) RING0_INT_FRAME pushl $0 @@ -777,140 +718,6 @@ ENTRY(device_not_available) CFI_ENDPROC END(device_not_available) -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -#define FIX_STACK(offset, ok, label) \ - cmpw $__KERNEL_CS,4(%esp); \ - jne ok; \ -label: \ - movl TSS_sysenter_sp0+offset(%esp),%esp; \ - CFI_DEF_CFA esp, 0; \ - CFI_UNDEFINED eip; \ - pushfl; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $__KERNEL_CS; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $sysenter_past_esp; \ - CFI_ADJUST_CFA_OFFSET 4; \ - CFI_REL_OFFSET eip, 0 - -KPROBE_ENTRY(debug) - RING0_INT_FRAME - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) -debug_stack_correct: - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(debug) - -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -KPROBE_ENTRY(nmi) - RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - je nmi_espfix_stack - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_nocheck_notrace - CFI_ENDPROC - -nmi_stack_fixup: - RING0_INT_FRAME - FIX_STACK(12,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK(24,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * - * create the pointer to lss back - */ - pushl %ss - CFI_ADJUST_CFA_OFFSET 4 - pushl %esp - CFI_ADJUST_CFA_OFFSET 4 - addw $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - CFI_ADJUST_CFA_OFFSET 4 - .endr - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 - jmp irq_return - CFI_ENDPROC -KPROBE_END(nmi) - #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iret @@ -926,19 +733,6 @@ ENTRY(native_irq_enable_sysexit) END(native_irq_enable_sysexit) #endif -KPROBE_ENTRY(int3) - RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(int3) - ENTRY(overflow) RING0_INT_FRAME pushl $0 @@ -1003,14 +797,6 @@ ENTRY(stack_segment) CFI_ENDPROC END(stack_segment) -KPROBE_ENTRY(general_protection) - RING0_EC_FRAME - pushl $do_general_protection - CFI_ADJUST_CFA_OFFSET 4 - jmp error_code - CFI_ENDPROC -KPROBE_END(general_protection) - ENTRY(alignment_check) RING0_EC_FRAME pushl $do_alignment_check @@ -1220,3 +1006,227 @@ END(mcount) #include "syscall_table_32.S" syscall_table_size=(.-sys_call_table) + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + /* the function address is in %fs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ + movl PT_FS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_FS(%esp) + /*CFI_REL_OFFSET fs, ES*/ + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl TSS_sysenter_sp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ + pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $__KERNEL_CS; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 + +ENTRY(debug) + RING0_INT_FRAME + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_espfix_stack + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_nocheck_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + addw $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return + CFI_ENDPROC +END(nmi) + +ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +END(int3) + +ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(general_protection) + +/* + * End of kprobes section + */ + .popsection diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 9fd1f859021b..fee9e59649c1 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -64,14 +64,6 @@ name: #endif -#define KPROBE_ENTRY(name) \ - .pushsection .kprobes.text, "ax"; \ - ENTRY(name) - -#define KPROBE_END(name) \ - END(name); \ - .popsection - #ifndef END #define END(name) \ .size name, .-name -- cgit v1.2.3-71-gd317 From 3c8bb73ace6249bd089b70c941440441940e3365 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:27 -0800 Subject: x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3 Impact: Code transformation, new functions added should have no effect. Drivers use mmap followed by pgprot_* and remap_pfn_range or vm_insert_pfn, in order to export reserved memory to userspace. Currently, such mappings are not tracked and hence not kept consistent with other mappings (/dev/mem, pci resource, ioremap) for the sme memory, that may exist in the system. The following patchset adds x86 PAT attribute tracking and untracking for pfnmap related APIs. First three patches in the patchset are changing the generic mm code to fit in this tracking. Last four patches are x86 specific to make things work with x86 PAT code. The patchset aso introduces pgprot_writecombine interface, which gives writecombine mapping when enabled, falling back to pgprot_noncached otherwise. This patch: While working on x86 PAT, we faced some hurdles with trackking remap_pfn_range() regions, as we do not have any information to say whether that PFNMAP mapping is linear for the entire vma range or it is smaller granularity regions within the vma. A simple solution to this is to use vm_pgoff as an indicator for linear mapping over the vma region. Currently, remap_pfn_range only sets vm_pgoff for COW mappings. Below patch changes the logic and sets the vm_pgoff irrespective of COW. This will still not be enough for the case where pfn is zero (vma region mapped to physical address zero). But, for all the other cases, we can look at pfnmap VMAs and say whether the mappng is for the entire vma region or not. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 9 +++++++++ mm/memory.c | 7 +++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ffee2f743418..2be8d9b5e46f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -145,6 +145,15 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ +static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) +{ + return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff); +} + +static inline int is_pfn_mapping(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_PFNMAP); +} /* * vm_fault is filled by the the pagefault handler and passed to the vma's diff --git a/mm/memory.c b/mm/memory.c index 164951c47305..cef95c8c77fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1575,11 +1575,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". */ - if (is_cow_mapping(vma->vm_flags)) { - if (addr != vma->vm_start || end != vma->vm_end) - return -EINVAL; + if (addr == vma->vm_start && end == vma->vm_end) vma->vm_pgoff = pfn; - } + else if (is_cow_mapping(vma->vm_flags)) + return -EINVAL; vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; -- cgit v1.2.3-71-gd317 From e121e418441525b5636321fe03d16f0193ad218e Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:28 -0800 Subject: x86: PAT: add follow_pfnmp_pte routine to help tracking pfnmap pages - v3 Impact: New currently unused interface. Add a generic interface to follow pfn in a pfnmap vma range. This is used by one of the subsequent x86 PAT related patch to keep track of memory types for vma regions across vma copy and free. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 3 +++ mm/memory.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2be8d9b5e46f..a25024ff9c11 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1223,6 +1223,9 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ +int follow_pfnmap_pte(struct vm_area_struct *vma, + unsigned long address, pte_t *ret_ptep); + typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index cef95c8c77fa..8ca6bbf34ad6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1111,6 +1111,49 @@ no_page_table: return page; } +int follow_pfnmap_pte(struct vm_area_struct *vma, unsigned long address, + pte_t *ret_ptep) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + if (!is_pfn_mapping(vma)) + goto err; + + page = NULL; + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto err; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto err; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto err; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + + pte = *ptep; + if (!pte_present(pte)) + goto err_unlock; + + *ret_ptep = pte; + pte_unmap_unlock(ptep, ptl); + return 0; + +err_unlock: + pte_unmap_unlock(ptep, ptl); +err: + return -EINVAL; +} + /* Can we do the FOLL_ANON optimization? */ static inline int use_zero_page(struct vm_area_struct *vma) { -- cgit v1.2.3-71-gd317 From 2ab640379a0ab4cef746ced1d7e04a0941774bcb Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:29 -0800 Subject: x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3 Impact: Introduces new hooks, which are currently null. Introduce generic hooks in remap_pfn_range and vm_insert_pfn and corresponding copy and free routines with reserve and free tracking. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 6 +++++ mm/memory.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 81 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a25024ff9c11..87ecb40e11a0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,6 +155,12 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma) return (vma->vm_flags & VM_PFNMAP); } +extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size); +extern int track_pfn_vma_copy(struct vm_area_struct *vma); +extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size); + /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask diff --git a/mm/memory.c b/mm/memory.c index 8ca6bbf34ad6..1e8f0d347c0e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -99,6 +99,50 @@ int randomize_va_space __read_mostly = 2; #endif +#ifndef track_pfn_vma_new +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + */ +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size) +{ + return 0; +} +#endif + +#ifndef track_pfn_vma_copy +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + */ +int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + return 0; +} +#endif + +#ifndef untrack_pfn_vma +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) +{ +} +#endif + static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -669,6 +713,16 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); + if (is_pfn_mapping(vma)) { + /* + * We do not free on error cases below as remove_vma + * gets called on error from higher level routine + */ + ret = track_pfn_vma_copy(vma); + if (ret) + return ret; + } + /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the @@ -915,6 +969,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; + if (is_pfn_mapping(vma)) + untrack_pfn_vma(vma, 0, 0); + while (start != end) { if (!tlb_start_valid) { tlb_start = start; @@ -1473,6 +1530,7 @@ out: int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { + int ret; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1487,7 +1545,15 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - return insert_pfn(vma, addr, pfn, vma->vm_page_prot); + if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE)) + return -EINVAL; + + ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot); + + if (ret) + untrack_pfn_vma(vma, pfn, PAGE_SIZE); + + return ret; } EXPORT_SYMBOL(vm_insert_pfn); @@ -1625,6 +1691,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); + if (err) + return -EINVAL; + BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); @@ -1636,6 +1706,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); + + if (err) + untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); + return err; } EXPORT_SYMBOL(remap_pfn_range); -- cgit v1.2.3-71-gd317 From 6bd9cd50c830eb88d571c492ec370a30bf999e15 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:26 -0800 Subject: x86: PAT: clarify is_linear_pfn_mapping() interface Impact: Documentation only Incremental patches to address the review comments from Nick Piggin for v3 version of x86 PAT pfnmap changes patchset here http://lkml.indiana.edu/hypermail/linux/kernel/0812.2/01330.html This patch: Clarify is_linear_pfn_mapping() and its usage. It is used by x86 PAT code for performance reasons. Identifying pfnmap as linear over entire vma helps speedup reserve and free of memtype for the region. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 87ecb40e11a0..35f811b0cd69 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -145,6 +145,14 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ +/* + * This interface is used by x86 PAT code to identify a pfn mapping that is + * linear over entire vma. This is to optimize PAT code that deals with + * marking the physical region with a particular prot. This is not for generic + * mm use. Note also that this check will not work if the pfn mapping is + * linear for a vma starting at physical address 0. In which case PAT code + * falls back to slow path of reserving physical range page by page. + */ static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) { return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff); -- cgit v1.2.3-71-gd317 From d87fe6607c31944f7572f965c1507ae77026c133 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:27 -0800 Subject: x86: PAT: modify follow_phys to return phys_addr prot and return value Impact: Changes and globalizes an existing static interface. Follow_phys does similar things as follow_pfnmap_pte. Make a minor change to follow_phys so that it can be used in place of follow_pfnmap_pte. Physical address return value with 0 as error return does not work in follow_phys as the actual physical address 0 mapping may exist in pte. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 2 ++ mm/memory.c | 31 ++++++++++++++----------------- 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 35f811b0cd69..2f6e2f886d4b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -804,6 +804,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int follow_phys(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); diff --git a/mm/memory.c b/mm/memory.c index 1e8f0d347c0e..79f28e35d4fc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2981,9 +2981,9 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ #ifdef CONFIG_HAVE_IOREMAP_PROT -static resource_size_t follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot) +int follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot, resource_size_t *phys) { pgd_t *pgd; pud_t *pud; @@ -2992,24 +2992,26 @@ static resource_size_t follow_phys(struct vm_area_struct *vma, spinlock_t *ptl; resource_size_t phys_addr = 0; struct mm_struct *mm = vma->vm_mm; + int ret = -EINVAL; - VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; + goto out; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto no_page_table; + goto out; pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto no_page_table; + goto out; /* We cannot handle huge page PFN maps. Luckily they don't exist. */ if (pmd_huge(*pmd)) - goto no_page_table; + goto out; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) @@ -3024,13 +3026,13 @@ static resource_size_t follow_phys(struct vm_area_struct *vma, phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ *prot = pgprot_val(pte_pgprot(pte)); + *phys = phys_addr; + ret = 0; unlock: pte_unmap_unlock(ptep, ptl); out: - return phys_addr; -no_page_table: - return 0; + return ret; } int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, @@ -3041,12 +3043,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *maddr; int offset = addr & (PAGE_SIZE-1); - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - - phys_addr = follow_phys(vma, addr, write, &prot); - - if (!phys_addr) + if (follow_phys(vma, addr, write, &prot, &phys_addr)) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); -- cgit v1.2.3-71-gd317 From 982d789ab76c8a11426852fec2fdf2f412e21c0c Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:28 -0800 Subject: x86: PAT: remove follow_pfnmap_pte in favor of follow_phys Impact: Cleanup - removes a new function in favor of a recently modified older one. Replace follow_pfnmap_pte in pat code with follow_phys. follow_phys lso returns protection eliminating the need of pte_pgprot call. Using follow_phys also eliminates the need for pte_pa. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 5 ----- arch/x86/mm/pat.c | 30 +++++++++++------------------ include/linux/mm.h | 3 --- mm/memory.c | 43 ------------------------------------------ 4 files changed, 11 insertions(+), 70 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 579f8ceee948..2aa792bbd7e0 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -230,11 +230,6 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } -static inline u64 pte_pa(pte_t pte) -{ - return pte_val(pte) & PTE_PFN_MASK; -} - #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d5254bae84f4..541bcc944a5b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -685,8 +685,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int retval = 0; unsigned long i, j; u64 paddr; - pgprot_t prot; - pte_t pte; + unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -696,26 +695,22 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) if (is_linear_pfn_mapping(vma)) { /* - * reserve the whole chunk starting from vm_pgoff, - * But, we have to get the protection from pte. + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. */ - if (follow_pfnmap_pte(vma, vma_start, &pte)) { + if (follow_phys(vma, vma_start, 0, &prot, &paddr)) { WARN_ON_ONCE(1); - return -1; + return -EINVAL; } - prot = pte_pgprot(pte); - paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot); + return reserve_pfn_range(paddr, vma_size, __pgprot(prot)); } /* reserve entire vma page by page, using pfn and prot from pte */ for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); - prot = pte_pgprot(pte); - retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot)); if (retval) goto cleanup_ret; } @@ -724,10 +719,9 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) cleanup_ret: /* Reserve error: Cleanup partial reservation and return error */ for (j = 0; j < i; j += PAGE_SIZE) { - if (follow_pfnmap_pte(vma, vma_start + j, &pte)) + if (follow_phys(vma, vma_start + j, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); free_pfn_range(paddr, PAGE_SIZE); } @@ -797,6 +791,7 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, { unsigned long i; u64 paddr; + unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -821,12 +816,9 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, } else { /* free entire vma, page by page, using the pfn from pte */ for (i = 0; i < vma_size; i += PAGE_SIZE) { - pte_t pte; - - if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); free_pfn_range(paddr, PAGE_SIZE); } } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2f6e2f886d4b..36f9b3fa5e15 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,9 +1239,6 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ -int follow_pfnmap_pte(struct vm_area_struct *vma, - unsigned long address, pte_t *ret_ptep); - typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 79f28e35d4fc..6b29f39a5a3e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1168,49 +1168,6 @@ no_page_table: return page; } -int follow_pfnmap_pte(struct vm_area_struct *vma, unsigned long address, - pte_t *ret_ptep) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - if (!is_pfn_mapping(vma)) - goto err; - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto err; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto err; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto err; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - - pte = *ptep; - if (!pte_present(pte)) - goto err_unlock; - - *ret_ptep = pte; - pte_unmap_unlock(ptep, ptl); - return 0; - -err_unlock: - pte_unmap_unlock(ptep, ptl); -err: - return -EINVAL; -} - /* Can we do the FOLL_ANON optimization? */ static inline int use_zero_page(struct vm_area_struct *vma) { -- cgit v1.2.3-71-gd317 From 34801ba9bf0381fcf0e2b08179d2c07f2c6ede74 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:29 -0800 Subject: x86: PAT: move track untrack pfnmap stubs to asm-generic Impact: Cleanup and branch hints only. Move the track and untrack pfn stub routines from memory.c to asm-generic. Also add unlikely to pfnmap related calls in fork and exit path. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 6 ++---- include/asm-generic/pgtable.h | 46 ++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 6 ------ mm/memory.c | 48 ++---------------------------------------- 4 files changed, 50 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2aa792bbd7e0..875192bf72cb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -339,12 +339,10 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) +#ifndef __ASSEMBLY__ /* Indicate that x86 has its own track and untrack pfn vma functions */ -#define track_pfn_vma_new track_pfn_vma_new -#define track_pfn_vma_copy track_pfn_vma_copy -#define untrack_pfn_vma untrack_pfn_vma +#define __HAVE_PFNMAP_TRACKING -#ifndef __ASSEMBLY__ #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index b84633801fb6..72ebe91005a8 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -293,6 +293,52 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #define arch_flush_lazy_cpu_mode() do {} while (0) #endif +#ifndef __HAVE_PFNMAP_TRACKING +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + */ +static inline int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size) +{ + return 0; +} + +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + */ +static inline int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + return 0; +} + +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +static inline void untrack_pfn_vma(struct vm_area_struct *vma, + unsigned long pfn, unsigned long size) +{ +} +#else +extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size); +extern int track_pfn_vma_copy(struct vm_area_struct *vma); +extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size); +#endif + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_GENERIC_PGTABLE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 36f9b3fa5e15..d3ddd735e375 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -163,12 +163,6 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma) return (vma->vm_flags & VM_PFNMAP); } -extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, - unsigned long pfn, unsigned long size); -extern int track_pfn_vma_copy(struct vm_area_struct *vma); -extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); - /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask diff --git a/mm/memory.c b/mm/memory.c index 6b29f39a5a3e..f01b7eed6e16 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -99,50 +99,6 @@ int randomize_va_space __read_mostly = 2; #endif -#ifndef track_pfn_vma_new -/* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, - unsigned long pfn, unsigned long size) -{ - return 0; -} -#endif - -#ifndef track_pfn_vma_copy -/* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). - */ -int track_pfn_vma_copy(struct vm_area_struct *vma) -{ - return 0; -} -#endif - -#ifndef untrack_pfn_vma -/* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * untrack_pfn_vma is called while unmapping a pfnmap for a region. - * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). - */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) -{ -} -#endif - static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -713,7 +669,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); - if (is_pfn_mapping(vma)) { + if (unlikely(is_pfn_mapping(vma))) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine @@ -969,7 +925,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; - if (is_pfn_mapping(vma)) + if (unlikely(is_pfn_mapping(vma))) untrack_pfn_vma(vma, 0, 0); while (start != end) { -- cgit v1.2.3-71-gd317