From b5ff8161e37cef3265e186ecded23324e4dc2973 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 8 Sep 2017 16:10:49 -0700 Subject: mm: thp: introduce separate TTU flag for thp freezing TTU_MIGRATION is used to convert pte into migration entry until thp split completes. This behavior conflicts with thp migration added later patches, so let's introduce a new TTU flag specifically for freezing. try_to_unmap() is used both for thp split (via freeze_page()) and page migration (via __unmap_and_move()). In freeze_page(), ttu_flag given for head page is like below (assuming anonymous thp): (TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | \ TTU_MIGRATION | TTU_SPLIT_HUGE_PMD) and ttu_flag given for tail pages is: (TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | \ TTU_MIGRATION) __unmap_and_move() calls try_to_unmap() with ttu_flag: (TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS) Now I'm trying to insert a branch for thp migration at the top of try_to_unmap_one() like below static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { ... /* PMD-mapped THP migration entry */ if (!pvmw.pte && (flags & TTU_MIGRATION)) { if (!PageAnon(page)) continue; set_pmd_migration_entry(&pvmw, page); continue; } ... } so try_to_unmap() for tail pages called by thp split can go into thp migration code path (which converts *pmd* into migration entry), while the expectation is to freeze thp (which converts *pte* into migration entry.) I detected this failure as a "bad page state" error in a testcase where split_huge_page() is called from queue_pages_pte_range(). Link: http://lkml.kernel.org/r/20170717193955.20207-4-zi.yan@sent.com Signed-off-by: Naoya Horiguchi Signed-off-by: Zi Yan Acked-by: Kirill A. Shutemov Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: Dave Hansen Cc: David Nellans Cc: Ingo Molnar Cc: Mel Gorman Cc: Minchan Kim Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 43ef2c30cb0f..f8ca2e74b819 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -93,8 +93,9 @@ enum ttu_flags { TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ - TTU_RMAP_LOCKED = 0x80 /* do not grab rmap lock: + TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: * caller holds it */ + TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ }; #ifdef CONFIG_MMU -- cgit v1.2.3-71-gd317 From 9c670ea37947a82cb6d4df69139f7e46ed71a0ac Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 8 Sep 2017 16:10:53 -0700 Subject: mm: thp: introduce CONFIG_ARCH_ENABLE_THP_MIGRATION Introduce CONFIG_ARCH_ENABLE_THP_MIGRATION to limit thp migration functionality to x86_64, which should be safer at the first step. Link: http://lkml.kernel.org/r/20170717193955.20207-5-zi.yan@sent.com Signed-off-by: Naoya Horiguchi Signed-off-by: Zi Yan Reviewed-by: Anshuman Khandual Cc: "H. Peter Anvin" Cc: Dave Hansen Cc: David Nellans Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Minchan Kim Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 4 ++++ include/linux/huge_mm.h | 10 ++++++++++ mm/Kconfig | 3 +++ 3 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4b278a33ccbb..07d9b6c75328 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2343,6 +2343,10 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on X86_64 && HUGETLB_PAGE && MIGRATION +config ARCH_ENABLE_THP_MIGRATION + def_bool y + depends on X86_64 && TRANSPARENT_HUGEPAGE + menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ee696347f928..d8f35a0865dc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -233,6 +233,11 @@ void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) +static inline bool thp_migration_supported(void) +{ + return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); +} + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -336,6 +341,11 @@ static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, { return NULL; } + +static inline bool thp_migration_supported(void) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 0ded10a22639..9ef8c2ea92ad 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -262,6 +262,9 @@ config MIGRATION config ARCH_ENABLE_HUGEPAGE_MIGRATION bool +config ARCH_ENABLE_THP_MIGRATION + bool + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT -- cgit v1.2.3-71-gd317 From 616b8371539a6c487404c3b8fb04078016dab4ba Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 8 Sep 2017 16:10:57 -0700 Subject: mm: thp: enable thp migration in generic path Add thp migration's core code, including conversions between a PMD entry and a swap entry, setting PMD migration entry, removing PMD migration entry, and waiting on PMD migration entries. This patch makes it possible to support thp migration. If you fail to allocate a destination page as a thp, you just split the source thp as we do now, and then enter the normal page migration. If you succeed to allocate destination thp, you enter thp migration. Subsequent patches actually enable thp migration for each caller of page migration by allowing its get_new_page() callback to allocate thps. [zi.yan@cs.rutgers.edu: fix gcc-4.9.0 -Wmissing-braces warning] Link: http://lkml.kernel.org/r/A0ABA698-7486-46C3-B209-E95A9048B22C@cs.rutgers.edu [akpm@linux-foundation.org: fix x86_64 allnoconfig warning] Signed-off-by: Zi Yan Acked-by: Kirill A. Shutemov Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: Dave Hansen Cc: David Nellans Cc: Ingo Molnar Cc: Mel Gorman Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64.h | 2 + include/linux/swapops.h | 73 ++++++++++++++++++++++++++++++++-- mm/huge_memory.c | 84 ++++++++++++++++++++++++++++++++++++--- mm/migrate.c | 32 ++++++++++++++- mm/page_vma_mapped.c | 18 +++++++-- mm/pgtable-generic.c | 3 +- mm/rmap.c | 13 ++++++ 7 files changed, 212 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 46bf71c77a25..972a4698c530 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -210,7 +210,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; } ((type) << (SWP_TYPE_FIRST_BIT)) \ | ((offset) << SWP_OFFSET_FIRST_BIT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) +#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val }) extern int kern_addr_valid(unsigned long addr); extern void cleanup_highmap(void); diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c5ff7b217ee6..82089fd88c29 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -103,7 +103,8 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) #ifdef CONFIG_MIGRATION static inline swp_entry_t make_migration_entry(struct page *page, int write) { - BUG_ON(!PageLocked(page)); + BUG_ON(!PageLocked(compound_head(page))); + return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ, page_to_pfn(page)); } @@ -126,7 +127,7 @@ static inline struct page *migration_entry_to_page(swp_entry_t entry) * Any use of migration entries may only occur while the * corresponding page is locked */ - BUG_ON(!PageLocked(p)); + BUG_ON(!PageLocked(compound_head(p))); return p; } @@ -148,7 +149,11 @@ static inline int is_migration_entry(swp_entry_t swp) { return 0; } -#define migration_entry_to_page(swp) NULL +static inline struct page *migration_entry_to_page(swp_entry_t entry) +{ + return NULL; +} + static inline void make_migration_entry_read(swp_entry_t *entryp) { } static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl) { } @@ -163,6 +168,68 @@ static inline int is_write_migration_entry(swp_entry_t entry) #endif +struct page_vma_mapped_walk; + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page); + +extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, + struct page *new); + +extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); + +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) +{ + swp_entry_t arch_entry; + + arch_entry = __pmd_to_swp_entry(pmd); + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); +} + +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) +{ + swp_entry_t arch_entry; + + arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); + return __swp_entry_to_pmd(arch_entry); +} + +static inline int is_pmd_migration_entry(pmd_t pmd) +{ + return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); +} +#else +static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + BUILD_BUG(); +} + +static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, + struct page *new) +{ + BUILD_BUG(); +} + +static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } + +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) +{ + return swp_entry(0, 0); +} + +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) +{ + return __pmd(0); +} + +static inline int is_pmd_migration_entry(pmd_t pmd) +{ + return 0; +} +#endif + #ifdef CONFIG_MEMORY_FAILURE extern atomic_long_t num_poisoned_pages __read_mostly; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8a97833ef0f1..937f007794dd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1684,10 +1684,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, spin_unlock(ptl); tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { - struct page *page = pmd_page(orig_pmd); - page_remove_rmap(page, true); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - VM_BUG_ON_PAGE(!PageHead(page), page); + struct page *page = NULL; + int flush_needed = 1; + + if (pmd_present(orig_pmd)) { + page = pmd_page(orig_pmd); + page_remove_rmap(page, true); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + VM_BUG_ON_PAGE(!PageHead(page), page); + } else if (thp_migration_supported()) { + swp_entry_t entry; + + VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); + entry = pmd_to_swp_entry(orig_pmd); + page = pfn_to_page(swp_offset(entry)); + flush_needed = 0; + } else + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + if (PageAnon(page)) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); @@ -1696,8 +1710,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } + spin_unlock(ptl); - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + if (flush_needed) + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); } return 1; } @@ -2745,3 +2761,61 @@ static int __init split_huge_pages_debugfs(void) } late_initcall(split_huge_pages_debugfs); #endif + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + pmd_t pmdval; + swp_entry_t entry; + + if (!(pvmw->pmd && !pvmw->pte)) + return; + + mmu_notifier_invalidate_range_start(mm, address, + address + HPAGE_PMD_SIZE); + + flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); + pmdval = *pvmw->pmd; + pmdp_invalidate(vma, address, pvmw->pmd); + if (pmd_dirty(pmdval)) + set_page_dirty(page); + entry = make_migration_entry(page, pmd_write(pmdval)); + pmdval = swp_entry_to_pmd(entry); + set_pmd_at(mm, address, pvmw->pmd, pmdval); + page_remove_rmap(page, true); + put_page(page); + + mmu_notifier_invalidate_range_end(mm, address, + address + HPAGE_PMD_SIZE); +} + +void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + unsigned long mmun_start = address & HPAGE_PMD_MASK; + pmd_t pmde; + swp_entry_t entry; + + if (!(pvmw->pmd && !pvmw->pte)) + return; + + entry = pmd_to_swp_entry(*pvmw->pmd); + get_page(new); + pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); + if (is_write_migration_entry(entry)) + pmde = maybe_pmd_mkwrite(pmde, vma); + + flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); + page_add_anon_rmap(new, vma, mmun_start, true); + set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); + if (vma->vm_flags & VM_LOCKED) + mlock_vma_page(new); + update_mmu_cache_pmd(vma, address, pvmw->pmd); +} +#endif diff --git a/mm/migrate.c b/mm/migrate.c index e84eeb4e4356..bf5366a2176b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -216,6 +216,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, new = page - pvmw.page->index + linear_page_index(vma, pvmw.address); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + remove_migration_pmd(&pvmw, new); + continue; + } +#endif + get_page(new); pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(*pvmw.pte)) @@ -330,6 +339,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, __migration_entry_wait(mm, pte, ptl); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *ptl; + struct page *page; + + ptl = pmd_lock(mm, pmd); + if (!is_pmd_migration_entry(*pmd)) + goto unlock; + page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); + if (!get_page_unless_zero(page)) + goto unlock; + spin_unlock(ptl); + wait_on_page_locked(page); + put_page(page); + return; +unlock: + spin_unlock(ptl); +} +#endif + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, @@ -1088,7 +1118,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page))) { + if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { lock_page(page); rc = split_huge_page(page); unlock_page(page); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 8ec6ba230bb9..3bd3008db4cb 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -138,16 +138,28 @@ restart: if (!pud_present(*pud)) return false; pvmw->pmd = pmd_offset(pud, pvmw->address); - if (pmd_trans_huge(*pvmw->pmd)) { + if (pmd_trans_huge(*pvmw->pmd) || is_pmd_migration_entry(*pvmw->pmd)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); - if (!pmd_present(*pvmw->pmd)) - return not_found(pvmw); if (likely(pmd_trans_huge(*pvmw->pmd))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); if (pmd_page(*pvmw->pmd) != page) return not_found(pvmw); return true; + } else if (!pmd_present(*pvmw->pmd)) { + if (thp_migration_supported()) { + if (!(pvmw->flags & PVMW_MIGRATION)) + return not_found(pvmw); + if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd); + + if (migration_entry_to_page(entry) != page) + return not_found(pvmw); + return true; + } + } else + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + return not_found(pvmw); } else { /* THP pmd was split under us: handle on pte level */ spin_unlock(pvmw->ptl); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c99d9512a45b..1175f6a24fdb 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,7 +124,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp)) || !pmd_present(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; diff --git a/mm/rmap.c b/mm/rmap.c index 5b26af8a7a29..7dc9c02f7106 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1360,6 +1360,19 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte && (flags & TTU_MIGRATION)) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + + if (!PageAnon(page)) + continue; + + set_pmd_migration_entry(&pvmw, page); + continue; + } +#endif + /* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced -- cgit v1.2.3-71-gd317 From 84c3fc4e9c563d8fb91cfdf5948da48fe1af34d3 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 8 Sep 2017 16:11:01 -0700 Subject: mm: thp: check pmd migration entry in common path When THP migration is being used, memory management code needs to handle pmd migration entries properly. This patch uses !pmd_present() or is_swap_pmd() (depending on whether pmd_none() needs separate code or not) to check pmd migration entries at the places where a pmd entry is present. Since pmd-related code uses split_huge_page(), split_huge_pmd(), pmd_trans_huge(), pmd_trans_unstable(), or pmd_none_or_trans_huge_or_clear_bad(), this patch: 1. adds pmd migration entry split code in split_huge_pmd(), 2. takes care of pmd migration entries whenever pmd_trans_huge() is present, 3. makes pmd_none_or_trans_huge_or_clear_bad() pmd migration entry aware. Since split_huge_page() uses split_huge_pmd() and pmd_trans_unstable() is equivalent to pmd_none_or_trans_huge_or_clear_bad(), we do not change them. Until this commit, a pmd entry should be: 1. pointing to a pte page, 2. is_swap_pmd(), 3. pmd_trans_huge(), 4. pmd_devmap(), or 5. pmd_none(). Signed-off-by: Zi Yan Cc: Kirill A. Shutemov Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: Dave Hansen Cc: David Nellans Cc: Ingo Molnar Cc: Mel Gorman Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 32 +++++++++++++-------- include/asm-generic/pgtable.h | 18 +++++++++++- include/linux/huge_mm.h | 14 ++++++++-- mm/gup.c | 22 +++++++++++++-- mm/huge_memory.c | 65 +++++++++++++++++++++++++++++++++++++++---- mm/memcontrol.c | 5 ++++ mm/memory.c | 12 ++++++-- mm/mprotect.c | 4 +-- mm/mremap.c | 2 +- 9 files changed, 147 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a290966f91ec..8eec35af32e4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -608,7 +608,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - smaps_pmd_entry(pmd, addr, walk); + if (pmd_present(*pmd)) + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; } @@ -1012,6 +1013,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, goto out; } + if (!pmd_present(*pmd)) + goto out; + page = pmd_page(*pmd); /* Clear accessed and referenced bits. */ @@ -1293,27 +1297,33 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; + struct page *page = NULL; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; - /* - * Currently pmd for thp is always present because thp - * can not be swapped-out, migrated, or HWPOISONed - * (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ if (pmd_present(pmd)) { - struct page *page = pmd_page(pmd); - - if (page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; + page = pmd_page(pmd); flags |= PM_PRESENT; if (pm->show_pfn) frame = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + else if (is_swap_pmd(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + flags |= PM_SWAP; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + page = migration_entry_to_page(entry); + } +#endif + + if (page && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 4d7bb98f4134..4f93a6d10a47 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -846,7 +846,23 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) #ifdef CONFIG_TRANSPARENT_HUGEPAGE barrier(); #endif - if (pmd_none(pmdval) || pmd_trans_huge(pmdval)) + /* + * !pmd_present() checks for pmd migration entries + * + * The complete check uses is_pmd_migration_entry() in linux/swapops.h + * But using that requires moving current function and pmd_trans_unstable() + * to linux/swapops.h to resovle dependency, which is too much code move. + * + * !pmd_present() is equivalent to is_pmd_migration_entry() currently, + * because !pmd_present() pages can only be under migration not swapped + * out. + * + * pmd_none() is preseved for future condition checks on pmd migration + * entries and not confusing with this function name, although it is + * redundant with !pmd_present(). + */ + if (pmd_none(pmdval) || pmd_trans_huge(pmdval) || + (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval))) return 1; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d8f35a0865dc..14bc21c2ee7f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -147,7 +147,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (pmd_trans_huge(*____pmd) \ + if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ @@ -178,12 +178,18 @@ extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); extern spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); + +static inline int is_swap_pmd(pmd_t pmd) +{ + return !pmd_none(pmd) && !pmd_present(pmd); +} + /* mmap_sem must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; @@ -299,6 +305,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } +static inline int is_swap_pmd(pmd_t pmd) +{ + return 0; +} static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { diff --git a/mm/gup.c b/mm/gup.c index 33d651deeae2..76fd199aaae2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -234,6 +234,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } +retry: + if (!pmd_present(*pmd)) { + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(*pmd)); + if (is_pmd_migration_entry(*pmd)) + pmd_migration_entry_wait(mm, pmd); + goto retry; + } if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); @@ -247,7 +257,15 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); +retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_present(*pmd))) { + spin_unlock(ptl); + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + pmd_migration_entry_wait(mm, pmd); + goto retry_locked; + } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags); @@ -424,7 +442,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud = pud_offset(p4d, address); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return -EFAULT; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, address); @@ -1534,7 +1552,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 937f007794dd..b82585eabe85 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -928,6 +928,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (unlikely(is_swap_pmd(pmd))) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + if (is_write_migration_entry(entry)) { + make_migration_entry_read(&entry); + pmd = swp_entry_to_pmd(entry); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + ret = 0; + goto out_unlock; + } +#endif + if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; @@ -1599,6 +1616,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_huge_zero_pmd(orig_pmd)) goto out; + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto out; + } + page = pmd_page(orig_pmd); /* * If other processes are mapping this page, we couldn't discard @@ -1810,6 +1833,25 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, preserve_write = prot_numa && pmd_write(*pmd); ret = 1; +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (is_swap_pmd(*pmd)) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + + VM_BUG_ON(!is_pmd_migration_entry(*pmd)); + if (is_write_migration_entry(entry)) { + pmd_t newpmd; + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + newpmd = swp_entry_to_pmd(entry); + set_pmd_at(mm, addr, pmd, newpmd); + } + goto unlock; + } +#endif + /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -1875,7 +1917,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || + pmd_devmap(*pmd))) return ptl; spin_unlock(ptl); return NULL; @@ -1993,14 +2036,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t _pmd; - bool young, write, dirty, soft_dirty; + bool young, write, dirty, soft_dirty, pmd_migration = false; unsigned long addr; int i; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) + && !pmd_devmap(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -2025,7 +2069,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } - page = pmd_page(*pmd); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + pmd_migration = is_pmd_migration_entry(*pmd); + if (pmd_migration) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(*pmd); + page = pfn_to_page(swp_offset(entry)); + } else +#endif + page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); @@ -2044,7 +2097,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * transferred to avoid any possibility of altering * permissions across VMAs. */ - if (freeze) { + if (freeze || pmd_migration) { swp_entry_t swp_entry; swp_entry = make_migration_entry(page + i, write); entry = swp_entry_to_pte(swp_entry); @@ -2143,7 +2196,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); if (PageMlocked(page)) clear_page_mlock(page); - } else if (!pmd_devmap(*pmd)) + } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6532b219b222..f1f3f5b41155 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4664,6 +4664,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, struct page *page = NULL; enum mc_target_type ret = MC_TARGET_NONE; + if (unlikely(is_swap_pmd(pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(pmd)); + return ret; + } page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) diff --git a/mm/memory.c b/mm/memory.c index 13ee83b43878..886033b95fd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1065,7 +1065,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) + || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, @@ -1326,7 +1327,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); @@ -3911,6 +3912,13 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pmd_t orig_pmd = *vmf.pmd; barrier(); + if (unlikely(is_swap_pmd(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + if (is_pmd_migration_entry(orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); diff --git a/mm/mprotect.c b/mm/mprotect.c index bd0f409922cb..a1bfe9545770 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -149,7 +149,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); - if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) + if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) && pmd_none_or_clear_bad(pmd)) continue; @@ -159,7 +159,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mni_start, end); } - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { __split_huge_pmd(vma, pmd, addr, false, NULL); } else { diff --git a/mm/mremap.c b/mm/mremap.c index 7395564daa6c..cfec004c4ff9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -223,7 +223,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (pmd_trans_huge(*old_pmd)) { + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; /* See comment in move_ptes() */ -- cgit v1.2.3-71-gd317 From ab6e3d0939bb332d72444a532f0f72e0dfde7b7b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 8 Sep 2017 16:11:04 -0700 Subject: mm: soft-dirty: keep soft-dirty bits over thp migration Soft dirty bit is designed to keep tracked over page migration. This patch makes it work in the same manner for thp migration too. Signed-off-by: Naoya Horiguchi Signed-off-by: Zi Yan Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: Dave Hansen Cc: David Nellans Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Minchan Kim Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 17 +++++++++++++++++ fs/proc/task_mmu.c | 27 ++++++++++++++++----------- include/asm-generic/pgtable.h | 34 +++++++++++++++++++++++++++++++++- include/linux/swapops.h | 2 ++ mm/huge_memory.c | 27 ++++++++++++++++++++++++--- 5 files changed, 92 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index bbeae4a2bd01..5b4c44d419c5 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1172,6 +1172,23 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pmd_swp_soft_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY); +} +#endif #endif #define PKRU_AD_BIT 0x1 diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8eec35af32e4..4b21c4e51ce4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -978,17 +978,22 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, { pmd_t pmd = *pmdp; - /* See comment in change_huge_pmd() */ - pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(*pmdp)) - pmd = pmd_mkdirty(pmd); - if (pmd_young(*pmdp)) - pmd = pmd_mkyoung(pmd); - - pmd = pmd_wrprotect(pmd); - pmd = pmd_clear_soft_dirty(pmd); - - set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + if (pmd_present(pmd)) { + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); + + pmd = pmd_wrprotect(pmd); + pmd = pmd_clear_soft_dirty(pmd); + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + pmd = pmd_swp_clear_soft_dirty(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } } #else static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 4f93a6d10a47..8e0243036564 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -630,7 +630,24 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #define arch_start_context_switch(prev) do {} while (0) #endif -#ifndef CONFIG_HAVE_ARCH_SOFT_DIRTY +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) +{ + return pmd; +} + +static inline int pmd_swp_soft_dirty(pmd_t pmd) +{ + return 0; +} + +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) +{ + return pmd; +} +#endif +#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; @@ -675,6 +692,21 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } + +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) +{ + return pmd; +} + +static inline int pmd_swp_soft_dirty(pmd_t pmd) +{ + return 0; +} + +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) +{ + return pmd; +} #endif #ifndef __HAVE_PFNMAP_TRACKING diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 82089fd88c29..45b092aa6419 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -183,6 +183,8 @@ static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) { swp_entry_t arch_entry; + if (pmd_swp_soft_dirty(pmd)) + pmd = pmd_swp_clear_soft_dirty(pmd); arch_entry = __pmd_to_swp_entry(pmd); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b82585eabe85..269b5df58543 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -937,6 +937,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_write_migration_entry(entry)) { make_migration_entry_read(&entry); pmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); } set_pmd_at(dst_mm, addr, dst_pmd, pmd); @@ -1756,6 +1758,17 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, } #endif +static pmd_t move_soft_dirty_pmd(pmd_t pmd) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + if (unlikely(is_pmd_migration_entry(pmd))) + pmd = pmd_swp_mksoft_dirty(pmd); + else if (pmd_present(pmd)) + pmd = pmd_mksoft_dirty(pmd); +#endif + return pmd; +} + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) @@ -1798,7 +1811,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } - set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + pmd = move_soft_dirty_pmd(pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd); if (new_ptl != old_ptl) spin_unlock(new_ptl); if (force_flush) @@ -1846,6 +1860,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ make_migration_entry_read(&entry); newpmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*pmd)) + newpmd = pmd_swp_mksoft_dirty(newpmd); set_pmd_at(mm, addr, pmd, newpmd); } goto unlock; @@ -2824,6 +2840,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, unsigned long address = pvmw->address; pmd_t pmdval; swp_entry_t entry; + pmd_t pmdswp; if (!(pvmw->pmd && !pvmw->pte)) return; @@ -2837,8 +2854,10 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_dirty(pmdval)) set_page_dirty(page); entry = make_migration_entry(page, pmd_write(pmdval)); - pmdval = swp_entry_to_pmd(entry); - set_pmd_at(mm, address, pvmw->pmd, pmdval); + pmdswp = swp_entry_to_pmd(entry); + if (pmd_soft_dirty(pmdval)) + pmdswp = pmd_swp_mksoft_dirty(pmdswp); + set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, true); put_page(page); @@ -2861,6 +2880,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); get_page(new); pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); + if (pmd_swp_soft_dirty(*pvmw->pmd)) + pmde = pmd_mksoft_dirty(pmde); if (is_write_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); -- cgit v1.2.3-71-gd317 From 8135d8926c08e553e39b0b040c6d01f0daef0676 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 8 Sep 2017 16:11:15 -0700 Subject: mm: memory_hotplug: memory hotremove supports thp migration This patch enables thp migration for memory hotremove. Link: http://lkml.kernel.org/r/20170717193955.20207-11-zi.yan@sent.com Signed-off-by: Naoya Horiguchi Signed-off-by: Zi Yan Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: Dave Hansen Cc: David Nellans Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Minchan Kim Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 15 ++++++++++++++- mm/memory_hotplug.c | 4 +++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3e0d405dc842..ce15989521a1 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -35,15 +35,28 @@ static inline struct page *new_page_nodemask(struct page *page, int preferred_nid, nodemask_t *nodemask) { gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; + unsigned int order = 0; + struct page *new_page = NULL; if (PageHuge(page)) return alloc_huge_page_nodemask(page_hstate(compound_head(page)), preferred_nid, nodemask); + if (thp_migration_supported() && PageTransHuge(page)) { + order = HPAGE_PMD_ORDER; + gfp_mask |= GFP_TRANSHUGE; + } + if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) gfp_mask |= __GFP_HIGHMEM; - return __alloc_pages_nodemask(gfp_mask, 0, preferred_nid, nodemask); + new_page = __alloc_pages_nodemask(gfp_mask, order, + preferred_nid, nodemask); + + if (new_page && PageTransHuge(page)) + prep_transhuge_page(new_page); + + return new_page; } #ifdef CONFIG_MIGRATION diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 73bf17df6899..1f92fb84770d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1380,7 +1380,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } + } else if (thp_migration_supported() && PageTransHuge(page)) + pfn = page_to_pfn(compound_head(page)) + + hpage_nr_pages(page) - 1; if (!get_page_unless_zero(page)) continue; -- cgit v1.2.3-71-gd317 From 133ff0eac95b7dc6edf89dc51bd139a0630bbae7 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:11:23 -0700 Subject: mm/hmm: heterogeneous memory management (HMM for short) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM provides 3 separate types of functionality: - Mirroring: synchronize CPU page table and device page table - Device memory: allocating struct page for device memory - Migration: migrating regular memory to device memory This patch introduces some common helpers and definitions to all of those 3 functionality. Link: http://lkml.kernel.org/r/20170817000548.32038-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Evgeny Baskakov Signed-off-by: John Hubbard Signed-off-by: Mark Hairgrove Signed-off-by: Sherry Cheung Signed-off-by: Subhash Gutti Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 152 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 6 ++ kernel/fork.c | 3 + mm/Kconfig | 13 ++++ mm/Makefile | 2 +- mm/hmm.c | 74 +++++++++++++++++++++++ 6 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 include/linux/hmm.h create mode 100644 mm/hmm.c (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h new file mode 100644 index 000000000000..5d83fec6dfdd --- /dev/null +++ b/include/linux/hmm.h @@ -0,0 +1,152 @@ +/* + * Copyright 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Jérôme Glisse + */ +/* + * Heterogeneous Memory Management (HMM) + * + * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it + * is for. Here we focus on the HMM API description, with some explanation of + * the underlying implementation. + * + * Short description: HMM provides a set of helpers to share a virtual address + * space between CPU and a device, so that the device can access any valid + * address of the process (while still obeying memory protection). HMM also + * provides helpers to migrate process memory to device memory, and back. Each + * set of functionality (address space mirroring, and migration to and from + * device memory) can be used independently of the other. + * + * + * HMM address space mirroring API: + * + * Use HMM address space mirroring if you want to mirror range of the CPU page + * table of a process into a device page table. Here, "mirror" means "keep + * synchronized". Prerequisites: the device must provide the ability to write- + * protect its page tables (at PAGE_SIZE granularity), and must be able to + * recover from the resulting potential page faults. + * + * HMM guarantees that at any point in time, a given virtual address points to + * either the same memory in both CPU and device page tables (that is: CPU and + * device page tables each point to the same pages), or that one page table (CPU + * or device) points to no entry, while the other still points to the old page + * for the address. The latter case happens when the CPU page table update + * happens first, and then the update is mirrored over to the device page table. + * This does not cause any issue, because the CPU page table cannot start + * pointing to a new page until the device page table is invalidated. + * + * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any + * updates to each device driver that has registered a mirror. It also provides + * some API calls to help with taking a snapshot of the CPU page table, and to + * synchronize with any updates that might happen concurrently. + * + * + * HMM migration to and from device memory: + * + * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with + * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page + * of the device memory, and allows the device driver to manage its memory + * using those struct pages. Having struct pages for device memory makes + * migration easier. Because that memory is not addressable by the CPU it must + * never be pinned to the device; in other words, any CPU page fault can always + * cause the device memory to be migrated (copied/moved) back to regular memory. + * + * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that + * allows use of a device DMA engine to perform the copy operation between + * regular system memory and device memory. + */ +#ifndef LINUX_HMM_H +#define LINUX_HMM_H + +#include + +#if IS_ENABLED(CONFIG_HMM) + + +/* + * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page + * + * Flags: + * HMM_PFN_VALID: pfn is valid + * HMM_PFN_WRITE: CPU page table has write permission set + */ +typedef unsigned long hmm_pfn_t; + +#define HMM_PFN_VALID (1 << 0) +#define HMM_PFN_WRITE (1 << 1) +#define HMM_PFN_SHIFT 2 + +/* + * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t + * @pfn: hmm_pfn_t to convert to struct page + * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise + * + * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page + * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL. + */ +static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn) +{ + if (!(pfn & HMM_PFN_VALID)) + return NULL; + return pfn_to_page(pfn >> HMM_PFN_SHIFT); +} + +/* + * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t + * @pfn: hmm_pfn_t to extract pfn from + * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise + */ +static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn) +{ + if (!(pfn & HMM_PFN_VALID)) + return -1UL; + return (pfn >> HMM_PFN_SHIFT); +} + +/* + * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page + * @page: struct page pointer for which to create the hmm_pfn_t + * Returns: valid hmm_pfn_t for the page + */ +static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page) +{ + return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID; +} + +/* + * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn + * @pfn: pfn value for which to create the hmm_pfn_t + * Returns: valid hmm_pfn_t for the pfn + */ +static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn) +{ + return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID; +} + + +/* Below are for HMM internal use only! Not to be used by device driver! */ +void hmm_mm_destroy(struct mm_struct *mm); + +static inline void hmm_mm_init(struct mm_struct *mm) +{ + mm->hmm = NULL; +} + +#else /* IS_ENABLED(CONFIG_HMM) */ + +/* Below are for HMM internal use only! Not to be used by device driver! */ +static inline void hmm_mm_destroy(struct mm_struct *mm) {} +static inline void hmm_mm_init(struct mm_struct *mm) {} + +#endif /* IS_ENABLED(CONFIG_HMM) */ +#endif /* LINUX_HMM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f45ad815b7d7..46f4ecf5479a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -23,6 +23,7 @@ struct address_space; struct mem_cgroup; +struct hmm; /* * Each physical page in the system has a struct page associated with @@ -503,6 +504,11 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; + +#if IS_ENABLED(CONFIG_HMM) + /* HMM needs to track a few things per mm */ + struct hmm *hmm; +#endif } __randomize_layout; extern struct mm_struct init_mm; diff --git a/kernel/fork.c b/kernel/fork.c index 24a4c0be80d5..2ccbbbfcb7b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -824,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); + hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; @@ -903,6 +905,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); diff --git a/mm/Kconfig b/mm/Kconfig index 9ef8c2ea92ad..037fa26d16a2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -692,6 +692,19 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ARCH_HAS_HMM + bool + default y + depends on (X86_64 || PPC64) + depends on ZONE_DEVICE + depends on MMU && 64BIT + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + +config HMM + bool + config FRAME_VECTOR bool diff --git a/mm/Makefile b/mm/Makefile index 411bd24d4a7c..1cde2a8bed97 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o swap_slots.o \ interval_tree.o list_lru.o workingset.o \ - debug.o $(mmu-y) + debug.o hmm.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/hmm.c b/mm/hmm.c new file mode 100644 index 000000000000..de032ff9e576 --- /dev/null +++ b/mm/hmm.c @@ -0,0 +1,74 @@ +/* + * Copyright 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Jérôme Glisse + */ +/* + * Refer to include/linux/hmm.h for information about heterogeneous memory + * management or HMM for short. + */ +#include +#include +#include +#include + + +#ifdef CONFIG_HMM +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + */ +struct hmm { + struct mm_struct *mm; +}; + +/* + * hmm_register - register HMM against an mm (HMM internal) + * + * @mm: mm struct to attach to + * + * This is not intended to be used directly by device drivers. It allocates an + * HMM struct if mm does not have one, and initializes it. + */ +static struct hmm *hmm_register(struct mm_struct *mm) +{ + if (!mm->hmm) { + struct hmm *hmm = NULL; + + hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); + if (!hmm) + return NULL; + hmm->mm = mm; + + spin_lock(&mm->page_table_lock); + if (!mm->hmm) + mm->hmm = hmm; + else + kfree(hmm); + spin_unlock(&mm->page_table_lock); + } + + /* + * The hmm struct can only be freed once the mm_struct goes away, + * hence we should always have pre-allocated an new hmm struct + * above. + */ + return mm->hmm; +} + +void hmm_mm_destroy(struct mm_struct *mm) +{ + kfree(mm->hmm); +} +#endif /* CONFIG_HMM */ -- cgit v1.2.3-71-gd317 From c0b124054f9e42eb6da545a10fe9122a7d7c3f72 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:11:27 -0700 Subject: mm/hmm/mirror: mirror process address space on device with HMM helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a heterogeneous memory management (HMM) process address space mirroring. In a nutshell this provide an API to mirror process address space on a device. This boils down to keeping CPU and device page table synchronize (we assume that both device and CPU are cache coherent like PCIe device can be). This patch provide a simple API for device driver to achieve address space mirroring thus avoiding each device driver to grow its own CPU page table walker and its own CPU page table synchronization mechanism. This is useful for NVidia GPU >= Pascal, Mellanox IB >= mlx5 and more hardware in the future. [jglisse@redhat.com: fix hmm for "mmu_notifier kill invalidate_page callback"] Link: http://lkml.kernel.org/r/20170830231955.GD9445@redhat.com Link: http://lkml.kernel.org/r/20170817000548.32038-4-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Evgeny Baskakov Signed-off-by: John Hubbard Signed-off-by: Mark Hairgrove Signed-off-by: Sherry Cheung Signed-off-by: Subhash Gutti Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 110 +++++++++++++++++++++++++++++++++++++ mm/Kconfig | 12 +++++ mm/hmm.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 260 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 5d83fec6dfdd..76310726ee9c 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -72,6 +72,7 @@ #if IS_ENABLED(CONFIG_HMM) +struct hmm; /* * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page @@ -134,6 +135,115 @@ static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn) } +#if IS_ENABLED(CONFIG_HMM_MIRROR) +/* + * Mirroring: how to synchronize device page table with CPU page table. + * + * A device driver that is participating in HMM mirroring must always + * synchronize with CPU page table updates. For this, device drivers can either + * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device + * drivers can decide to register one mirror per device per process, or just + * one mirror per process for a group of devices. The pattern is: + * + * int device_bind_address_space(..., struct mm_struct *mm, ...) + * { + * struct device_address_space *das; + * + * // Device driver specific initialization, and allocation of das + * // which contains an hmm_mirror struct as one of its fields. + * ... + * + * ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops); + * if (ret) { + * // Cleanup on error + * return ret; + * } + * + * // Other device driver specific initialization + * ... + * } + * + * Once an hmm_mirror is registered for an address space, the device driver + * will get callbacks through sync_cpu_device_pagetables() operation (see + * hmm_mirror_ops struct). + * + * Device driver must not free the struct containing the hmm_mirror struct + * before calling hmm_mirror_unregister(). The expected usage is to do that when + * the device driver is unbinding from an address space. + * + * + * void device_unbind_address_space(struct device_address_space *das) + * { + * // Device driver specific cleanup + * ... + * + * hmm_mirror_unregister(&das->mirror); + * + * // Other device driver specific cleanup, and now das can be freed + * ... + * } + */ + +struct hmm_mirror; + +/* + * enum hmm_update_type - type of update + * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) + */ +enum hmm_update_type { + HMM_UPDATE_INVALIDATE, +}; + +/* + * struct hmm_mirror_ops - HMM mirror device operations callback + * + * @update: callback to update range on a device + */ +struct hmm_mirror_ops { + /* sync_cpu_device_pagetables() - synchronize page tables + * + * @mirror: pointer to struct hmm_mirror + * @update_type: type of update that occurred to the CPU page table + * @start: virtual start address of the range to update + * @end: virtual end address of the range to update + * + * This callback ultimately originates from mmu_notifiers when the CPU + * page table is updated. The device driver must update its page table + * in response to this callback. The update argument tells what action + * to perform. + * + * The device driver must not return from this callback until the device + * page tables are completely updated (TLBs flushed, etc); this is a + * synchronous call. + */ + void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, + enum hmm_update_type update_type, + unsigned long start, + unsigned long end); +}; + +/* + * struct hmm_mirror - mirror struct for a device driver + * + * @hmm: pointer to struct hmm (which is unique per mm_struct) + * @ops: device driver callback for HMM mirror operations + * @list: for list of mirrors of a given mm + * + * Each address space (mm_struct) being mirrored by a device must register one + * instance of an hmm_mirror struct with HMM. HMM will track the list of all + * mirrors for each mm_struct. + */ +struct hmm_mirror { + struct hmm *hmm; + const struct hmm_mirror_ops *ops; + struct list_head list; +}; + +int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); +void hmm_mirror_unregister(struct hmm_mirror *mirror); +#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ + + /* Below are for HMM internal use only! Not to be used by device driver! */ void hmm_mm_destroy(struct mm_struct *mm); diff --git a/mm/Kconfig b/mm/Kconfig index 037fa26d16a2..254db99f263d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -705,6 +705,18 @@ config ARCH_HAS_HMM config HMM bool +config HMM_MIRROR + bool "HMM mirror CPU page table into a device page table" + depends on ARCH_HAS_HMM + select MMU_NOTIFIER + select HMM + help + Select HMM_MIRROR if you want to mirror range of the CPU page table of a + process into a device page table. Here, mirror means "keep synchronized". + Prerequisites: the device must provide the ability to write-protect its + page tables (at PAGE_SIZE granularity), and must be able to recover from + the resulting potential page faults. + config FRAME_VECTOR bool diff --git a/mm/hmm.c b/mm/hmm.c index de032ff9e576..d37daf9edcd3 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -21,16 +21,27 @@ #include #include #include +#include #ifdef CONFIG_HMM +static const struct mmu_notifier_ops hmm_mmu_notifier_ops; + /* * struct hmm - HMM per mm struct * * @mm: mm struct this HMM struct is bound to + * @sequence: we track updates to the CPU page table with a sequence number + * @mirrors: list of mirrors for this mm + * @mmu_notifier: mmu notifier to track updates to CPU page table + * @mirrors_sem: read/write semaphore protecting the mirrors list */ struct hmm { struct mm_struct *mm; + atomic_t sequence; + struct list_head mirrors; + struct mmu_notifier mmu_notifier; + struct rw_semaphore mirrors_sem; }; /* @@ -43,27 +54,48 @@ struct hmm { */ static struct hmm *hmm_register(struct mm_struct *mm) { - if (!mm->hmm) { - struct hmm *hmm = NULL; - - hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); - if (!hmm) - return NULL; - hmm->mm = mm; - - spin_lock(&mm->page_table_lock); - if (!mm->hmm) - mm->hmm = hmm; - else - kfree(hmm); - spin_unlock(&mm->page_table_lock); - } + struct hmm *hmm = READ_ONCE(mm->hmm); + bool cleanup = false; /* * The hmm struct can only be freed once the mm_struct goes away, * hence we should always have pre-allocated an new hmm struct * above. */ + if (hmm) + return hmm; + + hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); + if (!hmm) + return NULL; + INIT_LIST_HEAD(&hmm->mirrors); + init_rwsem(&hmm->mirrors_sem); + atomic_set(&hmm->sequence, 0); + hmm->mmu_notifier.ops = NULL; + hmm->mm = mm; + + /* + * We should only get here if hold the mmap_sem in write mode ie on + * registration of first mirror through hmm_mirror_register() + */ + hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; + if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { + kfree(hmm); + return NULL; + } + + spin_lock(&mm->page_table_lock); + if (!mm->hmm) + mm->hmm = hmm; + else + cleanup = true; + spin_unlock(&mm->page_table_lock); + + if (cleanup) { + mmu_notifier_unregister(&hmm->mmu_notifier, mm); + kfree(hmm); + } + return mm->hmm; } @@ -72,3 +104,94 @@ void hmm_mm_destroy(struct mm_struct *mm) kfree(mm->hmm); } #endif /* CONFIG_HMM */ + +#if IS_ENABLED(CONFIG_HMM_MIRROR) +static void hmm_invalidate_range(struct hmm *hmm, + enum hmm_update_type action, + unsigned long start, + unsigned long end) +{ + struct hmm_mirror *mirror; + + down_read(&hmm->mirrors_sem); + list_for_each_entry(mirror, &hmm->mirrors, list) + mirror->ops->sync_cpu_device_pagetables(mirror, action, + start, end); + up_read(&hmm->mirrors_sem); +} + +static void hmm_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct hmm *hmm = mm->hmm; + + VM_BUG_ON(!hmm); + + atomic_inc(&hmm->sequence); +} + +static void hmm_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct hmm *hmm = mm->hmm; + + VM_BUG_ON(!hmm); + + hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); +} + +static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { + .invalidate_range_start = hmm_invalidate_range_start, + .invalidate_range_end = hmm_invalidate_range_end, +}; + +/* + * hmm_mirror_register() - register a mirror against an mm + * + * @mirror: new mirror struct to register + * @mm: mm to register against + * + * To start mirroring a process address space, the device driver must register + * an HMM mirror struct. + * + * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! + */ +int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) +{ + /* Sanity check */ + if (!mm || !mirror || !mirror->ops) + return -EINVAL; + + mirror->hmm = hmm_register(mm); + if (!mirror->hmm) + return -ENOMEM; + + down_write(&mirror->hmm->mirrors_sem); + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); + + return 0; +} +EXPORT_SYMBOL(hmm_mirror_register); + +/* + * hmm_mirror_unregister() - unregister a mirror + * + * @mirror: new mirror struct to register + * + * Stop mirroring a process address space, and cleanup. + */ +void hmm_mirror_unregister(struct hmm_mirror *mirror) +{ + struct hmm *hmm = mirror->hmm; + + down_write(&hmm->mirrors_sem); + list_del(&mirror->list); + up_write(&hmm->mirrors_sem); +} +EXPORT_SYMBOL(hmm_mirror_unregister); +#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -- cgit v1.2.3-71-gd317 From da4c3c735ea4dcc2a0b0ff0bd4803c336361b6f5 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:11:31 -0700 Subject: mm/hmm/mirror: helper to snapshot CPU page table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This does not use existing page table walker because we want to share same code for our page fault handler. Link: http://lkml.kernel.org/r/20170817000548.32038-5-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Evgeny Baskakov Signed-off-by: John Hubbard Signed-off-by: Mark Hairgrove Signed-off-by: Sherry Cheung Signed-off-by: Subhash Gutti Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 55 +++++++++- mm/hmm.c | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 338 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 76310726ee9c..62899c9829c9 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -79,13 +79,26 @@ struct hmm; * * Flags: * HMM_PFN_VALID: pfn is valid + * HMM_PFN_READ: CPU page table has read permission set * HMM_PFN_WRITE: CPU page table has write permission set + * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory + * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none() + * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the + * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not + * be mirrored by a device, because the entry will never have HMM_PFN_VALID + * set and the pfn value is undefined. + * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE) */ typedef unsigned long hmm_pfn_t; #define HMM_PFN_VALID (1 << 0) -#define HMM_PFN_WRITE (1 << 1) -#define HMM_PFN_SHIFT 2 +#define HMM_PFN_READ (1 << 1) +#define HMM_PFN_WRITE (1 << 2) +#define HMM_PFN_ERROR (1 << 3) +#define HMM_PFN_EMPTY (1 << 4) +#define HMM_PFN_SPECIAL (1 << 5) +#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6) +#define HMM_PFN_SHIFT 7 /* * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t @@ -241,6 +254,44 @@ struct hmm_mirror { int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); + + +/* + * struct hmm_range - track invalidation lock on virtual address range + * + * @list: all range lock are on a list + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @pfns: array of pfns (big enough for the range) + * @valid: pfns array did not change since it has been fill by an HMM function + */ +struct hmm_range { + struct list_head list; + unsigned long start; + unsigned long end; + hmm_pfn_t *pfns; + bool valid; +}; + +/* + * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device + * driver lock that serializes device page table updates, then call + * hmm_vma_range_done(), to check if the snapshot is still valid. The same + * device driver page table update lock must also be used in the + * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page + * table invalidation serializes on it. + * + * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL + * hmm_vma_get_pfns() WITHOUT ERROR ! + * + * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! + */ +int hmm_vma_get_pfns(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns); +bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff --git a/mm/hmm.c b/mm/hmm.c index d37daf9edcd3..172984848d51 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -19,8 +19,12 @@ */ #include #include +#include +#include #include #include +#include +#include #include @@ -31,14 +35,18 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; * struct hmm - HMM per mm struct * * @mm: mm struct this HMM struct is bound to + * @lock: lock protecting ranges list * @sequence: we track updates to the CPU page table with a sequence number + * @ranges: list of range being snapshotted * @mirrors: list of mirrors for this mm * @mmu_notifier: mmu notifier to track updates to CPU page table * @mirrors_sem: read/write semaphore protecting the mirrors list */ struct hmm { struct mm_struct *mm; + spinlock_t lock; atomic_t sequence; + struct list_head ranges; struct list_head mirrors; struct mmu_notifier mmu_notifier; struct rw_semaphore mirrors_sem; @@ -72,6 +80,8 @@ static struct hmm *hmm_register(struct mm_struct *mm) init_rwsem(&hmm->mirrors_sem); atomic_set(&hmm->sequence, 0); hmm->mmu_notifier.ops = NULL; + INIT_LIST_HEAD(&hmm->ranges); + spin_lock_init(&hmm->lock); hmm->mm = mm; /* @@ -112,6 +122,22 @@ static void hmm_invalidate_range(struct hmm *hmm, unsigned long end) { struct hmm_mirror *mirror; + struct hmm_range *range; + + spin_lock(&hmm->lock); + list_for_each_entry(range, &hmm->ranges, list) { + unsigned long addr, idx, npages; + + if (end < range->start || start >= range->end) + continue; + + range->valid = false; + addr = max(start, range->start); + idx = (addr - range->start) >> PAGE_SHIFT; + npages = (min(range->end, end) - addr) >> PAGE_SHIFT; + memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); + } + spin_unlock(&hmm->lock); down_read(&hmm->mirrors_sem); list_for_each_entry(mirror, &hmm->mirrors, list) @@ -194,4 +220,263 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) up_write(&hmm->mirrors_sem); } EXPORT_SYMBOL(hmm_mirror_unregister); + +static void hmm_pfns_special(hmm_pfn_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = HMM_PFN_SPECIAL; +} + +static int hmm_pfns_bad(unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_range *range = walk->private; + hmm_pfn_t *pfns = range->pfns; + unsigned long i; + + i = (addr - range->start) >> PAGE_SHIFT; + for (; addr < end; addr += PAGE_SIZE, i++) + pfns[i] = HMM_PFN_ERROR; + + return 0; +} + +static int hmm_vma_walk_hole(unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_range *range = walk->private; + hmm_pfn_t *pfns = range->pfns; + unsigned long i; + + i = (addr - range->start) >> PAGE_SHIFT; + for (; addr < end; addr += PAGE_SIZE, i++) + pfns[i] = HMM_PFN_EMPTY; + + return 0; +} + +static int hmm_vma_walk_clear(unsigned long addr, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_range *range = walk->private; + hmm_pfn_t *pfns = range->pfns; + unsigned long i; + + i = (addr - range->start) >> PAGE_SHIFT; + for (; addr < end; addr += PAGE_SIZE, i++) + pfns[i] = 0; + + return 0; +} + +static int hmm_vma_walk_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_range *range = walk->private; + struct vm_area_struct *vma = walk->vma; + hmm_pfn_t *pfns = range->pfns; + unsigned long addr = start, i; + hmm_pfn_t flag; + pte_t *ptep; + + i = (addr - range->start) >> PAGE_SHIFT; + flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; + +again: + if (pmd_none(*pmdp)) + return hmm_vma_walk_hole(start, end, walk); + + if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) + return hmm_pfns_bad(start, end, walk); + + if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { + unsigned long pfn; + pmd_t pmd; + + /* + * No need to take pmd_lock here, even if some other threads + * is splitting the huge pmd we will get that event through + * mmu_notifier callback. + * + * So just read pmd value and check again its a transparent + * huge or device mapping one and compute corresponding pfn + * values. + */ + pmd = pmd_read_atomic(pmdp); + barrier(); + if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) + goto again; + if (pmd_protnone(pmd)) + return hmm_vma_walk_clear(start, end, walk); + + pfn = pmd_pfn(pmd) + pte_index(addr); + flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) + pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; + return 0; + } + + if (pmd_bad(*pmdp)) + return hmm_pfns_bad(start, end, walk); + + ptep = pte_offset_map(pmdp, addr); + for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { + pte_t pte = *ptep; + + pfns[i] = 0; + + if (pte_none(pte) || !pte_present(pte)) { + pfns[i] = HMM_PFN_EMPTY; + continue; + } + + pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; + pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; + } + pte_unmap(ptep - 1); + + return 0; +} + +/* + * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses + * @vma: virtual memory area containing the virtual address range + * @range: used to track snapshot validity + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @entries: array of hmm_pfn_t: provided by the caller, filled in by function + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success + * + * This snapshots the CPU page table for a range of virtual addresses. Snapshot + * validity is tracked by range struct. See hmm_vma_range_done() for further + * information. + * + * The range struct is initialized here. It tracks the CPU page table, but only + * if the function returns success (0), in which case the caller must then call + * hmm_vma_range_done() to stop CPU page table update tracking on this range. + * + * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS + * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! + */ +int hmm_vma_get_pfns(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns) +{ + struct mm_walk mm_walk; + struct hmm *hmm; + + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(pfns, start, end); + return -EINVAL; + } + + /* Sanity check, this really should not happen ! */ + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + hmm = hmm_register(vma->vm_mm); + if (!hmm) + return -ENOMEM; + /* Caller must have registered a mirror, via hmm_mirror_register() ! */ + if (!hmm->mmu_notifier.ops) + return -EINVAL; + + /* Initialize range to track CPU page table update */ + range->start = start; + range->pfns = pfns; + range->end = end; + spin_lock(&hmm->lock); + range->valid = true; + list_add_rcu(&range->list, &hmm->ranges); + spin_unlock(&hmm->lock); + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.private = range; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + walk_page_range(start, end, &mm_walk); + + return 0; +} +EXPORT_SYMBOL(hmm_vma_get_pfns); + +/* + * hmm_vma_range_done() - stop tracking change to CPU page table over a range + * @vma: virtual memory area containing the virtual address range + * @range: range being tracked + * Returns: false if range data has been invalidated, true otherwise + * + * Range struct is used to track updates to the CPU page table after a call to + * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done + * using the data, or wants to lock updates to the data it got from those + * functions, it must call the hmm_vma_range_done() function, which will then + * stop tracking CPU page table updates. + * + * Note that device driver must still implement general CPU page table update + * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using + * the mmu_notifier API directly. + * + * CPU page table update tracking done through hmm_range is only temporary and + * to be used while trying to duplicate CPU page table contents for a range of + * virtual addresses. + * + * There are two ways to use this : + * again: + * hmm_vma_get_pfns(vma, range, start, end, pfns); + * trans = device_build_page_table_update_transaction(pfns); + * device_page_table_lock(); + * if (!hmm_vma_range_done(vma, range)) { + * device_page_table_unlock(); + * goto again; + * } + * device_commit_transaction(trans); + * device_page_table_unlock(); + * + * Or: + * hmm_vma_get_pfns(vma, range, start, end, pfns); + * device_page_table_lock(); + * hmm_vma_range_done(vma, range); + * device_update_page_table(pfns); + * device_page_table_unlock(); + */ +bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) +{ + unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; + struct hmm *hmm; + + if (range->end <= range->start) { + BUG(); + return false; + } + + hmm = hmm_register(vma->vm_mm); + if (!hmm) { + memset(range->pfns, 0, sizeof(*range->pfns) * npages); + return false; + } + + spin_lock(&hmm->lock); + list_del_rcu(&range->list); + spin_unlock(&hmm->lock); + + return range->valid; +} +EXPORT_SYMBOL(hmm_vma_range_done); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -- cgit v1.2.3-71-gd317 From 74eee180b935fcb9b83a56dd7648fb75caf38f0e Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:11:35 -0700 Subject: mm/hmm/mirror: device page fault handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This handles page fault on behalf of device driver, unlike handle_mm_fault() it does not trigger migration back to system memory for device memory. Link: http://lkml.kernel.org/r/20170817000548.32038-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Evgeny Baskakov Signed-off-by: John Hubbard Signed-off-by: Mark Hairgrove Signed-off-by: Sherry Cheung Signed-off-by: Subhash Gutti Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 27 ++++++ mm/hmm.c | 256 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 271 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 62899c9829c9..61a6535fe438 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -292,6 +292,33 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, unsigned long end, hmm_pfn_t *pfns); bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range); + + +/* + * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will + * not migrate any device memory back to system memory. The hmm_pfn_t array will + * be updated with the fault result and current snapshot of the CPU page table + * for the range. + * + * The mmap_sem must be taken in read mode before entering and it might be + * dropped by the function if the block argument is false. In that case, the + * function returns -EAGAIN. + * + * Return value does not reflect if the fault was successful for every single + * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to + * determine fault status for each address. + * + * Trying to fault inside an invalid vma will result in -EINVAL. + * + * See the function description in mm/hmm.c for further documentation. + */ +int hmm_vma_fault(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns, + bool write, + bool block); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff --git a/mm/hmm.c b/mm/hmm.c index 172984848d51..f6c745b9a25a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -221,6 +221,36 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) } EXPORT_SYMBOL(hmm_mirror_unregister); +struct hmm_vma_walk { + struct hmm_range *range; + unsigned long last; + bool fault; + bool block; + bool write; +}; + +static int hmm_vma_do_fault(struct mm_walk *walk, + unsigned long addr, + hmm_pfn_t *pfn) +{ + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct vm_area_struct *vma = walk->vma; + int r; + + flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; + flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; + r = handle_mm_fault(vma, addr, flags); + if (r & VM_FAULT_RETRY) + return -EBUSY; + if (r & VM_FAULT_ERROR) { + *pfn = HMM_PFN_ERROR; + return -EFAULT; + } + + return -EAGAIN; +} + static void hmm_pfns_special(hmm_pfn_t *pfns, unsigned long addr, unsigned long end) @@ -244,34 +274,62 @@ static int hmm_pfns_bad(unsigned long addr, return 0; } +static void hmm_pfns_clear(hmm_pfn_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = 0; +} + static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hmm_range *range = walk->private; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; hmm_pfn_t *pfns = range->pfns; unsigned long i; + hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) + for (; addr < end; addr += PAGE_SIZE, i++) { pfns[i] = HMM_PFN_EMPTY; + if (hmm_vma_walk->fault) { + int ret; - return 0; + ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + if (ret != -EAGAIN) + return ret; + } + } + + return hmm_vma_walk->fault ? -EAGAIN : 0; } static int hmm_vma_walk_clear(unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hmm_range *range = walk->private; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; hmm_pfn_t *pfns = range->pfns; unsigned long i; + hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) + for (; addr < end; addr += PAGE_SIZE, i++) { pfns[i] = 0; + if (hmm_vma_walk->fault) { + int ret; - return 0; + ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + if (ret != -EAGAIN) + return ret; + } + } + + return hmm_vma_walk->fault ? -EAGAIN : 0; } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -279,15 +337,18 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, unsigned long end, struct mm_walk *walk) { - struct hmm_range *range = walk->private; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; hmm_pfn_t *pfns = range->pfns; unsigned long addr = start, i; + bool write_fault; hmm_pfn_t flag; pte_t *ptep; i = (addr - range->start) >> PAGE_SHIFT; flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; + write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; again: if (pmd_none(*pmdp)) @@ -316,6 +377,9 @@ again: if (pmd_protnone(pmd)) return hmm_vma_walk_clear(start, end, walk); + if (write_fault && !pmd_write(pmd)) + return hmm_vma_walk_clear(start, end, walk); + pfn = pmd_pfn(pmd) + pte_index(addr); flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; for (; addr < end; addr += PAGE_SIZE, i++, pfn++) @@ -332,13 +396,55 @@ again: pfns[i] = 0; - if (pte_none(pte) || !pte_present(pte)) { + if (pte_none(pte)) { pfns[i] = HMM_PFN_EMPTY; + if (hmm_vma_walk->fault) + goto fault; continue; } + if (!pte_present(pte)) { + swp_entry_t entry; + + if (!non_swap_entry(entry)) { + if (hmm_vma_walk->fault) + goto fault; + continue; + } + + entry = pte_to_swp_entry(pte); + + /* + * This is a special swap entry, ignore migration, use + * device and report anything else as error. + */ + if (is_migration_entry(entry)) { + if (hmm_vma_walk->fault) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(vma->vm_mm, + pmdp, addr); + return -EAGAIN; + } + continue; + } else { + /* Report error for everything else */ + pfns[i] = HMM_PFN_ERROR; + } + continue; + } + + if (write_fault && !pte_write(pte)) + goto fault; + pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; + continue; + +fault: + pte_unmap(ptep); + /* Fault all pages in range */ + return hmm_vma_walk_clear(start, end, walk); } pte_unmap(ptep - 1); @@ -371,6 +477,7 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, unsigned long end, hmm_pfn_t *pfns) { + struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; @@ -402,9 +509,12 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, list_add_rcu(&range->list, &hmm->ranges); spin_unlock(&hmm->lock); + hmm_vma_walk.fault = false; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + mm_walk.vma = vma; mm_walk.mm = vma->vm_mm; - mm_walk.private = range; mm_walk.pte_entry = NULL; mm_walk.test_walk = NULL; mm_walk.hugetlb_entry = NULL; @@ -412,7 +522,6 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, mm_walk.pte_hole = hmm_vma_walk_hole; walk_page_range(start, end, &mm_walk); - return 0; } EXPORT_SYMBOL(hmm_vma_get_pfns); @@ -439,7 +548,7 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * * There are two ways to use this : * again: - * hmm_vma_get_pfns(vma, range, start, end, pfns); + * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); * trans = device_build_page_table_update_transaction(pfns); * device_page_table_lock(); * if (!hmm_vma_range_done(vma, range)) { @@ -450,7 +559,7 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * device_page_table_unlock(); * * Or: - * hmm_vma_get_pfns(vma, range, start, end, pfns); + * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); * device_page_table_lock(); * hmm_vma_range_done(vma, range); * device_update_page_table(pfns); @@ -479,4 +588,127 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) return range->valid; } EXPORT_SYMBOL(hmm_vma_range_done); + +/* + * hmm_vma_fault() - try to fault some address in a virtual address range + * @vma: virtual memory area containing the virtual address range + * @range: use to track pfns array content validity + * @start: fault range virtual start address (inclusive) + * @end: fault range virtual end address (exclusive) + * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted + * @write: is it a write fault + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) + * + * This is similar to a regular CPU page fault except that it will not trigger + * any memory migration if the memory being faulted is not accessible by CPUs. + * + * On error, for one virtual address in the range, the function will set the + * hmm_pfn_t error flag for the corresponding pfn entry. + * + * Expected use pattern: + * retry: + * down_read(&mm->mmap_sem); + * // Find vma and address device wants to fault, initialize hmm_pfn_t + * // array accordingly + * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry); + * switch (ret) { + * case -EAGAIN: + * hmm_vma_range_done(vma, range); + * // You might want to rate limit or yield to play nicely, you may + * // also commit any valid pfn in the array assuming that you are + * // getting true from hmm_vma_range_monitor_end() + * goto retry; + * case 0: + * break; + * default: + * // Handle error ! + * up_read(&mm->mmap_sem) + * return; + * } + * // Take device driver lock that serialize device page table update + * driver_lock_device_page_table_update(); + * hmm_vma_range_done(vma, range); + * // Commit pfns we got from hmm_vma_fault() + * driver_unlock_device_page_table_update(); + * up_read(&mm->mmap_sem) + * + * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) + * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! + * + * YOU HAVE BEEN WARNED ! + */ +int hmm_vma_fault(struct vm_area_struct *vma, + struct hmm_range *range, + unsigned long start, + unsigned long end, + hmm_pfn_t *pfns, + bool write, + bool block) +{ + struct hmm_vma_walk hmm_vma_walk; + struct mm_walk mm_walk; + struct hmm *hmm; + int ret; + + /* Sanity check, this really should not happen ! */ + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + hmm = hmm_register(vma->vm_mm); + if (!hmm) { + hmm_pfns_clear(pfns, start, end); + return -ENOMEM; + } + /* Caller must have registered a mirror using hmm_mirror_register() */ + if (!hmm->mmu_notifier.ops) + return -EINVAL; + + /* Initialize range to track CPU page table update */ + range->start = start; + range->pfns = pfns; + range->end = end; + spin_lock(&hmm->lock); + range->valid = true; + list_add_rcu(&range->list, &hmm->ranges); + spin_unlock(&hmm->lock); + + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(pfns, start, end); + return 0; + } + + hmm_vma_walk.fault = true; + hmm_vma_walk.write = write; + hmm_vma_walk.block = block; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + hmm_vma_walk.last = range->start; + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + do { + ret = walk_page_range(start, end, &mm_walk); + start = hmm_vma_walk.last; + } while (ret == -EAGAIN); + + if (ret) { + unsigned long i; + + i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end); + hmm_vma_range_done(vma, range); + } + return ret; +} +EXPORT_SYMBOL(hmm_vma_fault); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -- cgit v1.2.3-71-gd317 From 3072e413e305e353cd4654f8a57d953b66e85bf3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 8 Sep 2017 16:11:39 -0700 Subject: mm/memory_hotplug: introduce add_pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are new users of memory hotplug emerging. Some of them require different subset of arch_add_memory. There are some which only require allocation of struct pages without mapping those pages to the kernel address space. We currently have __add_pages for that purpose. But this is rather lowlevel and not very suitable for the code outside of the memory hotplug. E.g. x86_64 wants to update max_pfn which should be done by the caller. Introduce add_pages() which should care about those details if they are needed. Each architecture should define its implementation and select CONFIG_ARCH_HAS_ADD_PAGES. All others use the currently existing __add_pages. Link: http://lkml.kernel.org/r/20170817000548.32038-7-jglisse@redhat.com Signed-off-by: Michal Hocko Signed-off-by: Jérôme Glisse Acked-by: Balbir Singh Cc: Aneesh Kumar Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 4 ++++ arch/x86/mm/init_64.c | 22 +++++++++++++++------- include/linux/memory_hotplug.h | 11 +++++++++++ 3 files changed, 30 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 07d9b6c75328..a3e6e6136a47 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2323,6 +2323,10 @@ source "kernel/livepatch/Kconfig" endmenu +config ARCH_HAS_ADD_PAGES + def_bool y + depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG + config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 136422d7d539..048fbe8fc274 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -761,7 +761,7 @@ void __init paging_init(void) * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need * updating. */ -static void update_end_of_memory_vars(u64 start, u64 size) +static void update_end_of_memory_vars(u64 start, u64 size) { unsigned long end_pfn = PFN_UP(start + size); @@ -772,22 +772,30 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int add_pages(int nid, unsigned long start_pfn, + unsigned long nr_pages, bool want_memblock) { - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, start + size); - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ - update_end_of_memory_vars(start, size); + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT); return ret; } + +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + init_memory_mapping(start, start + size); + + return add_pages(nid, start_pfn, nr_pages, want_memblock); +} EXPORT_SYMBOL_GPL(arch_add_memory); #define PAGE_INUSE 0xFD diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 5e6e4cc36ff4..0995e1a2b458 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -133,6 +133,17 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, bool want_memblock); +#ifndef CONFIG_ARCH_HAS_ADD_PAGES +static inline int add_pages(int nid, unsigned long start_pfn, + unsigned long nr_pages, bool want_memblock) +{ + return __add_pages(nid, start_pfn, nr_pages, want_memblock); +} +#else /* ARCH_HAS_ADD_PAGES */ +int add_pages(int nid, unsigned long start_pfn, + unsigned long nr_pages, bool want_memblock); +#endif /* ARCH_HAS_ADD_PAGES */ + #ifdef CONFIG_NUMA extern int memory_add_physaddr_to_nid(u64 start); #else -- cgit v1.2.3-71-gd317 From 5042db43cc26f51eed51c56192e2c2317e44315f Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:11:43 -0700 Subject: mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM (heterogeneous memory management) need struct page to support migration from system main memory to device memory. Reasons for HMM and migration to device memory is explained with HMM core patch. This patch deals with device memory that is un-addressable memory (ie CPU can not access it). Hence we do not want those struct page to be manage like regular memory. That is why we extend ZONE_DEVICE to support different types of memory. A persistent memory type is define for existing user of ZONE_DEVICE and a new device un-addressable type is added for the un-addressable memory type. There is a clear separation between what is expected from each memory type and existing user of ZONE_DEVICE are un-affected by new requirement and new use of the un-addressable type. All specific code path are protect with test against the memory type. Because memory is un-addressable we use a new special swap type for when a page is migrated to device memory (this reduces the number of maximum swap file). The main two additions beside memory type to ZONE_DEVICE is two callbacks. First one, page_free() is call whenever page refcount reach 1 (which means the page is free as ZONE_DEVICE page never reach a refcount of 0). This allow device driver to manage its memory and associated struct page. The second callback page_fault() happens when there is a CPU access to an address that is back by a device page (which are un-addressable by the CPU). This callback is responsible to migrate the page back to system main memory. Device driver can not block migration back to system memory, HMM make sure that such page can not be pin into device memory. If device is in some error condition and can not migrate memory back then a CPU page fault to device memory should end with SIGBUS. [arnd@arndb.de: fix warning] Link: http://lkml.kernel.org/r/20170823133213.712917-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20170817000548.32038-8-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Arnd Bergmann Acked-by: Dan Williams Cc: Ross Zwisler Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Michal Hocko Cc: Paul E. McKenney Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 7 +++++ include/linux/ioport.h | 1 + include/linux/memremap.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 12 ++++++++ include/linux/swap.h | 24 ++++++++++++++-- include/linux/swapops.h | 68 ++++++++++++++++++++++++++++++++++++++++++++ kernel/memremap.c | 34 ++++++++++++++++++++++ mm/Kconfig | 11 +++++++- mm/memory.c | 61 ++++++++++++++++++++++++++++++++++++++++ mm/memory_hotplug.c | 10 +++++-- mm/mprotect.c | 14 ++++++++++ 11 files changed, 309 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4b21c4e51ce4..90ab657f8e56 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -549,6 +549,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + else if (is_device_private_entry(swpent)) + page = device_private_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { page = find_get_entry(vma->vm_file->f_mapping, @@ -713,6 +715,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + else if (is_device_private_entry(swpent)) + page = device_private_entry_to_page(swpent); } if (page) { int mapcount = page_mapcount(page); @@ -1276,6 +1280,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); + + if (is_device_private_entry(entry)) + page = device_private_entry_to_page(entry); } if (page && !PageAnon(page)) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 6230064d7f95..3a4f69137bc2 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -130,6 +130,7 @@ enum { IORES_DESC_ACPI_NV_STORAGE = 3, IORES_DESC_PERSISTENT_MEMORY = 4, IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, + IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, }; /* helpers to define resources */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 93416196ba64..8e164ec9eed0 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -4,6 +4,8 @@ #include #include +#include + struct resource; struct device; @@ -35,18 +37,89 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) } #endif +/* + * Specialize ZONE_DEVICE memory into multiple types each having differents + * usage. + * + * MEMORY_DEVICE_HOST: + * Persistent device memory (pmem): struct page might be allocated in different + * memory and architecture might want to perform special actions. It is similar + * to regular memory, in that the CPU can access it transparently. However, + * it is likely to have different bandwidth and latency than regular memory. + * See Documentation/nvdimm/nvdimm.txt for more information. + * + * MEMORY_DEVICE_PRIVATE: + * Device memory that is not directly addressable by the CPU: CPU can neither + * read nor write private memory. In this case, we do still have struct pages + * backing the device memory. Doing so simplifies the implementation, but it is + * important to remember that there are certain points at which the struct page + * must be treated as an opaque object, rather than a "normal" struct page. + * + * A more complete discussion of unaddressable memory may be found in + * include/linux/hmm.h and Documentation/vm/hmm.txt. + */ +enum memory_type { + MEMORY_DEVICE_HOST = 0, + MEMORY_DEVICE_PRIVATE, +}; + +/* + * For MEMORY_DEVICE_PRIVATE we use ZONE_DEVICE and extend it with two + * callbacks: + * page_fault() + * page_free() + * + * Additional notes about MEMORY_DEVICE_PRIVATE may be found in + * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief + * explanation in include/linux/memory_hotplug.h. + * + * The page_fault() callback must migrate page back, from device memory to + * system memory, so that the CPU can access it. This might fail for various + * reasons (device issues, device have been unplugged, ...). When such error + * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and + * set the CPU page table entry to "poisoned". + * + * Note that because memory cgroup charges are transferred to the device memory, + * this should never fail due to memory restrictions. However, allocation + * of a regular system page might still fail because we are out of memory. If + * that happens, the page_fault() callback must return VM_FAULT_OOM. + * + * The page_fault() callback can also try to migrate back multiple pages in one + * chunk, as an optimization. It must, however, prioritize the faulting address + * over all the others. + * + * + * The page_free() callback is called once the page refcount reaches 1 + * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. + * This allows the device driver to implement its own memory management.) + */ +typedef int (*dev_page_fault_t)(struct vm_area_struct *vma, + unsigned long addr, + const struct page *page, + unsigned int flags, + pmd_t *pmdp); +typedef void (*dev_page_free_t)(struct page *page, void *data); + /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings + * @page_fault: callback when CPU fault on an unaddressable device page + * @page_free: free page callback when page refcount reaches 1 * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping * @dev: host device of the mapping for debug + * @data: private data pointer for page_free() + * @type: memory type: see MEMORY_* in memory_hotplug.h */ struct dev_pagemap { + dev_page_fault_t page_fault; + dev_page_free_t page_free; struct vmem_altmap *altmap; const struct resource *res; struct percpu_ref *ref; struct device *dev; + void *data; + enum memory_type type; }; #ifdef CONFIG_ZONE_DEVICE diff --git a/include/linux/mm.h b/include/linux/mm.h index 39db8e54c5d5..a74c4e954352 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -792,11 +792,23 @@ static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } + +static inline bool is_device_private_page(const struct page *page) +{ + /* See MEMORY_DEVICE_PRIVATE in include/linux/memory_hotplug.h */ + return ((page_zonenum(page) == ZONE_DEVICE) && + (page->pgmap->type == MEMORY_DEVICE_PRIVATE)); +} #else static inline bool is_zone_device_page(const struct page *page) { return false; } + +static inline bool is_device_private_page(const struct page *page) +{ + return false; +} #endif static inline void get_page(struct page *page) diff --git a/include/linux/swap.h b/include/linux/swap.h index 8bf3487fb204..8a807292037f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -50,6 +50,23 @@ static inline int current_is_kswapd(void) * actions on faults. */ +/* + * Unaddressable device memory support. See include/linux/hmm.h and + * Documentation/vm/hmm.txt. Short description is we need struct pages for + * device memory that is unaddressable (inaccessible) by CPU, so that we can + * migrate part of a process memory to device memory. + * + * When a page is migrated from CPU to device, we set the CPU page table entry + * to a special SWP_DEVICE_* entry. + */ +#ifdef CONFIG_DEVICE_PRIVATE +#define SWP_DEVICE_NUM 2 +#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) +#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) +#else +#define SWP_DEVICE_NUM 0 +#endif + /* * NUMA node memory migration support */ @@ -72,7 +89,8 @@ static inline int current_is_kswapd(void) #endif #define MAX_SWAPFILES \ - ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) + ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) /* * Magic header for a swap area. The first part of the union is @@ -469,8 +487,8 @@ static inline void show_swap_cache_info(void) { } -#define free_swap_and_cache(swp) is_migration_entry(swp) -#define swapcache_prepare(swp) is_migration_entry(swp) +#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));}) +#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));}) static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 45b092aa6419..291c4b534658 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -100,6 +100,74 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY); } +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +static inline swp_entry_t make_device_private_entry(struct page *page, bool write) +{ + return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ, + page_to_pfn(page)); +} + +static inline bool is_device_private_entry(swp_entry_t entry) +{ + int type = swp_type(entry); + return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; +} + +static inline void make_device_private_entry_read(swp_entry_t *entry) +{ + *entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry)); +} + +static inline bool is_write_device_private_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); +} + +static inline struct page *device_private_entry_to_page(swp_entry_t entry) +{ + return pfn_to_page(swp_offset(entry)); +} + +int device_private_entry_fault(struct vm_area_struct *vma, + unsigned long addr, + swp_entry_t entry, + unsigned int flags, + pmd_t *pmdp); +#else /* CONFIG_DEVICE_PRIVATE */ +static inline swp_entry_t make_device_private_entry(struct page *page, bool write) +{ + return swp_entry(0, 0); +} + +static inline void make_device_private_entry_read(swp_entry_t *entry) +{ +} + +static inline bool is_device_private_entry(swp_entry_t entry) +{ + return false; +} + +static inline bool is_write_device_private_entry(swp_entry_t entry) +{ + return false; +} + +static inline struct page *device_private_entry_to_page(swp_entry_t entry) +{ + return NULL; +} + +static inline int device_private_entry_fault(struct vm_area_struct *vma, + unsigned long addr, + swp_entry_t entry, + unsigned int flags, + pmd_t *pmdp) +{ + return VM_FAULT_SIGBUS; +} +#endif /* CONFIG_DEVICE_PRIVATE */ + #ifdef CONFIG_MIGRATION static inline swp_entry_t make_migration_entry(struct page *page, int write) { diff --git a/kernel/memremap.c b/kernel/memremap.c index 066e73c2fcc9..f1d1e0dfe8b4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #ifndef ioremap_cache /* temporary while we convert existing ioremap_cache users to memremap */ @@ -219,6 +221,34 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff) for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ pgoff += 1UL << order, order = order_at((res), pgoff)) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +int device_private_entry_fault(struct vm_area_struct *vma, + unsigned long addr, + swp_entry_t entry, + unsigned int flags, + pmd_t *pmdp) +{ + struct page *page = device_private_entry_to_page(entry); + + /* + * The page_fault() callback must migrate page back to system memory + * so that CPU can access it. This might fail for various reasons + * (device issue, device was unsafely unplugged, ...). When such + * error conditions happen, the callback must return VM_FAULT_SIGBUS. + * + * Note that because memory cgroup charges are accounted to the device + * memory, this should never fail because of memory restrictions (but + * allocation of regular system page might still fail because we are + * out of memory). + * + * There is a more in-depth description of what that callback can and + * cannot do, in include/linux/memremap.h + */ + return page->pgmap->page_fault(vma, addr, page, flags, pmdp); +} +EXPORT_SYMBOL(device_private_entry_fault); +#endif /* CONFIG_DEVICE_PRIVATE */ + static void pgmap_radix_release(struct resource *res) { unsigned long pgoff, order; @@ -356,6 +386,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, } pgmap->ref = ref; pgmap->res = &page_map->res; + pgmap->type = MEMORY_DEVICE_HOST; + pgmap->page_fault = NULL; + pgmap->page_free = NULL; + pgmap->data = NULL; mutex_lock(&pgmap_lock); error = 0; diff --git a/mm/Kconfig b/mm/Kconfig index 254db99f263d..ec27855db133 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -676,7 +676,7 @@ config ARCH_HAS_ZONE_DEVICE bool config ZONE_DEVICE - bool "Device memory (pmem, etc...) hotplug support" + bool "Device memory (pmem, HMM, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP @@ -717,6 +717,15 @@ config HMM_MIRROR page tables (at PAGE_SIZE granularity), and must be able to recover from the resulting potential page faults. +config DEVICE_PRIVATE + bool "Unaddressable device memory (GPU memory, ...)" + depends on ARCH_HAS_HMM + + help + Allows creation of struct pages to represent unaddressable device + memory; i.e., memory that is only accessible from the device (or + group of devices). You likely also want to select HMM_MIRROR. + config FRAME_VECTOR bool diff --git a/mm/memory.c b/mm/memory.c index 886033b95fd2..079eeac0b009 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -956,6 +957,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } + } else if (is_device_private_entry(entry)) { + page = device_private_entry_to_page(entry); + + /* + * Update rss count even for unaddressable pages, as + * they should treated just like normal pages in this + * respect. + * + * We will likely want to have some new rss counters + * for unaddressable pages, at some point. But for now + * keep things as they are. + */ + get_page(page); + rss[mm_counter(page)]++; + page_dup_rmap(page, false); + + /* + * We do not preserve soft-dirty information, because so + * far, checkpoint/restore is the only feature that + * requires that. And checkpoint/restore does not work + * when a device driver is involved (you cannot easily + * save and restore device driver state). + */ + if (is_write_device_private_entry(entry) && + is_cow_mapping(vm_flags)) { + make_device_private_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } goto out_set_pte; } @@ -1274,6 +1304,29 @@ again: } continue; } + + entry = pte_to_swp_entry(ptent); + if (non_swap_entry(entry) && is_device_private_entry(entry)) { + struct page *page = device_private_entry_to_page(entry); + + if (unlikely(details && details->check_mapping)) { + /* + * unmap_shared_mapping_pages() wants to + * invalidate cache without truncating: + * unmap shared but keep private pages. + */ + if (details->check_mapping != + page_rmapping(page)) + continue; + } + + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + rss[mm_counter(page)]--; + page_remove_rmap(page, false); + put_page(page); + continue; + } + /* If details->check_mapping, we leave swap entries. */ if (unlikely(details)) continue; @@ -2776,6 +2829,14 @@ int do_swap_page(struct vm_fault *vmf) if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); + } else if (is_device_private_entry(entry)) { + /* + * For un-addressable device memory we call the pgmap + * fault handler callback. The callback must migrate + * the page back to some CPU accessible page. + */ + ret = device_private_entry_fault(vma, vmf->address, entry, + vmf->flags, vmf->pmd); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f92fb84770d..e882cb6da994 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -99,7 +99,7 @@ void mem_hotplug_done(void) /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { - struct resource *res; + struct resource *res, *conflict; res = kzalloc(sizeof(struct resource), GFP_KERNEL); if (!res) return ERR_PTR(-ENOMEM); @@ -108,7 +108,13 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->start = start; res->end = start + size - 1; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res) < 0) { + conflict = request_resource_conflict(&iomem_resource, res); + if (conflict) { + if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_debug("Device unaddressable memory block " + "memory hotplug at %#010llx !\n", + (unsigned long long)start); + } pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); return ERR_PTR(-EEXIST); diff --git a/mm/mprotect.c b/mm/mprotect.c index a1bfe9545770..6d3e2f082290 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -125,6 +125,20 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pages++; } + + if (is_write_device_private_entry(entry)) { + pte_t newpte; + + /* + * We do not preserve soft-dirtiness. See + * copy_one_pte() for explanation. + */ + make_device_private_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + set_pte_at(mm, addr, pte, newpte); + + pages++; + } } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); -- cgit v1.2.3-71-gd317 From 7b2d55d2c8961ae9d456d3133f4ae2f0fbd3e14f Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:11:46 -0700 Subject: mm/ZONE_DEVICE: special case put_page() for device private pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A ZONE_DEVICE page that reach a refcount of 1 is free ie no longer have any user. For device private pages this is important to catch and thus we need to special case put_page() for this. Link: http://lkml.kernel.org/r/20170817000548.32038-9-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Kirill A. Shutemov Cc: Dan Williams Cc: Ross Zwisler Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Mark Hairgrove Cc: Michal Hocko Cc: Paul E. McKenney Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 13 +++++++++++++ include/linux/mm.h | 31 ++++++++++++++++++++++--------- kernel/memremap.c | 25 ++++++++++++++++++++++++- mm/hmm.c | 8 ++++++++ 4 files changed, 67 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 8e164ec9eed0..8aa6b82679e2 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -126,6 +126,14 @@ struct dev_pagemap { void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap); struct dev_pagemap *find_dev_pagemap(resource_size_t phys); + +static inline bool is_zone_device_page(const struct page *page); + +static inline bool is_device_private_page(const struct page *page) +{ + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PRIVATE; +} #else static inline void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, @@ -144,6 +152,11 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) { return NULL; } + +static inline bool is_device_private_page(const struct page *page) +{ + return false; +} #endif /** diff --git a/include/linux/mm.h b/include/linux/mm.h index a74c4e954352..eccdab4bb44a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -23,6 +23,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -792,25 +793,25 @@ static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } - -static inline bool is_device_private_page(const struct page *page) -{ - /* See MEMORY_DEVICE_PRIVATE in include/linux/memory_hotplug.h */ - return ((page_zonenum(page) == ZONE_DEVICE) && - (page->pgmap->type == MEMORY_DEVICE_PRIVATE)); -} #else static inline bool is_zone_device_page(const struct page *page) { return false; } +#endif -static inline bool is_device_private_page(const struct page *page) +#ifdef CONFIG_DEVICE_PRIVATE +void put_zone_device_private_page(struct page *page); +#else +static inline void put_zone_device_private_page(struct page *page) { - return false; } #endif +static inline bool is_device_private_page(const struct page *page); + +DECLARE_STATIC_KEY_FALSE(device_private_key); + static inline void get_page(struct page *page) { page = compound_head(page); @@ -826,6 +827,18 @@ static inline void put_page(struct page *page) { page = compound_head(page); + /* + * For private device pages we need to catch refcount transition from + * 2 to 1, when refcount reach one it means the private device page is + * free and we need to inform the device driver through callback. See + * include/linux/memremap.h and HMM for details. + */ + if (static_branch_unlikely(&device_private_key) && + unlikely(is_device_private_page(page))) { + put_zone_device_private_page(page); + return; + } + if (put_page_testzero(page)) __put_page(page); } diff --git a/kernel/memremap.c b/kernel/memremap.c index f1d1e0dfe8b4..1403cf16fa61 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,7 +11,6 @@ * General Public License for more details. */ #include -#include #include #include #include @@ -500,3 +499,27 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } #endif /* CONFIG_ZONE_DEVICE */ + + +#ifdef CONFIG_DEVICE_PRIVATE +void put_zone_device_private_page(struct page *page) +{ + int count = page_ref_dec_return(page); + + /* + * If refcount is 1 then page is freed and refcount is stable as nobody + * holds a reference on the page. + */ + if (count == 1) { + /* Clear Active bit in case of parallel mark_page_accessed */ + __ClearPageActive(page); + __ClearPageWaiters(page); + + page->mapping = NULL; + + page->pgmap->page_free(page, page->pgmap->data); + } else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_zone_device_private_page); +#endif /* CONFIG_DEVICE_PRIVATE */ diff --git a/mm/hmm.c b/mm/hmm.c index f6c745b9a25a..3c6265d4254b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -25,9 +25,17 @@ #include #include #include +#include #include +/* + * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h + */ +DEFINE_STATIC_KEY_FALSE(device_private_key); +EXPORT_SYMBOL(device_private_key); + + #ifdef CONFIG_HMM static const struct mmu_notifier_ops hmm_mmu_notifier_ops; -- cgit v1.2.3-71-gd317 From 4ef589dc9b10cdcae75a2b2b0e9b2c5e8a92c378 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:11:58 -0700 Subject: mm/hmm/devmem: device memory hotplug using ZONE_DEVICE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This introduce a simple struct and associated helpers for device driver to use when hotpluging un-addressable device memory as ZONE_DEVICE. It will find a unuse physical address range and trigger memory hotplug for it which allocates and initialize struct page for the device memory. Device driver should use this helper during device initialization to hotplug the device memory. It should only need to remove the memory once the device is going offline (shutdown or hotremove). There should not be any userspace API to hotplug memory expect maybe for host device driver to allow to add more memory to a guest device driver. Device's memory is manage by the device driver and HMM only provides helpers to that effect. Link: http://lkml.kernel.org/r/20170817000548.32038-12-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Evgeny Baskakov Signed-off-by: John Hubbard Signed-off-by: Mark Hairgrove Signed-off-by: Sherry Cheung Signed-off-by: Subhash Gutti Signed-off-by: Balbir Singh Cc: Aneesh Kumar Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 155 +++++++++++++++++++++ mm/hmm.c | 379 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 533 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 61a6535fe438..16f916b437cc 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -72,6 +72,11 @@ #if IS_ENABLED(CONFIG_HMM) +#include +#include +#include + + struct hmm; /* @@ -322,6 +327,156 @@ int hmm_vma_fault(struct vm_area_struct *vma, #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +struct hmm_devmem; + +struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, + unsigned long addr); + +/* + * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events + * + * @free: call when refcount on page reach 1 and thus is no longer use + * @fault: call when there is a page fault to unaddressable memory + * + * Both callback happens from page_free() and page_fault() callback of struct + * dev_pagemap respectively. See include/linux/memremap.h for more details on + * those. + * + * The hmm_devmem_ops callback are just here to provide a coherent and + * uniq API to device driver and device driver should not register their + * own page_free() or page_fault() but rely on the hmm_devmem_ops call- + * back. + */ +struct hmm_devmem_ops { + /* + * free() - free a device page + * @devmem: device memory structure (see struct hmm_devmem) + * @page: pointer to struct page being freed + * + * Call back occurs whenever a device page refcount reach 1 which + * means that no one is holding any reference on the page anymore + * (ZONE_DEVICE page have an elevated refcount of 1 as default so + * that they are not release to the general page allocator). + * + * Note that callback has exclusive ownership of the page (as no + * one is holding any reference). + */ + void (*free)(struct hmm_devmem *devmem, struct page *page); + /* + * fault() - CPU page fault or get user page (GUP) + * @devmem: device memory structure (see struct hmm_devmem) + * @vma: virtual memory area containing the virtual address + * @addr: virtual address that faulted or for which there is a GUP + * @page: pointer to struct page backing virtual address (unreliable) + * @flags: FAULT_FLAG_* (see include/linux/mm.h) + * @pmdp: page middle directory + * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR + * on error + * + * The callback occurs whenever there is a CPU page fault or GUP on a + * virtual address. This means that the device driver must migrate the + * page back to regular memory (CPU accessible). + * + * The device driver is free to migrate more than one page from the + * fault() callback as an optimization. However if device decide to + * migrate more than one page it must always priotirize the faulting + * address over the others. + * + * The struct page pointer is only given as an hint to allow quick + * lookup of internal device driver data. A concurrent migration + * might have already free that page and the virtual address might + * not longer be back by it. So it should not be modified by the + * callback. + * + * Note that mmap semaphore is held in read mode at least when this + * callback occurs, hence the vma is valid upon callback entry. + */ + int (*fault)(struct hmm_devmem *devmem, + struct vm_area_struct *vma, + unsigned long addr, + const struct page *page, + unsigned int flags, + pmd_t *pmdp); +}; + +/* + * struct hmm_devmem - track device memory + * + * @completion: completion object for device memory + * @pfn_first: first pfn for this resource (set by hmm_devmem_add()) + * @pfn_last: last pfn for this resource (set by hmm_devmem_add()) + * @resource: IO resource reserved for this chunk of memory + * @pagemap: device page map for that chunk + * @device: device to bind resource to + * @ops: memory operations callback + * @ref: per CPU refcount + * + * This an helper structure for device drivers that do not wish to implement + * the gory details related to hotplugging new memoy and allocating struct + * pages. + * + * Device drivers can directly use ZONE_DEVICE memory on their own if they + * wish to do so. + */ +struct hmm_devmem { + struct completion completion; + unsigned long pfn_first; + unsigned long pfn_last; + struct resource *resource; + struct device *device; + struct dev_pagemap pagemap; + const struct hmm_devmem_ops *ops; + struct percpu_ref ref; +}; + +/* + * To add (hotplug) device memory, HMM assumes that there is no real resource + * that reserves a range in the physical address space (this is intended to be + * use by unaddressable device memory). It will reserve a physical range big + * enough and allocate struct page for it. + * + * The device driver can wrap the hmm_devmem struct inside a private device + * driver struct. The device driver must call hmm_devmem_remove() before the + * device goes away and before freeing the hmm_devmem struct memory. + */ +struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, + struct device *device, + unsigned long size); +void hmm_devmem_remove(struct hmm_devmem *devmem); + +/* + * hmm_devmem_page_set_drvdata - set per-page driver data field + * + * @page: pointer to struct page + * @data: driver data value to set + * + * Because page can not be on lru we have an unsigned long that driver can use + * to store a per page field. This just a simple helper to do that. + */ +static inline void hmm_devmem_page_set_drvdata(struct page *page, + unsigned long data) +{ + unsigned long *drvdata = (unsigned long *)&page->pgmap; + + drvdata[1] = data; +} + +/* + * hmm_devmem_page_get_drvdata - get per page driver data field + * + * @page: pointer to struct page + * Return: driver data value + */ +static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page) +{ + unsigned long *drvdata = (unsigned long *)&page->pgmap; + + return drvdata[1]; +} +#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */ + + /* Below are for HMM internal use only! Not to be used by device driver! */ void hmm_mm_destroy(struct mm_struct *mm); diff --git a/mm/hmm.c b/mm/hmm.c index 3c6265d4254b..afb51078a5cf 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -23,10 +23,16 @@ #include #include #include +#include +#include #include #include +#include #include #include +#include + +#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) /* @@ -426,7 +432,15 @@ again: * This is a special swap entry, ignore migration, use * device and report anything else as error. */ - if (is_migration_entry(entry)) { + if (is_device_private_entry(entry)) { + pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry)); + if (is_write_device_private_entry(entry)) { + pfns[i] |= HMM_PFN_WRITE; + } else if (write_fault) + goto fault; + pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; + pfns[i] |= flag; + } else if (is_migration_entry(entry)) { if (hmm_vma_walk->fault) { pte_unmap(ptep); hmm_vma_walk->last = addr; @@ -720,3 +734,366 @@ int hmm_vma_fault(struct vm_area_struct *vma, } EXPORT_SYMBOL(hmm_vma_fault); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ + + +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + + page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + if (!page) + return NULL; + lock_page(page); + return page; +} +EXPORT_SYMBOL(hmm_vma_alloc_locked_page); + + +static void hmm_devmem_ref_release(struct percpu_ref *ref) +{ + struct hmm_devmem *devmem; + + devmem = container_of(ref, struct hmm_devmem, ref); + complete(&devmem->completion); +} + +static void hmm_devmem_ref_exit(void *data) +{ + struct percpu_ref *ref = data; + struct hmm_devmem *devmem; + + devmem = container_of(ref, struct hmm_devmem, ref); + percpu_ref_exit(ref); + devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); +} + +static void hmm_devmem_ref_kill(void *data) +{ + struct percpu_ref *ref = data; + struct hmm_devmem *devmem; + + devmem = container_of(ref, struct hmm_devmem, ref); + percpu_ref_kill(ref); + wait_for_completion(&devmem->completion); + devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); +} + +static int hmm_devmem_fault(struct vm_area_struct *vma, + unsigned long addr, + const struct page *page, + unsigned int flags, + pmd_t *pmdp) +{ + struct hmm_devmem *devmem = page->pgmap->data; + + return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); +} + +static void hmm_devmem_free(struct page *page, void *data) +{ + struct hmm_devmem *devmem = data; + + devmem->ops->free(devmem, page); +} + +static DEFINE_MUTEX(hmm_devmem_lock); +static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); + +static void hmm_devmem_radix_release(struct resource *resource) +{ + resource_size_t key, align_start, align_size, align_end; + + align_start = resource->start & ~(PA_SECTION_SIZE - 1); + align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); + align_end = align_start + align_size - 1; + + mutex_lock(&hmm_devmem_lock); + for (key = resource->start; + key <= resource->end; + key += PA_SECTION_SIZE) + radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); + mutex_unlock(&hmm_devmem_lock); +} + +static void hmm_devmem_release(struct device *dev, void *data) +{ + struct hmm_devmem *devmem = data; + struct resource *resource = devmem->resource; + unsigned long start_pfn, npages; + struct zone *zone; + struct page *page; + + if (percpu_ref_tryget_live(&devmem->ref)) { + dev_WARN(dev, "%s: page mapping is still live!\n", __func__); + percpu_ref_put(&devmem->ref); + } + + /* pages are dead and unused, undo the arch mapping */ + start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; + npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; + + page = pfn_to_page(start_pfn); + zone = page_zone(page); + + mem_hotplug_begin(); + __remove_pages(zone, start_pfn, npages); + mem_hotplug_done(); + + hmm_devmem_radix_release(resource); +} + +static struct hmm_devmem *hmm_devmem_find(resource_size_t phys) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT); +} + +static int hmm_devmem_pages_create(struct hmm_devmem *devmem) +{ + resource_size_t key, align_start, align_size, align_end; + struct device *device = devmem->device; + int ret, nid, is_ram; + unsigned long pfn; + + align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); + align_size = ALIGN(devmem->resource->start + + resource_size(devmem->resource), + PA_SECTION_SIZE) - align_start; + + is_ram = region_intersects(align_start, align_size, + IORESOURCE_SYSTEM_RAM, + IORES_DESC_NONE); + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "%s attempted on mixed region %pr\n", + __func__, devmem->resource); + return -ENXIO; + } + if (is_ram == REGION_INTERSECTS) + return -ENXIO; + + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + devmem->pagemap.res = devmem->resource; + devmem->pagemap.page_fault = hmm_devmem_fault; + devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.dev = devmem->device; + devmem->pagemap.ref = &devmem->ref; + devmem->pagemap.data = devmem; + + mutex_lock(&hmm_devmem_lock); + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { + struct hmm_devmem *dup; + + rcu_read_lock(); + dup = hmm_devmem_find(key); + rcu_read_unlock(); + if (dup) { + dev_err(device, "%s: collides with mapping for %s\n", + __func__, dev_name(dup->device)); + mutex_unlock(&hmm_devmem_lock); + ret = -EBUSY; + goto error; + } + ret = radix_tree_insert(&hmm_devmem_radix, + key >> PA_SECTION_SHIFT, + devmem); + if (ret) { + dev_err(device, "%s: failed: %d\n", __func__, ret); + mutex_unlock(&hmm_devmem_lock); + goto error_radix; + } + } + mutex_unlock(&hmm_devmem_lock); + + nid = dev_to_node(device); + if (nid < 0) + nid = numa_mem_id(); + + mem_hotplug_begin(); + /* + * For device private memory we call add_pages() as we only need to + * allocate and initialize struct page for the device memory. More- + * over the device memory is un-accessible thus we do not want to + * create a linear mapping for the memory like arch_add_memory() + * would do. + */ + ret = add_pages(nid, align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, false); + if (ret) { + mem_hotplug_done(); + goto error_add_memory; + } + move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT); + mem_hotplug_done(); + + for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { + struct page *page = pfn_to_page(pfn); + + page->pgmap = &devmem->pagemap; + } + return 0; + +error_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); +error_radix: + hmm_devmem_radix_release(devmem->resource); +error: + return ret; +} + +static int hmm_devmem_match(struct device *dev, void *data, void *match_data) +{ + struct hmm_devmem *devmem = data; + + return devmem->resource == match_data; +} + +static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) +{ + devres_release(devmem->device, &hmm_devmem_release, + &hmm_devmem_match, devmem->resource); +} + +/* + * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory + * + * @ops: memory event device driver callback (see struct hmm_devmem_ops) + * @device: device struct to bind the resource too + * @size: size in bytes of the device memory to add + * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise + * + * This function first finds an empty range of physical address big enough to + * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which + * in turn allocates struct pages. It does not do anything beyond that; all + * events affecting the memory will go through the various callbacks provided + * by hmm_devmem_ops struct. + * + * Device driver should call this function during device initialization and + * is then responsible of memory management. HMM only provides helpers. + */ +struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, + struct device *device, + unsigned long size) +{ + struct hmm_devmem *devmem; + resource_size_t addr; + int ret; + + static_branch_enable(&device_private_key); + + devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), + GFP_KERNEL, dev_to_node(device)); + if (!devmem) + return ERR_PTR(-ENOMEM); + + init_completion(&devmem->completion); + devmem->pfn_first = -1UL; + devmem->pfn_last = -1UL; + devmem->resource = NULL; + devmem->device = device; + devmem->ops = ops; + + ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, + 0, GFP_KERNEL); + if (ret) + goto error_percpu_ref; + + ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + if (ret) + goto error_devm_add_action; + + size = ALIGN(size, PA_SECTION_SIZE); + addr = min((unsigned long)iomem_resource.end, + (1UL << MAX_PHYSMEM_BITS) - 1); + addr = addr - size + 1UL; + + /* + * FIXME add a new helper to quickly walk resource tree and find free + * range + * + * FIXME what about ioport_resource resource ? + */ + for (; addr > size && addr >= iomem_resource.start; addr -= size) { + ret = region_intersects(addr, size, 0, IORES_DESC_NONE); + if (ret != REGION_DISJOINT) + continue; + + devmem->resource = devm_request_mem_region(device, addr, size, + dev_name(device)); + if (!devmem->resource) { + ret = -ENOMEM; + goto error_no_resource; + } + break; + } + if (!devmem->resource) { + ret = -ERANGE; + goto error_no_resource; + } + + devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; + devmem->pfn_last = devmem->pfn_first + + (resource_size(devmem->resource) >> PAGE_SHIFT); + + ret = hmm_devmem_pages_create(devmem); + if (ret) + goto error_pages; + + devres_add(device, devmem); + + ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); + if (ret) { + hmm_devmem_remove(devmem); + return ERR_PTR(ret); + } + + return devmem; + +error_pages: + devm_release_mem_region(device, devmem->resource->start, + resource_size(devmem->resource)); +error_no_resource: +error_devm_add_action: + hmm_devmem_ref_kill(&devmem->ref); + hmm_devmem_ref_exit(&devmem->ref); +error_percpu_ref: + devres_free(devmem); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(hmm_devmem_add); + +/* + * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) + * + * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory + * + * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf + * of the device driver. It will free struct page and remove the resource that + * reserved the physical address range for this device memory. + */ +void hmm_devmem_remove(struct hmm_devmem *devmem) +{ + resource_size_t start, size; + struct device *device; + + if (!devmem) + return; + + device = devmem->device; + start = devmem->resource->start; + size = resource_size(devmem->resource); + + hmm_devmem_ref_kill(&devmem->ref); + hmm_devmem_ref_exit(&devmem->ref); + hmm_devmem_pages_remove(devmem); + + devm_release_mem_region(device, start, size); +} +EXPORT_SYMBOL(hmm_devmem_remove); +#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */ -- cgit v1.2.3-71-gd317 From 858b54dabf4363daa3a97b9a722130a8e7cea8c9 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:02 -0700 Subject: mm/hmm/devmem: dummy HMM device for ZONE_DEVICE memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This introduce a dummy HMM device class so device driver can use it to create hmm_device for the sole purpose of registering device memory. It is useful to device driver that want to manage multiple physical device memory under same struct device umbrella. Link: http://lkml.kernel.org/r/20170817000548.32038-13-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Evgeny Baskakov Signed-off-by: John Hubbard Signed-off-by: Mark Hairgrove Signed-off-by: Sherry Cheung Signed-off-by: Subhash Gutti Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 22 ++++++++++++++- mm/hmm.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 16f916b437cc..67a03b20a2db 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -72,11 +72,11 @@ #if IS_ENABLED(CONFIG_HMM) +#include #include #include #include - struct hmm; /* @@ -474,6 +474,26 @@ static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page) return drvdata[1]; } + + +/* + * struct hmm_device - fake device to hang device memory onto + * + * @device: device struct + * @minor: device minor number + */ +struct hmm_device { + struct device device; + unsigned int minor; +}; + +/* + * A device driver that wants to handle multiple devices memory through a + * single fake device can use hmm_device to do so. This is purely a helper and + * it is not strictly needed, in order to make use of any HMM functionality. + */ +struct hmm_device *hmm_device_new(void *drvdata); +void hmm_device_put(struct hmm_device *hmm_device); #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */ diff --git a/mm/hmm.c b/mm/hmm.c index afb51078a5cf..c9d23ef80552 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -19,6 +19,7 @@ */ #include #include +#include #include #include #include @@ -1096,4 +1097,84 @@ void hmm_devmem_remove(struct hmm_devmem *devmem) devm_release_mem_region(device, start, size); } EXPORT_SYMBOL(hmm_devmem_remove); + +/* + * A device driver that wants to handle multiple devices memory through a + * single fake device can use hmm_device to do so. This is purely a helper + * and it is not needed to make use of any HMM functionality. + */ +#define HMM_DEVICE_MAX 256 + +static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); +static DEFINE_SPINLOCK(hmm_device_lock); +static struct class *hmm_device_class; +static dev_t hmm_device_devt; + +static void hmm_device_release(struct device *device) +{ + struct hmm_device *hmm_device; + + hmm_device = container_of(device, struct hmm_device, device); + spin_lock(&hmm_device_lock); + clear_bit(hmm_device->minor, hmm_device_mask); + spin_unlock(&hmm_device_lock); + + kfree(hmm_device); +} + +struct hmm_device *hmm_device_new(void *drvdata) +{ + struct hmm_device *hmm_device; + + hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); + if (!hmm_device) + return ERR_PTR(-ENOMEM); + + spin_lock(&hmm_device_lock); + hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); + if (hmm_device->minor >= HMM_DEVICE_MAX) { + spin_unlock(&hmm_device_lock); + kfree(hmm_device); + return ERR_PTR(-EBUSY); + } + set_bit(hmm_device->minor, hmm_device_mask); + spin_unlock(&hmm_device_lock); + + dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); + hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), + hmm_device->minor); + hmm_device->device.release = hmm_device_release; + dev_set_drvdata(&hmm_device->device, drvdata); + hmm_device->device.class = hmm_device_class; + device_initialize(&hmm_device->device); + + return hmm_device; +} +EXPORT_SYMBOL(hmm_device_new); + +void hmm_device_put(struct hmm_device *hmm_device) +{ + put_device(&hmm_device->device); +} +EXPORT_SYMBOL(hmm_device_put); + +static int __init hmm_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&hmm_device_devt, 0, + HMM_DEVICE_MAX, + "hmm_device"); + if (ret) + return ret; + + hmm_device_class = class_create(THIS_MODULE, "hmm_device"); + if (IS_ERR(hmm_device_class)) { + unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); + return PTR_ERR(hmm_device_class); + } + return 0; +} + +device_initcall(hmm_init); #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */ -- cgit v1.2.3-71-gd317 From 2916ecc0f9d435d849c98f4da50e453124c87531 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:06 -0700 Subject: mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new migration mode that allow to offload the copy to a device DMA engine. This changes the workflow of migration and not all address_space migratepage callback can support this. This is intended to be use by migrate_vma() which itself is use for thing like HMM (see include/linux/hmm.h). No additional per-filesystem migratepage testing is needed. I disables MIGRATE_SYNC_NO_COPY in all problematic migratepage() callback and i added comment in those to explain why (part of this patch). The commit message is unclear it should say that any callback that wish to support this new mode need to be aware of the difference in the migration flow from other mode. Some of these callbacks do extra locking while copying (aio, zsmalloc, balloon, ...) and for DMA to be effective you want to copy multiple pages in one DMA operations. But in the problematic case you can not easily hold the extra lock accross multiple call to this callback. Usual flow is: For each page { 1 - lock page 2 - call migratepage() callback 3 - (extra locking in some migratepage() callback) 4 - migrate page state (freeze refcount, update page cache, buffer head, ...) 5 - copy page 6 - (unlock any extra lock of migratepage() callback) 7 - return from migratepage() callback 8 - unlock page } The new mode MIGRATE_SYNC_NO_COPY: 1 - lock multiple pages For each page { 2 - call migratepage() callback 3 - abort in all problematic migratepage() callback 4 - migrate page state (freeze refcount, update page cache, buffer head, ...) } // finished all calls to migratepage() callback 5 - DMA copy multiple pages 6 - unlock all the pages To support MIGRATE_SYNC_NO_COPY in the problematic case we would need a new callback migratepages() (for instance) that deals with multiple pages in one transaction. Because the problematic cases are not important for current usage I did not wanted to complexify this patchset even more for no good reason. Link: http://lkml.kernel.org/r/20170817000548.32038-14-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 8 +++++++ fs/f2fs/data.c | 5 ++++- fs/hugetlbfs/inode.c | 5 ++++- fs/ubifs/file.c | 5 ++++- include/linux/migrate.h | 5 +++++ include/linux/migrate_mode.h | 5 +++++ mm/balloon_compaction.c | 8 +++++++ mm/migrate.c | 52 ++++++++++++++++++++++++++++++++++---------- mm/zsmalloc.c | 8 +++++++ 9 files changed, 86 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/aio.c b/fs/aio.c index 8f0127526299..b5d69f28d8b1 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -373,6 +373,14 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, pgoff_t idx; int rc; + /* + * We cannot support the _NO_COPY case here, because copy needs to + * happen under the ctx->completion_lock. That does not work with the + * migration workflow of MIGRATE_SYNC_NO_COPY. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + rc = 0; /* mapping->private_lock here protects against the kioctx teardown. */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a791aac4c5af..fb96bb71da00 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2253,7 +2253,10 @@ int f2fs_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); set_page_private(newpage, page_private(page)); - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7c02b3f738e1..8c6f4b8f910f 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -830,7 +830,10 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, rc = migrate_huge_page_move_mapping(mapping, newpage, page); if (rc != MIGRATEPAGE_SUCCESS) return rc; - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f90a466ea5db..a02aa59d1e24 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1490,7 +1490,10 @@ static int ubifs_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); } - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ce15989521a1..7db4c812a2a6 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -72,6 +72,7 @@ extern void putback_movable_page(struct page *page); extern int migrate_prep(void); extern int migrate_prep_local(void); +extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); @@ -92,6 +93,10 @@ static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } +static inline void migrate_page_states(struct page *newpage, struct page *page) +{ +} + static inline void migrate_page_copy(struct page *newpage, struct page *page) {} diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index ebf3d89a3919..bdf66af9b937 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -6,11 +6,16 @@ * on most operations but not ->writepage as the potential stall time * is too significant * MIGRATE_SYNC will block when migrating pages + * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages + * with the CPU. Instead, page copy happens outside the migratepage() + * callback and is likely using a DMA engine. See migrate_vma() and HMM + * (mm/hmm.c) for users of this mode. */ enum migrate_mode { MIGRATE_ASYNC, MIGRATE_SYNC_LIGHT, MIGRATE_SYNC, + MIGRATE_SYNC_NO_COPY, }; #endif /* MIGRATE_MODE_H_INCLUDED */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index b06d9fe23a28..68d28924ba79 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -139,6 +139,14 @@ int balloon_page_migrate(struct address_space *mapping, { struct balloon_dev_info *balloon = balloon_page_device(page); + /* + * We can not easily support the no copy case here so ignore it as it + * is unlikely to be use with ballon pages. See include/linux/hmm.h for + * user of the MIGRATE_SYNC_NO_COPY mode. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); diff --git a/mm/migrate.c b/mm/migrate.c index 1088cef6ef8b..71de36cfb673 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -634,15 +634,10 @@ static void copy_huge_page(struct page *dst, struct page *src) /* * Copy the page to its new location */ -void migrate_page_copy(struct page *newpage, struct page *page) +void migrate_page_states(struct page *newpage, struct page *page) { int cpupid; - if (PageHuge(page) || PageTransHuge(page)) - copy_huge_page(newpage, page); - else - copy_highpage(newpage, page); - if (PageError(page)) SetPageError(newpage); if (PageReferenced(page)) @@ -696,6 +691,17 @@ void migrate_page_copy(struct page *newpage, struct page *page) mem_cgroup_migrate(page, newpage); } +EXPORT_SYMBOL(migrate_page_states); + +void migrate_page_copy(struct page *newpage, struct page *page) +{ + if (PageHuge(page) || PageTransHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); + + migrate_page_states(newpage, page); +} EXPORT_SYMBOL(migrate_page_copy); /************************************************************ @@ -721,7 +727,10 @@ int migrate_page(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -771,12 +780,15 @@ int buffer_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); bh = head; do { unlock_buffer(bh); - put_bh(bh); + put_bh(bh); bh = bh->b_this_page; } while (bh != head); @@ -835,8 +847,13 @@ static int fallback_migrate_page(struct address_space *mapping, { if (PageDirty(page)) { /* Only writeback pages in full synchronous migration */ - if (mode != MIGRATE_SYNC) + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: return -EBUSY; + } return writeout(mapping, page); } @@ -973,7 +990,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much */ - if (mode != MIGRATE_SYNC) { + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: rc = -EBUSY; goto out_unlock; } @@ -1243,8 +1264,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOMEM; if (!trylock_page(hpage)) { - if (!force || mode != MIGRATE_SYNC) + if (!force) goto out; + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: + goto out; + } lock_page(hpage); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 62457eb82330..5ad75ec4151c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1969,6 +1969,14 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage, unsigned int obj_idx; int ret = -EAGAIN; + /* + * We cannot support the _NO_COPY case here, because copy needs to + * happen under the zs lock, which does not work with + * MIGRATE_SYNC_NO_COPY workflow. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); -- cgit v1.2.3-71-gd317 From 8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:09 -0700 Subject: mm/migrate: new memory migration helper for use with device memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch add a new memory migration helpers, which migrate memory backing a range of virtual address of a process to different memory (which can be allocated through special allocator). It differs from numa migration by working on a range of virtual address and thus by doing migration in chunk that can be large enough to use DMA engine or special copy offloading engine. Expected users are any one with heterogeneous memory where different memory have different characteristics (latency, bandwidth, ...). As an example IBM platform with CAPI bus can make use of this feature to migrate between regular memory and CAPI device memory. New CPU architecture with a pool of high performance memory not manage as cache but presented as regular memory (while being faster and with lower latency than DDR) will also be prime user of this patch. Migration to private device memory will be useful for device that have large pool of such like GPU, NVidia plans to use HMM for that. Link: http://lkml.kernel.org/r/20170817000548.32038-15-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Evgeny Baskakov Signed-off-by: John Hubbard Signed-off-by: Mark Hairgrove Signed-off-by: Sherry Cheung Signed-off-by: Subhash Gutti Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 104 ++++++++++ mm/migrate.c | 492 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 596 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7db4c812a2a6..8f73cebfc3f5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -156,4 +156,108 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, } #endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/ + +#ifdef CONFIG_MIGRATION + +#define MIGRATE_PFN_VALID (1UL << 0) +#define MIGRATE_PFN_MIGRATE (1UL << 1) +#define MIGRATE_PFN_LOCKED (1UL << 2) +#define MIGRATE_PFN_WRITE (1UL << 3) +#define MIGRATE_PFN_ERROR (1UL << 4) +#define MIGRATE_PFN_SHIFT 5 + +static inline struct page *migrate_pfn_to_page(unsigned long mpfn) +{ + if (!(mpfn & MIGRATE_PFN_VALID)) + return NULL; + return pfn_to_page(mpfn >> MIGRATE_PFN_SHIFT); +} + +static inline unsigned long migrate_pfn(unsigned long pfn) +{ + return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; +} + +/* + * struct migrate_vma_ops - migrate operation callback + * + * @alloc_and_copy: alloc destination memory and copy source memory to it + * @finalize_and_map: allow caller to map the successfully migrated pages + * + * + * The alloc_and_copy() callback happens once all source pages have been locked, + * unmapped and checked (checked whether pinned or not). All pages that can be + * migrated will have an entry in the src array set with the pfn value of the + * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other + * flags might be set but should be ignored by the callback). + * + * The alloc_and_copy() callback can then allocate destination memory and copy + * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and + * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the + * callback must update each corresponding entry in the dst array with the pfn + * value of the destination page and with the MIGRATE_PFN_VALID and + * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages + * locked, via lock_page()). + * + * At this point the alloc_and_copy() callback is done and returns. + * + * Note that the callback does not have to migrate all the pages that are + * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration + * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also + * set in the src array entry). If the device driver cannot migrate a device + * page back to system memory, then it must set the corresponding dst array + * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to + * access any of the virtual addresses originally backed by this page. Because + * a SIGBUS is such a severe result for the userspace process, the device + * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an + * unrecoverable state. + * + * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES + * OR BAD THINGS WILL HAPPEN ! + * + * + * The finalize_and_map() callback happens after struct page migration from + * source to destination (destination struct pages are the struct pages for the + * memory allocated by the alloc_and_copy() callback). Migration can fail, and + * thus the finalize_and_map() allows the driver to inspect which pages were + * successfully migrated, and which were not. Successfully migrated pages will + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * + * It is safe to update device page table from within the finalize_and_map() + * callback because both destination and source page are still locked, and the + * mmap_sem is held in read mode (hence no one can unmap the range being + * migrated). + * + * Once callback is done cleaning up things and updating its page table (if it + * chose to do so, this is not an obligation) then it returns. At this point, + * the HMM core will finish up the final steps, and the migration is complete. + * + * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY + * ENTRIES OR BAD THINGS WILL HAPPEN ! + */ +struct migrate_vma_ops { + void (*alloc_and_copy)(struct vm_area_struct *vma, + const unsigned long *src, + unsigned long *dst, + unsigned long start, + unsigned long end, + void *private); + void (*finalize_and_map)(struct vm_area_struct *vma, + const unsigned long *src, + const unsigned long *dst, + unsigned long start, + unsigned long end, + void *private); +}; + +int migrate_vma(const struct migrate_vma_ops *ops, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long *src, + unsigned long *dst, + void *private); + +#endif /* CONFIG_MIGRATION */ + #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/migrate.c b/mm/migrate.c index 71de36cfb673..991e8886093f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -428,6 +428,14 @@ int migrate_page_move_mapping(struct address_space *mapping, int expected_count = 1 + extra_count; void **pslot; + /* + * ZONE_DEVICE pages have 1 refcount always held by their device + * + * Note that DAX memory will never reach that point as it does not have + * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h). + */ + expected_count += is_zone_device_page(page); + if (!mapping) { /* Anonymous page without mapping */ if (page_count(page) != expected_count) @@ -2106,3 +2114,487 @@ out_unlock: #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ + + +struct migrate_vma { + struct vm_area_struct *vma; + unsigned long *dst; + unsigned long *src; + unsigned long cpages; + unsigned long npages; + unsigned long start; + unsigned long end; +}; + +static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = 0; + } + + return 0; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start; + spinlock_t *ptl; + pte_t *ptep; + +again: + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, walk); + + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_trans_huge(*pmdp))) { + spin_unlock(ptl); + goto again; + } + + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmdp, addr); + if (pmd_trans_unstable(pmdp)) + return migrate_vma_collect_hole(start, end, + walk); + } else { + int ret; + + get_page(page); + spin_unlock(ptl); + if (unlikely(!trylock_page(page))) + return migrate_vma_collect_hole(start, end, + walk); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret || pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, + walk); + } + } + + if (unlikely(pmd_bad(*pmdp))) + return migrate_vma_collect_hole(start, end, walk); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + for (; addr < end; addr += PAGE_SIZE, ptep++) { + unsigned long mpfn, pfn; + struct page *page; + pte_t pte; + + pte = *ptep; + pfn = pte_pfn(pte); + + if (!pte_present(pte)) { + mpfn = pfn = 0; + goto next; + } + + /* FIXME support THP */ + page = vm_normal_page(migrate->vma, addr, pte); + if (!page || !page->mapping || PageTransCompound(page)) { + mpfn = pfn = 0; + goto next; + } + + /* + * By getting a reference on the page we pin it and that blocks + * any kind of migration. Side effect is that it "freezes" the + * pte. + * + * We drop this reference after isolating the page from the lru + * for non device page (device page are not on the lru and thus + * can't be dropped from it). + */ + get_page(page); + migrate->cpages++; + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + +next: + migrate->src[migrate->npages++] = mpfn; + } + pte_unmap_unlock(ptep - 1, ptl); + + return 0; +} + +/* + * migrate_vma_collect() - collect pages over a range of virtual addresses + * @migrate: migrate struct containing all migration information + * + * This will walk the CPU page table. For each virtual address backed by a + * valid page, it updates the src array and takes a reference on the page, in + * order to pin the page until we lock it and unmap it. + */ +static void migrate_vma_collect(struct migrate_vma *migrate) +{ + struct mm_walk mm_walk; + + mm_walk.pmd_entry = migrate_vma_collect_pmd; + mm_walk.pte_entry = NULL; + mm_walk.pte_hole = migrate_vma_collect_hole; + mm_walk.hugetlb_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.vma = migrate->vma; + mm_walk.mm = migrate->vma->vm_mm; + mm_walk.private = migrate; + + walk_page_range(migrate->start, migrate->end, &mm_walk); + + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); +} + +/* + * migrate_vma_check_page() - check if page is pinned or not + * @page: struct page to check + * + * Pinned pages cannot be migrated. This is the same test as in + * migrate_page_move_mapping(), except that here we allow migration of a + * ZONE_DEVICE page. + */ +static bool migrate_vma_check_page(struct page *page) +{ + /* + * One extra ref because caller holds an extra reference, either from + * isolate_lru_page() for a regular page, or migrate_vma_collect() for + * a device page. + */ + int extra = 1; + + /* + * FIXME support THP (transparent huge page), it is bit more complex to + * check them than regular pages, because they can be mapped with a pmd + * or with a pte (split pte mapping). + */ + if (PageCompound(page)) + return false; + + if ((page_count(page) - extra) > page_mapcount(page)) + return false; + + return true; +} + +/* + * migrate_vma_prepare() - lock pages and isolate them from the lru + * @migrate: migrate struct containing all migration information + * + * This locks pages that have been collected by migrate_vma_collect(). Once each + * page is locked it is isolated from the lru (for non-device pages). Finally, + * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be + * migrated by concurrent kernel threads. + */ +static void migrate_vma_prepare(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + bool allow_drain = true; + unsigned long i; + + lru_add_drain(); + + for (i = 0; (i < npages) && migrate->cpages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) + continue; + + /* + * Because we are migrating several pages there can be + * a deadlock between 2 concurrent migration where each + * are waiting on each other page lock. + * + * Make migrate_vma() a best effort thing and backoff + * for any page we can not lock right away. + */ + if (!trylock_page(page)) { + migrate->src[i] = 0; + migrate->cpages--; + put_page(page); + continue; + } + migrate->src[i] |= MIGRATE_PFN_LOCKED; + + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } + + if (isolate_lru_page(page)) { + migrate->src[i] = 0; + unlock_page(page); + migrate->cpages--; + put_page(page); + continue; + } + + if (!migrate_vma_check_page(page)) { + migrate->src[i] = 0; + unlock_page(page); + migrate->cpages--; + + putback_lru_page(page); + } + } +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Replace page mapping (CPU page table pte) with a special migration pte entry + * and check again if it has been pinned. Pinned pages are restored because we + * cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + unsigned long addr, i, restore = 0; + + for (i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + try_to_unmap(page, flags); + if (page_mapped(page) || !migrate_vma_check_page(page)) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + } + } + + for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + remove_migration_ptes(page, page, false); + + migrate->src[i] = 0; + unlock_page(page); + restore--; + + putback_lru_page(page); + } +} + +/* + * migrate_vma_pages() - migrate meta-data from src page to dst page + * @migrate: migrate struct containing all migration information + * + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ +static void migrate_vma_pages(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + unsigned long addr, i; + + for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct address_space *mapping; + int r; + + if (!page || !newpage) + continue; + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + mapping = page_mapping(page); + + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (r != MIGRATEPAGE_SUCCESS) + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + } +} + +/* + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +static void migrate_vma_finalize(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i; + + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) + continue; + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + newpage = page; + } + + remove_migration_ptes(page, newpage, false); + unlock_page(page); + migrate->cpages--; + + putback_lru_page(page); + + if (newpage != page) { + unlock_page(newpage); + putback_lru_page(newpage); + } + } +} + +/* + * migrate_vma() - migrate a range of memory inside vma + * + * @ops: migration callback for allocating destination memory and copying + * @vma: virtual memory area containing the range to be migrated + * @start: start address of the range to migrate (inclusive) + * @end: end address of the range to migrate (exclusive) + * @src: array of hmm_pfn_t containing source pfns + * @dst: array of hmm_pfn_t containing destination pfns + * @private: pointer passed back to each of the callback + * Returns: 0 on success, error code otherwise + * + * This function tries to migrate a range of memory virtual address range, using + * callbacks to allocate and copy memory from source to destination. First it + * collects all the pages backing each virtual address in the range, saving this + * inside the src array. Then it locks those pages and unmaps them. Once the pages + * are locked and unmapped, it checks whether each page is pinned or not. Pages + * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) + * in the corresponding src array entry. It then restores any pages that are + * pinned, by remapping and unlocking those pages. + * + * At this point it calls the alloc_and_copy() callback. For documentation on + * what is expected from that callback, see struct migrate_vma_ops comments in + * include/linux/migrate.h + * + * After the alloc_and_copy() callback, this function goes over each entry in + * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then the function tries to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the struct + * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src + * array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * It then calls the finalize_and_map() callback. See comments for "struct + * migrate_vma_ops", in include/linux/migrate.h for details about + * finalize_and_map() behavior. + * + * After the finalize_and_map() callback, for successfully migrated pages, this + * function updates the CPU page table to point to new pages, otherwise it + * restores the CPU page table to point to the original source pages. + * + * Function returns 0 after the above steps, even if no pages were migrated + * (The function only returns an error if any of the arguments are invalid.) + * + * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT + * unsigned long entries. + */ +int migrate_vma(const struct migrate_vma_ops *ops, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long *src, + unsigned long *dst, + void *private) +{ + struct migrate_vma migrate; + + /* Sanity check the arguments */ + start &= PAGE_MASK; + end &= PAGE_MASK; + if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) + return -EINVAL; + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end <= vma->vm_start || end > vma->vm_end) + return -EINVAL; + if (!ops || !src || !dst || start >= end) + return -EINVAL; + + memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); + migrate.src = src; + migrate.dst = dst; + migrate.start = start; + migrate.npages = 0; + migrate.cpages = 0; + migrate.end = end; + migrate.vma = vma; + + /* Collect, and try to unmap source pages */ + migrate_vma_collect(&migrate); + if (!migrate.cpages) + return 0; + + /* Lock and isolate page */ + migrate_vma_prepare(&migrate); + if (!migrate.cpages) + return 0; + + /* Unmap pages */ + migrate_vma_unmap(&migrate); + if (!migrate.cpages) + return 0; + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the callback. + * + * Note that migration can fail in migrate_vma_struct_page() for each + * individual page. + */ + ops->alloc_and_copy(vma, src, dst, start, end, private); + + /* This does the real migration of struct page */ + migrate_vma_pages(&migrate); + + ops->finalize_and_map(vma, src, dst, start, end, private); + + /* Unlock and remap pages */ + migrate_vma_finalize(&migrate); + + return 0; +} +EXPORT_SYMBOL(migrate_vma); -- cgit v1.2.3-71-gd317 From a5430dda8a3a1cdd532e37270e6f36436241b6e7 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:17 -0700 Subject: mm/migrate: support un-addressable ZONE_DEVICE page in migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow to unmap and restore special swap entry of un-addressable ZONE_DEVICE memory. Link: http://lkml.kernel.org/r/20170817000548.32038-17-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Kirill A. Shutemov Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Mark Hairgrove Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 10 +++- mm/migrate.c | 149 +++++++++++++++++++++++++++++++++++++++--------- mm/page_vma_mapped.c | 10 ++++ mm/rmap.c | 26 +++++++++ 4 files changed, 165 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8f73cebfc3f5..8dc8f0a3f1af 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -159,12 +159,18 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, #ifdef CONFIG_MIGRATION +/* + * Watch out for PAE architecture, which has an unsigned long, and might not + * have enough bits to store all physical address and flags. So far we have + * enough room for all our flags. + */ #define MIGRATE_PFN_VALID (1UL << 0) #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_LOCKED (1UL << 2) #define MIGRATE_PFN_WRITE (1UL << 3) -#define MIGRATE_PFN_ERROR (1UL << 4) -#define MIGRATE_PFN_SHIFT 5 +#define MIGRATE_PFN_DEVICE (1UL << 4) +#define MIGRATE_PFN_ERROR (1UL << 5) +#define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) { diff --git a/mm/migrate.c b/mm/migrate.c index 652b2c642eed..77cb2fef08ea 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -237,7 +238,13 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); - flush_dcache_page(new); + if (unlikely(is_zone_device_page(new)) && + is_device_private_page(new)) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + } else + flush_dcache_page(new); + #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); @@ -2205,17 +2212,40 @@ again: pte = *ptep; pfn = pte_pfn(pte); - if (!pte_present(pte)) { + if (pte_none(pte)) { mpfn = pfn = 0; goto next; } + if (!pte_present(pte)) { + mpfn = pfn = 0; + + /* + * Only care about unaddressable device page special + * page table entry. Other special swap entries are not + * migratable, and we ignore regular swapped page. + */ + entry = pte_to_swp_entry(pte); + if (!is_device_private_entry(entry)) + goto next; + + page = device_private_entry_to_page(entry); + mpfn = migrate_pfn(page_to_pfn(page))| + MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; + if (is_write_device_private_entry(entry)) + mpfn |= MIGRATE_PFN_WRITE; + } else { + page = vm_normal_page(migrate->vma, addr, pte); + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + } + /* FIXME support THP */ - page = vm_normal_page(migrate->vma, addr, pte); if (!page || !page->mapping || PageTransCompound(page)) { mpfn = pfn = 0; goto next; } + pfn = page_to_pfn(page); /* * By getting a reference on the page we pin it and that blocks @@ -2228,8 +2258,6 @@ again: */ get_page(page); migrate->cpages++; - mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; - mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; /* * Optimize for the common case where page is only mapped once @@ -2256,10 +2284,13 @@ again: */ page_remove_rmap(page, false); put_page(page); - unmapped++; + + if (pte_present(pte)) + unmapped++; } next: + migrate->dst[migrate->npages] = 0; migrate->src[migrate->npages++] = mpfn; } arch_leave_lazy_mmu_mode(); @@ -2329,6 +2360,28 @@ static bool migrate_vma_check_page(struct page *page) if (PageCompound(page)) return false; + /* Page from ZONE_DEVICE have one extra reference */ + if (is_zone_device_page(page)) { + /* + * Private page can never be pin as they have no valid pte and + * GUP will fail for those. Yet if there is a pending migration + * a thread might try to wait on the pte migration entry and + * will bump the page reference count. Sadly there is no way to + * differentiate a regular pin from migration wait. Hence to + * avoid 2 racing thread trying to migrate back to CPU to enter + * infinite loop (one stoping migration because the other is + * waiting on pte migration entry). We always return true here. + * + * FIXME proper solution is to rework migration_entry_wait() so + * it does not need to take a reference on page. + */ + if (is_device_private_page(page)) + return true; + + /* Other ZONE_DEVICE memory type are not supported */ + return false; + } + if ((page_count(page) - extra) > page_mapcount(page)) return false; @@ -2379,24 +2432,30 @@ static void migrate_vma_prepare(struct migrate_vma *migrate) migrate->src[i] |= MIGRATE_PFN_LOCKED; } - if (!PageLRU(page) && allow_drain) { - /* Drain CPU's pagevec */ - lru_add_drain_all(); - allow_drain = false; - } + /* ZONE_DEVICE pages are not on LRU */ + if (!is_zone_device_page(page)) { + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } - if (isolate_lru_page(page)) { - if (remap) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; - restore++; - } else { - migrate->src[i] = 0; - unlock_page(page); - migrate->cpages--; - put_page(page); + if (isolate_lru_page(page)) { + if (remap) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + } else { + migrate->src[i] = 0; + unlock_page(page); + migrate->cpages--; + put_page(page); + } + continue; } - continue; + + /* Drop the reference we took in collect */ + put_page(page); } if (!migrate_vma_check_page(page)) { @@ -2405,14 +2464,19 @@ static void migrate_vma_prepare(struct migrate_vma *migrate) migrate->cpages--; restore++; - get_page(page); - putback_lru_page(page); + if (!is_zone_device_page(page)) { + get_page(page); + putback_lru_page(page); + } } else { migrate->src[i] = 0; unlock_page(page); migrate->cpages--; - putback_lru_page(page); + if (!is_zone_device_page(page)) + putback_lru_page(page); + else + put_page(page); } } } @@ -2483,7 +2547,10 @@ restore: unlock_page(page); restore--; - putback_lru_page(page); + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); } } @@ -2514,6 +2581,26 @@ static void migrate_vma_pages(struct migrate_vma *migrate) mapping = page_mapping(page); + if (is_zone_device_page(newpage)) { + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when + * migrating to un-addressable device memory. + */ + if (mapping) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } else { + /* + * Other types of ZONE_DEVICE page are not + * supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; @@ -2554,11 +2641,17 @@ static void migrate_vma_finalize(struct migrate_vma *migrate) unlock_page(page); migrate->cpages--; - putback_lru_page(page); + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); if (newpage != page) { unlock_page(newpage); - putback_lru_page(newpage); + if (is_zone_device_page(newpage)) + put_page(newpage); + else + putback_lru_page(newpage); } } } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 3bd3008db4cb..6a03946469a9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -48,6 +48,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) if (!is_swap_pte(*pvmw->pte)) return false; entry = pte_to_swp_entry(*pvmw->pte); + if (!is_migration_entry(entry)) return false; if (migration_entry_to_page(entry) - pvmw->page >= @@ -60,6 +61,15 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) WARN_ON_ONCE(1); #endif } else { + if (is_swap_pte(*pvmw->pte)) { + swp_entry_t entry; + + entry = pte_to_swp_entry(*pvmw->pte); + if (is_device_private_entry(entry) && + device_private_entry_to_page(entry) == pvmw->page) + return true; + } + if (!pte_present(*pvmw->pte)) return false; diff --git a/mm/rmap.c b/mm/rmap.c index 7dc9c02f7106..0618cd85b862 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -63,6 +63,7 @@ #include #include #include +#include #include @@ -1346,6 +1347,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) return true; + if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && + is_zone_device_page(page) && !is_device_private_page(page)) + return true; + if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, flags & TTU_SPLIT_FREEZE, page); @@ -1403,6 +1408,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, address = pvmw.address; + if (IS_ENABLED(CONFIG_MIGRATION) && + (flags & TTU_MIGRATION) && + is_zone_device_page(page)) { + swp_entry_t entry; + pte_t swp_pte; + + pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_migration_entry(page, 0); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + goto discard; + } + if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { -- cgit v1.2.3-71-gd317 From 8315ada7f095bfa2cae0cd1e915b95bf6226897d Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:21 -0700 Subject: mm/migrate: allow migrate_vma() to alloc new page on empty entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows callers of migrate_vma() to allocate new page for empty CPU page table entry (pte_none or back by zero page). This is only for anonymous memory and it won't allow new page to be instanced if the userfaultfd is armed. This is useful to device driver that want to migrate a range of virtual address and would rather allocate new memory than having to fault later on. Link: http://lkml.kernel.org/r/20170817000548.32038-18-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Aneesh Kumar Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 9 +++ mm/migrate.c | 205 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 205 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8dc8f0a3f1af..d4e6d12a0b40 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -218,6 +218,15 @@ static inline unsigned long migrate_pfn(unsigned long pfn) * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an * unrecoverable state. * + * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus + * allowing device driver to allocate device memory for those unback virtual + * address. For this the device driver simply have to allocate device memory + * and properly set the destination entry like for regular migration. Note that + * this can still fails and thus inside the device driver must check if the + * migration was successful for those entry inside the finalize_and_map() + * callback just like for regular migration. + * * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES * OR BAD THINGS WILL HAPPEN ! * diff --git a/mm/migrate.c b/mm/migrate.c index 77cb2fef08ea..e581253ef330 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -2140,6 +2141,22 @@ static int migrate_vma_collect_hole(unsigned long start, struct migrate_vma *migrate = walk->private; unsigned long addr; + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->cpages++; + } + + return 0; +} + +static int migrate_vma_collect_skip(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { migrate->dst[migrate->npages] = 0; migrate->src[migrate->npages++] = 0; @@ -2178,7 +2195,7 @@ again: spin_unlock(ptl); split_huge_pmd(vma, pmdp, addr); if (pmd_trans_unstable(pmdp)) - return migrate_vma_collect_hole(start, end, + return migrate_vma_collect_skip(start, end, walk); } else { int ret; @@ -2186,19 +2203,22 @@ again: get_page(page); spin_unlock(ptl); if (unlikely(!trylock_page(page))) - return migrate_vma_collect_hole(start, end, + return migrate_vma_collect_skip(start, end, walk); ret = split_huge_page(page); unlock_page(page); put_page(page); - if (ret || pmd_none(*pmdp)) + if (ret) + return migrate_vma_collect_skip(start, end, + walk); + if (pmd_none(*pmdp)) return migrate_vma_collect_hole(start, end, walk); } } if (unlikely(pmd_bad(*pmdp))) - return migrate_vma_collect_hole(start, end, walk); + return migrate_vma_collect_skip(start, end, walk); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -2213,7 +2233,9 @@ again: pfn = pte_pfn(pte); if (pte_none(pte)) { - mpfn = pfn = 0; + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + pfn = 0; goto next; } @@ -2235,6 +2257,12 @@ again: if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { + if (is_zero_pfn(pfn)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + pfn = 0; + goto next; + } page = vm_normal_page(migrate->vma, addr, pte); mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; @@ -2554,6 +2582,135 @@ restore: } } +static void migrate_vma_insert_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + unsigned long *dst) +{ + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + struct mem_cgroup *memcg; + bool flush = false; + spinlock_t *ptl; + pte_t entry; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + /* Only allow populating anonymous memory */ + if (!vma_is_anonymous(vma)) + goto abort; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + goto abort; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + goto abort; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + goto abort; + + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + goto abort; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under down_write(mmap_sem) or when + * parallel threads are excluded by other means. + * + * Here we only have down_read(mmap_sem). + */ + if (pte_alloc(mm, pmdp, addr)) + goto abort; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmdp))) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + goto abort; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (is_zone_device_page(page) && is_device_private_page(page)) { + swp_entry_t swp_entry; + + swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); + entry = swp_entry_to_pte(swp_entry); + } else { + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + } + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + + if (pte_present(*ptep)) { + unsigned long pfn = pte_pfn(*ptep); + + if (!is_zero_pfn(pfn)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + flush = true; + } else if (!pte_none(*ptep)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + + /* + * Check for usefaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); + if (!is_zone_device_page(page)) + lru_cache_add_active_or_unevictable(page, vma); + get_page(page); + + if (flush) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } else { + /* No need to invalidate - it was non-present before */ + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } + + pte_unmap_unlock(ptep, ptl); + *src = MIGRATE_PFN_MIGRATE; + return; + +abort: + *src &= ~MIGRATE_PFN_MIGRATE; +} + /* * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information @@ -2566,7 +2723,10 @@ static void migrate_vma_pages(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; - unsigned long addr, i; + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr, i, mmu_start; + bool notified = false; for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); @@ -2574,10 +2734,27 @@ static void migrate_vma_pages(struct migrate_vma *migrate) struct address_space *mapping; int r; - if (!page || !newpage) + if (!newpage) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; continue; - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + } + + if (!page) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { + continue; + } + if (!notified) { + mmu_start = addr; + notified = true; + mmu_notifier_invalidate_range_start(mm, + mmu_start, + migrate->end); + } + migrate_vma_insert_page(migrate, addr, newpage, + &migrate->src[i], + &migrate->dst[i]); continue; + } mapping = page_mapping(page); @@ -2605,6 +2782,10 @@ static void migrate_vma_pages(struct migrate_vma *migrate) if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } + + if (notified) + mmu_notifier_invalidate_range_end(mm, mmu_start, + migrate->end); } /* @@ -2627,8 +2808,14 @@ static void migrate_vma_finalize(struct migrate_vma *migrate) struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); struct page *page = migrate_pfn_to_page(migrate->src[i]); - if (!page) + if (!page) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } continue; + } + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { if (newpage) { unlock_page(newpage); -- cgit v1.2.3-71-gd317 From df6ad69838fc9dcdbee0dcf2fc2c6f1113f8d609 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:24 -0700 Subject: mm/device-public-memory: device memory cache coherent with CPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Platform with advance system bus (like CAPI or CCIX) allow device memory to be accessible from CPU in a cache coherent fashion. Add a new type of ZONE_DEVICE to represent such memory. The use case are the same as for the un-addressable device memory but without all the corners cases. Link: http://lkml.kernel.org/r/20170817000548.32038-19-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Aneesh Kumar Cc: Paul E. McKenney Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: Ross Zwisler Cc: Balbir Singh Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Michal Hocko Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- include/linux/hmm.h | 4 ++-- include/linux/ioport.h | 1 + include/linux/memremap.h | 21 ++++++++++++++++++ include/linux/mm.h | 20 ++++++++++------- kernel/memremap.c | 8 +++---- mm/Kconfig | 11 ++++++++++ mm/gup.c | 7 ++++++ mm/hmm.c | 4 ++-- mm/madvise.c | 2 +- mm/memcontrol.c | 12 +++++----- mm/memory.c | 46 +++++++++++++++++++++++++++++++++----- mm/migrate.c | 57 ++++++++++++++++++++++++++++++++---------------- mm/swap.c | 11 ++++++++++ 14 files changed, 159 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 90ab657f8e56..281880c7e694 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1267,7 +1267,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pm->show_pfn) frame = pte_pfn(pte); flags |= PM_PRESENT; - page = vm_normal_page(vma, addr, pte); + page = _vm_normal_page(vma, addr, pte, true); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 67a03b20a2db..6d3b0b4fed4e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) struct hmm_devmem; struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, @@ -494,7 +494,7 @@ struct hmm_device { */ struct hmm_device *hmm_device_new(void *drvdata); void hmm_device_put(struct hmm_device *hmm_device); -#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */ +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ /* Below are for HMM internal use only! Not to be used by device driver! */ diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 3a4f69137bc2..f5cf32e80041 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -131,6 +131,7 @@ enum { IORES_DESC_PERSISTENT_MEMORY = 4, IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, + IORES_DESC_DEVICE_PUBLIC_MEMORY = 7, }; /* helpers to define resources */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 8aa6b82679e2..f8ee1c73ad2d 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) * * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/vm/hmm.txt. + * + * MEMORY_DEVICE_PUBLIC: + * Device memory that is cache coherent from device and CPU point of view. This + * is use on platform that have an advance system bus (like CAPI or CCIX). A + * driver can hotplug the device memory using ZONE_DEVICE and with that memory + * type. Any page of a process can be migrated to such memory. However no one + * should be allow to pin such memory so that it can always be evicted. */ enum memory_type { MEMORY_DEVICE_HOST = 0, MEMORY_DEVICE_PRIVATE, + MEMORY_DEVICE_PUBLIC, }; /* @@ -92,6 +100,8 @@ enum memory_type { * The page_free() callback is called once the page refcount reaches 1 * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. * This allows the device driver to implement its own memory management.) + * + * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter. */ typedef int (*dev_page_fault_t)(struct vm_area_struct *vma, unsigned long addr, @@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct page *page) return is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } + +static inline bool is_device_public_page(const struct page *page) +{ + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PUBLIC; +} #else static inline void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, @@ -157,6 +173,11 @@ static inline bool is_device_private_page(const struct page *page) { return false; } + +static inline bool is_device_public_page(const struct page *page) +{ + return false; +} #endif /** diff --git a/include/linux/mm.h b/include/linux/mm.h index eccdab4bb44a..de66a1127db4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -800,15 +800,16 @@ static inline bool is_zone_device_page(const struct page *page) } #endif -#ifdef CONFIG_DEVICE_PRIVATE -void put_zone_device_private_page(struct page *page); +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +void put_zone_device_private_or_public_page(struct page *page); #else -static inline void put_zone_device_private_page(struct page *page) +static inline void put_zone_device_private_or_public_page(struct page *page) { } -#endif +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ static inline bool is_device_private_page(const struct page *page); +static inline bool is_device_public_page(const struct page *page); DECLARE_STATIC_KEY_FALSE(device_private_key); @@ -834,8 +835,9 @@ static inline void put_page(struct page *page) * include/linux/memremap.h and HMM for details. */ if (static_branch_unlikely(&device_private_key) && - unlikely(is_device_private_page(page))) { - put_zone_device_private_page(page); + unlikely(is_device_private_page(page) || + is_device_public_page(page))) { + put_zone_device_private_or_public_page(page); return; } @@ -1224,8 +1226,10 @@ struct zap_details { pgoff_t last_index; /* Highest page->index to unmap */ }; -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte); +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device); +#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false) + struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); diff --git a/kernel/memremap.c b/kernel/memremap.c index ea0e18a2a5f2..6bcbfbf1a8fd 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -501,8 +501,8 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) #endif /* CONFIG_ZONE_DEVICE */ -#ifdef CONFIG_DEVICE_PRIVATE -void put_zone_device_private_page(struct page *page) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +void put_zone_device_private_or_public_page(struct page *page) { int count = page_ref_dec_return(page); @@ -522,5 +522,5 @@ void put_zone_device_private_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL(put_zone_device_private_page); -#endif /* CONFIG_DEVICE_PRIVATE */ +EXPORT_SYMBOL(put_zone_device_private_or_public_page); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/mm/Kconfig b/mm/Kconfig index ec27855db133..7bea16697d87 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -720,12 +720,23 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ARCH_HAS_HMM + select HMM help Allows creation of struct pages to represent unaddressable device memory; i.e., memory that is only accessible from the device (or group of devices). You likely also want to select HMM_MIRROR. +config DEVICE_PUBLIC + bool "Addressable device memory (like GPU memory)" + depends on ARCH_HAS_HMM + select HMM + + help + Allows creation of struct pages to represent addressable device + memory; i.e., memory that is accessible from both the device and + the CPU + config FRAME_VECTOR bool diff --git a/mm/gup.c b/mm/gup.c index 76fd199aaae2..b2b4d4263768 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -456,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); + + /* + * This should never happen (a device public page in the gate + * area). + */ + if (is_device_public_page(*page)) + goto unmap; } get_page(*page); out: diff --git a/mm/hmm.c b/mm/hmm.c index c9d23ef80552..b31d56662202 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -737,7 +737,7 @@ EXPORT_SYMBOL(hmm_vma_fault); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, unsigned long addr) { @@ -1177,4 +1177,4 @@ static int __init hmm_init(void) } device_initcall(hmm_init); -#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */ +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/mm/madvise.c b/mm/madvise.c index eea1c733286f..21261ff0466f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (!page) continue; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8aa98f9bc723..126a939b600a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4623,10 +4623,11 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE - * (so ZONE_DEVICE page and thus not on the lru). For now we such page is - * charge like a regular page would be as for all intent and purposes it is - * just special memory taking the place of a regular page. + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + * For now we such page is charge like a regular page would be as for all + * intent and purposes it is just special memory taking the place of a + * regular page. * * See Documentations/vm/hmm.txt and include/linux/hmm.h * @@ -4657,7 +4658,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page)) + if (is_device_private_page(page) || + is_device_public_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; diff --git a/mm/memory.c b/mm/memory.c index 079eeac0b009..ad0ea1af1f44 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -818,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, #else # define HAVE_PTE_SPECIAL 0 #endif -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); @@ -830,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; - if (!is_zero_pfn(pfn)) - print_bad_pte(vma, addr, pte, NULL); + if (is_zero_pfn(pfn)) + return NULL; + + /* + * Device public pages are special pages (they are ZONE_DEVICE + * pages but different from persistent memory). They behave + * allmost like normal pages. The difference is that they are + * not on the lru and thus should never be involve with any- + * thing that involve lru manipulation (mlock, numa balancing, + * ...). + * + * This is why we still want to return NULL for such page from + * vm_normal_page() so that we do not have to special case all + * call site of vm_normal_page(). + */ + if (likely(pfn < highest_memmap_pfn)) { + struct page *page = pfn_to_page(pfn); + + if (is_device_public_page(page)) { + if (with_public_device) + return page; + return NULL; + } + } + print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -1012,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; + } else if (pte_devmap(pte)) { + page = pte_page(pte); + + /* + * Cache coherent device memory behave like regular page and + * not like persistent memory page. For more informations see + * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h + */ + if (is_device_public_page(page)) { + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } } out_set_pte: @@ -1267,7 +1303,7 @@ again: if (pte_present(ptent)) { struct page *page; - page = vm_normal_page(vma, addr, ptent); + page = _vm_normal_page(vma, addr, ptent, true); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to diff --git a/mm/migrate.c b/mm/migrate.c index e581253ef330..618aeb5e9cde 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -239,10 +240,14 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); - if (unlikely(is_zone_device_page(new)) && - is_device_private_page(new)) { - entry = make_device_private_entry(new, pte_write(pte)); - pte = swp_entry_to_pte(entry); + if (unlikely(is_zone_device_page(new))) { + if (is_device_private_page(new)) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + } else if (is_device_public_page(new)) { + pte = pte_mkdevmap(pte); + flush_dcache_page(new); + } } else flush_dcache_page(new); @@ -437,12 +442,11 @@ int migrate_page_move_mapping(struct address_space *mapping, void **pslot; /* - * ZONE_DEVICE pages have 1 refcount always held by their device - * - * Note that DAX memory will never reach that point as it does not have - * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h). + * Device public or private pages have an extra refcount as they are + * ZONE_DEVICE pages. */ - expected_count += is_zone_device_page(page); + expected_count += is_device_private_page(page); + expected_count += is_device_public_page(page); if (!mapping) { /* Anonymous page without mapping */ @@ -2123,7 +2127,6 @@ out_unlock: #endif /* CONFIG_NUMA */ - struct migrate_vma { struct vm_area_struct *vma; unsigned long *dst; @@ -2263,7 +2266,7 @@ again: pfn = 0; goto next; } - page = vm_normal_page(migrate->vma, addr, pte); + page = _vm_normal_page(migrate->vma, addr, pte, true); mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } @@ -2406,10 +2409,19 @@ static bool migrate_vma_check_page(struct page *page) if (is_device_private_page(page)) return true; - /* Other ZONE_DEVICE memory type are not supported */ - return false; + /* + * Only allow device public page to be migrated and account for + * the extra reference count imply by ZONE_DEVICE pages. + */ + if (!is_device_public_page(page)) + return false; + extra++; } + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + if ((page_count(page) - extra) > page_mapcount(page)) return false; @@ -2647,11 +2659,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, */ __SetPageUptodate(page); - if (is_zone_device_page(page) && is_device_private_page(page)) { - swp_entry_t swp_entry; - - swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); - entry = swp_entry_to_pte(swp_entry); + if (is_zone_device_page(page)) { + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); + entry = swp_entry_to_pte(swp_entry); + } else if (is_device_public_page(page)) { + entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + entry = pte_mkdevmap(entry); + } } else { entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) @@ -2768,7 +2787,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - } else { + } else if (!is_device_public_page(newpage)) { /* * Other types of ZONE_DEVICE page are not * supported. diff --git a/mm/swap.c b/mm/swap.c index 62d96b8e5eb3..9295ae960d66 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -765,6 +765,17 @@ void release_pages(struct page **pages, int nr, bool cold) if (is_huge_zero_page(page)) continue; + /* Device public page can not be huge page */ + if (is_device_public_page(page)) { + if (locked_pgdat) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, + flags); + locked_pgdat = NULL; + } + put_zone_device_private_or_public_page(page); + continue; + } + page = compound_head(page); if (!put_page_testzero(page)) continue; -- cgit v1.2.3-71-gd317 From d3df0a423397c9a1ae05c3857e8c04240dd85e68 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:28 -0700 Subject: mm/hmm: add new helper to hotplug CDM memory region MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike unaddressable memory, coherent device memory has a real resource associated with it on the system (as CPU can address it). Add a new helper to hotplug such memory within the HMM framework. Link: http://lkml.kernel.org/r/20170817000548.32038-20-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Balbir Singh Cc: Aneesh Kumar Cc: Benjamin Herrenschmidt Cc: Dan Williams Cc: David Nellans Cc: Evgeny Baskakov Cc: Johannes Weiner Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Mark Hairgrove Cc: Michal Hocko Cc: Paul E. McKenney Cc: Ross Zwisler Cc: Sherry Cheung Cc: Subhash Gutti Cc: Vladimir Davydov Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 3 ++ mm/hmm.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 86 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 6d3b0b4fed4e..8385e75356ca 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -443,6 +443,9 @@ struct hmm_devmem { struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, struct device *device, unsigned long size); +struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, + struct device *device, + struct resource *res); void hmm_devmem_remove(struct hmm_devmem *devmem); /* diff --git a/mm/hmm.c b/mm/hmm.c index b31d56662202..bdb49b836bf2 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -839,7 +839,11 @@ static void hmm_devmem_release(struct device *dev, void *data) zone = page_zone(page); mem_hotplug_begin(); - __remove_pages(zone, start_pfn, npages); + if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) + __remove_pages(zone, start_pfn, npages); + else + arch_remove_memory(start_pfn << PAGE_SHIFT, + npages << PAGE_SHIFT); mem_hotplug_done(); hmm_devmem_radix_release(resource); @@ -875,7 +879,11 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) if (is_ram == REGION_INTERSECTS) return -ENXIO; - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) + devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; + else + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + devmem->pagemap.res = devmem->resource; devmem->pagemap.page_fault = hmm_devmem_fault; devmem->pagemap.page_free = hmm_devmem_free; @@ -920,9 +928,15 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) * over the device memory is un-accessible thus we do not want to * create a linear mapping for the memory like arch_add_memory() * would do. + * + * For device public memory, which is accesible by the CPU, we do + * want the linear mapping and thus use arch_add_memory(). */ - ret = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, false); + if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) + ret = arch_add_memory(nid, align_start, align_size, false); + else + ret = add_pages(nid, align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, false); if (ret) { mem_hotplug_done(); goto error_add_memory; @@ -1069,6 +1083,67 @@ error_percpu_ref: } EXPORT_SYMBOL(hmm_devmem_add); +struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, + struct device *device, + struct resource *res) +{ + struct hmm_devmem *devmem; + int ret; + + if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) + return ERR_PTR(-EINVAL); + + static_branch_enable(&device_private_key); + + devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), + GFP_KERNEL, dev_to_node(device)); + if (!devmem) + return ERR_PTR(-ENOMEM); + + init_completion(&devmem->completion); + devmem->pfn_first = -1UL; + devmem->pfn_last = -1UL; + devmem->resource = res; + devmem->device = device; + devmem->ops = ops; + + ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, + 0, GFP_KERNEL); + if (ret) + goto error_percpu_ref; + + ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); + if (ret) + goto error_devm_add_action; + + + devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; + devmem->pfn_last = devmem->pfn_first + + (resource_size(devmem->resource) >> PAGE_SHIFT); + + ret = hmm_devmem_pages_create(devmem); + if (ret) + goto error_devm_add_action; + + devres_add(device, devmem); + + ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); + if (ret) { + hmm_devmem_remove(devmem); + return ERR_PTR(ret); + } + + return devmem; + +error_devm_add_action: + hmm_devmem_ref_kill(&devmem->ref); + hmm_devmem_ref_exit(&devmem->ref); +error_percpu_ref: + devres_free(devmem); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(hmm_devmem_add_resource); + /* * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) * @@ -1082,6 +1157,7 @@ void hmm_devmem_remove(struct hmm_devmem *devmem) { resource_size_t start, size; struct device *device; + bool cdm = false; if (!devmem) return; @@ -1090,11 +1166,13 @@ void hmm_devmem_remove(struct hmm_devmem *devmem) start = devmem->resource->start; size = resource_size(devmem->resource); + cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; hmm_devmem_ref_kill(&devmem->ref); hmm_devmem_ref_exit(&devmem->ref); hmm_devmem_pages_remove(devmem); - devm_release_mem_region(device, start, size); + if (!cdm) + devm_release_mem_region(device, start, size); } EXPORT_SYMBOL(hmm_devmem_remove); -- cgit v1.2.3-71-gd317 From 6b368cd4a44ce95b33f1d31f2f932e6ae707f319 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:32 -0700 Subject: mm/hmm: avoid bloating arch that do not make use of HMM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This moves all new code including new page migration helper behind kernel Kconfig option so that there is no codee bloat for arch or user that do not want to use HMM or any of its associated features. arm allyesconfig (without all the patchset, then with and this patch): text data bss dec hex filename 83721896 46511131 27582964 157815991 96814b7 ../without/vmlinux 83722364 46511131 27582964 157816459 968168b vmlinux [jglisse@redhat.com: struct hmm is only use by HMM mirror functionality] Link: http://lkml.kernel.org/r/20170825213133.27286-1-jglisse@redhat.com [sfr@canb.auug.org.au: fix build (arm multi_v7_defconfig)] Link: http://lkml.kernel.org/r/20170828181849.323ab81b@canb.auug.org.au Link: http://lkml.kernel.org/r/20170818032858.7447-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Stephen Rothwell Cc: Dan Williams Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 9 ++++++--- include/linux/memremap.h | 22 +++++++--------------- include/linux/migrate.h | 13 +++++++++++++ include/linux/mm.h | 26 +++++++++++++++++--------- mm/Kconfig | 4 ++++ mm/Makefile | 3 ++- mm/hmm.c | 7 +++---- mm/migrate.c | 2 ++ 8 files changed, 54 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 8385e75356ca..28e14345bd8d 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -501,18 +501,21 @@ void hmm_device_put(struct hmm_device *hmm_device); /* Below are for HMM internal use only! Not to be used by device driver! */ +#if IS_ENABLED(CONFIG_HMM_MIRROR) void hmm_mm_destroy(struct mm_struct *mm); static inline void hmm_mm_init(struct mm_struct *mm) { mm->hmm = NULL; } +#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ +static inline void hmm_mm_destroy(struct mm_struct *mm) {} +static inline void hmm_mm_init(struct mm_struct *mm) {} +#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -#else /* IS_ENABLED(CONFIG_HMM) */ -/* Below are for HMM internal use only! Not to be used by device driver! */ +#else /* IS_ENABLED(CONFIG_HMM) */ static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} - #endif /* IS_ENABLED(CONFIG_HMM) */ #endif /* LINUX_HMM_H */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f8ee1c73ad2d..79f8ba7c3894 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -138,18 +138,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct dev_pagemap *find_dev_pagemap(resource_size_t phys); static inline bool is_zone_device_page(const struct page *page); - -static inline bool is_device_private_page(const struct page *page) -{ - return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; -} - -static inline bool is_device_public_page(const struct page *page) -{ - return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PUBLIC; -} #else static inline void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, @@ -168,17 +156,21 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) { return NULL; } +#endif +#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) static inline bool is_device_private_page(const struct page *page) { - return false; + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool is_device_public_page(const struct page *page) { - return false; + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PUBLIC; } -#endif +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn diff --git a/include/linux/migrate.h b/include/linux/migrate.h index d4e6d12a0b40..643c7ae7d7b4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -265,6 +265,7 @@ struct migrate_vma_ops { void *private); }; +#if defined(CONFIG_MIGRATE_VMA_HELPER) int migrate_vma(const struct migrate_vma_ops *ops, struct vm_area_struct *vma, unsigned long start, @@ -272,6 +273,18 @@ int migrate_vma(const struct migrate_vma_ops *ops, unsigned long *src, unsigned long *dst, void *private); +#else +static inline int migrate_vma(const struct migrate_vma_ops *ops, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long *src, + unsigned long *dst, + void *private) +{ + return -EINVAL; +} +#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */ #endif /* CONFIG_MIGRATION */ diff --git a/include/linux/mm.h b/include/linux/mm.h index de66a1127db4..5195e272fc4a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -800,18 +800,27 @@ static inline bool is_zone_device_page(const struct page *page) } #endif -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) void put_zone_device_private_or_public_page(struct page *page); -#else +DECLARE_STATIC_KEY_FALSE(device_private_key); +#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key) +static inline bool is_device_private_page(const struct page *page); +static inline bool is_device_public_page(const struct page *page); +#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ static inline void put_zone_device_private_or_public_page(struct page *page) { } +#define IS_HMM_ENABLED 0 +static inline bool is_device_private_page(const struct page *page) +{ + return false; +} +static inline bool is_device_public_page(const struct page *page) +{ + return false; +} #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -static inline bool is_device_private_page(const struct page *page); -static inline bool is_device_public_page(const struct page *page); - -DECLARE_STATIC_KEY_FALSE(device_private_key); static inline void get_page(struct page *page) { @@ -834,9 +843,8 @@ static inline void put_page(struct page *page) * free and we need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ - if (static_branch_unlikely(&device_private_key) && - unlikely(is_device_private_page(page) || - is_device_public_page(page))) { + if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) || + unlikely(is_device_public_page(page)))) { put_zone_device_private_or_public_page(page); return; } diff --git a/mm/Kconfig b/mm/Kconfig index 7bea16697d87..9c4bdddd80c2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -702,8 +702,12 @@ config ARCH_HAS_HMM depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP +config MIGRATE_VMA_HELPER + bool + config HMM bool + select MIGRATE_VMA_HELPER config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" diff --git a/mm/Makefile b/mm/Makefile index 1cde2a8bed97..e3ac3aeb533b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o swap_slots.o \ interval_tree.o list_lru.o workingset.o \ - debug.o hmm.o $(mmu-y) + debug.o $(mmu-y) obj-y += init-mm.o @@ -104,3 +104,4 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o +obj-$(CONFIG_HMM) += hmm.o diff --git a/mm/hmm.c b/mm/hmm.c index bdb49b836bf2..a88a847bccba 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -35,15 +35,16 @@ #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) - +#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) /* * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h */ DEFINE_STATIC_KEY_FALSE(device_private_key); EXPORT_SYMBOL(device_private_key); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -#ifdef CONFIG_HMM +#if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; /* @@ -128,9 +129,7 @@ void hmm_mm_destroy(struct mm_struct *mm) { kfree(mm->hmm); } -#endif /* CONFIG_HMM */ -#if IS_ENABLED(CONFIG_HMM_MIRROR) static void hmm_invalidate_range(struct hmm *hmm, enum hmm_update_type action, unsigned long start, diff --git a/mm/migrate.c b/mm/migrate.c index 618aeb5e9cde..6954c1435833 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2127,6 +2127,7 @@ out_unlock: #endif /* CONFIG_NUMA */ +#if defined(CONFIG_MIGRATE_VMA_HELPER) struct migrate_vma { struct vm_area_struct *vma; unsigned long *dst; @@ -2980,3 +2981,4 @@ int migrate_vma(const struct migrate_vma_ops *ops, return 0; } EXPORT_SYMBOL(migrate_vma); +#endif /* defined(MIGRATE_VMA_HELPER) */ -- cgit v1.2.3-71-gd317 From de540a9763efcd5b6339158ac2e5932fb3e691b9 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 8 Sep 2017 16:12:35 -0700 Subject: mm/hmm: fix build when HMM is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Combinatorial Kconfig is painfull. Withi this patch all below combination build. 1) 2) CONFIG_HMM_MIRROR=y 3) CONFIG_DEVICE_PRIVATE=y 4) CONFIG_DEVICE_PUBLIC=y 5) CONFIG_HMM_MIRROR=y CONFIG_DEVICE_PUBLIC=y 6) CONFIG_HMM_MIRROR=y CONFIG_DEVICE_PRIVATE=y 7) CONFIG_DEVICE_PRIVATE=y CONFIG_DEVICE_PUBLIC=y 8) CONFIG_HMM_MIRROR=y CONFIG_DEVICE_PRIVATE=y CONFIG_DEVICE_PUBLIC=y Link: http://lkml.kernel.org/r/20170826002149.20919-1-jglisse@redhat.com Reported-by: Randy Dunlap Signed-off-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 28e14345bd8d..96e69979f84d 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -498,7 +498,7 @@ struct hmm_device { struct hmm_device *hmm_device_new(void *drvdata); void hmm_device_put(struct hmm_device *hmm_device); #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ - +#endif /* IS_ENABLED(CONFIG_HMM) */ /* Below are for HMM internal use only! Not to be used by device driver! */ #if IS_ENABLED(CONFIG_HMM_MIRROR) @@ -517,5 +517,4 @@ static inline void hmm_mm_init(struct mm_struct *mm) {} #else /* IS_ENABLED(CONFIG_HMM) */ static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} -#endif /* IS_ENABLED(CONFIG_HMM) */ #endif /* LINUX_HMM_H */ -- cgit v1.2.3-71-gd317 From 3a321d2a3dde812142e06ab5c2f062ed860182a5 Mon Sep 17 00:00:00 2001 From: Kemi Wang Date: Fri, 8 Sep 2017 16:12:48 -0700 Subject: mm: change the call sites of numa statistics items Patch series "Separate NUMA statistics from zone statistics", v2. Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM summit, these are a substantial source of overhead in the page allocator and are very rarely consumed. This significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (pointed out by Dave Hansen). A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017-JesperBrouer.pdf To mitigate this overhead, this patchset separates NUMA statistics from zone statistics framework, and update NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 31.3% drop of CPU cycles(537-->369, see below) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Meanwhile, this patchset keeps the same style of virtual memory statistics with little end-user-visible effects (only move the numa stats to show behind zone page stats, see the first patch for details). I did an experiment of single page allocation and reclaim concurrently using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server (88 processors with 126G memory) with different size of threshold of pcp counter. Benchmark provided by Jesper D Brouer(increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench Threshold CPU cycles Throughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default 256 468 412397590 512 428 450550704 4096 399 482520943 20000 394 489009617 30000 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics This patch (of 3): In this patch, NUMA statistics is separated from zone statistics framework, all the call sites of NUMA stats are changed to use numa-stats-specific functions, it does not have any functionality change except that the number of NUMA stats is shown behind zone page stats when users *read* the zone info. E.g. cat /proc/zoneinfo ***Base*** ***With this patch*** nr_free_pages 3976 nr_free_pages 3976 nr_zone_inactive_anon 0 nr_zone_inactive_anon 0 nr_zone_active_anon 0 nr_zone_active_anon 0 nr_zone_inactive_file 0 nr_zone_inactive_file 0 nr_zone_active_file 0 nr_zone_active_file 0 nr_zone_unevictable 0 nr_zone_unevictable 0 nr_zone_write_pending 0 nr_zone_write_pending 0 nr_mlock 0 nr_mlock 0 nr_page_table_pages 0 nr_page_table_pages 0 nr_kernel_stack 0 nr_kernel_stack 0 nr_bounce 0 nr_bounce 0 nr_zspages 0 nr_zspages 0 numa_hit 0 *nr_free_cma 0* numa_miss 0 numa_hit 0 numa_foreign 0 numa_miss 0 numa_interleave 0 numa_foreign 0 numa_local 0 numa_interleave 0 numa_other 0 numa_local 0 *nr_free_cma 0* numa_other 0 ... ... vm stats threshold: 10 vm stats threshold: 10 ... ... The next patch updates the numa stats counter size and threshold. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1503568801-21305-2-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang Reported-by: Jesper Dangaard Brouer Acked-by: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Christopher Lameter Cc: Dave Hansen Cc: Andi Kleen Cc: Ying Huang Cc: Aaron Lu Cc: Tim Chen Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 22 ++++--- include/linux/mmzone.h | 25 +++++--- include/linux/vmstat.h | 29 ++++++++- mm/page_alloc.c | 10 +-- mm/vmstat.c | 161 +++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 220 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index d8dc83017d8d..3855902f2c5b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_node_page_state(dev->id, NUMA_HIT), - sum_zone_node_page_state(dev->id, NUMA_MISS), - sum_zone_node_page_state(dev->id, NUMA_FOREIGN), - sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_node_page_state(dev->id, NUMA_LOCAL), - sum_zone_node_page_state(dev->id, NUMA_OTHER)); + sum_zone_numa_state(dev->id, NUMA_HIT), + sum_zone_numa_state(dev->id, NUMA_MISS), + sum_zone_numa_state(dev->id, NUMA_FOREIGN), + sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), + sum_zone_numa_state(dev->id, NUMA_LOCAL), + sum_zone_numa_state(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev, n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], sum_zone_node_page_state(nid, i)); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + sum_zone_numa_state(nid, i)); +#endif + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + n += sprintf(buf+n, "%s %lu\n", + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + + NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); return n; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e7e92c8f4883..e65d91c02e30 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,20 @@ struct zone_padding { #define ZONE_PADDING(name) #endif +#ifdef CONFIG_NUMA +enum numa_stat_item { + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ + NUMA_FOREIGN, /* was intended here, hit elsewhere */ + NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ + NUMA_LOCAL, /* allocation from local node */ + NUMA_OTHER, /* allocation from other node */ + NR_VM_NUMA_STAT_ITEMS +}; +#else +#define NR_VM_NUMA_STAT_ITEMS 0 +#endif + enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, @@ -131,14 +145,6 @@ enum zone_stat_item { NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ -#endif -#ifdef CONFIG_NUMA - NUMA_HIT, /* allocated in intended node */ - NUMA_MISS, /* allocated in non intended node */ - NUMA_FOREIGN, /* was intended here, hit elsewhere */ - NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ - NUMA_LOCAL, /* allocation from local node */ - NUMA_OTHER, /* allocation from other node */ #endif NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; @@ -276,6 +282,8 @@ struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; + s8 numa_stat_threshold; + s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; @@ -496,6 +504,7 @@ struct zone { ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; + atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 97e11ab573f0..9ac82e29948f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -107,8 +107,33 @@ static inline void vm_events_fold_cpu(int cpu) * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; +extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; +#ifdef CONFIG_NUMA +static inline void zone_numa_state_add(long x, struct zone *zone, + enum numa_stat_item item) +{ + atomic_long_add(x, &zone->vm_numa_stat[item]); + atomic_long_add(x, &vm_numa_stat[item]); +} + +static inline unsigned long global_numa_state(enum numa_stat_item item) +{ + long x = atomic_long_read(&vm_numa_stat[item]); + + return x; +} + +static inline unsigned long zone_numa_state(struct zone *zone, + enum numa_stat_item item) +{ + long x = atomic_long_read(&zone->vm_numa_stat[item]); + + return x; +} +#endif /* CONFIG_NUMA */ + static inline void zone_page_state_add(long x, struct zone *zone, enum zone_stat_item item) { @@ -194,8 +219,10 @@ static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, #ifdef CONFIG_NUMA +extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, - enum zone_stat_item item); + enum zone_stat_item item); +extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); #else diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a9add06fe768..45583cd8dd56 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2741,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order) static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA - enum zone_stat_item local_stat = NUMA_LOCAL; + enum numa_stat_item local_stat = NUMA_LOCAL; if (z->node != numa_node_id()) local_stat = NUMA_OTHER; if (z->node == preferred_zone->node) - __inc_zone_state(z, NUMA_HIT); + __inc_numa_state(z, NUMA_HIT); else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); + __inc_numa_state(z, NUMA_MISS); + __inc_numa_state(preferred_zone, NUMA_FOREIGN); } - __inc_zone_state(z, local_stat); + __inc_numa_state(z, local_stat); #endif } diff --git a/mm/vmstat.c b/mm/vmstat.c index c7e4b8458023..daea02833e2e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -87,8 +87,10 @@ void vm_events_fold_cpu(int cpu) * vm_stat contains the global counters */ atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp; atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_zone_stat); +EXPORT_SYMBOL(vm_numa_stat); EXPORT_SYMBOL(vm_node_stat); #ifdef CONFIG_SMP @@ -192,7 +194,10 @@ void refresh_zone_stat_thresholds(void) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; - +#ifdef CONFIG_NUMA + per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold + = threshold; +#endif /* Base nodestat threshold on the largest populated zone. */ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold @@ -226,9 +231,14 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; +#ifdef CONFIG_NUMA + per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold + = threshold; +#endif + } } } @@ -604,6 +614,32 @@ EXPORT_SYMBOL(dec_node_page_state); * Fold a differential into the global counters. * Returns the number of counters updated. */ +#ifdef CONFIG_NUMA +static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) +{ + int i; + int changes = 0; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (zone_diff[i]) { + atomic_long_add(zone_diff[i], &vm_zone_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + if (numa_diff[i]) { + atomic_long_add(numa_diff[i], &vm_numa_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (node_diff[i]) { + atomic_long_add(node_diff[i], &vm_node_stat[i]); + changes++; + } + return changes; +} +#else static int fold_diff(int *zone_diff, int *node_diff) { int i; @@ -622,6 +658,7 @@ static int fold_diff(int *zone_diff, int *node_diff) } return changes; } +#endif /* CONFIG_NUMA */ /* * Update the zone counters for the current cpu. @@ -645,6 +682,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; +#ifdef CONFIG_NUMA + int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; +#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; int changes = 0; @@ -666,6 +706,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } #ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { + int v; + + v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0); + if (v) { + + atomic_long_add(v, &zone->vm_numa_stat[i]); + global_numa_diff[i] += v; + __this_cpu_write(p->expire, 3); + } + } + if (do_pagesets) { cond_resched(); /* @@ -712,7 +764,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } +#ifdef CONFIG_NUMA + changes += fold_diff(global_zone_diff, global_numa_diff, + global_node_diff); +#else changes += fold_diff(global_zone_diff, global_node_diff); +#endif return changes; } @@ -727,6 +784,9 @@ void cpu_vm_stats_fold(int cpu) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; +#ifdef CONFIG_NUMA + int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; +#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { @@ -743,6 +803,18 @@ void cpu_vm_stats_fold(int cpu) atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; } + +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + if (p->vm_numa_stat_diff[i]) { + int v; + + v = p->vm_numa_stat_diff[i]; + p->vm_numa_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_numa_stat[i]); + global_numa_diff[i] += v; + } +#endif } for_each_online_pgdat(pgdat) { @@ -761,7 +833,11 @@ void cpu_vm_stats_fold(int cpu) } } +#ifdef CONFIG_NUMA + fold_diff(global_zone_diff, global_numa_diff, global_node_diff); +#else fold_diff(global_zone_diff, global_node_diff); +#endif } /* @@ -779,10 +855,38 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) atomic_long_add(v, &zone->vm_stat[i]); atomic_long_add(v, &vm_zone_stat[i]); } + +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + if (pset->vm_numa_stat_diff[i]) { + int v = pset->vm_numa_stat_diff[i]; + + pset->vm_numa_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_numa_stat[i]); + atomic_long_add(v, &vm_numa_stat[i]); + } +#endif } #endif #ifdef CONFIG_NUMA +void __inc_numa_state(struct zone *zone, + enum numa_stat_item item) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_numa_stat_diff + item; + s8 v, t; + + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->numa_stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; + + zone_numa_state_add(v + overstep, zone, item); + __this_cpu_write(*p, -overstep); + } +} + /* * Determine the per node value of a stat item. This function * is called frequently in a NUMA machine, so try to be as @@ -801,6 +905,19 @@ unsigned long sum_zone_node_page_state(int node, return count; } +unsigned long sum_zone_numa_state(int node, + enum numa_stat_item item) +{ + struct zone *zones = NODE_DATA(node)->node_zones; + int i; + unsigned long count = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + count += zone_numa_state(zones + i, item); + + return count; +} + /* * Determine the per node value of a stat item. */ @@ -937,6 +1054,9 @@ const char * const vmstat_text[] = { #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", #endif + "nr_free_cma", + + /* enum numa_stat_item counters */ #ifdef CONFIG_NUMA "numa_hit", "numa_miss", @@ -945,7 +1065,6 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif - "nr_free_cma", /* Node-based counters */ "nr_inactive_anon", @@ -1106,7 +1225,6 @@ const char * const vmstat_text[] = { }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ - #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ defined(CONFIG_PROC_FS) static void *frag_start(struct seq_file *m, loff_t *pos) @@ -1384,7 +1502,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { seq_printf(m, "\n %-12s %lu", - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + + NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); } } @@ -1421,6 +1540,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n %-12s %lu", vmstat_text[i], zone_page_state(zone, i)); +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + zone_numa_state(zone, i)); +#endif + seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; @@ -1497,6 +1623,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) + NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); @@ -1512,6 +1639,12 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v[i] = global_zone_page_state(i); v += NR_VM_ZONE_STAT_ITEMS; +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + v[i] = global_numa_state(i); + v += NR_VM_NUMA_STAT_ITEMS; +#endif + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) v[i] = global_node_page_state(i); v += NR_VM_NODE_STAT_ITEMS; @@ -1613,6 +1746,16 @@ int vmstat_refresh(struct ctl_table *table, int write, err = -EINVAL; } } +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_numa_stat[i]); + if (val < 0) { + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val); + err = -EINVAL; + } + } +#endif if (err) return err; if (write) @@ -1654,13 +1797,19 @@ static bool need_update(int cpu) struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); +#ifdef CONFIG_NUMA + BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1); +#endif /* * The fast way of checking if there are any vmstat diffs. * This works because the diffs are byte sized items. */ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) return true; - +#ifdef CONFIG_NUMA + if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS)) + return true; +#endif } return false; } -- cgit v1.2.3-71-gd317 From 1d90ca897cb05cf38bd62f36756d219e02913b7d Mon Sep 17 00:00:00 2001 From: Kemi Wang Date: Fri, 8 Sep 2017 16:12:52 -0700 Subject: mm: update NUMA counter threshold size There is significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (suggested by Dave Hansen). This patch updates NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter(suggested by Ying Huang). The rationality is that these statistics counters don't affect the kernel's decision, unlike other VM counters, so it's not a problem to use a large threshold. With this patchset, we see 31.3% drop of CPU cycles(537-->369) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Benchmark provided by Jesper D Brouer(increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Threshold CPU cycles Throughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default (base) 256 468 412397590 512 428 450550704 4096 399 482520943 20000 394 489009617 30000 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Link: http://lkml.kernel.org/r/1503568801-21305-3-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Acked-by: Mel Gorman Cc: Aaron Lu Cc: Andi Kleen Cc: Christopher Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 +-- mm/vmstat.c | 28 ++++++++++------------------ 2 files changed, 11 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e65d91c02e30..356a814e7c8e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -282,8 +282,7 @@ struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; - s8 numa_stat_threshold; - s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; + u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; diff --git a/mm/vmstat.c b/mm/vmstat.c index daea02833e2e..153d8129c155 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define NUMA_STATS_THRESHOLD (U16_MAX - 2) + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -194,10 +196,7 @@ void refresh_zone_stat_thresholds(void) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif + /* Base nodestat threshold on the largest populated zone. */ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold @@ -231,14 +230,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif - } } } @@ -874,16 +868,14 @@ void __inc_numa_state(struct zone *zone, enum numa_stat_item item) { struct per_cpu_pageset __percpu *pcp = zone->pageset; - s8 __percpu *p = pcp->vm_numa_stat_diff + item; - s8 v, t; + u16 __percpu *p = pcp->vm_numa_stat_diff + item; + u16 v; v = __this_cpu_inc_return(*p); - t = __this_cpu_read(pcp->numa_stat_threshold); - if (unlikely(v > t)) { - s8 overstep = t >> 1; - zone_numa_state_add(v + overstep, zone, item); - __this_cpu_write(*p, -overstep); + if (unlikely(v > NUMA_STATS_THRESHOLD)) { + zone_numa_state_add(v, zone, item); + __this_cpu_write(*p, 0); } } @@ -1798,7 +1790,7 @@ static bool need_update(int cpu) BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); #ifdef CONFIG_NUMA - BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1); + BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); #endif /* * The fast way of checking if there are any vmstat diffs. -- cgit v1.2.3-71-gd317 From 638032224ed762a29baca1fc37f1168efc2554ae Mon Sep 17 00:00:00 2001 From: Kemi Wang Date: Fri, 8 Sep 2017 16:12:55 -0700 Subject: mm: consider the number in local CPUs when reading NUMA stats To avoid deviation, the per cpu number of NUMA stats in vm_numa_stat_diff[] is included when a user *reads* the NUMA stats. Since NUMA stats does not be read by users frequently, and kernel does not need it to make a decision, it will not be a problem to make the readers more expensive. Link: http://lkml.kernel.org/r/1503568801-21305-4-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang Reported-by: Jesper Dangaard Brouer Acked-by: Mel Gorman Cc: Aaron Lu Cc: Andi Kleen Cc: Christopher Lameter Cc: Dave Hansen Cc: Johannes Weiner Cc: Michal Hocko Cc: Tim Chen Cc: Ying Huang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 6 +++++- mm/vmstat.c | 9 +++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9ac82e29948f..ade7cb5f1359 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum numa_stat_item item) return x; } -static inline unsigned long zone_numa_state(struct zone *zone, +static inline unsigned long zone_numa_state_snapshot(struct zone *zone, enum numa_stat_item item) { long x = atomic_long_read(&zone->vm_numa_stat[item]); + int cpu; + + for_each_online_cpu(cpu) + x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; return x; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 153d8129c155..4bb13e72ac97 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -897,6 +897,10 @@ unsigned long sum_zone_node_page_state(int node, return count; } +/* + * Determine the per node value of a numa stat item. To avoid deviation, + * the per cpu stat number in vm_numa_stat_diff[] is also included. + */ unsigned long sum_zone_numa_state(int node, enum numa_stat_item item) { @@ -905,7 +909,7 @@ unsigned long sum_zone_numa_state(int node, unsigned long count = 0; for (i = 0; i < MAX_NR_ZONES; i++) - count += zone_numa_state(zones + i, item); + count += zone_numa_state_snapshot(zones + i, item); return count; } @@ -1536,7 +1540,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], - zone_numa_state(zone, i)); + zone_numa_state_snapshot(zone, i)); #endif seq_printf(m, "\n pagesets"); @@ -1792,6 +1796,7 @@ static bool need_update(int cpu) #ifdef CONFIG_NUMA BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); #endif + /* * The fast way of checking if there are any vmstat diffs. * This works because the diffs are byte sized items. -- cgit v1.2.3-71-gd317 From 855d97657d4d335970622f0d95ca4966d3f77ee7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Sep 2017 16:13:38 -0700 Subject: proc: uninline proc_create() Save some code from ~320 invocations all clearing last argument. add/remove: 3/0 grow/shrink: 0/158 up/down: 45/-702 (-657) function old new delta proc_create - 17 +17 __ksymtab_proc_create - 16 +16 __kstrtab_proc_create - 12 +12 yam_init_driver 301 298 -3 ... cifs_proc_init 249 228 -21 via_fb_pci_probe 2304 2280 -24 Link: http://lkml.kernel.org/r/20170819094702.GA27864@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 8 ++++++++ include/linux/proc_fs.h | 8 +------- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e3cda0b5968f..85566abe6f83 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -499,6 +499,14 @@ out: } EXPORT_SYMBOL(proc_create_data); +struct proc_dir_entry *proc_create(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops) +{ + return proc_create_data(name, mode, parent, proc_fops, NULL); +} +EXPORT_SYMBOL(proc_create); + void proc_set_size(struct proc_dir_entry *de, loff_t size) { de->size = size; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 2d2bf592d9db..76124dd4e36d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -28,13 +28,7 @@ extern struct proc_dir_entry *proc_create_data(const char *, umode_t, const struct file_operations *, void *); -static inline struct proc_dir_entry *proc_create( - const char *name, umode_t mode, struct proc_dir_entry *parent, - const struct file_operations *proc_fops) -{ - return proc_create_data(name, mode, parent, proc_fops, NULL); -} - +struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops); extern void proc_set_size(struct proc_dir_entry *, loff_t); extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t); extern void *PDE_DATA(const struct inode *); -- cgit v1.2.3-71-gd317 From 604df322363e5770735df85368f83cac4a955a24 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 8 Sep 2017 16:13:45 -0700 Subject: linux/kernel.h: move DIV_ROUND_DOWN_ULL() macro This macro is useful to avoid link error on 32-bit systems. We have the same definition in two drivers, so move it to include/linux/kernel.h While we are here, refactor DIV_ROUND_UP_ULL() by using DIV_ROUND_DOWN_ULL(). Link: http://lkml.kernel.org/r/1500945156-12907-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Mark Brown Cc: Cyrille Pitchen Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Liam Girdwood Cc: Boris Brezillon Cc: Marek Vasut Cc: Brian Norris Cc: Richard Weinberger Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mtd/nand/denali.c | 3 --- include/linux/kernel.h | 7 +++++-- sound/soc/codecs/pcm512x.c | 3 --- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/denali.c b/drivers/mtd/nand/denali.c index d723be352148..3087b0ba7b7f 100644 --- a/drivers/mtd/nand/denali.c +++ b/drivers/mtd/nand/denali.c @@ -980,9 +980,6 @@ static int denali_erase(struct mtd_info *mtd, int page) return irq_status & INTR__ERASE_COMP ? 0 : NAND_STATUS_FAIL; } -#define DIV_ROUND_DOWN_ULL(ll, d) \ - ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; }) - static int denali_setup_data_interface(struct mtd_info *mtd, int chipnr, const struct nand_data_interface *conf) { diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 6607225d0ea4..0ad4c3044cf9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -78,8 +78,11 @@ #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP -#define DIV_ROUND_UP_ULL(ll,d) \ - ({ unsigned long long _tmp = (ll)+(d)-1; do_div(_tmp, d); _tmp; }) + +#define DIV_ROUND_DOWN_ULL(ll, d) \ + ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; }) + +#define DIV_ROUND_UP_ULL(ll, d) DIV_ROUND_DOWN_ULL((ll) + (d) - 1, (d)) #if BITS_PER_LONG == 32 # define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d) diff --git a/sound/soc/codecs/pcm512x.c b/sound/soc/codecs/pcm512x.c index f1005a31c709..68feae262476 100644 --- a/sound/soc/codecs/pcm512x.c +++ b/sound/soc/codecs/pcm512x.c @@ -30,9 +30,6 @@ #include "pcm512x.h" -#define DIV_ROUND_DOWN_ULL(ll, d) \ - ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; }) - #define PCM512x_NUM_SUPPLIES 3 static const char * const pcm512x_supply_names[PCM512x_NUM_SUPPLIES] = { "AVDD", -- cgit v1.2.3-71-gd317 From 3b3c4babd898715926d24ae10aa64778ace33aae Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 8 Sep 2017 16:13:48 -0700 Subject: lib/string.c: add multibyte memset functions Patch series "Multibyte memset variations", v4. A relatively common idiom we're missing is a function to fill an area of memory with a pattern which is larger than a single byte. I first noticed this with a zram patch which wanted to fill a page with an 'unsigned long' value. There turn out to be quite a few places in the kernel which can benefit from using an optimised function rather than a loop; sometimes text size, sometimes speed, and sometimes both. The optimised PowerPC version (not included here) improves performance by about 30% on POWER8 on just the raw memset_l(). Most of the extra lines of code come from the three testcases I added. This patch (of 8): memset16(), memset32() and memset64() are like memset(), but allow the caller to fill the destination with a value larger than a single byte. memset_l() and memset_p() allow the caller to use unsigned long and pointer values respectively. Link: http://lkml.kernel.org/r/20170720184539.31609-2-willy@infradead.org Signed-off-by: Matthew Wilcox Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: David Miller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Minchan Kim Cc: Ralf Baechle Cc: Richard Henderson Cc: Russell King Cc: Sam Ravnborg Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 30 +++++++++++++++++++++++ lib/string.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index a467e617eeb0..c8bdafffd2f0 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -99,6 +99,36 @@ extern __kernel_size_t strcspn(const char *,const char *); #ifndef __HAVE_ARCH_MEMSET extern void * memset(void *,int,__kernel_size_t); #endif + +#ifndef __HAVE_ARCH_MEMSET16 +extern void *memset16(uint16_t *, uint16_t, __kernel_size_t); +#endif + +#ifndef __HAVE_ARCH_MEMSET32 +extern void *memset32(uint32_t *, uint32_t, __kernel_size_t); +#endif + +#ifndef __HAVE_ARCH_MEMSET64 +extern void *memset64(uint64_t *, uint64_t, __kernel_size_t); +#endif + +static inline void *memset_l(unsigned long *p, unsigned long v, + __kernel_size_t n) +{ + if (BITS_PER_LONG == 32) + return memset32((uint32_t *)p, v, n); + else + return memset64((uint64_t *)p, v, n); +} + +static inline void *memset_p(void **p, void *v, __kernel_size_t n) +{ + if (BITS_PER_LONG == 32) + return memset32((uint32_t *)p, (uintptr_t)v, n); + else + return memset64((uint64_t *)p, (uintptr_t)v, n); +} + #ifndef __HAVE_ARCH_MEMCPY extern void * memcpy(void *,const void *,__kernel_size_t); #endif diff --git a/lib/string.c b/lib/string.c index ebbb99c775bd..198148bb61fd 100644 --- a/lib/string.c +++ b/lib/string.c @@ -723,6 +723,72 @@ void memzero_explicit(void *s, size_t count) } EXPORT_SYMBOL(memzero_explicit); +#ifndef __HAVE_ARCH_MEMSET16 +/** + * memset16() - Fill a memory area with a uint16_t + * @s: Pointer to the start of the area. + * @v: The value to fill the area with + * @count: The number of values to store + * + * Differs from memset() in that it fills with a uint16_t instead + * of a byte. Remember that @count is the number of uint16_ts to + * store, not the number of bytes. + */ +void *memset16(uint16_t *s, uint16_t v, size_t count) +{ + uint16_t *xs = s; + + while (count--) + *xs++ = v; + return s; +} +EXPORT_SYMBOL(memset16); +#endif + +#ifndef __HAVE_ARCH_MEMSET32 +/** + * memset32() - Fill a memory area with a uint32_t + * @s: Pointer to the start of the area. + * @v: The value to fill the area with + * @count: The number of values to store + * + * Differs from memset() in that it fills with a uint32_t instead + * of a byte. Remember that @count is the number of uint32_ts to + * store, not the number of bytes. + */ +void *memset32(uint32_t *s, uint32_t v, size_t count) +{ + uint32_t *xs = s; + + while (count--) + *xs++ = v; + return s; +} +EXPORT_SYMBOL(memset32); +#endif + +#ifndef __HAVE_ARCH_MEMSET64 +/** + * memset64() - Fill a memory area with a uint64_t + * @s: Pointer to the start of the area. + * @v: The value to fill the area with + * @count: The number of values to store + * + * Differs from memset() in that it fills with a uint64_t instead + * of a byte. Remember that @count is the number of uint64_ts to + * store, not the number of bytes. + */ +void *memset64(uint64_t *s, uint64_t v, size_t count) +{ + uint64_t *xs = s; + + while (count--) + *xs++ = v; + return s; +} +EXPORT_SYMBOL(memset64); +#endif + #ifndef __HAVE_ARCH_MEMCPY /** * memcpy - Copy one area of memory to another -- cgit v1.2.3-71-gd317 From ac036f9570a2d318b7d8dbbdbf0e269d7cc68cef Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 8 Sep 2017 16:14:15 -0700 Subject: vga: optimise console scrolling Where possible, call memset16(), memmove() or memcpy() instead of using open-coded loops. I don't like the calling convention that uses a byte count instead of a count of u16s, but it's a little late to change that. Reduces code size of fbcon.o by almost 400 bytes on my laptop build. [akpm@linux-foundation.org: fix build] Link: http://lkml.kernel.org/r/20170720184539.31609-9-willy@infradead.org Signed-off-by: Matthew Wilcox Cc: Ralf Baechle Cc: David Miller Cc: Sam Ravnborg Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Minchan Kim Cc: Richard Henderson Cc: Russell King Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/vga.h | 7 +++++++ arch/powerpc/include/asm/vga.h | 8 ++++++++ arch/sparc/include/asm/vga.h | 25 +++++++++++++++++++++++++ include/linux/vt_buffer.h | 13 +++++++++++++ 4 files changed, 53 insertions(+) (limited to 'include/linux') diff --git a/arch/mips/include/asm/vga.h b/arch/mips/include/asm/vga.h index f82c83749a08..975ff51f80c4 100644 --- a/arch/mips/include/asm/vga.h +++ b/arch/mips/include/asm/vga.h @@ -6,6 +6,7 @@ #ifndef _ASM_VGA_H #define _ASM_VGA_H +#include #include #include @@ -40,9 +41,15 @@ static inline u16 scr_readw(volatile const u16 *addr) return le16_to_cpu(*addr); } +static inline void scr_memsetw(u16 *s, u16 v, unsigned int count) +{ + memset16(s, cpu_to_le16(v), count / 2); +} + #define scr_memcpyw(d, s, c) memcpy(d, s, c) #define scr_memmovew(d, s, c) memmove(d, s, c) #define VT_BUF_HAVE_MEMCPYW #define VT_BUF_HAVE_MEMMOVEW +#define VT_BUF_HAVE_MEMSETW #endif /* _ASM_VGA_H */ diff --git a/arch/powerpc/include/asm/vga.h b/arch/powerpc/include/asm/vga.h index ab3acd2f2786..7a7b541b7493 100644 --- a/arch/powerpc/include/asm/vga.h +++ b/arch/powerpc/include/asm/vga.h @@ -33,8 +33,16 @@ static inline u16 scr_readw(volatile const u16 *addr) return le16_to_cpu(*addr); } +#define VT_BUF_HAVE_MEMSETW +static inline void scr_memsetw(u16 *s, u16 v, unsigned int n) +{ + memset16(s, cpu_to_le16(v), n / 2); +} + #define VT_BUF_HAVE_MEMCPYW +#define VT_BUF_HAVE_MEMMOVEW #define scr_memcpyw memcpy +#define scr_memmovew memmove #endif /* !CONFIG_VGA_CONSOLE && !CONFIG_MDA_CONSOLE */ diff --git a/arch/sparc/include/asm/vga.h b/arch/sparc/include/asm/vga.h index ec0e9967d93d..f54e8b6fb197 100644 --- a/arch/sparc/include/asm/vga.h +++ b/arch/sparc/include/asm/vga.h @@ -8,9 +8,13 @@ #define _LINUX_ASM_VGA_H_ #include +#include #include #define VT_BUF_HAVE_RW +#define VT_BUF_HAVE_MEMSETW +#define VT_BUF_HAVE_MEMCPYW +#define VT_BUF_HAVE_MEMMOVEW #undef scr_writew #undef scr_readw @@ -29,6 +33,27 @@ static inline u16 scr_readw(const u16 *addr) return *addr; } +static inline void scr_memsetw(u16 *p, u16 v, unsigned int n) +{ + BUG_ON((long) p >= 0); + + memset16(p, cpu_to_le16(v), n / 2); +} + +static inline void scr_memcpyw(u16 *d, u16 *s, unsigned int n) +{ + BUG_ON((long) d >= 0); + + memcpy(d, s, n); +} + +static inline void scr_memmovew(u16 *d, u16 *s, unsigned int n) +{ + BUG_ON((long) d >= 0); + + memmove(d, s, n); +} + #define VGA_MAP_MEM(x,s) (x) #endif diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h index f38c10ba3ff5..30b6e0d2a942 100644 --- a/include/linux/vt_buffer.h +++ b/include/linux/vt_buffer.h @@ -13,6 +13,7 @@ #ifndef _LINUX_VT_BUFFER_H_ #define _LINUX_VT_BUFFER_H_ +#include #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_MDA_CONSOLE) #include @@ -26,24 +27,33 @@ #ifndef VT_BUF_HAVE_MEMSETW static inline void scr_memsetw(u16 *s, u16 c, unsigned int count) { +#ifdef VT_BUF_HAVE_RW count /= 2; while (count--) scr_writew(c, s++); +#else + memset16(s, c, count / 2); +#endif } #endif #ifndef VT_BUF_HAVE_MEMCPYW static inline void scr_memcpyw(u16 *d, const u16 *s, unsigned int count) { +#ifdef VT_BUF_HAVE_RW count /= 2; while (count--) scr_writew(scr_readw(s++), d++); +#else + memcpy(d, s, count); +#endif } #endif #ifndef VT_BUF_HAVE_MEMMOVEW static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count) { +#ifdef VT_BUF_HAVE_RW if (d < s) scr_memcpyw(d, s, count); else { @@ -53,6 +63,9 @@ static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count) while (count--) scr_writew(scr_readw(--s), --d); } +#else + memmove(d, s, count); +#endif } #endif -- cgit v1.2.3-71-gd317 From 9b130ad5bb8255ee8534d92d67e12b2a4887eacb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Sep 2017 16:14:18 -0700 Subject: treewide: make "nr_cpu_ids" unsigned First, number of CPUs can't be negative number. Second, different signnnedness leads to suboptimal code in the following cases: 1) kmalloc(nr_cpu_ids * sizeof(X)); "int" has to be sign extended to size_t. 2) while (loff_t *pos < nr_cpu_ids) MOVSXD is 1 byte longed than the same MOV. Other cases exist as well. Basically compiler is told that nr_cpu_ids can't be negative which can't be deduced if it is "int". Code savings on allyesconfig kernel: -3KB add/remove: 0/0 grow/shrink: 25/264 up/down: 261/-3631 (-3370) function old new delta coretemp_cpu_online 450 512 +62 rcu_init_one 1234 1272 +38 pci_device_probe 374 399 +25 ... pgdat_reclaimable_pages 628 556 -72 select_fallback_rq 446 369 -77 task_numa_find_cpu 1923 1807 -116 Link: http://lkml.kernel.org/r/20170819114959.GA30580@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/smp.c | 2 +- arch/powerpc/kernel/paca.c | 2 +- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/sysdev/xive/native.c | 4 ++-- arch/tile/kernel/setup.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- drivers/base/cpu.c | 4 ++-- drivers/scsi/scsi_debug.c | 2 +- include/linux/cpumask.h | 6 +++--- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 2 +- kernel/sched/topology.c | 2 +- kernel/smp.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- mm/slub.c | 2 +- 17 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index ffe089942ac4..9f7195a5773e 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -690,7 +690,7 @@ void __init smp_init_cpus(void) acpi_parse_gic_cpu_interface, 0); if (cpu_count > nr_cpu_ids) - pr_warn("Number of cores (%d) exceeds configured maximum of %d - clipping\n", + pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n", cpu_count, nr_cpu_ids); if (!bootcpu_valid) { diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 70f073d6c3b2..2ff2b8a19f71 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -224,7 +224,7 @@ void __init allocate_pacas(void) paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit)); memset(paca, 0, paca_size); - printk(KERN_DEBUG "Allocated %u bytes for %d pacas at %p\n", + printk(KERN_DEBUG "Allocated %u bytes for %u pacas at %p\n", paca_size, nr_cpu_ids, paca); allocate_lppacas(nr_cpu_ids, limit); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 7de73589d8e2..0ac741fae90e 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -551,7 +551,7 @@ void __init smp_setup_cpu_maps(void) if (maxcpus > nr_cpu_ids) { printk(KERN_WARNING "Partition configured for %d cpus, " - "operating system maximum is %d.\n", + "operating system maximum is %u.\n", maxcpus, nr_cpu_ids); maxcpus = nr_cpu_ids; } else diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 44f3a25ca630..ebc244b08d67 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -511,13 +511,13 @@ static bool xive_parse_provisioning(struct device_node *np) static void xive_native_setup_pools(void) { /* Allocate a pool big enough */ - pr_debug("XIVE: Allocating VP block for pool size %d\n", nr_cpu_ids); + pr_debug("XIVE: Allocating VP block for pool size %u\n", nr_cpu_ids); xive_pool_vps = xive_native_alloc_vp_block(nr_cpu_ids); if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP)) pr_err("XIVE: Failed to allocate pool VP, KVM might not function\n"); - pr_debug("XIVE: Pool VPs allocated at 0x%x for %d max CPUs\n", + pr_debug("XIVE: Pool VPs allocated at 0x%x for %u max CPUs\n", xive_pool_vps, nr_cpu_ids); } diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 443a70bccc1c..6becb96c60a0 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -1200,7 +1200,7 @@ static void __init validate_hv(void) * We use a struct cpumask for this, so it must be big enough. */ if ((smp_height * smp_width) > nr_cpu_ids) - early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %d\n", + early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %u\n", smp_height, smp_width, nr_cpu_ids); #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7834f73efbf1..8315e2f517a7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2097,7 +2097,7 @@ static int allocate_logical_cpuid(int apicid) /* Allocate a new cpuid. */ if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. " + WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. " "Processor %d/0x%x and the rest are ignored.\n", nr_cpu_ids, nr_logical_cpuids, apicid); return -EINVAL; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 6e8fcb6f7e1e..28dafed6c682 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -168,7 +168,7 @@ void __init setup_per_cpu_areas(void) unsigned long delta; int rc; - pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 54b9e89d4d6b..cd6622c3204e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1461,7 +1461,7 @@ __init void prefill_possible_map(void) /* nr_cpu_ids could be reduced via nr_cpus= */ if (possible > nr_cpu_ids) { - pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", + pr_warn("%d Processors exceeds NR_CPUS limit of %u\n", possible, nr_cpu_ids); possible = nr_cpu_ids; } diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 2c3b359b3536..321cd7b4d817 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -256,9 +256,9 @@ static ssize_t print_cpus_offline(struct device *dev, buf[n++] = ','; if (nr_cpu_ids == total_cpus-1) - n += snprintf(&buf[n], len - n, "%d", nr_cpu_ids); + n += snprintf(&buf[n], len - n, "%u", nr_cpu_ids); else - n += snprintf(&buf[n], len - n, "%d-%d", + n += snprintf(&buf[n], len - n, "%u-%d", nr_cpu_ids, total_cpus-1); } diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 77a0335eb757..09ba494f8896 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -5465,7 +5465,7 @@ static int sdebug_driver_probe(struct device * dev) return error; } if (submit_queues > nr_cpu_ids) { - pr_warn("%s: trim submit_queues (was %d) to nr_cpu_ids=%d\n", + pr_warn("%s: trim submit_queues (was %d) to nr_cpu_ids=%u\n", my_name, submit_queues, nr_cpu_ids); submit_queues = nr_cpu_ids; } diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 4bf4479a3a80..68c5a8290275 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -32,15 +32,15 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if NR_CPUS == 1 -#define nr_cpu_ids 1 +#define nr_cpu_ids 1U #else -extern int nr_cpu_ids; +extern unsigned int nr_cpu_ids; #endif #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ -#define nr_cpumask_bits ((unsigned int)nr_cpu_ids) +#define nr_cpumask_bits nr_cpu_ids #else #define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84fe96641b2e..1250e4bd4b85 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4091,7 +4091,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 55bde94b9572..e012b9be777e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_fanout_leaf != RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6f7b43982f73..5d0062cc10cb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -473,7 +473,7 @@ static int __init isolated_cpu_setup(char *str) alloc_bootmem_cpumask_var(&cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map); if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); return 0; } return 1; diff --git a/kernel/smp.c b/kernel/smp.c index 81cfca9b4cc3..c94dd85c8d41 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -550,7 +550,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); /* Setup number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; +unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d56123cdcc89..b8f1f54731af 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { - max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { pr_warn("Warning: could not register graph trace events\n"); diff --git a/mm/slub.c b/mm/slub.c index ddb04576b342..d39a5d3834b3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4232,7 +4232,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); -- cgit v1.2.3-71-gd317 From e9ef073a0796e46c24f037237291efe56fc28ad9 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 8 Sep 2017 16:14:29 -0700 Subject: include: warn for inconsistent endian config definition We have seen some generic code use config parameter CONFIG_CPU_BIG_ENDIAN to decide the endianness. Here are the few examples. include/asm-generic/qrwlock.h drivers/of/base.c drivers/of/fdt.c drivers/tty/serial/earlycon.c drivers/tty/serial/serial_core.c Display warning if CPU_BIG_ENDIAN is not defined on big endian architecture and also warn if it defined on little endian architectures. Here is our original discussion https://lkml.org/lkml/2017/5/24/620 Link: http://lkml.kernel.org/r/1499358861-179979-4-git-send-email-babu.moger@oracle.com Signed-off-by: Babu Moger Suggested-by: Arnd Bergmann Acked-by: Geert Uytterhoeven Cc: "James E.J. Bottomley" Cc: Alexander Viro Cc: David S. Miller Cc: Greg KH Cc: Helge Deller Cc: Ingo Molnar Cc: Jonas Bonn Cc: Max Filippov Cc: Michael Ellerman (powerpc) Cc: Michal Simek Cc: Peter Zijlstra Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/byteorder/big_endian.h | 4 ++++ include/linux/byteorder/little_endian.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h index 392041475c72..ffd215988392 100644 --- a/include/linux/byteorder/big_endian.h +++ b/include/linux/byteorder/big_endian.h @@ -3,5 +3,9 @@ #include +#ifndef CONFIG_CPU_BIG_ENDIAN +#warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN +#endif + #include #endif /* _LINUX_BYTEORDER_BIG_ENDIAN_H */ diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h index 08057377aa23..ba910bb9aad0 100644 --- a/include/linux/byteorder/little_endian.h +++ b/include/linux/byteorder/little_endian.h @@ -3,5 +3,9 @@ #include +#ifdef CONFIG_CPU_BIG_ENDIAN +#warning inconsistent configuration, CONFIG_CPU_BIG_ENDIAN is set +#endif + #include #endif /* _LINUX_BYTEORDER_LITTLE_ENDIAN_H */ -- cgit v1.2.3-71-gd317 From c32ee3d9abd284b4fcaacc250b101f93829c7bae Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Fri, 8 Sep 2017 16:14:33 -0700 Subject: bitops: avoid integer overflow in GENMASK(_ULL) GENMASK(_ULL) performs a left-shift of ~0UL(L), which technically results in an integer overflow. clang raises a warning if the overflow occurs in a preprocessor expression. Clear the low-order bits through a substraction instead of the left-shift to avoid the overflow. (akpm: no change in .text size in my testing) Link: http://lkml.kernel.org/r/20170803212020.24939-1-mka@chromium.org Signed-off-by: Matthias Kaehlcke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index a83c822c35c2..8fbe259b197c 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -19,10 +19,11 @@ * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ #define GENMASK(h, l) \ - (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) + (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) #define GENMASK_ULL(h, l) \ - (((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) + (((~0ULL) - (1ULL << (l)) + 1) & \ + (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); -- cgit v1.2.3-71-gd317 From cd9e61ed1eebbcd5dfad59475d41ec58d9b64b6a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:14:36 -0700 Subject: rbtree: cache leftmost node internally Patch series "rbtree: Cache leftmost node internally", v4. A series to extending rbtrees to internally cache the leftmost node such that we can have fast overlap check optimization for all interval tree users[1]. The benefits of this series are that: (i) Unify users that do internal leftmost node caching. (ii) Optimize all interval tree users. (iii) Convert at least two new users (epoll and procfs) to the new interface. This patch (of 16): Red-black tree semantics imply that nodes with smaller or greater (or equal for duplicates) keys always be to the left and right, respectively. For the kernel this is extremely evident when considering our rb_first() semantics. Enabling lookups for the smallest node in the tree in O(1) can save a good chunk of cycles in not having to walk down the tree each time. To this end there are a few core users that explicitly do this, such as the scheduler and rtmutexes. There is also the desire for interval trees to have this optimization allowing faster overlap checking. This patch introduces a new 'struct rb_root_cached' which is just the root with a cached pointer to the leftmost node. The reason why the regular rb_root was not extended instead of adding a new structure was that this allows the user to have the choice between memory footprint and actual tree performance. The new wrappers on top of the regular rb_root calls are: - rb_first_cached(cached_root) -- which is a fast replacement for rb_first. - rb_insert_color_cached(node, cached_root, new) - rb_erase_cached(node, cached_root) In addition, augmented cached interfaces are also added for basic insertion and deletion operations; which becomes important for the interval tree changes. With the exception of the inserts, which adds a bool for updating the new leftmost, the interfaces are kept the same. To this end, porting rb users to the cached version becomes really trivial, and keeping current rbtree semantics for users that don't care about the optimization requires zero overhead. Link: http://lkml.kernel.org/r/20170719014603.19029-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Jan Kara Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rbtree.txt | 33 +++++++++++++++++++++++++++++++++ include/linux/rbtree.h | 21 +++++++++++++++++++++ include/linux/rbtree_augmented.h | 33 ++++++++++++++++++++++++++++++--- lib/rbtree.c | 34 +++++++++++++++++++++++++++++----- 4 files changed, 113 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt index b8a8c70b0188..c42a21b99046 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/rbtree.txt @@ -193,6 +193,39 @@ Example:: for (node = rb_first(&mytree); node; node = rb_next(node)) printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); +Cached rbtrees +-------------- + +Computing the leftmost (smallest) node is quite a common task for binary +search trees, such as for traversals or users relying on a the particular +order for their own logic. To this end, users can use 'struct rb_root_cached' +to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding +potentially expensive tree iterations. This is done at negligible runtime +overhead for maintanence; albeit larger memory footprint. + +Similar to the rb_root structure, cached rbtrees are initialized to be +empty via: + + struct rb_root_cached mytree = RB_ROOT_CACHED; + +Cached rbtree is simply a regular rb_root with an extra pointer to cache the +leftmost node. This allows rb_root_cached to exist wherever rb_root does, +which permits augmented trees to be supported as well as only a few extra +interfaces: + + struct rb_node *rb_first_cached(struct rb_root_cached *tree); + void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool); + void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); + +Both insert and erase calls have their respective counterpart of augmented +trees: + + void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *, + bool, struct rb_augment_callbacks *); + void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *, + struct rb_augment_callbacks *); + + Support for Augmented rbtrees ----------------------------- diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index e585018498d5..d574361943ea 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -44,10 +44,25 @@ struct rb_root { struct rb_node *rb_node; }; +/* + * Leftmost-cached rbtrees. + * + * We do not cache the rightmost node based on footprint + * size vs number of potential users that could benefit + * from O(1) rb_last(). Just not worth it, users that want + * this feature can always implement the logic explicitly. + * Furthermore, users that want to cache both pointers may + * find it a bit asymmetric, but that's ok. + */ +struct rb_root_cached { + struct rb_root rb_root; + struct rb_node *rb_leftmost; +}; #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } +#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) @@ -69,6 +84,12 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); +extern void rb_insert_color_cached(struct rb_node *, + struct rb_root_cached *, bool); +extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); +/* Same as rb_first(), but O(1) */ +#define rb_first_cached(root) (root)->rb_leftmost + /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 9702b6e183bc..6bfd2b581f75 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -41,7 +41,9 @@ struct rb_augment_callbacks { void (*rotate)(struct rb_node *old, struct rb_node *new); }; -extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, +extern void __rb_insert_augmented(struct rb_node *node, + struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. @@ -57,7 +59,16 @@ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, root, augment->rotate); + __rb_insert_augmented(node, root, false, NULL, augment->rotate); +} + +static inline void +rb_insert_augmented_cached(struct rb_node *node, + struct rb_root_cached *root, bool newleft, + const struct rb_augment_callbacks *augment) +{ + __rb_insert_augmented(node, &root->rb_root, + newleft, &root->rb_leftmost, augment->rotate); } #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ @@ -150,6 +161,7 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, + struct rb_node **leftmost, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; @@ -157,6 +169,9 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, struct rb_node *parent, *rebalance; unsigned long pc; + if (leftmost && node == *leftmost) + *leftmost = rb_next(node); + if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) @@ -256,9 +271,21 @@ static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); + struct rb_node *rebalance = __rb_erase_augmented(node, root, + NULL, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } +static __always_inline void +rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root, + &root->rb_leftmost, + augment); + if (rebalance) + __rb_erase_color(rebalance, &root->rb_root, augment->rotate); +} + #endif /* _LINUX_RBTREE_AUGMENTED_H */ diff --git a/lib/rbtree.c b/lib/rbtree.c index 4ba2828a67c0..d102d9d2ffaa 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -95,10 +95,14 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, static __always_inline void __rb_insert(struct rb_node *node, struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; + if (newleft) + *leftmost = node; + while (true) { /* * Loop invariant: node is red @@ -434,19 +438,38 @@ static const struct rb_augment_callbacks dummy_callbacks = { void rb_insert_color(struct rb_node *node, struct rb_root *root) { - __rb_insert(node, root, dummy_rotate); + __rb_insert(node, root, false, NULL, dummy_rotate); } EXPORT_SYMBOL(rb_insert_color); void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + rebalance = __rb_erase_augmented(node, root, + NULL, &dummy_callbacks); if (rebalance) ____rb_erase_color(rebalance, root, dummy_rotate); } EXPORT_SYMBOL(rb_erase); +void rb_insert_color_cached(struct rb_node *node, + struct rb_root_cached *root, bool leftmost) +{ + __rb_insert(node, &root->rb_root, leftmost, + &root->rb_leftmost, dummy_rotate); +} +EXPORT_SYMBOL(rb_insert_color_cached); + +void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) +{ + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, &root->rb_root, + &root->rb_leftmost, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate); +} +EXPORT_SYMBOL(rb_erase_cached); + /* * Augmented rbtree manipulation functions. * @@ -455,9 +478,10 @@ EXPORT_SYMBOL(rb_erase); */ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - __rb_insert(node, root, augment_rotate); + __rb_insert(node, root, newleft, leftmost, augment_rotate); } EXPORT_SYMBOL(__rb_insert_augmented); @@ -502,7 +526,7 @@ struct rb_node *rb_next(const struct rb_node *node) * as we can. */ if (node->rb_right) { - node = node->rb_right; + node = node->rb_right; while (node->rb_left) node=node->rb_left; return (struct rb_node *)node; @@ -534,7 +558,7 @@ struct rb_node *rb_prev(const struct rb_node *node) * as we can. */ if (node->rb_left) { - node = node->rb_left; + node = node->rb_left; while (node->rb_right) node=node->rb_right; return (struct rb_node *)node; -- cgit v1.2.3-71-gd317 From a23ba907d5e65d6aeea3e59c82fda9cd206a7aad Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:15:01 -0700 Subject: locking/rtmutex: replace top-waiter and pi_waiters leftmost caching ... with the generic rbtree flavor instead. No changes in semantics whatsoever. Link: http://lkml.kernel.org/r/20170719014603.19029-10-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init_task.h | 5 ++--- include/linux/rtmutex.h | 11 +++++------ include/linux/sched.h | 3 +-- kernel/fork.c | 3 +-- kernel/locking/rtmutex-debug.c | 2 +- kernel/locking/rtmutex.c | 35 +++++++++++------------------------ kernel/locking/rtmutex_common.h | 12 ++++++------ 7 files changed, 27 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 0e849715e5be..3c07ace5b431 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -175,9 +175,8 @@ extern struct cred init_cred; #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = RB_ROOT, \ - .pi_top_task = NULL, \ - .pi_waiters_leftmost = NULL, + .pi_waiters = RB_ROOT_CACHED, \ + .pi_top_task = NULL, #else # define INIT_RT_MUTEXES(tsk) #endif diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 44fd002f7cd5..53fcbe9de7fd 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -22,18 +22,17 @@ extern int max_lock_depth; /* for sysctl */ * The rt_mutex structure * * @wait_lock: spinlock to protect the structure - * @waiters: rbtree root to enqueue waiters in priority order - * @waiters_leftmost: top waiter + * @waiters: rbtree root to enqueue waiters in priority order; + * caches top-waiter (leftmost node). * @owner: the mutex owner */ struct rt_mutex { raw_spinlock_t wait_lock; - struct rb_root waiters; - struct rb_node *waiters_leftmost; + struct rb_root_cached waiters; struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES int save_state; - const char *name, *file; + const char *name, *file; int line; void *magic; #endif @@ -84,7 +83,7 @@ do { \ #define __RT_MUTEX_INITIALIZER(mutexname) \ { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ - , .waiters = RB_ROOT \ + , .waiters = RB_ROOT_CACHED \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} diff --git a/include/linux/sched.h b/include/linux/sched.h index 68b38335d33c..92fb8dd5a9e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -812,8 +812,7 @@ struct task_struct { #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ - struct rb_root pi_waiters; - struct rb_node *pi_waiters_leftmost; + struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ diff --git a/kernel/fork.c b/kernel/fork.c index 2ccbbbfcb7b8..6f1b0af00bda 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1462,8 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - p->pi_waiters = RB_ROOT; - p->pi_waiters_leftmost = NULL; + p->pi_waiters = RB_ROOT_CACHED; p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index ac35e648b0e5..f4a74e78d467 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 649dc9d3951a..6f3dba6e4e9e 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &lock->waiters.rb_node; + struct rb_node **link = &lock->waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - lock->waiters_leftmost = &waiter->tree_entry; - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color(&waiter->tree_entry, &lock->waiters); + rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); } static void @@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->tree_entry)) return; - if (lock->waiters_leftmost == &waiter->tree_entry) - lock->waiters_leftmost = rb_next(&waiter->tree_entry); - - rb_erase(&waiter->tree_entry, &lock->waiters); + rb_erase_cached(&waiter->tree_entry, &lock->waiters); RB_CLEAR_NODE(&waiter->tree_entry); } static void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node **link = &task->pi_waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - task->pi_waiters_leftmost = &waiter->pi_tree_entry; - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); + rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); } static void @@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) return; - if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) - task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); - - rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); RB_CLEAR_NODE(&waiter->pi_tree_entry); } @@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT; - lock->waiters_leftmost = NULL; + lock->waiters = RB_ROOT_CACHED; if (name && key) debug_rt_mutex_init(lock, name, key); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 8d039b928d61..7453be0485a5 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -45,7 +45,7 @@ struct rt_mutex_waiter { static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !RB_EMPTY_ROOT(&lock->waiters); + return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } static inline struct rt_mutex_waiter * @@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, - tree_entry); + w = rb_entry(lock->waiters.rb_leftmost, + struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); return w; @@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !RB_EMPTY_ROOT(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + return rb_entry(p->pi_waiters.rb_leftmost, + struct rt_mutex_waiter, pi_tree_entry); } #else -- cgit v1.2.3-71-gd317 From f808c13fd3738948e10196496959871130612b61 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:15:08 -0700 Subject: lib/interval_tree: fast overlap detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow interval trees to quickly check for overlaps to avoid unnecesary tree lookups in interval_tree_iter_first(). As of this patch, all interval tree flavors will require using a 'rb_root_cached' such that we can have the leftmost node easily available. While most users will make use of this feature, those with special functions (in addition to the generic insert, delete, search calls) will avoid using the cached option as they can do funky things with insertions -- for example, vma_interval_tree_insert_after(). [jglisse@redhat.com: fix deadlock from typo vm_lock_anon_vma()] Link: http://lkml.kernel.org/r/20170808225719.20723-1-jglisse@redhat.com Link: http://lkml.kernel.org/r/20170719014603.19029-12-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Signed-off-by: Jérôme Glisse Acked-by: Christian König Acked-by: Peter Zijlstra (Intel) Acked-by: Doug Ledford Acked-by: Michael S. Tsirkin Cc: David Airlie Cc: Jason Wang Cc: Christian Benvenuti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/drm_mm.c | 19 +++++---- drivers/gpu/drm/drm_vma_manager.c | 2 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 6 +-- drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_mn.c | 8 ++-- drivers/gpu/drm/radeon/radeon_vm.c | 7 ++-- drivers/infiniband/core/umem_rbtree.c | 4 +- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/hw/hfi1/mmu_rb.c | 10 ++--- drivers/infiniband/hw/usnic/usnic_uiom.c | 6 +-- drivers/infiniband/hw/usnic/usnic_uiom.h | 2 +- .../infiniband/hw/usnic/usnic_uiom_interval_tree.c | 15 +++---- .../infiniband/hw/usnic/usnic_uiom_interval_tree.h | 12 +++--- drivers/vhost/vhost.c | 2 +- drivers/vhost/vhost.h | 2 +- fs/hugetlbfs/inode.c | 6 +-- fs/inode.c | 2 +- include/drm/drm_mm.h | 2 +- include/linux/fs.h | 4 +- include/linux/interval_tree.h | 8 ++-- include/linux/interval_tree_generic.h | 46 +++++++++++++++++----- include/linux/mm.h | 17 ++++---- include/linux/rmap.h | 4 +- include/rdma/ib_umem_odp.h | 11 ++++-- include/rdma/ib_verbs.h | 2 +- lib/interval_tree_test.c | 4 +- mm/interval_tree.c | 10 ++--- mm/memory.c | 4 +- mm/mmap.c | 10 ++--- mm/rmap.c | 4 +- 33 files changed, 145 insertions(+), 105 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index e1cde6b80027..3b0f2ec6eec7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -51,7 +51,7 @@ struct amdgpu_mn { /* objects protected by lock */ struct mutex lock; - struct rb_root objects; + struct rb_root_cached objects; }; struct amdgpu_mn_node { @@ -76,8 +76,8 @@ static void amdgpu_mn_destroy(struct work_struct *work) mutex_lock(&adev->mn_lock); mutex_lock(&rmn->lock); hash_del(&rmn->node); - rbtree_postorder_for_each_entry_safe(node, next_node, &rmn->objects, - it.rb) { + rbtree_postorder_for_each_entry_safe(node, next_node, + &rmn->objects.rb_root, it.rb) { list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) { bo->mn = NULL; list_del_init(&bo->mn_list); @@ -221,7 +221,7 @@ static struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) rmn->mm = mm; rmn->mn.ops = &amdgpu_mn_ops; mutex_init(&rmn->lock); - rmn->objects = RB_ROOT; + rmn->objects = RB_ROOT_CACHED; r = __mmu_notifier_register(&rmn->mn, mm); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 6b1343e5541d..b9a5a77eedaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2475,7 +2475,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, u64 flags; uint64_t init_pde_value = 0; - vm->va = RB_ROOT; + vm->va = RB_ROOT_CACHED; vm->client_id = atomic64_inc_return(&adev->vm_manager.client_counter); for (i = 0; i < AMDGPU_MAX_VMHUBS; i++) vm->reserved_vmid[i] = NULL; @@ -2596,10 +2596,11 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) amd_sched_entity_fini(vm->entity.sched, &vm->entity); - if (!RB_EMPTY_ROOT(&vm->va)) { + if (!RB_EMPTY_ROOT(&vm->va.rb_root)) { dev_err(adev->dev, "still active bo inside vm\n"); } - rbtree_postorder_for_each_entry_safe(mapping, tmp, &vm->va, rb) { + rbtree_postorder_for_each_entry_safe(mapping, tmp, + &vm->va.rb_root, rb) { list_del(&mapping->list); amdgpu_vm_it_remove(mapping, &vm->va); kfree(mapping); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index ba6691b58ee7..6716355403ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -118,7 +118,7 @@ struct amdgpu_vm_pt { struct amdgpu_vm { /* tree of virtual addresses mapped */ - struct rb_root va; + struct rb_root_cached va; /* protecting invalidated */ spinlock_t status_lock; diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index f794089d30ac..61a1c8ea74bc 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -169,7 +169,7 @@ INTERVAL_TREE_DEFINE(struct drm_mm_node, rb, struct drm_mm_node * __drm_mm_interval_first(const struct drm_mm *mm, u64 start, u64 last) { - return drm_mm_interval_tree_iter_first((struct rb_root *)&mm->interval_tree, + return drm_mm_interval_tree_iter_first((struct rb_root_cached *)&mm->interval_tree, start, last) ?: (struct drm_mm_node *)&mm->head_node; } EXPORT_SYMBOL(__drm_mm_interval_first); @@ -180,6 +180,7 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node, struct drm_mm *mm = hole_node->mm; struct rb_node **link, *rb; struct drm_mm_node *parent; + bool leftmost = true; node->__subtree_last = LAST(node); @@ -196,9 +197,10 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node, rb = &hole_node->rb; link = &hole_node->rb.rb_right; + leftmost = false; } else { rb = NULL; - link = &mm->interval_tree.rb_node; + link = &mm->interval_tree.rb_root.rb_node; } while (*link) { @@ -208,14 +210,15 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node, parent->__subtree_last = node->__subtree_last; if (node->start < parent->start) link = &parent->rb.rb_left; - else + else { link = &parent->rb.rb_right; + leftmost = true; + } } rb_link_node(&node->rb, rb, link); - rb_insert_augmented(&node->rb, - &mm->interval_tree, - &drm_mm_interval_tree_augment); + rb_insert_augmented_cached(&node->rb, &mm->interval_tree, leftmost, + &drm_mm_interval_tree_augment); } #define RB_INSERT(root, member, expr) do { \ @@ -577,7 +580,7 @@ void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new) *new = *old; list_replace(&old->node_list, &new->node_list); - rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree); + rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree.rb_root); if (drm_mm_hole_follows(old)) { list_replace(&old->hole_stack, &new->hole_stack); @@ -863,7 +866,7 @@ void drm_mm_init(struct drm_mm *mm, u64 start, u64 size) mm->color_adjust = NULL; INIT_LIST_HEAD(&mm->hole_stack); - mm->interval_tree = RB_ROOT; + mm->interval_tree = RB_ROOT_CACHED; mm->holes_size = RB_ROOT; mm->holes_addr = RB_ROOT; diff --git a/drivers/gpu/drm/drm_vma_manager.c b/drivers/gpu/drm/drm_vma_manager.c index d9100b565198..28f1226576f8 100644 --- a/drivers/gpu/drm/drm_vma_manager.c +++ b/drivers/gpu/drm/drm_vma_manager.c @@ -147,7 +147,7 @@ struct drm_vma_offset_node *drm_vma_offset_lookup_locked(struct drm_vma_offset_m struct rb_node *iter; unsigned long offset; - iter = mgr->vm_addr_space_mm.interval_tree.rb_node; + iter = mgr->vm_addr_space_mm.interval_tree.rb_root.rb_node; best = NULL; while (likely(iter)) { diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index f152a38d7079..23fd18bd1b56 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -49,7 +49,7 @@ struct i915_mmu_notifier { spinlock_t lock; struct hlist_node node; struct mmu_notifier mn; - struct rb_root objects; + struct rb_root_cached objects; struct workqueue_struct *wq; }; @@ -123,7 +123,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, struct interval_tree_node *it; LIST_HEAD(cancelled); - if (RB_EMPTY_ROOT(&mn->objects)) + if (RB_EMPTY_ROOT(&mn->objects.rb_root)) return; /* interval ranges are inclusive, but invalidate range is exclusive */ @@ -172,7 +172,7 @@ i915_mmu_notifier_create(struct mm_struct *mm) spin_lock_init(&mn->lock); mn->mn.ops = &i915_gem_userptr_notifier; - mn->objects = RB_ROOT; + mn->objects = RB_ROOT_CACHED; mn->wq = alloc_workqueue("i915-userptr-release", WQ_UNBOUND, 0); if (mn->wq == NULL) { kfree(mn); diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index ec63bc5e9de7..8cbaeec090c9 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -924,7 +924,7 @@ struct radeon_vm_id { struct radeon_vm { struct mutex mutex; - struct rb_root va; + struct rb_root_cached va; /* protecting invalidated and freed */ spinlock_t status_lock; diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index 896f2cf51e4e..1d62288b7ee3 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -50,7 +50,7 @@ struct radeon_mn { /* objects protected by lock */ struct mutex lock; - struct rb_root objects; + struct rb_root_cached objects; }; struct radeon_mn_node { @@ -75,8 +75,8 @@ static void radeon_mn_destroy(struct work_struct *work) mutex_lock(&rdev->mn_lock); mutex_lock(&rmn->lock); hash_del(&rmn->node); - rbtree_postorder_for_each_entry_safe(node, next_node, &rmn->objects, - it.rb) { + rbtree_postorder_for_each_entry_safe(node, next_node, + &rmn->objects.rb_root, it.rb) { interval_tree_remove(&node->it, &rmn->objects); list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) { @@ -205,7 +205,7 @@ static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev) rmn->mm = mm; rmn->mn.ops = &radeon_mn_ops; mutex_init(&rmn->lock); - rmn->objects = RB_ROOT; + rmn->objects = RB_ROOT_CACHED; r = __mmu_notifier_register(&rmn->mn, mm); if (r) diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c index 5e82b408d522..e5c0e635e371 100644 --- a/drivers/gpu/drm/radeon/radeon_vm.c +++ b/drivers/gpu/drm/radeon/radeon_vm.c @@ -1185,7 +1185,7 @@ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm) vm->ids[i].last_id_use = NULL; } mutex_init(&vm->mutex); - vm->va = RB_ROOT; + vm->va = RB_ROOT_CACHED; spin_lock_init(&vm->status_lock); INIT_LIST_HEAD(&vm->invalidated); INIT_LIST_HEAD(&vm->freed); @@ -1232,10 +1232,11 @@ void radeon_vm_fini(struct radeon_device *rdev, struct radeon_vm *vm) struct radeon_bo_va *bo_va, *tmp; int i, r; - if (!RB_EMPTY_ROOT(&vm->va)) { + if (!RB_EMPTY_ROOT(&vm->va.rb_root)) { dev_err(rdev->dev, "still active bo inside vm\n"); } - rbtree_postorder_for_each_entry_safe(bo_va, tmp, &vm->va, it.rb) { + rbtree_postorder_for_each_entry_safe(bo_va, tmp, + &vm->va.rb_root, it.rb) { interval_tree_remove(&bo_va->it, &vm->va); r = radeon_bo_reserve(bo_va->bo, false); if (!r) { diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c index d176597b4d78..fc801920e341 100644 --- a/drivers/infiniband/core/umem_rbtree.c +++ b/drivers/infiniband/core/umem_rbtree.c @@ -72,7 +72,7 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, /* @last is not a part of the interval. See comment for function * node_last. */ -int rbt_ib_umem_for_each_in_range(struct rb_root *root, +int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, u64 start, u64 last, umem_call_back cb, void *cookie) @@ -95,7 +95,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root *root, } EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); -struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root, +struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length) { struct umem_odp_node *node; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index e0cb99860934..4ab30d832ac5 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -118,7 +118,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->closing = 0; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->umem_tree = RB_ROOT; + ucontext->umem_tree = RB_ROOT_CACHED; init_rwsem(&ucontext->umem_rwsem); ucontext->odp_mrs_count = 0; INIT_LIST_HEAD(&ucontext->no_private_counters); diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 2f0d285dc278..175002c046ed 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -54,7 +54,7 @@ struct mmu_rb_handler { struct mmu_notifier mn; - struct rb_root root; + struct rb_root_cached root; void *ops_arg; spinlock_t lock; /* protect the RB tree */ struct mmu_rb_ops *ops; @@ -108,7 +108,7 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, if (!handlr) return -ENOMEM; - handlr->root = RB_ROOT; + handlr->root = RB_ROOT_CACHED; handlr->ops = ops; handlr->ops_arg = ops_arg; INIT_HLIST_NODE(&handlr->mn.hlist); @@ -149,9 +149,9 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) INIT_LIST_HEAD(&del_list); spin_lock_irqsave(&handler->lock, flags); - while ((node = rb_first(&handler->root))) { + while ((node = rb_first_cached(&handler->root))) { rbnode = rb_entry(node, struct mmu_rb_node, node); - rb_erase(node, &handler->root); + rb_erase_cached(node, &handler->root); /* move from LRU list to delete list */ list_move(&rbnode->list, &del_list); } @@ -300,7 +300,7 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, { struct mmu_rb_handler *handler = container_of(mn, struct mmu_rb_handler, mn); - struct rb_root *root = &handler->root; + struct rb_root_cached *root = &handler->root; struct mmu_rb_node *node, *ptr = NULL; unsigned long flags; bool added = false; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index c49db7c33979..4381c0a9a873 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -227,7 +227,7 @@ static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd, vpn_last = vpn_start + npages - 1; spin_lock(&pd->lock); - usnic_uiom_remove_interval(&pd->rb_root, vpn_start, + usnic_uiom_remove_interval(&pd->root, vpn_start, vpn_last, &rm_intervals); usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd); @@ -379,7 +379,7 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last, (writable) ? IOMMU_WRITE : 0, IOMMU_WRITE, - &pd->rb_root, + &pd->root, &sorted_diff_intervals); if (err) { usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n", @@ -395,7 +395,7 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, } - err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last, + err = usnic_uiom_insert_interval(&pd->root, vpn_start, vpn_last, (writable) ? IOMMU_WRITE : 0); if (err) { usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n", diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h index 45ca7c1613a7..431efe4143f4 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.h +++ b/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -55,7 +55,7 @@ struct usnic_uiom_dev { struct usnic_uiom_pd { struct iommu_domain *domain; spinlock_t lock; - struct rb_root rb_root; + struct rb_root_cached root; struct list_head devs; int dev_cnt; }; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c index 42b4b4c4e452..d399523206c7 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c @@ -100,9 +100,9 @@ static int interval_cmp(void *priv, struct list_head *a, struct list_head *b) } static void -find_intervals_intersection_sorted(struct rb_root *root, unsigned long start, - unsigned long last, - struct list_head *list) +find_intervals_intersection_sorted(struct rb_root_cached *root, + unsigned long start, unsigned long last, + struct list_head *list) { struct usnic_uiom_interval_node *node; @@ -118,7 +118,7 @@ find_intervals_intersection_sorted(struct rb_root *root, unsigned long start, int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last, int flags, int flag_mask, - struct rb_root *root, + struct rb_root_cached *root, struct list_head *diff_set) { struct usnic_uiom_interval_node *interval, *tmp; @@ -175,7 +175,7 @@ void usnic_uiom_put_interval_set(struct list_head *intervals) kfree(interval); } -int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start, +int usnic_uiom_insert_interval(struct rb_root_cached *root, unsigned long start, unsigned long last, int flags) { struct usnic_uiom_interval_node *interval, *tmp; @@ -246,8 +246,9 @@ err_out: return err; } -void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start, - unsigned long last, struct list_head *removed) +void usnic_uiom_remove_interval(struct rb_root_cached *root, + unsigned long start, unsigned long last, + struct list_head *removed) { struct usnic_uiom_interval_node *interval; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h index c0b0b876ab90..1d7fc3226bca 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h @@ -48,12 +48,12 @@ struct usnic_uiom_interval_node { extern void usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node, - struct rb_root *root); + struct rb_root_cached *root); extern void usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node, - struct rb_root *root); + struct rb_root_cached *root); extern struct usnic_uiom_interval_node * -usnic_uiom_interval_tree_iter_first(struct rb_root *root, +usnic_uiom_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); extern struct usnic_uiom_interval_node * @@ -63,7 +63,7 @@ usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node, * Inserts {start...last} into {root}. If there are overlaps, * nodes will be broken up and merged */ -int usnic_uiom_insert_interval(struct rb_root *root, +int usnic_uiom_insert_interval(struct rb_root_cached *root, unsigned long start, unsigned long last, int flags); /* @@ -71,7 +71,7 @@ int usnic_uiom_insert_interval(struct rb_root *root, * 'removed.' The caller is responsibile for freeing memory of nodes in * 'removed.' */ -void usnic_uiom_remove_interval(struct rb_root *root, +void usnic_uiom_remove_interval(struct rb_root_cached *root, unsigned long start, unsigned long last, struct list_head *removed); /* @@ -81,7 +81,7 @@ void usnic_uiom_remove_interval(struct rb_root *root, int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last, int flags, int flag_mask, - struct rb_root *root, + struct rb_root_cached *root, struct list_head *diff_set); /* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */ void usnic_uiom_put_interval_set(struct list_head *intervals); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 9cb3f722dce1..d6dbb28245e6 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1271,7 +1271,7 @@ static struct vhost_umem *vhost_umem_alloc(void) if (!umem) return NULL; - umem->umem_tree = RB_ROOT; + umem->umem_tree = RB_ROOT_CACHED; umem->numem = 0; INIT_LIST_HEAD(&umem->umem_list); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index bb7c29b8b9fc..d59a9cc65f9d 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -71,7 +71,7 @@ struct vhost_umem_node { }; struct vhost_umem { - struct rb_root umem_tree; + struct rb_root_cached umem_tree; struct list_head umem_list; int numem; }; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8c6f4b8f910f..59073e9f01a4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -334,7 +334,7 @@ static void remove_huge_page(struct page *page) } static void -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) +hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) { struct vm_area_struct *vma; @@ -498,7 +498,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); i_mmap_lock_write(mapping); - if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); @@ -523,7 +523,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); i_mmap_lock_write(mapping); - if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT); diff --git a/fs/inode.c b/fs/inode.c index 6a1626e0edaf..210054157a49 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -353,7 +353,7 @@ void address_space_init_once(struct address_space *mapping) init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); - mapping->i_mmap = RB_ROOT; + mapping->i_mmap = RB_ROOT_CACHED; } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h index 49b292e98fec..8d10fc97801c 100644 --- a/include/drm/drm_mm.h +++ b/include/drm/drm_mm.h @@ -172,7 +172,7 @@ struct drm_mm { * according to the (increasing) start address of the memory node. */ struct drm_mm_node head_node; /* Keep an interval_tree for fast lookup of drm_mm_nodes by address. */ - struct rb_root interval_tree; + struct rb_root_cached interval_tree; struct rb_root holes_size; struct rb_root holes_addr; diff --git a/include/linux/fs.h b/include/linux/fs.h index 509434aaf5a4..6111976848ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -392,7 +392,7 @@ struct address_space { struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ - struct rb_root i_mmap; /* tree of private and shared mappings */ + struct rb_root_cached i_mmap; /* tree of private and shared mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ @@ -487,7 +487,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping) */ static inline int mapping_mapped(struct address_space *mapping) { - return !RB_EMPTY_ROOT(&mapping->i_mmap); + return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } /* diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h index 724556aa3c95..202ee1283f4b 100644 --- a/include/linux/interval_tree.h +++ b/include/linux/interval_tree.h @@ -11,13 +11,15 @@ struct interval_tree_node { }; extern void -interval_tree_insert(struct interval_tree_node *node, struct rb_root *root); +interval_tree_insert(struct interval_tree_node *node, + struct rb_root_cached *root); extern void -interval_tree_remove(struct interval_tree_node *node, struct rb_root *root); +interval_tree_remove(struct interval_tree_node *node, + struct rb_root_cached *root); extern struct interval_tree_node * -interval_tree_iter_first(struct rb_root *root, +interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); extern struct interval_tree_node * diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index 58370e1862ad..f096423c8cbd 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -65,11 +65,13 @@ RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \ \ /* Insert / remove interval nodes from the tree */ \ \ -ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root) \ +ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, \ + struct rb_root_cached *root) \ { \ - struct rb_node **link = &root->rb_node, *rb_parent = NULL; \ + struct rb_node **link = &root->rb_root.rb_node, *rb_parent = NULL; \ ITTYPE start = ITSTART(node), last = ITLAST(node); \ ITSTRUCT *parent; \ + bool leftmost = true; \ \ while (*link) { \ rb_parent = *link; \ @@ -78,18 +80,22 @@ ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root) \ parent->ITSUBTREE = last; \ if (start < ITSTART(parent)) \ link = &parent->ITRB.rb_left; \ - else \ + else { \ link = &parent->ITRB.rb_right; \ + leftmost = false; \ + } \ } \ \ node->ITSUBTREE = last; \ rb_link_node(&node->ITRB, rb_parent, link); \ - rb_insert_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \ + rb_insert_augmented_cached(&node->ITRB, root, \ + leftmost, &ITPREFIX ## _augment); \ } \ \ -ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, struct rb_root *root) \ +ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ + struct rb_root_cached *root) \ { \ - rb_erase_augmented(&node->ITRB, root, &ITPREFIX ## _augment); \ + rb_erase_augmented_cached(&node->ITRB, root, &ITPREFIX ## _augment); \ } \ \ /* \ @@ -140,15 +146,35 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ } \ \ ITSTATIC ITSTRUCT * \ -ITPREFIX ## _iter_first(struct rb_root *root, ITTYPE start, ITTYPE last) \ +ITPREFIX ## _iter_first(struct rb_root_cached *root, \ + ITTYPE start, ITTYPE last) \ { \ - ITSTRUCT *node; \ + ITSTRUCT *node, *leftmost; \ \ - if (!root->rb_node) \ + if (!root->rb_root.rb_node) \ return NULL; \ - node = rb_entry(root->rb_node, ITSTRUCT, ITRB); \ + \ + /* \ + * Fastpath range intersection/overlap between A: [a0, a1] and \ + * B: [b0, b1] is given by: \ + * \ + * a0 <= b1 && b0 <= a1 \ + * \ + * ... where A holds the lock range and B holds the smallest \ + * 'start' and largest 'last' in the tree. For the later, we \ + * rely on the root node, which by augmented interval tree \ + * property, holds the largest value in its last-in-subtree. \ + * This allows mitigating some of the tree walk overhead for \ + * for non-intersecting ranges, maintained and consulted in O(1). \ + */ \ + node = rb_entry(root->rb_root.rb_node, ITSTRUCT, ITRB); \ if (node->ITSUBTREE < start) \ return NULL; \ + \ + leftmost = rb_entry(root->rb_leftmost, ITSTRUCT, ITRB); \ + if (ITSTART(leftmost) > last) \ + return NULL; \ + \ return ITPREFIX ## _subtree_search(node, start, last); \ } \ \ diff --git a/include/linux/mm.h b/include/linux/mm.h index 5195e272fc4a..f8c10d336e42 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2034,13 +2034,13 @@ extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, - struct rb_root *root); + struct rb_root_cached *root); void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, - struct rb_root *root); + struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, - struct rb_root *root); -struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, + struct rb_root_cached *root); +struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, unsigned long start, unsigned long last); @@ -2050,11 +2050,12 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, vma; vma = vma_interval_tree_iter_next(vma, start, last)) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, - struct rb_root *root); + struct rb_root_cached *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, - struct rb_root *root); -struct anon_vma_chain *anon_vma_interval_tree_iter_first( - struct rb_root *root, unsigned long start, unsigned long last); + struct rb_root_cached *root); +struct anon_vma_chain * +anon_vma_interval_tree_iter_first(struct rb_root_cached *root, + unsigned long start, unsigned long last); struct anon_vma_chain *anon_vma_interval_tree_iter_next( struct anon_vma_chain *node, unsigned long start, unsigned long last); #ifdef CONFIG_DEBUG_VM_RB diff --git a/include/linux/rmap.h b/include/linux/rmap.h index f8ca2e74b819..733d3d8181e2 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -55,7 +55,9 @@ struct anon_vma { * is serialized by a system wide lock only visible to * mm_take_all_locks() (mm_all_locks_mutex). */ - struct rb_root rb_root; /* Interval tree of private "related" vmas */ + + /* Interval tree of private "related" vmas */ + struct rb_root_cached rb_root; }; /* diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index fb67554aabd6..5eb7f5bc8248 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -111,22 +111,25 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bound); -void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); -void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); +void rbt_ib_umem_insert(struct umem_odp_node *node, + struct rb_root_cached *root); +void rbt_ib_umem_remove(struct umem_odp_node *node, + struct rb_root_cached *root); typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, void *cookie); /* * Call the callback on each ib_umem in the range. Returns the logical or of * the return values of the functions called. */ -int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, +int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, + u64 start, u64 end, umem_call_back cb, void *cookie); /* * Find first region intersecting with address range. * Return NULL if not found */ -struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root, +struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length); static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e6df68048517..bdb1279a415b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1457,7 +1457,7 @@ struct ib_ucontext { struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - struct rb_root umem_tree; + struct rb_root_cached umem_tree; /* * Protects .umem_rbroot and tree, as well as odp_mrs_count and * mmu notifiers registration. diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c index df495fe81421..0e343fd29570 100644 --- a/lib/interval_tree_test.c +++ b/lib/interval_tree_test.c @@ -19,14 +19,14 @@ __param(bool, search_all, false, "Searches will iterate all nodes in the tree"); __param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint"); -static struct rb_root root = RB_ROOT; +static struct rb_root_cached root = RB_ROOT_CACHED; static struct interval_tree_node *nodes = NULL; static u32 *queries = NULL; static struct rnd_state rnd; static inline unsigned long -search(struct rb_root *root, unsigned long start, unsigned long last) +search(struct rb_root_cached *root, unsigned long start, unsigned long last) { struct interval_tree_node *node; unsigned long results = 0; diff --git a/mm/interval_tree.c b/mm/interval_tree.c index f2c2492681bf..b47664358796 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -28,7 +28,7 @@ INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, - struct rb_root *root) + struct rb_root_cached *root) { struct rb_node **link; struct vm_area_struct *parent; @@ -55,7 +55,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, node->shared.rb_subtree_last = last; rb_link_node(&node->shared.rb, &parent->shared.rb, link); - rb_insert_augmented(&node->shared.rb, root, + rb_insert_augmented(&node->shared.rb, &root->rb_root, &vma_interval_tree_augment); } @@ -74,7 +74,7 @@ INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, static inline, __anon_vma_interval_tree) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, - struct rb_root *root) + struct rb_root_cached *root) { #ifdef CONFIG_DEBUG_VM_RB node->cached_vma_start = avc_start_pgoff(node); @@ -84,13 +84,13 @@ void anon_vma_interval_tree_insert(struct anon_vma_chain *node, } void anon_vma_interval_tree_remove(struct anon_vma_chain *node, - struct rb_root *root) + struct rb_root_cached *root) { __anon_vma_interval_tree_remove(node, root); } struct anon_vma_chain * -anon_vma_interval_tree_iter_first(struct rb_root *root, +anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long first, unsigned long last) { return __anon_vma_interval_tree_iter_first(root, first, last); diff --git a/mm/memory.c b/mm/memory.c index 0bbc1d612a63..ec4e15494901 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2761,7 +2761,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct rb_root *root, +static inline void unmap_mapping_range_tree(struct rb_root_cached *root, struct zap_details *details) { struct vm_area_struct *vma; @@ -2825,7 +2825,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; i_mmap_lock_write(mapping); - if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); } diff --git a/mm/mmap.c b/mm/mmap.c index 4c5981651407..680506faceae 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -685,7 +685,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; struct address_space *mapping = NULL; - struct rb_root *root = NULL; + struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; bool start_changed = false, end_changed = false; @@ -3340,7 +3340,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. @@ -3356,7 +3356,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_node)) + &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); } } @@ -3458,7 +3458,7 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. @@ -3472,7 +3472,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_node)) + &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); anon_vma_unlock_write(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index 0618cd85b862..b874c4761e84 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -391,7 +391,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ - if (RB_EMPTY_ROOT(&anon_vma->rb_root)) { + if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { anon_vma->parent->degree--; continue; } @@ -425,7 +425,7 @@ static void anon_vma_ctor(void *data) init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); - anon_vma->rb_root = RB_ROOT; + anon_vma->rb_root = RB_ROOT_CACHED; } void __init anon_vma_init(void) -- cgit v1.2.3-71-gd317 From f2686bb48618718c7141f117e2d607594e959bfc Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:15:12 -0700 Subject: lib/interval-tree: correct comment wrt generic flavor interval_tree.h _is_ the generic flavor. Link: http://lkml.kernel.org/r/20170719014603.19029-13-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/interval_tree_generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index f096423c8cbd..1f97ce26cccc 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -33,7 +33,7 @@ * ITSTATIC: 'static' or empty * ITPREFIX: prefix to use for the inline tree definitions * - * Note - before using this, please consider if non-generic version + * Note - before using this, please consider if generic version * (interval_tree.h) would work for you... */ -- cgit v1.2.3-71-gd317 From 60ef690018b262ddcd0d51edf10e40710deb9c9f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 8 Sep 2017 16:15:41 -0700 Subject: bitmap: introduce BITMAP_FROM_U64() The macro is the compile-time analogue of bitmap_from_u64() with the same purpose: convert the 64-bit number to the properly ordered pair of 32-bit parts, suitable for filling the bitmap in 32-bit BE environment. Use it to make test_bitmap_parselist() correct for 32-bit BE ABIs. Tested on BE mips/qemu. [akpm@linux-foundation.org: tweak code comment] Link: http://lkml.kernel.org/r/20170810172916.24144-1-ynorov@caviumnetworks.com Signed-off-by: Yury Norov Cc: Noam Camus Cc: Rasmus Villemoes Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 32 ++++++++++++++++++++++++++++++++ lib/test_bitmap.c | 47 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 64 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 5797ca6fdfe2..700cf5f67118 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -360,6 +360,38 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen, return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits); } +/* + * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. + * + * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit + * integers in 32-bit environment, and 64-bit integers in 64-bit one. + * + * There are four combinations of endianness and length of the word in linux + * ABIs: LE64, BE64, LE32 and BE32. + * + * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in + * bitmaps and therefore don't require any special handling. + * + * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory + * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the + * other hand is represented as an array of 32-bit words and the position of + * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that + * word. For example, bit #42 is located at 10th position of 2nd word. + * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit + * values in memory as it usually does. But for BE we need to swap hi and lo + * words manually. + * + * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and + * lo parts of u64. For LE32 it does nothing, and for BE environment it swaps + * hi and lo words, as is expected by bitmap. + */ +#if __BITS_PER_LONG == 64 +#define BITMAP_FROM_U64(n) (n) +#else +#define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \ + ((unsigned long) ((u64)(n) >> 32)) +#endif + /* * bitmap_from_u64 - Check and swap words within u64. * @mask: source bitmap diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 431c97fb1aa0..599c6713f2a2 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -175,24 +175,41 @@ struct test_bitmap_parselist{ const int flags; }; -static const unsigned long exp[] = {1, 2, 0x0000ffff, 0xffff0000, 0x55555555, - 0xaaaaaaaa, 0x11111111, 0x22222222, 0xffffffff, - 0xfffffffe, 0x3333333311111111, 0xffffffff77777777}; -static const unsigned long exp2[] = {0x3333333311111111, 0xffffffff77777777}; +static const unsigned long exp[] __initconst = { + BITMAP_FROM_U64(1), + BITMAP_FROM_U64(2), + BITMAP_FROM_U64(0x0000ffff), + BITMAP_FROM_U64(0xffff0000), + BITMAP_FROM_U64(0x55555555), + BITMAP_FROM_U64(0xaaaaaaaa), + BITMAP_FROM_U64(0x11111111), + BITMAP_FROM_U64(0x22222222), + BITMAP_FROM_U64(0xffffffff), + BITMAP_FROM_U64(0xfffffffe), + BITMAP_FROM_U64(0x3333333311111111), + BITMAP_FROM_U64(0xffffffff77777777) +}; + +static const unsigned long exp2[] __initconst = { + BITMAP_FROM_U64(0x3333333311111111), + BITMAP_FROM_U64(0xffffffff77777777) +}; static const struct test_bitmap_parselist parselist_tests[] __initconst = { +#define step (sizeof(u64) / sizeof(unsigned long)) + {0, "0", &exp[0], 8, 0}, - {0, "1", &exp[1], 8, 0}, - {0, "0-15", &exp[2], 32, 0}, - {0, "16-31", &exp[3], 32, 0}, - {0, "0-31:1/2", &exp[4], 32, 0}, - {0, "1-31:1/2", &exp[5], 32, 0}, - {0, "0-31:1/4", &exp[6], 32, 0}, - {0, "1-31:1/4", &exp[7], 32, 0}, - {0, "0-31:4/4", &exp[8], 32, 0}, - {0, "1-31:4/4", &exp[9], 32, 0}, - {0, "0-31:1/4,32-63:2/4", &exp[10], 64, 0}, - {0, "0-31:3/4,32-63:4/4", &exp[11], 64, 0}, + {0, "1", &exp[1 * step], 8, 0}, + {0, "0-15", &exp[2 * step], 32, 0}, + {0, "16-31", &exp[3 * step], 32, 0}, + {0, "0-31:1/2", &exp[4 * step], 32, 0}, + {0, "1-31:1/2", &exp[5 * step], 32, 0}, + {0, "0-31:1/4", &exp[6 * step], 32, 0}, + {0, "1-31:1/4", &exp[7 * step], 32, 0}, + {0, "0-31:4/4", &exp[8 * step], 32, 0}, + {0, "1-31:4/4", &exp[9 * step], 32, 0}, + {0, "0-31:1/4,32-63:2/4", &exp[10 * step], 64, 0}, + {0, "0-31:3/4,32-63:4/4", &exp[11 * step], 64, 0}, {0, "0-31:1/4,32-63:2/4,64-95:3/4,96-127:4/4", exp2, 128, 0}, -- cgit v1.2.3-71-gd317 From 895a60728f83684e738cce34d7d4e64631505ae4 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 8 Sep 2017 16:15:45 -0700 Subject: lib/rhashtable: fix comment on locks_mul default value As of commit 4cf0b354d92 ("rhashtable: avoid large lock-array allocations"), the default value for the locks multiplier was reduced from 128 to 32. Update the header file to reflect this. Link: http://lkml.kernel.org/r/20170815215401.30745-1-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Florian Westphal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rhashtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 7d56a7ea2b2e..361c08e35dbc 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -127,7 +127,7 @@ struct rhashtable; * @head_offset: Offset of rhash_head in struct to be hashed * @max_size: Maximum size while expanding * @min_size: Minimum size while shrinking - * @locks_mul: Number of bucket locks to allocate per cpu (default: 128) + * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) * @automatic_shrinking: Enable automatic shrinking of tables * @nulls_base: Base value to generate nulls marker * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) -- cgit v1.2.3-71-gd317 From 42f46148217865a545e129612075f3d828a2c4e4 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 8 Sep 2017 16:16:24 -0700 Subject: autofs: fix AT_NO_AUTOMOUNT not being honored The fstatat(2) and statx() calls can pass the flag AT_NO_AUTOMOUNT which is meant to clear the LOOKUP_AUTOMOUNT flag and prevent triggering of an automount by the call. But this flag is unconditionally cleared for all stat family system calls except statx(). stat family system calls have always triggered mount requests for the negative dentry case in follow_automount() which is intended but prevents the fstatat(2) and statx() AT_NO_AUTOMOUNT case from being handled. In order to handle the AT_NO_AUTOMOUNT for both system calls the negative dentry case in follow_automount() needs to be changed to return ENOENT when the LOOKUP_AUTOMOUNT flag is clear (and the other required flags are clear). AFAICT this change doesn't have any noticable side effects and may, in some use cases (although I didn't see it in testing) prevent unnecessary callbacks to the automount daemon. It's also possible that a stat family call has been made with a path that is in the process of being mounted by some other process. But stat family calls should return the automount state of the path as it is "now" so it shouldn't wait for mount completion. This is the same semantic as the positive dentry case already handled. Link: http://lkml.kernel.org/r/150216641255.11652.4204561328197919771.stgit@pluto.themaw.net Fixes: deccf497d804a4c5fca ("Make stat/lstat/fstatat pass AT_NO_AUTOMOUNT to vfs_statx()") Signed-off-by: Ian Kent Cc: David Howells Cc: Colin Walters Cc: Ondrej Holy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 15 ++++++++++++--- include/linux/fs.h | 3 +-- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index ddb6a7c2b3d4..1180f9c58093 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,9 +1129,18 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && - path->dentry->d_inode) - return -EISDIR; + LOOKUP_OPEN | LOOKUP_CREATE | + LOOKUP_AUTOMOUNT))) { + /* Positive dentry that isn't meant to trigger an + * automount, EISDIR will allow it to be used, + * otherwise there's no mount here "now" so return + * ENOENT. + */ + if (path->dentry->d_inode) + return -EISDIR; + else + return -ENOENT; + } if (path->dentry->d_sb->s_user_ns != &init_user_ns) return -EACCES; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6111976848ff..2d0e6748e46e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3043,8 +3043,7 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat) static inline int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags) { - return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, - stat, STATX_BASIC_STATS); + return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS); } static inline int vfs_fstat(int fd, struct kstat *stat) { -- cgit v1.2.3-71-gd317 From c1f3fa2a4fde2818623b42e3f749bd478be5dec7 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 8 Sep 2017 16:17:08 -0700 Subject: kmod: split off umh headers into its own file In the future usermode helper users do not need to carry in all the of kmod headers declarations. Since kmod.h still includes umh.h this change has no functional changes, each umh user can be cleaned up separately later and with time. Link: http://lkml.kernel.org/r/20170810180618.22457-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Kees Cook Cc: Dmitry Torokhov Cc: Jessica Yu Cc: Rusty Russell Cc: Michal Marek Cc: Petr Mladek Cc: Miroslav Benes Cc: Josh Poimboeuf Cc: Guenter Roeck Cc: "Eric W. Biederman" Cc: Matt Redfearn Cc: Dan Carpenter Cc: Colin Ian King Cc: Daniel Mentz Cc: David Binderman Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + include/linux/kmod.h | 60 +-------------------------------------------- include/linux/umh.h | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 59 deletions(-) create mode 100644 include/linux/umh.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 7aee06c1b775..ff3a349f24e4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7462,6 +7462,7 @@ M: "Luis R. Rodriguez" L: linux-kernel@vger.kernel.org S: Maintained F: kernel/umh.c +F: include/linux/umh.h KERNEL VIRTUAL MACHINE (KVM) M: Paolo Bonzini diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 655082c88fd9..40c89ad4bea6 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -19,6 +19,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include #include #include #include @@ -44,63 +45,4 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; #define try_then_request_module(x, mod...) (x) #endif - -struct cred; -struct file; - -#define UMH_NO_WAIT 0 /* don't wait at all */ -#define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */ -#define UMH_WAIT_PROC 2 /* wait for the process to complete */ -#define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */ - -struct subprocess_info { - struct work_struct work; - struct completion *complete; - const char *path; - char **argv; - char **envp; - int wait; - int retval; - int (*init)(struct subprocess_info *info, struct cred *new); - void (*cleanup)(struct subprocess_info *info); - void *data; -} __randomize_layout; - -extern int -call_usermodehelper(const char *path, char **argv, char **envp, int wait); - -extern struct subprocess_info * -call_usermodehelper_setup(const char *path, char **argv, char **envp, - gfp_t gfp_mask, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *), void *data); - -extern int -call_usermodehelper_exec(struct subprocess_info *info, int wait); - -extern struct ctl_table usermodehelper_table[]; - -enum umh_disable_depth { - UMH_ENABLED = 0, - UMH_FREEZING, - UMH_DISABLED, -}; - -extern int __usermodehelper_disable(enum umh_disable_depth depth); -extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth); - -static inline int usermodehelper_disable(void) -{ - return __usermodehelper_disable(UMH_DISABLED); -} - -static inline void usermodehelper_enable(void) -{ - __usermodehelper_set_disable_depth(UMH_ENABLED); -} - -extern int usermodehelper_read_trylock(void); -extern long usermodehelper_read_lock_wait(long timeout); -extern void usermodehelper_read_unlock(void); - #endif /* __LINUX_KMOD_H__ */ diff --git a/include/linux/umh.h b/include/linux/umh.h new file mode 100644 index 000000000000..244aff638220 --- /dev/null +++ b/include/linux/umh.h @@ -0,0 +1,69 @@ +#ifndef __LINUX_UMH_H__ +#define __LINUX_UMH_H__ + +#include +#include +#include +#include +#include +#include + +struct cred; +struct file; + +#define UMH_NO_WAIT 0 /* don't wait at all */ +#define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */ +#define UMH_WAIT_PROC 2 /* wait for the process to complete */ +#define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */ + +struct subprocess_info { + struct work_struct work; + struct completion *complete; + const char *path; + char **argv; + char **envp; + int wait; + int retval; + int (*init)(struct subprocess_info *info, struct cred *new); + void (*cleanup)(struct subprocess_info *info); + void *data; +} __randomize_layout; + +extern int +call_usermodehelper(const char *path, char **argv, char **envp, int wait); + +extern struct subprocess_info * +call_usermodehelper_setup(const char *path, char **argv, char **envp, + gfp_t gfp_mask, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *), void *data); + +extern int +call_usermodehelper_exec(struct subprocess_info *info, int wait); + +extern struct ctl_table usermodehelper_table[]; + +enum umh_disable_depth { + UMH_ENABLED = 0, + UMH_FREEZING, + UMH_DISABLED, +}; + +extern int __usermodehelper_disable(enum umh_disable_depth depth); +extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth); + +static inline int usermodehelper_disable(void) +{ + return __usermodehelper_disable(UMH_DISABLED); +} + +static inline void usermodehelper_enable(void) +{ + __usermodehelper_set_disable_depth(UMH_ENABLED); +} + +extern int usermodehelper_read_trylock(void); +extern long usermodehelper_read_lock_wait(long timeout); +extern void usermodehelper_read_unlock(void); + +#endif /* __LINUX_UMH_H__ */ -- cgit v1.2.3-71-gd317 From f22ef333c32cc683922d7e3361a83ebc31b2ac6d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Sep 2017 16:17:15 -0700 Subject: cpumask: make cpumask_next() out-of-line Every for_each_XXX_cpu() invocation calls cpumask_next() which is an inline function: static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1); } However! find_next_bit() is regular out-of-line function which means "nr_cpu_ids" load and increment happen at the caller resulting in a lot of bloat x86_64 defconfig: add/remove: 3/0 grow/shrink: 8/373 up/down: 155/-5668 (-5513) x86_64 allyesconfig-ish: add/remove: 3/1 grow/shrink: 57/634 up/down: 3515/-28177 (-24662) !!! Some archs redefine find_next_bit() but it is OK: m68k inline but SMP is not supported arm out-of-line unicore32 out-of-line Function call will happen anyway, so move load and increment into callee. Link: http://lkml.kernel.org/r/20170824230010.GA1593@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 15 +-------------- lib/cpumask.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 68c5a8290275..cd415b733c2a 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -178,20 +178,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } -/** - * cpumask_next - get the next cpu in a cpumask - * @n: the cpu prior to the place to search (ie. return will be > @n) - * @srcp: the cpumask pointer - * - * Returns >= nr_cpu_ids if no further cpus set. - */ -static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) -{ - /* -1 is a legal arg here. */ - if (n != -1) - cpumask_check(n); - return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); -} +unsigned int cpumask_next(int n, const struct cpumask *srcp); /** * cpumask_next_zero - get the next unset cpu in a cpumask diff --git a/lib/cpumask.c b/lib/cpumask.c index 4731a0895760..8b1a1bd77539 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -5,6 +5,22 @@ #include #include +/** + * cpumask_next - get the next cpu in a cpumask + * @n: the cpu prior to the place to search (ie. return will be > @n) + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if no further cpus set. + */ +unsigned int cpumask_next(int n, const struct cpumask *srcp) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1); +} +EXPORT_SYMBOL(cpumask_next); + /** * cpumask_next_and - get the next cpu in *src1p & *src2p * @n: the cpu prior to the place to search (ie. return will be > @n) -- cgit v1.2.3-71-gd317 From a2d818030135c293f878fbb772cf40e7a14c5acc Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Fri, 8 Sep 2017 16:17:19 -0700 Subject: drivers/pps: aesthetic tweaks to PPS-related content Collection of aesthetic adjustments to various PPS-related files, directories and Documentation, some quite minor just for the sake of consistency, including: * Updated example of pps device tree node (courtesy Rodolfo G.) * "PPS-API" -> "PPS API" * "pps_source_info_s" -> "pps_source_info" * "ktimer driver" -> "pps-ktimer driver" * "ppstest /dev/pps0" -> "ppstest /dev/pps1" to match example * Add missing PPS-related entries to MAINTAINERS file * Other trivialities Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1708261048220.8106@localhost.localdomain Signed-off-by: Robert P. J. Day Acked-by: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/devicetree/bindings/pps/pps-gpio.txt | 8 +++- Documentation/pps/pps.txt | 44 +++++++++++----------- MAINTAINERS | 3 ++ include/linux/pps-gpio.h | 2 +- include/linux/pps_kernel.h | 16 ++++---- include/uapi/linux/pps.h | 4 +- kernel/time/timekeeping.c | 2 +- 7 files changed, 43 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/Documentation/devicetree/bindings/pps/pps-gpio.txt b/Documentation/devicetree/bindings/pps/pps-gpio.txt index 40bf9c3564a5..0de23b793657 100644 --- a/Documentation/devicetree/bindings/pps/pps-gpio.txt +++ b/Documentation/devicetree/bindings/pps/pps-gpio.txt @@ -13,8 +13,12 @@ Optional properties: Example: pps { - compatible = "pps-gpio"; - gpios = <&gpio2 6 0>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_pps>; + gpios = <&gpio1 26 GPIO_ACTIVE_HIGH>; assert-falling-edge; + + compatible = "pps-gpio"; + status = "okay"; }; diff --git a/Documentation/pps/pps.txt b/Documentation/pps/pps.txt index 1fdbd5447216..99f5d8c4c652 100644 --- a/Documentation/pps/pps.txt +++ b/Documentation/pps/pps.txt @@ -48,12 +48,12 @@ problem: time_pps_create(). This implies that the source has a /dev/... entry. This assumption is -ok for the serial and parallel port, where you can do something +OK for the serial and parallel port, where you can do something useful besides(!) the gathering of timestamps as it is the central -task for a PPS-API. But this assumption does not work for a single +task for a PPS API. But this assumption does not work for a single purpose GPIO line. In this case even basic file-related functionality (like read() and write()) makes no sense at all and should not be a -precondition for the use of a PPS-API. +precondition for the use of a PPS API. The problem can be simply solved if you consider that a PPS source is not always connected with a GPS data source. @@ -88,13 +88,13 @@ Coding example -------------- To register a PPS source into the kernel you should define a struct -pps_source_info_s as follows: +pps_source_info as follows: static struct pps_source_info pps_ktimer_info = { .name = "ktimer", .path = "", - .mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | \ - PPS_ECHOASSERT | \ + .mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | + PPS_ECHOASSERT | PPS_CANWAIT | PPS_TSFMT_TSPEC, .echo = pps_ktimer_echo, .owner = THIS_MODULE, @@ -108,13 +108,13 @@ initialization routine as follows: The pps_register_source() prototype is: - int pps_register_source(struct pps_source_info_s *info, int default_params) + int pps_register_source(struct pps_source_info *info, int default_params) where "info" is a pointer to a structure that describes a particular PPS source, "default_params" tells the system what the initial default parameters for the device should be (it is obvious that these parameters must be a subset of ones defined in the struct -pps_source_info_s which describe the capabilities of the driver). +pps_source_info which describe the capabilities of the driver). Once you have registered a new PPS source into the system you can signal an assert event (for example in the interrupt handler routine) @@ -142,8 +142,10 @@ If the SYSFS filesystem is enabled in the kernel it provides a new class: Every directory is the ID of a PPS sources defined in the system and inside you find several files: - $ ls /sys/class/pps/pps0/ - assert clear echo mode name path subsystem@ uevent + $ ls -F /sys/class/pps/pps0/ + assert dev mode path subsystem@ + clear echo name power/ uevent + Inside each "assert" and "clear" file you can find the timestamp and a sequence number: @@ -154,32 +156,32 @@ sequence number: Where before the "#" is the timestamp in seconds; after it is the sequence number. Other files are: -* echo: reports if the PPS source has an echo function or not; + * echo: reports if the PPS source has an echo function or not; -* mode: reports available PPS functioning modes; + * mode: reports available PPS functioning modes; -* name: reports the PPS source's name; + * name: reports the PPS source's name; -* path: reports the PPS source's device path, that is the device the - PPS source is connected to (if it exists). + * path: reports the PPS source's device path, that is the device the + PPS source is connected to (if it exists). Testing the PPS support ----------------------- In order to test the PPS support even without specific hardware you can use -the ktimer driver (see the client subsection in the PPS configuration menu) +the pps-ktimer driver (see the client subsection in the PPS configuration menu) and the userland tools available in your distribution's pps-tools package, -http://linuxpps.org , or https://github.com/ago/pps-tools . +http://linuxpps.org , or https://github.com/redlab-i/pps-tools. -Once you have enabled the compilation of ktimer just modprobe it (if +Once you have enabled the compilation of pps-ktimer just modprobe it (if not statically compiled): - # modprobe ktimer + # modprobe pps-ktimer and the run ppstest as follow: - $ ./ppstest /dev/pps0 + $ ./ppstest /dev/pps1 trying PPS source "/dev/pps1" found PPS source "/dev/pps1" ok, found 1 source(s), now start fetching data... @@ -187,7 +189,7 @@ and the run ppstest as follow: source 0 - assert 1186592700.388931295, sequence: 365 - clear 0.000000000, sequence: 0 source 0 - assert 1186592701.389032765, sequence: 366 - clear 0.000000000, sequence: 0 -Please, note that to compile userland programs you need the file timepps.h . +Please note that to compile userland programs, you need the file timepps.h. This is available in the pps-tools repository mentioned above. diff --git a/MAINTAINERS b/MAINTAINERS index ff3a349f24e4..109c5d9a04c4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10725,8 +10725,11 @@ W: http://wiki.enneenne.com/index.php/LinuxPPS_support L: linuxpps@ml.enneenne.com (subscribers-only) S: Maintained F: Documentation/pps/ +F: Documentation/devicetree/bindings/pps/pps-gpio.txt +F: Documentation/ABI/testing/sysfs-pps F: drivers/pps/ F: include/linux/pps*.h +F: include/uapi/linux/pps.h PPTP DRIVER M: Dmitry Kozlov diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h index 0035abe41b9a..56f35dd3d01d 100644 --- a/include/linux/pps-gpio.h +++ b/include/linux/pps-gpio.h @@ -29,4 +29,4 @@ struct pps_gpio_platform_data { const char *gpio_label; }; -#endif +#endif /* _PPS_GPIO_H */ diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index 35ac903956c7..80a980cc8d95 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -22,7 +22,6 @@ #define LINUX_PPS_KERNEL_H #include - #include #include #include @@ -35,9 +34,9 @@ struct pps_device; /* The specific PPS source info */ struct pps_source_info { - char name[PPS_MAX_NAME_LEN]; /* simbolic name */ + char name[PPS_MAX_NAME_LEN]; /* symbolic name */ char path[PPS_MAX_NAME_LEN]; /* path of connected device */ - int mode; /* PPS's allowed mode */ + int mode; /* PPS allowed mode */ void (*echo)(struct pps_device *pps, int event, void *data); /* PPS echo function */ @@ -57,10 +56,10 @@ struct pps_event_time { struct pps_device { struct pps_source_info info; /* PSS source info */ - struct pps_kparams params; /* PPS's current params */ + struct pps_kparams params; /* PPS current params */ - __u32 assert_sequence; /* PPS' assert event seq # */ - __u32 clear_sequence; /* PPS' clear event seq # */ + __u32 assert_sequence; /* PPS assert event seq # */ + __u32 clear_sequence; /* PPS clear event seq # */ struct pps_ktime assert_tu; struct pps_ktime clear_tu; int current_mode; /* PPS mode at event time */ @@ -69,7 +68,7 @@ struct pps_device { wait_queue_head_t queue; /* PPS event queue */ unsigned int id; /* PPS source unique ID */ - void const *lookup_cookie; /* pps_lookup_dev only */ + void const *lookup_cookie; /* For pps_lookup_dev() only */ struct cdev cdev; struct device *dev; struct fasync_struct *async_queue; /* fasync method */ @@ -101,7 +100,7 @@ extern struct pps_device *pps_register_source( extern void pps_unregister_source(struct pps_device *pps); extern void pps_event(struct pps_device *pps, struct pps_event_time *ts, int event, void *data); -/* Look up a pps device by magic cookie */ +/* Look up a pps_device by magic cookie */ struct pps_device *pps_lookup_dev(void const *cookie); static inline void timespec_to_pps_ktime(struct pps_ktime *kt, @@ -132,4 +131,3 @@ static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta } #endif /* LINUX_PPS_KERNEL_H */ - diff --git a/include/uapi/linux/pps.h b/include/uapi/linux/pps.h index c1cb3825a8bc..c29d6b791c08 100644 --- a/include/uapi/linux/pps.h +++ b/include/uapi/linux/pps.h @@ -95,8 +95,8 @@ struct pps_kparams { #define PPS_CAPTURECLEAR 0x02 /* capture clear events */ #define PPS_CAPTUREBOTH 0x03 /* capture assert and clear events */ -#define PPS_OFFSETASSERT 0x10 /* apply compensation for assert ev. */ -#define PPS_OFFSETCLEAR 0x20 /* apply compensation for clear ev. */ +#define PPS_OFFSETASSERT 0x10 /* apply compensation for assert event */ +#define PPS_OFFSETCLEAR 0x20 /* apply compensation for clear event */ #define PPS_CANWAIT 0x100 /* can we wait for an event? */ #define PPS_CANPOLL 0x200 /* bit reserved for future use */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8ea4fb315719..2cafb49aa65e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); -#endif +#endif /* CONFIG_NTP_PPS */ /** * xtime_update() - advances the timekeeping infrastructure -- cgit v1.2.3-71-gd317 From a2e0602c36ed9fe042714694dd5a889ecd8cb556 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 8 Sep 2017 16:17:38 -0700 Subject: ipc: convert ipc_namespace.count from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Link: http://lkml.kernel.org/r/1499417992-3238-2-git-send-email-elena.reshetova@intel.com Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: "Eric W. Biederman" Cc: Ingo Molnar Cc: Alexey Dobriyan Cc: Serge Hallyn Cc: Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 5 +++-- ipc/msgutil.c | 2 +- ipc/namespace.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 65327ee0936b..e81445cc7c57 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -7,6 +7,7 @@ #include #include #include +#include struct user_namespace; @@ -19,7 +20,7 @@ struct ipc_ids { }; struct ipc_namespace { - atomic_t count; + refcount_t count; struct ipc_ids ids[3]; int sem_ctls[4]; @@ -118,7 +119,7 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags, static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) - atomic_inc(&ns->count); + refcount_inc(&ns->count); return ns; } diff --git a/ipc/msgutil.c b/ipc/msgutil.c index bf74eaa5c39f..84598025a6ad 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -29,7 +29,7 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .user_ns = &init_user_ns, .ns.inum = PROC_IPC_INIT_INO, #ifdef CONFIG_IPC_NS diff --git a/ipc/namespace.c b/ipc/namespace.c index b4d80f9f7246..7af6e6b883b9 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -50,7 +50,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, goto fail_free; ns->ns.ops = &ipcns_operations; - atomic_set(&ns->count, 1); + refcount_set(&ns->count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; @@ -144,7 +144,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) */ void put_ipc_ns(struct ipc_namespace *ns) { - if (atomic_dec_and_lock(&ns->count, &mq_lock)) { + if (refcount_dec_and_lock(&ns->count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); mq_put_mnt(ns); -- cgit v1.2.3-71-gd317 From 9405c03ee778cd3e353e55fff6e16dfdd9609c02 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 8 Sep 2017 16:17:45 -0700 Subject: ipc: convert kern_ipc_perm.refcount from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Link: http://lkml.kernel.org/r/1499417992-3238-4-git-send-email-elena.reshetova@intel.com Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: "Eric W. Biederman" Cc: Ingo Molnar Cc: Alexey Dobriyan Cc: Serge Hallyn Cc: Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc.h | 3 ++- ipc/util.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc.h b/include/linux/ipc.h index fadd579d577d..ae68980e9d48 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -4,6 +4,7 @@ #include #include #include +#include #define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ @@ -22,7 +23,7 @@ struct kern_ipc_perm { void *security; struct rcu_head rcu; - atomic_t refcount; + refcount_t refcount; } ____cacheline_aligned_in_smp __randomize_layout; #endif /* _LINUX_IPC_H */ diff --git a/ipc/util.c b/ipc/util.c index 1a2cb02467ab..069bb22c9f64 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -232,7 +232,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size) idr_preload(GFP_KERNEL); - atomic_set(&new->refcount, 1); + refcount_set(&new->refcount, 1); spin_lock_init(&new->lock); new->deleted = false; rcu_read_lock(); @@ -397,13 +397,13 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) int ipc_rcu_getref(struct kern_ipc_perm *ptr) { - return atomic_inc_not_zero(&ptr->refcount); + return refcount_inc_not_zero(&ptr->refcount); } void ipc_rcu_putref(struct kern_ipc_perm *ptr, void (*func)(struct rcu_head *head)) { - if (!atomic_dec_and_test(&ptr->refcount)) + if (!refcount_dec_and_test(&ptr->refcount)) return; call_rcu(&ptr->rcu, func); -- cgit v1.2.3-71-gd317 From 0cfb6aee70bddbef6ec796b255f588ce0e126766 Mon Sep 17 00:00:00 2001 From: Guillaume Knispel Date: Fri, 8 Sep 2017 16:17:55 -0700 Subject: ipc: optimize semget/shmget/msgget for lots of keys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ipc_findkey() used to scan all objects to look for the wanted key. This is slow when using a high number of keys. This change adds an rhashtable of kern_ipc_perm objects in ipc_ids, so that one lookup cease to be O(n). This change gives a 865% improvement of benchmark reaim.jobs_per_min on a 56 threads Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz with 256G memory [1] Other (more micro) benchmark results, by the author: On an i5 laptop, the following loop executed right after a reboot took, without and with this change: for (int i = 0, k=0x424242; i < KEYS; ++i) semget(k++, 1, IPC_CREAT | 0600); total total max single max single KEYS without with call without call with 1 3.5 4.9 µs 3.5 4.9 10 7.6 8.6 µs 3.7 4.7 32 16.2 15.9 µs 4.3 5.3 100 72.9 41.8 µs 3.7 4.7 1000 5,630.0 502.0 µs * * 10000 1,340,000.0 7,240.0 µs * * 31900 17,600,000.0 22,200.0 µs * * *: unreliable measure: high variance The duration for a lookup-only usage was obtained by the same loop once the keys are present: total total max single max single KEYS without with call without call with 1 2.1 2.5 µs 2.1 2.5 10 4.5 4.8 µs 2.2 2.3 32 13.0 10.8 µs 2.3 2.8 100 82.9 25.1 µs * 2.3 1000 5,780.0 217.0 µs * * 10000 1,470,000.0 2,520.0 µs * * 31900 17,400,000.0 7,810.0 µs * * Finally, executing each semget() in a new process gave, when still summing only the durations of these syscalls: creation: total total KEYS without with 1 3.7 5.0 µs 10 32.9 36.7 µs 32 125.0 109.0 µs 100 523.0 353.0 µs 1000 20,300.0 3,280.0 µs 10000 2,470,000.0 46,700.0 µs 31900 27,800,000.0 219,000.0 µs lookup-only: total total KEYS without with 1 2.5 2.7 µs 10 25.4 24.4 µs 32 106.0 72.6 µs 100 591.0 352.0 µs 1000 22,400.0 2,250.0 µs 10000 2,510,000.0 25,700.0 µs 31900 28,200,000.0 115,000.0 µs [1] http://lkml.kernel.org/r/20170814060507.GE23258@yexl-desktop Link: http://lkml.kernel.org/r/20170815194954.ck32ta2z35yuzpwp@debix Signed-off-by: Guillaume Knispel Reviewed-by: Marc Pardo Cc: Davidlohr Bueso Cc: Kees Cook Cc: Manfred Spraul Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: "Peter Zijlstra (Intel)" Cc: Ingo Molnar Cc: Sebastian Andrzej Siewior Cc: Serge Hallyn Cc: Andrey Vagin Cc: Guillaume Knispel Cc: Marc Pardo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc.h | 3 ++ include/linux/ipc_namespace.h | 3 ++ ipc/msg.c | 10 +++-- ipc/namespace.c | 20 +++++++-- ipc/sem.c | 11 +++-- ipc/shm.c | 12 ++--- ipc/util.c | 100 ++++++++++++++++++++++++++++++++---------- ipc/util.h | 21 +++++---- 8 files changed, 130 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc.h b/include/linux/ipc.h index ae68980e9d48..92a2ccff80c5 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -22,6 +23,8 @@ struct kern_ipc_perm { unsigned long seq; void *security; + struct rhash_head khtnode; + struct rcu_head rcu; refcount_t refcount; } ____cacheline_aligned_in_smp __randomize_layout; diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e81445cc7c57..83f0bf7a587d 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -8,15 +8,18 @@ #include #include #include +#include struct user_namespace; struct ipc_ids { int in_use; unsigned short seq; + bool tables_initialized; struct rw_semaphore rwsem; struct idr ipcs_idr; int next_id; + struct rhashtable key_ht; }; struct ipc_namespace { diff --git a/ipc/msg.c b/ipc/msg.c index 2c38f10d1483..df82bc9a5531 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -1011,7 +1011,7 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, } -void msg_init_ns(struct ipc_namespace *ns) +int msg_init_ns(struct ipc_namespace *ns) { ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; @@ -1019,7 +1019,7 @@ void msg_init_ns(struct ipc_namespace *ns) atomic_set(&ns->msg_bytes, 0); atomic_set(&ns->msg_hdrs, 0); - ipc_init_ids(&ns->ids[IPC_MSG_IDS]); + return ipc_init_ids(&ns->ids[IPC_MSG_IDS]); } #ifdef CONFIG_IPC_NS @@ -1027,6 +1027,7 @@ void msg_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); + rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); } #endif @@ -1058,11 +1059,12 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it) } #endif -void __init msg_init(void) +int __init msg_init(void) { - msg_init_ns(&init_ipc_ns); + const int err = msg_init_ns(&init_ipc_ns); ipc_init_proc_interface("sysvipc/msg", " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", IPC_MSG_IDS, sysvipc_msg_proc_show); + return err; } diff --git a/ipc/namespace.c b/ipc/namespace.c index 7af6e6b883b9..fc850c526698 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -54,16 +54,28 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; - err = mq_init_ns(ns); + err = sem_init_ns(ns); if (err) goto fail_put; + err = msg_init_ns(ns); + if (err) + goto fail_destroy_sem; + err = shm_init_ns(ns); + if (err) + goto fail_destroy_msg; - sem_init_ns(ns); - msg_init_ns(ns); - shm_init_ns(ns); + err = mq_init_ns(ns); + if (err) + goto fail_destroy_shm; return ns; +fail_destroy_shm: + shm_exit_ns(ns); +fail_destroy_msg: + msg_exit_ns(ns); +fail_destroy_sem: + sem_exit_ns(ns); fail_put: put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); diff --git a/ipc/sem.c b/ipc/sem.c index 4c0cfaad560c..013c7981f3c7 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -183,14 +183,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #define sc_semopm sem_ctls[2] #define sc_semmni sem_ctls[3] -void sem_init_ns(struct ipc_namespace *ns) +int sem_init_ns(struct ipc_namespace *ns) { ns->sc_semmsl = SEMMSL; ns->sc_semmns = SEMMNS; ns->sc_semopm = SEMOPM; ns->sc_semmni = SEMMNI; ns->used_sems = 0; - ipc_init_ids(&ns->ids[IPC_SEM_IDS]); + return ipc_init_ids(&ns->ids[IPC_SEM_IDS]); } #ifdef CONFIG_IPC_NS @@ -198,15 +198,18 @@ void sem_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &sem_ids(ns), freeary); idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr); + rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht); } #endif -void __init sem_init(void) +int __init sem_init(void) { - sem_init_ns(&init_ipc_ns); + const int err = sem_init_ns(&init_ipc_ns); + ipc_init_proc_interface("sysvipc/sem", " key semid perms nsems uid gid cuid cgid otime ctime\n", IPC_SEM_IDS, sysvipc_sem_proc_show); + return err; } /** diff --git a/ipc/shm.c b/ipc/shm.c index 8828b4c3a190..8fc97beb5234 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -72,14 +72,14 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp); static int sysvipc_shm_proc_show(struct seq_file *s, void *it); #endif -void shm_init_ns(struct ipc_namespace *ns) +int shm_init_ns(struct ipc_namespace *ns) { ns->shm_ctlmax = SHMMAX; ns->shm_ctlall = SHMALL; ns->shm_ctlmni = SHMMNI; ns->shm_rmid_forced = 0; ns->shm_tot = 0; - ipc_init_ids(&shm_ids(ns)); + return ipc_init_ids(&shm_ids(ns)); } /* @@ -95,7 +95,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) if (shp->shm_nattch) { shp->shm_perm.mode |= SHM_DEST; /* Do not find it any more */ - shp->shm_perm.key = IPC_PRIVATE; + ipc_set_key_private(&shm_ids(ns), &shp->shm_perm); shm_unlock(shp); } else shm_destroy(ns, shp); @@ -106,13 +106,15 @@ void shm_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &shm_ids(ns), do_shm_rmid); idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); + rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht); } #endif static int __init ipc_ns_init(void) { - shm_init_ns(&init_ipc_ns); - return 0; + const int err = shm_init_ns(&init_ipc_ns); + WARN(err, "ipc: sysv shm_init_ns failed: %d\n", err); + return err; } pure_initcall(ipc_ns_init); diff --git a/ipc/util.c b/ipc/util.c index 069bb22c9f64..78755873cc5b 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -83,27 +83,46 @@ struct ipc_proc_iface { */ static int __init ipc_init(void) { - sem_init(); - msg_init(); + int err_sem, err_msg; + + err_sem = sem_init(); + WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem); + err_msg = msg_init(); + WARN(err_msg, "ipc: sysv msg_init failed: %d\n", err_msg); shm_init(); - return 0; + + return err_msg ? err_msg : err_sem; } device_initcall(ipc_init); +static const struct rhashtable_params ipc_kht_params = { + .head_offset = offsetof(struct kern_ipc_perm, khtnode), + .key_offset = offsetof(struct kern_ipc_perm, key), + .key_len = FIELD_SIZEOF(struct kern_ipc_perm, key), + .locks_mul = 1, + .automatic_shrinking = true, +}; + /** * ipc_init_ids - initialise ipc identifiers * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited - * below IPCMNI) then initialise the ids idr. + * below IPCMNI) then initialise the keys hashtable and ids idr. */ -void ipc_init_ids(struct ipc_ids *ids) +int ipc_init_ids(struct ipc_ids *ids) { + int err; ids->in_use = 0; ids->seq = 0; ids->next_id = -1; init_rwsem(&ids->rwsem); + err = rhashtable_init(&ids->key_ht, &ipc_kht_params); + if (err) + return err; idr_init(&ids->ipcs_idr); + ids->tables_initialized = true; + return 0; } #ifdef CONFIG_PROC_FS @@ -147,28 +166,20 @@ void __init ipc_init_proc_interface(const char *path, const char *header, * Returns the locked pointer to the ipc structure if found or NULL * otherwise. If key is found ipc points to the owning ipc structure * - * Called with ipc_ids.rwsem held. + * Called with writer ipc_ids.rwsem held. */ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) { - struct kern_ipc_perm *ipc; - int next_id; - int total; - - for (total = 0, next_id = 0; total < ids->in_use; next_id++) { - ipc = idr_find(&ids->ipcs_idr, next_id); - - if (ipc == NULL) - continue; + struct kern_ipc_perm *ipcp = NULL; - if (ipc->key != key) { - total++; - continue; - } + if (likely(ids->tables_initialized)) + ipcp = rhashtable_lookup_fast(&ids->key_ht, &key, + ipc_kht_params); + if (ipcp) { rcu_read_lock(); - ipc_lock_object(ipc); - return ipc; + ipc_lock_object(ipcp); + return ipcp; } return NULL; @@ -221,13 +232,13 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size) { kuid_t euid; kgid_t egid; - int id; + int id, err; int next_id = ids->next_id; if (size > IPCMNI) size = IPCMNI; - if (ids->in_use >= size) + if (!ids->tables_initialized || ids->in_use >= size) return -ENOSPC; idr_preload(GFP_KERNEL); @@ -246,6 +257,15 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size) (next_id < 0) ? 0 : ipcid_to_idx(next_id), 0, GFP_NOWAIT); idr_preload_end(); + + if (id >= 0 && new->key != IPC_PRIVATE) { + err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode, + ipc_kht_params); + if (err < 0) { + idr_remove(&ids->ipcs_idr, id); + id = err; + } + } if (id < 0) { spin_unlock(&new->lock); rcu_read_unlock(); @@ -377,6 +397,20 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, return err; } +/** + * ipc_kht_remove - remove an ipc from the key hashtable + * @ids: ipc identifier set + * @ipcp: ipc perm structure containing the key to remove + * + * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held + * before this function is called, and remain locked on the exit. + */ +static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) +{ + if (ipcp->key != IPC_PRIVATE) + rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, + ipc_kht_params); +} /** * ipc_rmid - remove an ipc identifier @@ -391,10 +425,25 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) int lid = ipcid_to_idx(ipcp->id); idr_remove(&ids->ipcs_idr, lid); + ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; } +/** + * ipc_set_key_private - switch the key of an existing ipc to IPC_PRIVATE + * @ids: ipc identifier set + * @ipcp: ipc perm structure containing the key to modify + * + * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held + * before this function is called, and remain locked on the exit. + */ +void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) +{ + ipc_kht_remove(ids, ipcp); + ipcp->key = IPC_PRIVATE; +} + int ipc_rcu_getref(struct kern_ipc_perm *ptr) { return refcount_inc_not_zero(&ptr->refcount); @@ -485,7 +534,7 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) } /** - * ipc_obtain_object + * ipc_obtain_object_idr * @ids: ipc identifier set * @id: ipc id to look for * @@ -499,6 +548,9 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) struct kern_ipc_perm *out; int lid = ipcid_to_idx(id); + if (unlikely(!ids->tables_initialized)) + return ERR_PTR(-EINVAL); + out = idr_find(&ids->ipcs_idr, lid); if (!out) return ERR_PTR(-EINVAL); diff --git a/ipc/util.h b/ipc/util.h index c692010e6f0a..80c9f51c3f07 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -15,8 +15,8 @@ #define SEQ_MULTIPLIER (IPCMNI) -void sem_init(void); -void msg_init(void); +int sem_init(void); +int msg_init(void); void shm_init(void); struct ipc_namespace; @@ -30,17 +30,17 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { } #endif #ifdef CONFIG_SYSVIPC -void sem_init_ns(struct ipc_namespace *ns); -void msg_init_ns(struct ipc_namespace *ns); -void shm_init_ns(struct ipc_namespace *ns); +int sem_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); +int shm_init_ns(struct ipc_namespace *ns); void sem_exit_ns(struct ipc_namespace *ns); void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else -static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline void msg_init_ns(struct ipc_namespace *ns) { } -static inline void shm_init_ns(struct ipc_namespace *ns) { } +static inline int sem_init_ns(struct ipc_namespace *ns) { return 0; } +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; } +static inline int shm_init_ns(struct ipc_namespace *ns) { return 0; } static inline void sem_exit_ns(struct ipc_namespace *ns) { } static inline void msg_exit_ns(struct ipc_namespace *ns) { } @@ -79,7 +79,7 @@ struct ipc_ops { struct seq_file; struct ipc_ids; -void ipc_init_ids(struct ipc_ids *); +int ipc_init_ids(struct ipc_ids *); #ifdef CONFIG_PROC_FS void __init ipc_init_proc_interface(const char *path, const char *header, int ids, int (*show)(struct seq_file *, void *)); @@ -104,6 +104,9 @@ int ipc_get_maxid(struct ipc_ids *); /* must be called with both locks acquired. */ void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *); +/* must be called with both locks acquired. */ +void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *); + /* must be called with ipcp locked */ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); -- cgit v1.2.3-71-gd317