From b67bf49ce7aae72f63739abee6ac25f64bf20081 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:21:52 -0800 Subject: mm/munlock: delete FOLL_MLOCK and FOLL_POPULATE If counting page mlocks, we must not double-count: follow_page_pte() can tell if a page has already been Mlocked or not, but cannot tell if a pte has already been counted or not: that will have to be done when the pte is mapped in (which lru_cache_add_inactive_or_unevictable() already tracks for new anon pages, but there's no such tracking yet for others). Delete all the FOLL_MLOCK code - faulting in the missing pages will do all that is necessary, without special mlock_vma_page() calls from here. But then FOLL_POPULATE turns out to serve no purpose - it was there so that its absence would tell faultin_page() not to faultin page when setting up VM_LOCKONFAULT areas; but if there's no special work needed here for mlock, then there's no work at all here for VM_LOCKONFAULT. Have I got that right? I've not looked into the history, but see that FOLL_POPULATE goes back before VM_LOCKONFAULT: did it serve a different purpose before? Ah, yes, it was used to skip the old stack guard page. And is it intentional that COW is not broken on existing pages when setting up a VM_LOCKONFAULT area? I can see that being argued either way, and have no reason to disagree with current behaviour. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..74ee50c2033b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2925,13 +2925,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_MLOCK 0x1000 /* lock present pages */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ -- cgit v1.2.3-71-gd317 From 5c3f1f9cc4cbbf491233982b5975ae2d284de5df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: mm: remove the __KERNEL__ guard from __KERNEL__ ifdefs don't make sense outside of include/uapi/. Link: https://lkml.kernel.org/r/20220210072828.2930359-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Muchun Song Reviewed-by: Dan Williams Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Christian Knig Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 74ee50c2033b..2cca8cd30186 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3,9 +3,6 @@ #define _LINUX_MM_H #include - -#ifdef __KERNEL__ - #include #include #include @@ -3379,5 +3376,4 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif -#endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- cgit v1.2.3-71-gd317 From 75e55d8a107edb2fd6e02b1fa8a81531209cda04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: mm: move free_devmap_managed_page to memremap.c free_devmap_managed_page has nothing to do with the code in swap.c, move it to live with the rest of the code for devmap handling. Link: https://lkml.kernel.org/r/20220210072828.2930359-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Muchun Song Reviewed-by: Dan Williams Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Christian Knig Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 1 - mm/memremap.c | 21 +++++++++++++++++++++ mm/swap.c | 23 ----------------------- 3 files changed, 21 insertions(+), 24 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2cca8cd30186..a9d6473fc045 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1092,7 +1092,6 @@ static inline bool is_zone_movable_page(const struct page *page) } #ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); static inline bool page_is_devmap_managed(struct page *page) diff --git a/mm/memremap.c b/mm/memremap.c index 5f04a0709e43..55d23e9f5c04 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -501,4 +501,25 @@ void free_devmap_managed_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); } + +void put_devmap_managed_page(struct page *page) +{ + int count; + + if (WARN_ON_ONCE(!page_is_devmap_managed(page))) + return; + + count = page_ref_dec_return(page); + + /* + * devmap page refcounts are 1-based, rather than 0-based: if + * refcount is 1, then the page is free and the refcount is + * stable because nobody holds a reference on the page. + */ + if (count == 1) + free_devmap_managed_page(page); + else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/swap.c b/mm/swap.c index 842d5cd92cf6..e499df864ef7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1154,26 +1154,3 @@ void __init swap_setup(void) * _really_ don't want to cluster much more */ } - -#ifdef CONFIG_DEV_PAGEMAP_OPS -void put_devmap_managed_page(struct page *page) -{ - int count; - - if (WARN_ON_ONCE(!page_is_devmap_managed(page))) - return; - - count = page_ref_dec_return(page); - - /* - * devmap page refcounts are 1-based, rather than 0-based: if - * refcount is 1, then the page is free and the refcount is - * stable because nobody holds a reference on the page. - */ - if (count == 1) - free_devmap_managed_page(page); - else if (!count) - __put_page(page); -} -EXPORT_SYMBOL(put_devmap_managed_page); -#endif -- cgit v1.2.3-71-gd317 From 895749455f6054e0c7b40a6ec449a3ab6db51bdd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: mm: simplify freeing of devmap managed pages Make put_devmap_managed_page return if it took charge of the page or not and remove the separate page_is_devmap_managed helper. Link: https://lkml.kernel.org/r/20220210072828.2930359-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Dan Williams Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Christian Knig Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 34 ++++++++++------------------------ mm/memremap.c | 20 +++++++++----------- mm/swap.c | 10 +--------- 3 files changed, 20 insertions(+), 44 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index a9d6473fc045..8a59f0456149 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1094,33 +1094,24 @@ static inline bool is_zone_movable_page(const struct page *page) #ifdef CONFIG_DEV_PAGEMAP_OPS DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -static inline bool page_is_devmap_managed(struct page *page) +bool __put_devmap_managed_page(struct page *page); +static inline bool put_devmap_managed_page(struct page *page) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!is_zone_device_page(page)) return false; - switch (page->pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_FS_DAX: - return true; - default: - break; - } - return false; + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_FS_DAX) + return false; + return __put_devmap_managed_page(page); } -void put_devmap_managed_page(struct page *page); - #else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool page_is_devmap_managed(struct page *page) +static inline bool put_devmap_managed_page(struct page *page) { return false; } - -static inline void put_devmap_managed_page(struct page *page) -{ -} #endif /* CONFIG_DEV_PAGEMAP_OPS */ static inline bool is_device_private_page(const struct page *page) @@ -1220,16 +1211,11 @@ static inline void put_page(struct page *page) struct folio *folio = page_folio(page); /* - * For devmap managed pages we need to catch refcount transition from - * 2 to 1, when refcount reach one it means the page is free and we - * need to inform the device driver through callback. See - * include/linux/memremap.h and HMM for details. + * For some devmap managed pages we need to catch refcount transition + * from 2 to 1: */ - if (page_is_devmap_managed(&folio->page)) { - put_devmap_managed_page(&folio->page); + if (put_devmap_managed_page(&folio->page)) return; - } - folio_put(folio); } diff --git a/mm/memremap.c b/mm/memremap.c index 55d23e9f5c04..f41233a67edb 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -502,24 +502,22 @@ void free_devmap_managed_page(struct page *page) page->pgmap->ops->page_free(page); } -void put_devmap_managed_page(struct page *page) +bool __put_devmap_managed_page(struct page *page) { - int count; - - if (WARN_ON_ONCE(!page_is_devmap_managed(page))) - return; - - count = page_ref_dec_return(page); - /* * devmap page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - if (count == 1) + switch (page_ref_dec_return(page)) { + case 1: free_devmap_managed_page(page); - else if (!count) + break; + case 0: __put_page(page); + break; + } + return true; } -EXPORT_SYMBOL(put_devmap_managed_page); +EXPORT_SYMBOL(__put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/swap.c b/mm/swap.c index e499df864ef7..db8d0eea13d7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -930,16 +930,8 @@ void release_pages(struct page **pages, int nr) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - /* - * ZONE_DEVICE pages that return 'false' from - * page_is_devmap_managed() do not require special - * processing, and instead, expect a call to - * put_page_testzero(). - */ - if (page_is_devmap_managed(page)) { - put_devmap_managed_page(page); + if (put_devmap_managed_page(page)) continue; - } if (put_page_testzero(page)) put_dev_pagemap(page->pgmap); continue; -- cgit v1.2.3-71-gd317 From dc90f0846df4870b6cc8528c31e5c60f18fb68be Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:36 +1100 Subject: mm: don't include in Move the check for the actual pgmap types that need the free at refcount one behavior into the out of line helper, and thus avoid the need to pull memremap.h into mm.h. Link: https://lkml.kernel.org/r/20220210072828.2930359-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- arch/arm64/mm/mmu.c | 1 + arch/powerpc/kvm/book3s_hv_uvmem.c | 1 + arch/powerpc/mm/book3s64/pgtable.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/drm_cache.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 + drivers/gpu/drm/nouveau/nouveau_svm.c | 1 + drivers/infiniband/core/rw.c | 1 + drivers/nvdimm/pmem.h | 1 + drivers/nvme/host/pci.c | 1 + drivers/nvme/target/io-cmd-bdev.c | 1 + fs/fuse/virtio_fs.c | 1 + fs/proc/page.c | 1 + include/linux/memremap.h | 18 ++++++++++++++++++ include/linux/mm.h | 20 -------------------- lib/test_hmm.c | 1 + mm/memcontrol.c | 1 + mm/memremap.c | 6 +++++- 18 files changed, 38 insertions(+), 22 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index acfae9b41cc8..580abae6c0b9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index e414ca44839f..881951604227 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 79ce3c22a29d..052e6590f84f 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index ea68f3b3a4e9..6d643b4b791d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -25,6 +25,7 @@ #include #include +#include #include #include #include diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c index f19d9acbe959..50b8a088f763 100644 --- a/drivers/gpu/drm/drm_cache.c +++ b/drivers/gpu/drm/drm_cache.c @@ -27,11 +27,11 @@ /* * Authors: Thomas Hellström */ - #include #include #include #include +#include #include #include diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index e886a3b9e08c..a5cdfbe32b5e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -39,6 +39,7 @@ #include #include +#include #include /* diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 266809e511e2..090b9b47708c 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -35,6 +35,7 @@ #include #include #include +#include #include struct nouveau_svm { diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 5a3bd41b331c..4d98f931a13d 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2016 HGST, a Western Digital Company. */ +#include #include #include #include diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 59cfe13ea8a8..1f51a2361429 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -3,6 +3,7 @@ #define __NVDIMM_PMEM_H__ #include #include +#include #include #include #include diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6a99ed680915..ab15bc72710d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 70ca9dfc1771..a141446db1be 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include +#include #include #include "nvmet.h" diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 9d737904d07c..86b7dbb6a0d4 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/proc/page.c b/fs/proc/page.c index 9f1077d94cde..a2873a617ae8 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1fafcc38acba..514ab46f597e 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ + +#include #include #include #include @@ -129,6 +131,22 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) return 1 << pgmap->vmemmap_shift; } +static inline bool is_device_private_page(const struct page *page) +{ + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PRIVATE; +} + +static inline bool is_pci_p2pdma_page(const struct page *page) +{ + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_PCI_P2PDMA) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; +} + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a59f0456149..cb8bee88e70c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -1101,9 +1100,6 @@ static inline bool put_devmap_managed_page(struct page *page) return false; if (!is_zone_device_page(page)) return false; - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_FS_DAX) - return false; return __put_devmap_managed_page(page); } @@ -1114,22 +1110,6 @@ static inline bool put_devmap_managed_page(struct page *page) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool is_device_private_page(const struct page *page) -{ - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; -} - -static inline bool is_pci_p2pdma_page(const struct page *page) -{ - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && - is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; -} - /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 396beee6b061..e5fc14ba71f3 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c78b9d3b9c04..2c5032294c9f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/memremap.c b/mm/memremap.c index f41233a67edb..a0ece2344c2c 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include @@ -504,6 +504,10 @@ void free_devmap_managed_page(struct page *page) bool __put_devmap_managed_page(struct page *page) { + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_FS_DAX) + return false; + /* * devmap page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is -- cgit v1.2.3-71-gd317 From 27674ef6c73f0c9096a9827dc5d6ba9fc7808422 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:36 +1100 Subject: mm: remove the extra ZONE_DEVICE struct page refcount ZONE_DEVICE struct pages have an extra reference count that complicates the code for put_page() and several places in the kernel that need to check the reference count to see that a page is not being used (gup, compaction, migration, etc.). Clean up the code so the reference count doesn't need to be treated specially for ZONE_DEVICE pages. Note that this excludes the special idle page wakeup for fsdax pages, which still happens at refcount 1. This is a separate issue and will be sorted out later. Given that only fsdax pages require the notifiacation when the refcount hits 1 now, the PAGEMAP_OPS Kconfig symbol can go away and be replaced with a FS_DAX check for this hook in the put_page fastpath. Based on an earlier patch from Ralph Campbell . Link: https://lkml.kernel.org/r/20220210072828.2930359-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 - drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 1 - drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 - fs/Kconfig | 1 - include/linux/memremap.h | 12 +++---- include/linux/mm.h | 6 ++-- lib/test_hmm.c | 1 - mm/Kconfig | 4 --- mm/internal.h | 2 ++ mm/memcontrol.c | 11 ++---- mm/memremap.c | 57 +++++++++++--------------------- mm/migrate.c | 6 ---- mm/swap.c | 16 +++------ 13 files changed, 36 insertions(+), 83 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 881951604227..8cabdb39cbbc 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -713,7 +713,6 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - get_page(dpage); lock_page(dpage); return dpage; out_clear: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index cb835f95a76e..e27ca3758762 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -225,7 +225,6 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - get_page(page); lock_page(page); } diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index a5cdfbe32b5e..7ba66ad68a8a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,6 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - get_page(page); lock_page(page); return page; } diff --git a/fs/Kconfig b/fs/Kconfig index 6c7dc1387beb..e9433bbc4801 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -48,7 +48,6 @@ config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) - select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) select FS_IOMAP select DAX help diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 514ab46f597e..d6a114dd5ea8 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -68,9 +68,9 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 1. (ZONE_DEVICE pages never - * reach 0 refcount unless there is a refcount bug. This allows the - * device driver to implement its own memory management.) + * Called once the page refcount reaches 0. The reference count will be + * reset to one by the core code after the method is called to prepare + * for handing out the page again. */ void (*page_free)(struct page *page); @@ -133,16 +133,14 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) static inline bool is_device_private_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool is_pci_p2pdma_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && + return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } diff --git a/include/linux/mm.h b/include/linux/mm.h index cb8bee88e70c..0201d258c646 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1090,7 +1090,7 @@ static inline bool is_zone_movable_page(const struct page *page) return page_zonenum(page) == ZONE_MOVABLE; } -#ifdef CONFIG_DEV_PAGEMAP_OPS +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); bool __put_devmap_managed_page(struct page *page); @@ -1103,12 +1103,12 @@ static inline bool put_devmap_managed_page(struct page *page) return __put_devmap_managed_page(page); } -#else /* CONFIG_DEV_PAGEMAP_OPS */ +#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ static inline bool put_devmap_managed_page(struct page *page) { return false; } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e5fc14ba71f3..cfe632047839 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -566,7 +566,6 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) } dpage->zone_device_data = rpage; - get_page(dpage); lock_page(dpage); return dpage; diff --git a/mm/Kconfig b/mm/Kconfig index 3326ee3903f3..a1901ae6d062 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -776,9 +776,6 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config DEV_PAGEMAP_OPS - bool - # # Helpers to mirror range of the CPU page tables of a process into device page # tables. @@ -790,7 +787,6 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ZONE_DEVICE - select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent unaddressable device diff --git a/mm/internal.h b/mm/internal.h index 450a2c8a43f3..3756dd5d2c92 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -735,4 +735,6 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); +void free_zone_device_page(struct page *page); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2c5032294c9f..8fef072dc1ce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5503,17 +5503,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to - * a device and because they are not accessible by CPU they are store - * as special swap entry in the CPU page table. + * Handle device private pages that are not accessible by the CPU, but + * stored as special swap entries in the page table. */ if (is_device_private_entry(ent)) { page = pfn_swap_entry_to_page(ent); - /* - * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have - * a refcount of 1 when free (unlike normal page) - */ - if (!page_ref_add_unless(page, 1, 1)) + if (!get_page_unless_zero(page)) return NULL; return page; } diff --git a/mm/memremap.c b/mm/memremap.c index a0ece2344c2c..fef5734d5e49 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -12,6 +12,7 @@ #include #include #include +#include "internal.h" static DEFINE_XARRAY(pgmap_array); @@ -37,21 +38,19 @@ unsigned long memremap_compat_align(void) EXPORT_SYMBOL_GPL(memremap_compat_align); #endif -#ifdef CONFIG_DEV_PAGEMAP_OPS +#ifdef CONFIG_FS_DAX DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_dec(&devmap_managed_key); } static void devmap_managed_enable_get(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_inc(&devmap_managed_key); } #else @@ -61,7 +60,7 @@ static void devmap_managed_enable_get(struct dev_pagemap *pgmap) static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ static void pgmap_array_delete(struct range *range) { @@ -102,23 +101,12 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) return (range->start + range_len(range)) >> PAGE_SHIFT; } -static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) -{ - if (pfn % (1024 << pgmap->vmemmap_shift)) - cond_resched(); - return pfn + pgmap_vmemmap_nr(pgmap); -} - static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) { return (pfn_end(pgmap, range_id) - pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } -#define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ - pfn = pfn_next(map, pfn)) - static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; @@ -147,13 +135,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) void memunmap_pages(struct dev_pagemap *pgmap) { - unsigned long pfn; int i; percpu_ref_kill(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) - for_each_device_pfn(pfn, pgmap, i) - put_page(pfn_to_page(pfn)); + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); wait_for_completion(&pgmap->done); percpu_ref_exit(&pgmap->ref); @@ -464,14 +450,10 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -#ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page) +void free_zone_device_page(struct page *page) { - /* notify page idle for dax */ - if (!is_device_private_page(page)) { - wake_up_var(&page->_refcount); + if (WARN_ON_ONCE(!is_device_private_page(page))) return; - } __ClearPageWaiters(page); @@ -500,28 +482,27 @@ void free_devmap_managed_page(struct page *page) */ page->mapping = NULL; page->pgmap->ops->page_free(page); + + /* + * Reset the page count to 1 to prepare for handing out the page again. + */ + set_page_count(page, 1); } +#ifdef CONFIG_FS_DAX bool __put_devmap_managed_page(struct page *page) { - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_FS_DAX) + if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) return false; /* - * devmap page refcounts are 1-based, rather than 0-based: if + * fsdax page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - switch (page_ref_dec_return(page)) { - case 1: - free_devmap_managed_page(page); - break; - case 0: - __put_page(page); - break; - } + if (page_ref_dec_return(page) == 1) + wake_up_var(&page->_refcount); return true; } EXPORT_SYMBOL(__put_devmap_managed_page); -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ diff --git a/mm/migrate.c b/mm/migrate.c index e7d0b68d5dcb..af0534de618a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -338,14 +338,8 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) { int expected_count = 1; - /* - * Device private pages have an extra refcount as they are - * ZONE_DEVICE pages. - */ - expected_count += is_device_private_page(page); if (mapping) expected_count += compound_nr(page) + page_has_private(page); - return expected_count; } diff --git a/mm/swap.c b/mm/swap.c index db8d0eea13d7..fc3b7989f5b2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -122,17 +122,9 @@ static void __put_compound_page(struct page *page) void __put_page(struct page *page) { - if (is_zone_device_page(page)) { - put_dev_pagemap(page->pgmap); - - /* - * The page belongs to the device that created pgmap. Do - * not return it to page allocator. - */ - return; - } - - if (unlikely(PageCompound(page))) + if (unlikely(is_zone_device_page(page))) + free_zone_device_page(page); + else if (unlikely(PageCompound(page))) __put_compound_page(page); else __put_single_page(page); @@ -933,7 +925,7 @@ void release_pages(struct page **pages, int nr) if (put_devmap_managed_page(page)) continue; if (put_page_testzero(page)) - put_dev_pagemap(page->pgmap); + free_zone_device_page(page); continue; } -- cgit v1.2.3-71-gd317 From 4c65422901154766e5cee17875ed680366a4a141 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 Jan 2022 13:45:25 -0500 Subject: mm/gup: Remove an assumption of a contiguous memmap This assumption needs the inverse of nth_page(), which is temporarily named page_nth() until it's renamed later in this series. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- include/linux/mm.h | 2 ++ mm/gup.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0201d258c646..e3f8755f65ed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -212,8 +212,10 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) +#define page_nth(head, tail) (page_to_pfn(tail) - page_to_pfn(head)) #else #define nth_page(page,n) ((page) + (n)) +#define page_nth(head, tail) ((tail) - (head)) #endif /* to align the pointer to the (next) page boundary */ diff --git a/mm/gup.c b/mm/gup.c index d585aa06afb2..ad120f470735 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -261,8 +261,8 @@ static inline struct page *compound_range_next(struct page *start, next = nth_page(start, i); page = compound_head(next); if (PageHead(page)) - nr = min_t(unsigned int, - page + compound_nr(page) - next, npages - i); + nr = min_t(unsigned int, npages - i, + compound_nr(page) - page_nth(page, next)); *ntails = nr; return page; -- cgit v1.2.3-71-gd317 From 5232c63f46fdd779303527ec36c518cc1e9c6b4e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 Jan 2022 16:46:43 -0500 Subject: mm: Make compound_pincount always available Move compound_pincount from the third page to the second page, which means it's available for all compound pages. That lets us delete hpage_pincount_available(). On 32-bit systems, there isn't enough space for both compound_pincount and compound_nr in the second page (it would collide with page->private, which is in use for pages in the swap cache), so revert the optimisation of storing both compound_order and compound_nr on 32-bit systems. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- Documentation/core-api/pin_user_pages.rst | 18 +++++++++--------- include/linux/mm.h | 21 ++++++++------------- include/linux/mm_types.h | 7 +++++-- mm/debug.c | 14 ++++---------- mm/gup.c | 20 +++++++++----------- mm/hugetlb.c | 4 ++++ mm/page_alloc.c | 3 +-- mm/rmap.c | 6 ++---- 8 files changed, 42 insertions(+), 51 deletions(-) (limited to 'include/linux/mm.h') diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index fcf605be43d0..b18416f4500f 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -55,18 +55,18 @@ flags the caller provides. The caller is required to pass in a non-null struct pages* array, and the function then pins pages by incrementing each by a special value: GUP_PIN_COUNTING_BIAS. -For huge pages (and in fact, any compound page of more than 2 pages), the -GUP_PIN_COUNTING_BIAS scheme is not used. Instead, an exact form of pin counting -is achieved, by using the 3rd struct page in the compound page. A new struct -page field, hpage_pinned_refcount, has been added in order to support this. +For compound pages, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, +an exact form of pin counting is achieved, by using the 2nd struct page +in the compound page. A new struct page field, compound_pincount, has +been added in order to support this. This approach for compound pages avoids the counting upper limit problems that are discussed below. Those limitations would have been aggravated severely by huge pages, because each tail page adds a refcount to the head page. And in -fact, testing revealed that, without a separate hpage_pinned_refcount field, +fact, testing revealed that, without a separate compound_pincount field, page overflows were seen in some huge page stress tests. -This also means that huge pages and compound pages (of order > 1) do not suffer +This also means that huge pages and compound pages do not suffer from the false positives problem that is mentioned below.:: Function @@ -264,9 +264,9 @@ place.) Other diagnostics ================= -dump_page() has been enhanced slightly, to handle these new counting fields, and -to better report on compound pages in general. Specifically, for compound pages -with order > 1, the exact (hpage_pinned_refcount) pincount is reported. +dump_page() has been enhanced slightly, to handle these new counting +fields, and to better report on compound pages in general. Specifically, +for compound pages, the exact (compound_pincount) pincount is reported. References ========== diff --git a/include/linux/mm.h b/include/linux/mm.h index e3f8755f65ed..c64bd0b67d75 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -887,17 +887,6 @@ static inline void destroy_compound_page(struct page *page) compound_page_dtors[page[1].compound_dtor](page); } -static inline bool hpage_pincount_available(struct page *page) -{ - /* - * Can the page->hpage_pinned_refcount field be used? That field is in - * the 3rd page of the compound page, so the smallest (2-page) compound - * pages cannot support it. - */ - page = compound_head(page); - return PageCompound(page) && compound_order(page) > 1; -} - static inline int head_compound_pincount(struct page *head) { return atomic_read(compound_pincount_ptr(head)); @@ -905,7 +894,7 @@ static inline int head_compound_pincount(struct page *head) static inline int compound_pincount(struct page *page) { - VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); + VM_BUG_ON_PAGE(!PageCompound(page), page); page = compound_head(page); return head_compound_pincount(page); } @@ -913,7 +902,9 @@ static inline int compound_pincount(struct page *page) static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; +#ifdef CONFIG_64BIT page[1].compound_nr = 1U << order; +#endif } /* Returns the number of pages in this potentially compound page. */ @@ -921,7 +912,11 @@ static inline unsigned long compound_nr(struct page *page) { if (!PageHead(page)) return 1; +#ifdef CONFIG_64BIT return page[1].compound_nr; +#else + return 1UL << compound_order(page); +#endif } /* Returns the number of bytes in this potentially compound page. */ @@ -1269,7 +1264,7 @@ void unpin_user_pages(struct page **pages, unsigned long npages); */ static inline bool page_maybe_dma_pinned(struct page *page) { - if (hpage_pincount_available(page)) + if (PageCompound(page)) return compound_pincount(page) > 0; /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 475bdb282769..0e274c9b934e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -135,11 +135,14 @@ struct page { unsigned char compound_dtor; unsigned char compound_order; atomic_t compound_mapcount; + atomic_t compound_pincount; +#ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ +#endif }; struct { /* Second tail page of compound page */ unsigned long _compound_pad_1; /* compound_head */ - atomic_t hpage_pinned_refcount; + unsigned long _compound_pad_2; /* For both global and memcg */ struct list_head deferred_list; }; @@ -300,7 +303,7 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page) static inline atomic_t *compound_pincount_ptr(struct page *page) { - return &page[2].hpage_pinned_refcount; + return &page[1].compound_pincount; } /* diff --git a/mm/debug.c b/mm/debug.c index bc9ac87f0e08..c4cf44266430 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -92,16 +92,10 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - if (hpage_pincount_available(page)) { - pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", - head, compound_order(head), - head_compound_mapcount(head), - head_compound_pincount(head)); - } else { - pr_warn("head:%p order:%u compound_mapcount:%d\n", - head, compound_order(head), - head_compound_mapcount(head)); - } + pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", + head, compound_order(head), + head_compound_mapcount(head), + head_compound_pincount(head)); } #ifdef CONFIG_MEMCG diff --git a/mm/gup.c b/mm/gup.c index 1809dc037a8e..56b6b01a430b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -99,12 +99,11 @@ retry: * * FOLL_GET: page's refcount will be incremented by @refs. * - * FOLL_PIN on compound pages that are > two pages long: page's refcount will - * be incremented by @refs, and page[2].hpage_pinned_refcount will be - * incremented by @refs * GUP_PIN_COUNTING_BIAS. + * FOLL_PIN on compound pages: page's refcount will be incremented by + * @refs, and page[1].compound_pincount will be incremented by @refs. * - * FOLL_PIN on normal pages, or compound pages that are two pages long: - * page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS. + * FOLL_PIN on normal pages: page's refcount will be incremented by + * @refs * GUP_PIN_COUNTING_BIAS. * * Return: head page (with refcount appropriately incremented) for success, or * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's @@ -135,16 +134,15 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, return NULL; /* - * When pinning a compound page of order > 1 (which is - * what hpage_pincount_available() checks for), use an - * exact count to track it. + * When pinning a compound page, use an exact count to + * track it. * * However, be sure to *also* increment the normal page * refcount field at least once, so that the page really * is pinned. That's why the refcount from the earlier * try_get_compound_head() is left intact. */ - if (hpage_pincount_available(page)) + if (PageHead(page)) atomic_add(refs, compound_pincount_ptr(page)); else page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); @@ -166,7 +164,7 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) if (flags & FOLL_PIN) { mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, refs); - if (hpage_pincount_available(page)) + if (PageHead(page)) atomic_sub(refs, compound_pincount_ptr(page)); else refs *= GUP_PIN_COUNTING_BIAS; @@ -211,7 +209,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) * increment the normal page refcount field at least once, * so that the page really is pinned. */ - if (hpage_pincount_available(page)) { + if (PageHead(page)) { page_ref_add(page, 1); atomic_add(1, compound_pincount_ptr(page)); } else { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 43fb3155298e..785d6e340292 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1320,7 +1320,9 @@ static void __destroy_compound_gigantic_page(struct page *page, } set_compound_order(page, 0); +#ifdef CONFIG_64BIT page[1].compound_nr = 0; +#endif __ClearPageHead(page); } @@ -1812,7 +1814,9 @@ out_error: for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) __ClearPageReserved(p); set_compound_order(page, 0); +#ifdef CONFIG_64BIT page[1].compound_nr = 0; +#endif __ClearPageHead(page); return false; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3589febc6d31..02283598fd14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -734,8 +734,7 @@ static void prep_compound_head(struct page *page, unsigned int order) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); } static void prep_compound_tail(struct page *head, int tail_idx) diff --git a/mm/rmap.c b/mm/rmap.c index c7921c102bc0..1a13d5d6cfc7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1219,8 +1219,7 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } else { @@ -2353,8 +2352,7 @@ void hugepage_add_new_anon_rmap(struct page *page, { BUG_ON(address < vma->vm_start || address >= vma->vm_end); atomic_set(compound_mapcount_ptr(page), 0); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); __page_set_anon_rmap(page, vma, address, 1); } -- cgit v1.2.3-71-gd317 From 3d11b225aeb184bc3dc9b4b27b302815a7c531aa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Dec 2021 18:28:58 -0500 Subject: mm: Add folio_pincount_ptr() This is the folio equivalent of compound_pincount_ptr(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c64bd0b67d75..c45739dfdd04 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1544,6 +1544,11 @@ static inline unsigned long folio_pfn(struct folio *folio) return page_to_pfn(&folio->page); } +static inline atomic_t *folio_pincount_ptr(struct folio *folio) +{ + return &folio_page(folio, 1)->compound_pincount; +} + /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) -- cgit v1.2.3-71-gd317 From 0b90ddae13441c43a30d2e2689b8193a81891c92 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 27 Dec 2021 18:40:41 -0500 Subject: mm: Turn page_maybe_dma_pinned() into folio_maybe_dma_pinned() Replace three calls to compound_head() with one. This removes the last user of compound_pincount(), so remove that helper too. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- include/linux/mm.h | 129 ++++++++++++++++++++++++++--------------------------- 1 file changed, 63 insertions(+), 66 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c45739dfdd04..35e453ac5c0f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -892,13 +892,6 @@ static inline int head_compound_pincount(struct page *head) return atomic_read(compound_pincount_ptr(head)); } -static inline int compound_pincount(struct page *page) -{ - VM_BUG_ON_PAGE(!PageCompound(page), page); - page = compound_head(page); - return head_compound_pincount(page); -} - static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; @@ -1236,70 +1229,11 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); -/** - * page_maybe_dma_pinned - Report if a page is pinned for DMA. - * @page: The page. - * - * This function checks if a page has been pinned via a call to - * a function in the pin_user_pages() family. - * - * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, - * because it means "definitely not pinned for DMA", but true means "probably - * pinned for DMA, but possibly a false positive due to having at least - * GUP_PIN_COUNTING_BIAS worth of normal page references". - * - * False positives are OK, because: a) it's unlikely for a page to get that many - * refcounts, and b) all the callers of this routine are expected to be able to - * deal gracefully with a false positive. - * - * For huge pages, the result will be exactly correct. That's because we have - * more tracking data available: the 3rd struct page in the compound page is - * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS - * scheme). - * - * For more information, please see Documentation/core-api/pin_user_pages.rst. - * - * Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. - */ -static inline bool page_maybe_dma_pinned(struct page *page) -{ - if (PageCompound(page)) - return compound_pincount(page) > 0; - - /* - * page_ref_count() is signed. If that refcount overflows, then - * page_ref_count() returns a negative value, and callers will avoid - * further incrementing the refcount. - * - * Here, for that overflow case, use the signed bit to count a little - * bit higher via unsigned math, and thus still get an accurate result. - */ - return ((unsigned int)page_ref_count(compound_head(page))) >= - GUP_PIN_COUNTING_BIAS; -} - static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } -/* - * This should most likely only be called during fork() to see whether we - * should break the cow immediately for a page on the src mm. - */ -static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, - struct page *page) -{ - if (!is_cow_mapping(vma->vm_flags)) - return false; - - if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) - return false; - - return page_maybe_dma_pinned(page); -} - #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif @@ -1549,6 +1483,69 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) return &folio_page(folio, 1)->compound_pincount; } +/** + * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. + * @folio: The folio. + * + * This function checks if a folio has been pinned via a call to + * a function in the pin_user_pages() family. + * + * For small folios, the return value is partially fuzzy: false is not fuzzy, + * because it means "definitely not pinned for DMA", but true means "probably + * pinned for DMA, but possibly a false positive due to having at least + * GUP_PIN_COUNTING_BIAS worth of normal folio references". + * + * False positives are OK, because: a) it's unlikely for a folio to + * get that many refcounts, and b) all the callers of this routine are + * expected to be able to deal gracefully with a false positive. + * + * For large folios, the result will be exactly correct. That's because + * we have more tracking data available: the compound_pincount is used + * instead of the GUP_PIN_COUNTING_BIAS scheme. + * + * For more information, please see Documentation/core-api/pin_user_pages.rst. + * + * Return: True, if it is likely that the page has been "dma-pinned". + * False, if the page is definitely not dma-pinned. + */ +static inline bool folio_maybe_dma_pinned(struct folio *folio) +{ + if (folio_test_large(folio)) + return atomic_read(folio_pincount_ptr(folio)) > 0; + + /* + * folio_ref_count() is signed. If that refcount overflows, then + * folio_ref_count() returns a negative value, and callers will avoid + * further incrementing the refcount. + * + * Here, for that overflow case, use the sign bit to count a little + * bit higher via unsigned math, and thus still get an accurate result. + */ + return ((unsigned int)folio_ref_count(folio)) >= + GUP_PIN_COUNTING_BIAS; +} + +static inline bool page_maybe_dma_pinned(struct page *page) +{ + return folio_maybe_dma_pinned(page_folio(page)); +} + +/* + * This should most likely only be called during fork() to see whether we + * should break the cow immediately for a page on the src mm. + */ +static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, + struct page *page) +{ + if (!is_cow_mapping(vma->vm_flags)) + return false; + + if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) + return false; + + return page_maybe_dma_pinned(page); +} + /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) -- cgit v1.2.3-71-gd317 From 40fcc7fc2c3838f3afe07a3a72709b45566e6cdb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 29 Dec 2021 12:23:55 -0500 Subject: mm: Remove page_cache_add_speculative() and page_cache_get_speculative() These wrappers have no more callers, so delete them. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- include/linux/mm.h | 7 +++---- include/linux/pagemap.h | 10 ---------- 2 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 35e453ac5c0f..b764057022c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1215,10 +1215,9 @@ static inline void put_page(struct page *page) * applications that don't have huge page reference counts, this won't be an * issue. * - * Locking: the lockless algorithm described in page_cache_get_speculative() - * and page_cache_gup_pin_speculative() provides safe operation for - * get_user_pages and page_mkclean and other calls that race to set up page - * table entries. + * Locking: the lockless algorithm described in folio_try_get_rcu() + * provides safe operation for get_user_pages(), page_mkclean() and + * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 270bf5136c34..cdb3f118603a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -283,16 +283,6 @@ static inline struct inode *folio_inode(struct folio *folio) return folio->mapping->host; } -static inline bool page_cache_add_speculative(struct page *page, int count) -{ - return folio_ref_try_add_rcu((struct folio *)page, count); -} - -static inline bool page_cache_get_speculative(struct page *page) -{ - return page_cache_add_speculative(page, 1); -} - /** * folio_attach_private - Attach private data to a folio. * @folio: Folio to attach data to. -- cgit v1.2.3-71-gd317 From 822951d84684d7a0c4f45e7231c960e7fe786d8f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 8 Jan 2022 00:15:04 -0500 Subject: mm/hugetlb: Use try_grab_folio() instead of try_grab_compound_head() follow_hugetlb_page() only cares about success or failure, so it doesn't need to know the type of the returned pointer, only whether it's NULL or not. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- include/linux/mm.h | 3 --- mm/gup.c | 2 +- mm/hugetlb.c | 7 +++---- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index b764057022c8..dca5c99395c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1124,9 +1124,6 @@ static inline void get_page(struct page *page) } bool __must_check try_grab_page(struct page *page, unsigned int flags); -struct page *try_grab_compound_head(struct page *page, int refs, - unsigned int flags); - static inline __must_check bool try_get_page(struct page *page) { diff --git a/mm/gup.c b/mm/gup.c index cbbddcf8ff3f..014004102e26 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -133,7 +133,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) return NULL; } -struct page *try_grab_compound_head(struct page *page, +static inline struct page *try_grab_compound_head(struct page *page, int refs, unsigned int flags) { return &try_grab_folio(page, refs, flags)->page; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 785d6e340292..10203f3b1ccf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6076,7 +6076,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pages) { /* - * try_grab_compound_head() should always succeed here, + * try_grab_folio() should always succeed here, * because: a) we hold the ptl lock, and b) we've just * checked that the huge page is present in the page * tables. If the huge page is present, then the tail @@ -6085,9 +6085,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * any way. So this page must be available at this * point, unless the page refcount overflowed: */ - if (WARN_ON_ONCE(!try_grab_compound_head(pages[i], - refs, - flags))) { + if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, + flags))) { spin_unlock(ptl); remainder = 0; err = -ENOMEM; -- cgit v1.2.3-71-gd317 From 659508f9c936aa6e3aaf6e9cf6a4a8836b8f8355 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 23 Dec 2021 10:20:12 -0500 Subject: mm/gup: Turn compound_range_next() into gup_folio_range_next() Convert the only caller to work on folios instead of pages. This removes the last caller of put_compound_head(), so delete it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- include/linux/mm.h | 4 ++-- mm/gup.c | 38 +++++++++++++++++--------------------- 2 files changed, 19 insertions(+), 23 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index dca5c99395c9..0d3f9057a807 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -212,10 +212,10 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) -#define page_nth(head, tail) (page_to_pfn(tail) - page_to_pfn(head)) +#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) -#define page_nth(head, tail) ((tail) - (head)) +#define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ diff --git a/mm/gup.c b/mm/gup.c index 0bde28f0543f..5edd05df9c37 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -146,12 +146,6 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) folio_put_refs(folio, refs); } -static void put_compound_head(struct page *page, int refs, unsigned int flags) -{ - VM_BUG_ON_PAGE(PageTail(page), page); - gup_put_folio((struct folio *)page, refs, flags); -} - /** * try_grab_page() - elevate a page's refcount by a flag-dependent amount * @page: pointer to page to be grabbed @@ -214,20 +208,19 @@ void unpin_user_page(struct page *page) } EXPORT_SYMBOL(unpin_user_page); -static inline struct page *compound_range_next(struct page *start, +static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { - struct page *next, *page; + struct page *next = nth_page(start, i); + struct folio *folio = page_folio(next); unsigned int nr = 1; - next = nth_page(start, i); - page = compound_head(next); - if (PageHead(page)) + if (folio_test_large(folio)) nr = min_t(unsigned int, npages - i, - compound_nr(page) - page_nth(page, next)); + folio_nr_pages(folio) - folio_page_idx(folio, next)); *ntails = nr; - return page; + return folio; } static inline struct folio *gup_folio_next(struct page **list, @@ -335,15 +328,18 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { - unsigned long index; - struct page *head; - unsigned int ntails; + unsigned long i; + struct folio *folio; + unsigned int nr; - for (index = 0; index < npages; index += ntails) { - head = compound_range_next(page, npages, index, &ntails); - if (make_dirty && !PageDirty(head)) - set_page_dirty_lock(head); - put_compound_head(head, ntails, FOLL_PIN); + for (i = 0; i < npages; i += nr) { + folio = gup_folio_range_next(page, npages, i, &nr); + if (make_dirty && !folio_test_dirty(folio)) { + folio_lock(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + } + gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); -- cgit v1.2.3-71-gd317 From 536939ff516382b391a0039262e27fc80c7b3924 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 21 Mar 2022 12:57:38 -0400 Subject: mm: Add three folio wrappers folio_is_zone_device() is equivalent to is_zone_device_page(), folio_is_device_private() is equivalent to is_device_private_page(), and folio_is_pinnable() is equivalent to is_pinnable_page(). All of these tests return the same result for every page in the folio, so we can just pass the head page of the folio to the page variant of the function. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/memremap.h | 5 +++++ include/linux/mm.h | 10 ++++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d6a114dd5ea8..8af304f6b504 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -138,6 +138,11 @@ static inline bool is_device_private_page(const struct page *page) page->pgmap->type == MEMORY_DEVICE_PRIVATE; } +static inline bool folio_is_device_private(const struct folio *folio) +{ + return is_device_private_page(&folio->page); +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d3f9057a807..2ca10c167f35 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1075,6 +1075,11 @@ static inline bool is_zone_device_page(const struct page *page) } #endif +static inline bool folio_is_zone_device(const struct folio *folio) +{ + return is_zone_device_page(&folio->page); +} + static inline bool is_zone_movable_page(const struct page *page) { return page_zonenum(page) == ZONE_MOVABLE; @@ -1556,6 +1561,11 @@ static inline bool is_pinnable_page(struct page *page) } #endif +static inline bool folio_is_pinnable(struct folio *folio) +{ + return is_pinnable_page(&folio->page); +} + static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); -- cgit v1.2.3-71-gd317 From 06d20bdb986815a75fb1addf34655756ba922e3a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Jan 2022 14:40:12 -0500 Subject: mm: Add lru_to_folio() Since page->lru occupies the same bytes as compound_head, any page on the LRU list must be a folio. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ca10c167f35..a583b7375445 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -225,6 +225,10 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) +static inline struct folio *lru_to_folio(struct list_head *head) +{ + return list_entry((head)->prev, struct folio, lru); +} void setup_initial_init_mm(void *start_code, void *end_code, void *end_data, void *brk); -- cgit v1.2.3-71-gd317 From d6c75dc22c755c567838f12f12a16f2a323ebd4e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 13 Feb 2022 15:22:28 -0500 Subject: mm/truncate: Split invalidate_inode_page() into mapping_evict_folio() Some of the callers already have the address_space and can avoid calling folio_mapping() and checking if the folio was already truncated. Also add kernel-doc and fix the return type (in case we ever support folios larger than 4TB). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin --- include/linux/mm.h | 1 - mm/internal.h | 1 + mm/memory-failure.c | 4 ++-- mm/truncate.c | 34 +++++++++++++++++++++++----------- 4 files changed, 26 insertions(+), 14 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index a583b7375445..dede2eda4d7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1825,7 +1825,6 @@ extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int generic_error_remove_page(struct address_space *mapping, struct page *page); -int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, diff --git a/mm/internal.h b/mm/internal.h index ade30a1e6682..9c1959fff477 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -95,6 +95,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); +long invalidate_inode_page(struct page *page); /** * folio_evictable - Test whether a folio is evictable. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97a9ed8f87a9..0b72a936b8dd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2139,7 +2139,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) */ static int __soft_offline_page(struct page *page) { - int ret = 0; + long ret = 0; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); char const *msg_page[] = {"page", "hugepage"}; @@ -2196,7 +2196,7 @@ static int __soft_offline_page(struct page *page) if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n", + pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n", pfn, msg_page[huge], ret, &page->flags); if (ret > 0) ret = -EBUSY; diff --git a/mm/truncate.c b/mm/truncate.c index 1d97c4cae6a0..2fb10735aab4 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -273,18 +273,9 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) } EXPORT_SYMBOL(generic_error_remove_page); -/* - * Safely invalidate one page from its pagecache mapping. - * It only drops clean, unused pages. The page must be locked. - * - * Returns 1 if the page is successfully invalidated, otherwise 0. - */ -int invalidate_inode_page(struct page *page) +static long mapping_evict_folio(struct address_space *mapping, + struct folio *folio) { - struct folio *folio = page_folio(page); - struct address_space *mapping = folio_mapping(folio); - if (!mapping) - return 0; if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; /* The refcount will be elevated if any page in the folio is mapped */ @@ -297,6 +288,27 @@ int invalidate_inode_page(struct page *page) return remove_mapping(mapping, folio); } +/** + * invalidate_inode_page() - Remove an unused page from the pagecache. + * @page: The page to remove. + * + * Safely invalidate one page from its pagecache mapping. + * It only drops clean, unused pages. + * + * Context: Page must be locked. + * Return: The number of pages successfully removed. + */ +long invalidate_inode_page(struct page *page) +{ + struct folio *folio = page_folio(page); + struct address_space *mapping = folio_mapping(folio); + + /* The page may have been truncated before it was locked */ + if (!mapping) + return 0; + return mapping_evict_folio(mapping, folio); +} + /** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate -- cgit v1.2.3-71-gd317 From 74e8ee4708a8edabbbc7ab8c12ec24d7a561bb41 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Jan 2022 10:50:48 -0500 Subject: mm: Turn head_compound_mapcount() into folio_entire_mapcount() Adjust documentation to be more clear. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/mm.h | 17 +++++++++++------ mm/debug.c | 6 ++++-- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index dede2eda4d7f..70f0ca217962 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -776,21 +776,26 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif -static inline int head_compound_mapcount(struct page *head) +/* + * How many times the entire folio is mapped as a single unit (eg by a + * PMD or PUD entry). This is probably not what you want, except for + * debugging purposes; look at folio_mapcount() or page_mapcount() + * instead. + */ +static inline int folio_entire_mapcount(struct folio *folio) { - return atomic_read(compound_mapcount_ptr(head)) + 1; + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + return atomic_read(folio_mapcount_ptr(folio)) + 1; } /* * Mapcount of compound page as a whole, does not include mapped sub-pages. * - * Must be called only for compound pages or any their tail sub-pages. + * Must be called only for compound pages. */ static inline int compound_mapcount(struct page *page) { - VM_BUG_ON_PAGE(!PageCompound(page), page); - page = compound_head(page); - return head_compound_mapcount(page); + return folio_entire_mapcount(page_folio(page)); } /* diff --git a/mm/debug.c b/mm/debug.c index c4cf44266430..eeb7ea3ca292 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -48,7 +48,8 @@ const struct trace_print_flags vmaflag_names[] = { static void __dump_page(struct page *page) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); + struct page *head = &folio->page; struct address_space *mapping; bool compound = PageCompound(page); /* @@ -76,6 +77,7 @@ static void __dump_page(struct page *page) else mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS); head = page; + folio = (struct folio *)page; compound = false; } else { mapping = page_mapping(page); @@ -94,7 +96,7 @@ static void __dump_page(struct page *page) if (compound) { pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", head, compound_order(head), - head_compound_mapcount(head), + folio_entire_mapcount(folio), head_compound_pincount(head)); } -- cgit v1.2.3-71-gd317 From 4ba1119cd53166d853050ff1a9d76079cd8f8e06 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Jan 2022 16:33:26 -0500 Subject: mm: Add folio_mapcount() This implements the same algorithm as total_mapcount(), which is transformed into a wrapper function. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/mm.h | 8 +++++++- mm/huge_memory.c | 24 ------------------------ mm/util.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 25 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 70f0ca217962..0d380dc26847 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -825,8 +825,14 @@ static inline int page_mapcount(struct page *page) return atomic_read(&page->_mapcount) + 1; } +int folio_mapcount(struct folio *folio); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE -int total_mapcount(struct page *page); +static inline int total_mapcount(struct page *page) +{ + return folio_mapcount(page_folio(page)); +} + int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9afca0122723..beebe4105659 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2465,30 +2465,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, } } -int total_mapcount(struct page *page) -{ - int i, compound, nr, ret; - - VM_BUG_ON_PAGE(PageTail(page), page); - - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) + 1; - - compound = compound_mapcount(page); - nr = compound_nr(page); - if (PageHuge(page)) - return compound; - ret = compound; - for (i = 0; i < nr; i++) - ret += atomic_read(&page[i]._mapcount) + 1; - /* File pages has compound_mapcount included in _mapcount */ - if (!PageAnon(page)) - return ret - compound * nr; - if (PageDoubleMap(page)) - ret -= nr; - return ret; -} - /* * This calculates accurately how many mappings a transparent hugepage * has (unlike page_mapcount() which isn't fully accurate). This full diff --git a/mm/util.c b/mm/util.c index 7e43369064c8..b614f423aaa4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -740,6 +740,39 @@ int __page_mapcount(struct page *page) } EXPORT_SYMBOL_GPL(__page_mapcount); +/** + * folio_mapcount() - Calculate the number of mappings of this folio. + * @folio: The folio. + * + * A large folio tracks both how many times the entire folio is mapped, + * and how many times each individual page in the folio is mapped. + * This function calculates the total number of times the folio is + * mapped. + * + * Return: The number of times this folio is mapped. + */ +int folio_mapcount(struct folio *folio) +{ + int i, compound, nr, ret; + + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) + 1; + + compound = folio_entire_mapcount(folio); + nr = folio_nr_pages(folio); + if (folio_test_hugetlb(folio)) + return compound; + ret = compound; + for (i = 0; i < nr; i++) + ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; + /* File pages has compound_mapcount included in _mapcount */ + if (!folio_test_anon(folio)) + return ret - compound * nr; + if (folio_test_double_map(folio)) + ret -= nr; + return ret; +} + /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. -- cgit v1.2.3-71-gd317 From e05b34539d008ab819388f699b25eae962ba24ac Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 29 Jan 2022 11:52:52 -0500 Subject: mm: Turn page_anon_vma() into folio_anon_vma() Move the prototype from mm.h to mm/internal.h and convert all callers to pass a folio. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 1 - mm/internal.h | 1 + mm/ksm.c | 3 ++- mm/rmap.c | 19 ++++++++++++------- mm/util.c | 3 +-- 5 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d380dc26847..a879c583f665 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1730,7 +1730,6 @@ static inline void *folio_address(const struct folio *folio) } extern void *page_rmapping(struct page *page); -extern struct anon_vma *page_anon_vma(struct page *page); extern pgoff_t __page_file_index(struct page *page); /* diff --git a/mm/internal.h b/mm/internal.h index 6039acc780c0..2b2c2c4eb63a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -392,6 +392,7 @@ static inline bool is_data_mapping(vm_flags_t flags) void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev); void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); +struct anon_vma *folio_anon_vma(struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); diff --git a/mm/ksm.c b/mm/ksm.c index ea82fef93a31..b25d545e0cd1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2554,7 +2554,8 @@ void __ksm_exit(struct mm_struct *mm) struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { - struct anon_vma *anon_vma = page_anon_vma(page); + struct folio *folio = page_folio(page); + struct anon_vma *anon_vma = folio_anon_vma(folio); struct page *new_page; if (PageKsm(page)) { diff --git a/mm/rmap.c b/mm/rmap.c index 64655d345234..09301aecf2fc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -737,8 +737,9 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { - if (PageAnon(page)) { - struct anon_vma *page__anon_vma = page_anon_vma(page); + struct folio *folio = page_folio(page); + if (folio_test_anon(folio)) { + struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. @@ -748,7 +749,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; - } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) { + } else if (vma->vm_file->f_mapping != folio->mapping) { return -EFAULT; } @@ -1103,6 +1104,7 @@ static void __page_set_anon_rmap(struct page *page, static void __page_check_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + struct folio *folio = page_folio(page); /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. @@ -1114,7 +1116,8 @@ static void __page_check_anon_rmap(struct page *page, * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ - VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); + VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, + folio); VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), page); } @@ -2177,6 +2180,7 @@ void __put_anon_vma(struct anon_vma *anon_vma) static struct anon_vma *rmap_walk_anon_lock(struct page *page, struct rmap_walk_control *rwc) { + struct folio *folio = page_folio(page); struct anon_vma *anon_vma; if (rwc->anon_lock) @@ -2188,7 +2192,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ - anon_vma = page_anon_vma(page); + anon_vma = folio_anon_vma(folio); if (!anon_vma) return NULL; @@ -2208,14 +2212,15 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) { + struct folio *folio = page_folio(page); struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; if (locked) { - anon_vma = page_anon_vma(page); + anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ - VM_BUG_ON_PAGE(!anon_vma, page); + VM_BUG_ON_FOLIO(!anon_vma, folio); } else { anon_vma = rmap_walk_anon_lock(page, rwc); } diff --git a/mm/util.c b/mm/util.c index b614f423aaa4..13fc88ac8e70 100644 --- a/mm/util.c +++ b/mm/util.c @@ -679,9 +679,8 @@ bool folio_mapped(struct folio *folio) } EXPORT_SYMBOL(folio_mapped); -struct anon_vma *page_anon_vma(struct page *page) +struct anon_vma *folio_anon_vma(struct folio *folio) { - struct folio *folio = page_folio(page); unsigned long mapping = (unsigned long)folio->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) -- cgit v1.2.3-71-gd317 From 18788cfa236967741b83db1035ab24539e2a21bb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 May 2020 20:54:38 -0400 Subject: mm: Support arbitrary THP sizes For code which has not yet been converted from THP to folios, use the compound size of the page instead of assuming PTE or PMD size. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/huge_mm.h | 47 ----------------------------------------------- include/linux/mm.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 47 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e0348bca3d66..0734aff8fa19 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -250,30 +250,6 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, return NULL; } -/** - * thp_order - Order of a transparent huge page. - * @page: Head page of a transparent huge page. - */ -static inline unsigned int thp_order(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - if (PageHead(page)) - return HPAGE_PMD_ORDER; - return 0; -} - -/** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline int thp_nr_pages(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - if (PageHead(page)) - return HPAGE_PMD_NR; - return 1; -} - /** * folio_test_pmd_mappable - Can we map this folio with a PMD? * @folio: The folio to test @@ -336,18 +312,6 @@ static inline struct list_head *page_deferred_list(struct page *page) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) -static inline unsigned int thp_order(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return 0; -} - -static inline int thp_nr_pages(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return 1; -} - static inline bool folio_test_pmd_mappable(struct folio *folio) { return false; @@ -489,15 +453,4 @@ static inline int split_folio_to_list(struct folio *folio, return split_huge_page_to_list(&folio->page, list); } -/** - * thp_size - Size of a transparent huge page. - * @page: Head page of a transparent huge page. - * - * Return: Number of bytes in this page. - */ -static inline unsigned long thp_size(struct page *page) -{ - return PAGE_SIZE << thp_order(page); -} - #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index a879c583f665..c1966ad34142 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -939,6 +939,37 @@ static inline unsigned int page_shift(struct page *page) return PAGE_SHIFT + compound_order(page); } +/** + * thp_order - Order of a transparent huge page. + * @page: Head page of a transparent huge page. + */ +static inline unsigned int thp_order(struct page *page) +{ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + return compound_order(page); +} + +/** + * thp_nr_pages - The number of regular pages in this huge page. + * @page: The head page of a huge page. + */ +static inline int thp_nr_pages(struct page *page) +{ + VM_BUG_ON_PGFLAGS(PageTail(page), page); + return compound_nr(page); +} + +/** + * thp_size - Size of a transparent huge page. + * @page: Head page of a transparent huge page. + * + * Return: Number of bytes in this page. + */ +static inline unsigned long thp_size(struct page *page) +{ + return PAGE_SIZE << thp_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU -- cgit v1.2.3-71-gd317