From aed877c2b4257a25b2429f165542f86125871071 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:15 +1100 Subject: [PATCH] device/dax: properly refcount device dax pages when mapping Device DAX pages are currently not reference counted when mapped, instead relying on the devmap PTE bit to ensure mapping code will not get/put references. This requires special handling in various page table walkers, particularly GUP, to manage references on the underlying pgmap to ensure the pages remain valid. However there is no reason these pages can't be refcounted properly at map time. Doning so eliminates the need for the devmap PTE bit, freeing up a precious PTE bit. It also simplifies GUP as it no longer needs to manage the special pgmap references and can instead just treat the pages normally as defined by vm_normal_page(). Link: https://lkml.kernel.org/r/968d3a8e9157e7492e85d065765c027e525f9fc9.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Wiliams Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/dax/device.c | 15 +++++++++------ mm/memremap.c | 14 +++++++------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index bc871a34b9cd..328231cfb028 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -125,11 +125,12 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, 0); dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + return vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), + vmf->flags & FAULT_FLAG_WRITE); } static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, @@ -168,11 +169,12 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, 0); dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_folio_pmd(vmf, page_folio(pfn_t_to_page(pfn)), + vmf->flags & FAULT_FLAG_WRITE); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD @@ -213,11 +215,12 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, 0); dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_folio_pud(vmf, page_folio(pfn_t_to_page(pfn)), + vmf->flags & FAULT_FLAG_WRITE); } #else static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, diff --git a/mm/memremap.c b/mm/memremap.c index 9a8879bf1ae4..2aebc1b192da 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -460,11 +460,7 @@ void free_zone_device_folio(struct folio *folio) { struct dev_pagemap *pgmap = folio->pgmap; - if (WARN_ON_ONCE(!pgmap->ops)) - return; - - if (WARN_ON_ONCE(pgmap->type != MEMORY_DEVICE_FS_DAX && - !pgmap->ops->page_free)) + if (WARN_ON_ONCE(!pgmap)) return; mem_cgroup_uncharge(folio); @@ -494,12 +490,15 @@ void free_zone_device_folio(struct folio *folio) * zero which indicating the page has been removed from the file * system mapping. */ - if (pgmap->type != MEMORY_DEVICE_FS_DAX) + if (pgmap->type != MEMORY_DEVICE_FS_DAX && + pgmap->type != MEMORY_DEVICE_GENERIC) folio->mapping = NULL; switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: case MEMORY_DEVICE_COHERENT: + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + break; pgmap->ops->page_free(folio_page(folio, 0)); put_dev_pagemap(pgmap); break; @@ -509,7 +508,6 @@ void free_zone_device_folio(struct folio *folio) * Reset the refcount to 1 to prepare for handing out the page * again. */ - pgmap->ops->page_free(folio_page(folio, 0)); folio_set_count(folio, 1); break; @@ -518,6 +516,8 @@ void free_zone_device_folio(struct folio *folio) break; case MEMORY_DEVICE_PCI_P2PDMA: + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + break; pgmap->ops->page_free(folio_page(folio, 0)); break; } -- 2.25.1