Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
Christoph Lameter <cl@linux-foundation.org>
Description:
- The shrink file is written when memory should be reclaimed from
- a cache. Empty partial slabs are freed and the partial list is
- sorted so the slabs with the fewest available objects are used
- first.
+ The shrink file is used to reclaim unused slab cache
+ memory from a cache. Empty per-cpu or partial slabs
+ are freed and the partial list is sorted so the slabs
+ with the fewest available objects are used first.
+ It only accepts a value of "1" on write for shrinking
+ the cache. Other input values are considered invalid.
+ Shrinking slab caches might be expensive and can
+ adversely impact other running applications. So it
+ should be used with care.
What: /sys/kernel/slab/cache/slab_size
Date: May 2007
memory.oom_control set/show oom controls.
memory.numa_stat show the number of memory usage per numa
node
-
memory.kmem.limit_in_bytes set/show hard limit for kernel memory
+ This knob is deprecated and shouldn't be
+ used. It is planned that this be removed in
+ the foreseeable future.
memory.kmem.usage_in_bytes show current kernel memory allocation
memory.kmem.failcnt show the number of kernel memory usage
hits limits
enables the feature at boot time. By default, it is
disabled and the system will work mostly the same as a
kernel built without CONFIG_DEBUG_PAGEALLOC.
+ Note: to get most of debug_pagealloc error reports, it's
+ useful to also enable the page_owner functionality.
on: enable the feature
debugpat [X86] Enable PAT debugging
and vice-versa 32-bit applications to call 64-bit mmap().
Required for applications doing different bitness syscalls.
+# This allows to use a set of generic functions to determine mmap base
+# address by giving priority to top-down scheme only if the process
+# is not in legacy mode (compat task, unlimited stack size or
+# sysctl_legacy_va_layout).
+# Architecture that selects this option can provide its own version of:
+# - STACK_RND_MASK
+config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+ bool
+ depends on MMU
+ select ARCH_HAS_ELF_RANDOMIZE
+
config HAVE_COPY_THREAD_TLS
bool
help
free_page((unsigned long)pmd);
}
-#define check_pgt_cache() do { } while (0)
-
#endif /* _ALPHA_PGALLOC_H */
#include <asm-generic/pgtable.h>
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
/* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */
#define HAVE_ARCH_UNMAPPED_AREA
#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte)
-#define check_pgt_cache() do { } while (0)
#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd))
#endif /* _ASM_ARC_PGALLOC_H */
/* to cope with aliasing VIPT cache */
#define HAVE_ARCH_UNMAPPED_AREA
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
#endif /* __ASSEMBLY__ */
#endif
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
+ select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_IPC_PARSE_VERSION
select BINFMT_FLAT_ARGVP_ENVP_ON_STACK
select BUILDTIME_EXTABLE_SORT if MMU
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-#define check_pgt_cache() do { } while (0)
-
#ifdef CONFIG_MMU
#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER))
*/
extern unsigned int kobjsize(const void *objp);
-/*
- * No page table caches to initialise.
- */
-#define pgtable_cache_init() do { } while (0)
-
/*
* All 32bit addresses are effectively valid for vmalloc...
* Sort of meaningless for non-VM targets.
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#define pgtable_cache_init() do { } while (0)
-
#endif /* !__ASSEMBLY__ */
#endif /* CONFIG_MMU */
#endif
#endif
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
#endif
#endif /* __ASM_ARM_PROCESSOR_H */
return 0;
}
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- return randomize_page(mm->brk, 0x02000000);
-}
-
#ifdef CONFIG_MMU
#ifdef CONFIG_KUSER_HELPERS
/*
* coherent with the kernels mapping.
*/
if (!PageHighMem(page)) {
- size_t page_size = PAGE_SIZE << compound_order(page);
- __cpuc_flush_dcache_area(page_address(page), page_size);
+ __cpuc_flush_dcache_area(page_address(page), page_size(page));
} else {
unsigned long i;
if (cache_is_vipt_nonaliasing()) {
- for (i = 0; i < (1 << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
void *addr = kmap_atomic(page + i);
__cpuc_flush_dcache_area(addr, PAGE_SIZE);
kunmap_atomic(addr);
}
} else {
- for (i = 0; i < (1 << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
void *addr = kmap_high_get(page + i);
if (addr) {
__cpuc_flush_dcache_area(addr, PAGE_SIZE);
((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \
(((pgoff)<<PAGE_SHIFT) & (SHMLBA-1)))
-/* gap between mmap and stack */
-#define MIN_GAP (128*1024*1024UL)
-#define MAX_GAP ((TASK_SIZE)/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
- if (current->personality & ADDR_COMPAT_LAYOUT)
- return 1;
-
- if (rlim_stack->rlim_cur == RLIM_INFINITY)
- return 1;
-
- return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
- unsigned long gap = rlim_stack->rlim_cur;
-
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
-
- return PAGE_ALIGN(TASK_SIZE - gap - rnd);
-}
-
/*
* We need to ensure that shared mappings are correctly aligned to
* avoid aliasing issues with VIPT caches. We need to ensure that
return addr;
}
-unsigned long arch_mmap_rnd(void)
-{
- unsigned long rnd;
-
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-
- return rnd << PAGE_SHIFT;
-}
-
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
- unsigned long random_factor = 0UL;
-
- if (current->flags & PF_RANDOMIZE)
- random_factor = arch_mmap_rnd();
-
- if (mmap_is_legacy(rlim_stack)) {
- mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- }
-}
-
/*
* You really shouldn't be using read() or write() on /dev/mem. This
* might go away in the future.
select ARCH_HAS_DMA_COHERENT_TO_PFN
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
- select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
+ select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
select ARCH_HAS_UBSAN_SANITIZE_ALL
#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
-#define check_pgt_cache() do { } while (0)
-
#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
#if CONFIG_PGTABLE_LEVELS > 2
#include <asm-generic/pgtable.h>
-static inline void pgtable_cache_init(void) { }
-
/*
* On AArch64, the cache coherency is handled via the set_pte_at() function.
*/
"nop") : : "p" (ptr));
}
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */
extern void __init minsigstksz_setup(void);
return sp & ~0xf;
}
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- if (is_compat_task())
- return randomize_page(mm->brk, SZ_32M);
- else
- return randomize_page(mm->brk, SZ_1G);
-}
-
/*
* Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY.
*/
struct page *page = pte_page(pte);
if (!test_and_set_bit(PG_dcache_clean, &page->flags))
- sync_icache_aliases(page_address(page),
- PAGE_SIZE << compound_order(page));
+ sync_icache_aliases(page_address(page), page_size(page));
}
EXPORT_SYMBOL_GPL(__sync_icache_dcache);
#include <asm/cputype.h>
-/*
- * Leave enough space between the mmap area and the stack to honour ulimit in
- * the face of randomisation.
- */
-#define MIN_GAP (SZ_128M)
-#define MAX_GAP (STACK_TOP/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
- if (current->personality & ADDR_COMPAT_LAYOUT)
- return 1;
-
- if (rlim_stack->rlim_cur == RLIM_INFINITY)
- return 1;
-
- return sysctl_legacy_va_layout;
-}
-
-unsigned long arch_mmap_rnd(void)
-{
- unsigned long rnd;
-
-#ifdef CONFIG_COMPAT
- if (test_thread_flag(TIF_32BIT))
- rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
- else
-#endif
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
- return rnd << PAGE_SHIFT;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
- unsigned long gap = rlim_stack->rlim_cur;
- unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap;
-
- /* Values close to RLIM_INFINITY can overflow. */
- if (gap + pad > gap)
- gap += pad;
-
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
-
- return PAGE_ALIGN(STACK_TOP - gap - rnd);
-}
-
-/*
- * This function, called very early during the creation of a new process VM
- * image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
- unsigned long random_factor = 0UL;
-
- if (current->flags & PF_RANDOMIZE)
- random_factor = arch_mmap_rnd();
-
- /*
- * Fall back to the standard layout if the personality bit is set, or
- * if the expected stack growth is unlimited:
- */
- if (mmap_is_legacy(rlim_stack)) {
- mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- }
-}
-
/*
* You really shouldn't be using read() or write() on /dev/mem. This might go
* away in the future.
kmem_cache_free(pgd_cache, pgd);
}
-void __init pgd_cache_init(void)
+void __init pgtable_cache_init(void)
{
if (PGD_SIZE == PAGE_SIZE)
return;
#define swapper_pg_dir ((pgd_t *) 0)
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
/*
* c6x is !MMU, so define the simpliest implementation
*/
tlb_remove_page(tlb, pte); \
} while (0)
-#define check_pgt_cache() do {} while (0)
-
extern void pagetable_init(void);
extern void pre_mmu_init(void);
extern void pre_trap_init(void);
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
#define kern_addr_valid(addr) (1)
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do {} while (0)
-
#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
remap_pfn_range(vma, vaddr, pfn, size, prot)
#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopud.h>
#include <asm-generic/pgtable.h>
-#define pgtable_cache_init() do { } while (0)
extern void paging_init(void);
#define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */
#define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */
extern unsigned int kobjsize(const void *objp);
extern int is_in_rom(unsigned long);
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
/*
* All 32bit addresses are effectively valid for vmalloc...
* Sort of meaningless for non-VM targets.
#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
-#define check_pgt_cache() do {} while (0)
-
extern unsigned long long kmap_generation;
/*
#define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-/* I think this is in case we have page table caches; needed by init/main.c */
-#define pgtable_cache_init() do { } while (0)
-
/*
* Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is
* interpreted as swap information. The remaining free bits are interpreted as
# Makefile for Hexagon memory management subsystem
#
-obj-y := init.o pgalloc.o ioremap.o uaccess.o vm_fault.o cache.o
+obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o
obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
- */
-
-#include <linux/init.h>
-
-void __init pgtable_cache_init(void)
-{
-}
config ZONE_DMA32
def_bool y
-config QUICKLIST
- bool
- default y
-
config MMU
bool
default y
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/threads.h>
-#include <linux/quicklist.h>
+
+#include <asm-generic/pgalloc.h>
#include <asm/mmu_context.h>
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
- return quicklist_alloc(0, GFP_KERNEL, NULL);
+ return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
}
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- quicklist_free(0, NULL, pgd);
+ free_page((unsigned long)pgd);
}
#if CONFIG_PGTABLE_LEVELS == 4
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
- return quicklist_alloc(0, GFP_KERNEL, NULL);
+ return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
}
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
- quicklist_free(0, NULL, pud);
+ free_page((unsigned long)pud);
}
#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud)
#endif /* CONFIG_PGTABLE_LEVELS == 4 */
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
- return quicklist_alloc(0, GFP_KERNEL, NULL);
+ return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
}
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
- quicklist_free(0, NULL, pmd);
+ free_page((unsigned long)pmd);
}
#define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd)
pmd_val(*pmd_entry) = __pa(pte);
}
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
- struct page *page;
- void *pg;
-
- pg = quicklist_alloc(0, GFP_KERNEL, NULL);
- if (!pg)
- return NULL;
- page = virt_to_page(pg);
- if (!pgtable_page_ctor(page)) {
- quicklist_free(0, NULL, pg);
- return NULL;
- }
- return page;
-}
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
- return quicklist_alloc(0, GFP_KERNEL, NULL);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
- pgtable_page_dtor(pte);
- quicklist_free_page(0, NULL, pte);
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- quicklist_free(0, NULL, pte);
-}
-
-static inline void check_pgt_cache(void)
-{
- quicklist_trim(0, NULL, 25, 16);
-}
-
#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte)
#endif /* _ASM_IA64_PGALLOC_H */
#define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M
#define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT)
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
/* These tell get_user_pages() that the first gate page is accessible from user-level. */
#define FIXADDR_USER_START GATE_ADDR
#ifdef HAVE_BUGGY_SEGREL
if (test_bit(PG_arch_1, &page->flags))
return; /* i-cache is already coherent with d-cache */
- flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page)));
+ flush_icache_range(addr, addr + page_size(page));
set_bit(PG_arch_1, &page->flags); /* mark page as clean */
}
#include <asm-generic/pgtable.h>
#endif /* !__ASSEMBLY__ */
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
-#define check_pgt_cache() do { } while (0)
-
#endif /* _M68K_PGTABLE_H */
*/
#define ZERO_PAGE(vaddr) (virt_to_page(0))
-/*
- * No page table caches to initialise.
- */
-#define pgtable_cache_init() do { } while (0)
-
/*
* All 32bit addresses are effectively valid for vmalloc...
* Sort of meaningless for non-VM targets.
#include <asm-generic/pgtable.h>
-#define check_pgt_cache() do { } while (0)
-
#endif /* _M68KNOMMU_PGTABLE_H */
#include <asm/cache.h>
#include <asm/pgtable.h>
-#define PGDIR_ORDER 0
-
-/*
- * This is handled very differently on MicroBlaze since out page tables
- * are all 0's and I want to be able to use these zero'd pages elsewhere
- * as well - it gives us quite a speedup.
- * -- Cort
- */
-extern struct pgtable_cache_struct {
- unsigned long *pgd_cache;
- unsigned long *pte_cache;
- unsigned long pgtable_cache_sz;
-} quicklists;
-
-#define pgd_quicklist (quicklists.pgd_cache)
-#define pmd_quicklist ((unsigned long *)0)
-#define pte_quicklist (quicklists.pte_cache)
-#define pgtable_cache_size (quicklists.pgtable_cache_sz)
-
-extern unsigned long *zero_cache; /* head linked list of pre-zero'd pages */
-extern atomic_t zero_sz; /* # currently pre-zero'd pages */
-extern atomic_t zeropage_hits; /* # zero'd pages request that we've done */
-extern atomic_t zeropage_calls; /* # zero'd pages request that've been made */
-extern atomic_t zerototal; /* # pages zero'd over time */
-
-#define zero_quicklist (zero_cache)
-#define zero_cache_sz (zero_sz)
-#define zero_cache_calls (zeropage_calls)
-#define zero_cache_hits (zeropage_hits)
-#define zero_cache_total (zerototal)
-
-/*
- * return a pre-zero'd page from the list,
- * return NULL if none available -- Cort
- */
-extern unsigned long get_zero_page_fast(void);
+#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
+#include <asm-generic/pgalloc.h>
extern void __bad_pte(pmd_t *pmd);
-static inline pgd_t *get_pgd_slow(void)
+static inline pgd_t *get_pgd(void)
{
- pgd_t *ret;
-
- ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER);
- if (ret != NULL)
- clear_page(ret);
- return ret;
+ return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
}
-static inline pgd_t *get_pgd_fast(void)
-{
- unsigned long *ret;
-
- ret = pgd_quicklist;
- if (ret != NULL) {
- pgd_quicklist = (unsigned long *)(*ret);
- ret[0] = 0;
- pgtable_cache_size--;
- } else
- ret = (unsigned long *)get_pgd_slow();
- return (pgd_t *)ret;
-}
-
-static inline void free_pgd_fast(pgd_t *pgd)
-{
- *(unsigned long **)pgd = pgd_quicklist;
- pgd_quicklist = (unsigned long *) pgd;
- pgtable_cache_size++;
-}
-
-static inline void free_pgd_slow(pgd_t *pgd)
+static inline void free_pgd(pgd_t *pgd)
{
free_page((unsigned long)pgd);
}
-#define pgd_free(mm, pgd) free_pgd_fast(pgd)
-#define pgd_alloc(mm) get_pgd_fast()
+#define pgd_free(mm, pgd) free_pgd(pgd)
+#define pgd_alloc(mm) get_pgd()
#define pmd_pgtable(pmd) pmd_page(pmd)
extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
-static inline struct page *pte_alloc_one(struct mm_struct *mm)
-{
- struct page *ptepage;
-
-#ifdef CONFIG_HIGHPTE
- int flags = GFP_KERNEL | __GFP_HIGHMEM;
-#else
- int flags = GFP_KERNEL;
-#endif
-
- ptepage = alloc_pages(flags, 0);
- if (!ptepage)
- return NULL;
- clear_highpage(ptepage);
- if (!pgtable_page_ctor(ptepage)) {
- __free_page(ptepage);
- return NULL;
- }
- return ptepage;
-}
-
-static inline void pte_free_fast(pte_t *pte)
-{
- *(unsigned long **)pte = pte_quicklist;
- pte_quicklist = (unsigned long *) pte;
- pgtable_cache_size++;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- free_page((unsigned long)pte);
-}
-
-static inline void pte_free_slow(struct page *ptepage)
-{
- __free_page(ptepage);
-}
-
-static inline void pte_free(struct mm_struct *mm, struct page *ptepage)
-{
- pgtable_page_dtor(ptepage);
- __free_page(ptepage);
-}
-
#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte))
#define pmd_populate(mm, pmd, pte) \
#define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x)
#define pgd_populate(mm, pmd, pte) BUG()
-extern int do_check_pgt_cache(int, int);
-
#endif /* CONFIG_MMU */
-#define check_pgt_cache() do { } while (0)
-
#endif /* _ASM_MICROBLAZE_PGALLOC_H */
#define swapper_pg_dir ((pgd_t *) NULL)
-#define pgtable_cache_init() do {} while (0)
-
#define arch_enter_lazy_cpu_mode() do {} while (0)
#define pgprot_noncached_wc(prot) prot
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
#define kern_addr_valid(addr) (1)
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
void do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code);
unsigned long ioremap_bot;
EXPORT_SYMBOL(ioremap_bot);
-#ifndef CONFIG_SMP
-struct pgtable_cache_struct quicklists;
-#endif
-
static void __iomem *__ioremap(phys_addr_t addr, unsigned long size,
unsigned long flags)
{
select ARCH_32BIT_OFF_T if !64BIT
select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
select ARCH_CLOCKSOURCE_DATA
- select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_SUPPORTS_UPROBES
select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_IPC_PARSE_VERSION
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
#endif /* __PAGETABLE_PUD_FOLDED */
-#define check_pgt_cache() do { } while (0)
-
extern void pagetable_init(void);
#endif /* _ASM_PGALLOC_H */
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
#endif /* _ASM_PGTABLE_H */
extern unsigned int vced_count, vcei_count;
-/*
- * MIPS does have an arch_pick_mmap_layout()
- */
-#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
-
#ifdef CONFIG_32BIT
#ifdef CONFIG_KVM_GUEST
/* User space process size is limited to 1GB in KVM Guest Mode */
unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */
EXPORT_SYMBOL(shm_align_mask);
-/* gap between mmap and stack */
-#define MIN_GAP (128*1024*1024UL)
-#define MAX_GAP ((TASK_SIZE)/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
- if (current->personality & ADDR_COMPAT_LAYOUT)
- return 1;
-
- if (rlim_stack->rlim_cur == RLIM_INFINITY)
- return 1;
-
- return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
- unsigned long gap = rlim_stack->rlim_cur;
-
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
-
- return PAGE_ALIGN(TASK_SIZE - gap - rnd);
-}
-
#define COLOUR_ALIGN(addr, pgoff) \
((((addr) + shm_align_mask) & ~shm_align_mask) + \
(((pgoff) << PAGE_SHIFT) & shm_align_mask))
addr0, len, pgoff, flags, DOWN);
}
-unsigned long arch_mmap_rnd(void)
-{
- unsigned long rnd;
-
-#ifdef CONFIG_COMPAT
- if (TASK_IS_32BIT_ADDR)
- rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
- else
-#endif /* CONFIG_COMPAT */
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-
- return rnd << PAGE_SHIFT;
-}
-
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
- unsigned long random_factor = 0UL;
-
- if (current->flags & PF_RANDOMIZE)
- random_factor = arch_mmap_rnd();
-
- if (mmap_is_legacy(rlim_stack)) {
- mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- }
-}
-
-static inline unsigned long brk_rnd(void)
-{
- unsigned long rnd = get_random_long();
-
- rnd = rnd << PAGE_SHIFT;
- /* 8MB for 32bit, 256MB for 64bit */
- if (TASK_IS_32BIT_ADDR)
- rnd = rnd & 0x7ffffful;
- else
- rnd = rnd & 0xffffffful;
-
- return rnd;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- unsigned long base = mm->brk;
- unsigned long ret;
-
- ret = PAGE_ALIGN(base + brk_rnd());
-
- if (ret < mm->brk)
- return mm->brk;
-
- return ret;
-}
-
bool __virt_addr_valid(const volatile void *kaddr)
{
unsigned long vaddr = (unsigned long)kaddr;
extern pgd_t *pgd_alloc(struct mm_struct *mm);
extern void pgd_free(struct mm_struct *mm, pgd_t * pgd);
-#define check_pgt_cache() do { } while (0)
-
static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
{
pgtable_t pte;
* into virtual address `from'
*/
-#define pgtable_cache_init() do { } while (0)
-
#endif /* !__ASSEMBLY__ */
#endif /* _ASMNDS32_PGTABLE_H */
tlb_remove_page((tlb), (pte)); \
} while (0)
-#define check_pgt_cache() do { } while (0)
-
#endif /* _ASM_NIOS2_PGALLOC_H */
#include <asm-generic/pgtable.h>
-#define pgtable_cache_init() do { } while (0)
-
extern void __init paging_init(void);
extern void __init mmu_init(void);
#define pmd_pgtable(pmd) pmd_page(pmd)
-#define check_pgt_cache() do { } while (0)
-
#endif
#include <asm-generic/pgtable.h>
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
typedef pte_t *pte_addr_t;
#endif /* __ASSEMBLY__ */
pmd_populate_kernel(mm, pmd, page_address(pte_page))
#define pmd_pgtable(pmd) pmd_page(pmd)
-#define check_pgt_cache() do { } while (0)
-
#endif
#define PTRS_PER_PTE (1UL << BITS_PER_PTE)
/* Definitions for 2nd level */
-#define pgtable_cache_init() do { } while (0)
-
#define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE)
#define PMD_SIZE (1UL << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
extern struct kmem_cache *pgtable_cache[];
#define PGT_CACHE(shift) pgtable_cache[shift]
-static inline void check_pgt_cache(void) { }
-
#ifdef CONFIG_PPC_BOOK3S
#include <asm/book3s/pgalloc.h>
#else
unsigned long vmalloc_to_phys(void *vmalloc_addr);
void pgtable_cache_add(unsigned int shift);
-void pgtable_cache_init(void);
#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32)
void mark_initmem_nx(void);
/*
* IF we try to do a HUGE PTE update after a withdraw is done.
* we will find the below NULL. This happens when we do
- * split_huge_page_pmd
+ * split_huge_pmd
*/
if (!hpte_slot_array)
return;
* Allow to use larger than 64k IOMMU pages. Only do that
* if we are backed by hugetlb.
*/
- if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
- struct page *head = compound_head(page);
-
- pageshift = compound_order(head) + PAGE_SHIFT;
- }
+ if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page))
+ pageshift = page_shift(compound_head(page));
mem->pageshift = min(mem->pageshift, pageshift);
/*
* We don't need struct page reference any more, switch
BUG_ON(!PageCompound(page));
- for (i = 0; i < (1UL << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
if (!PageHighMem(page)) {
__flush_dcache_icache(page_address(page+i));
} else {
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select SPARSEMEM_STATIC if 32BIT
+ select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
+ select HAVE_ARCH_MMAP_RND_BITS
+
+config ARCH_MMAP_RND_BITS_MIN
+ default 18 if 64BIT
+ default 8
+
+# max bits determined by the following formula:
+# VA_BITS - PAGE_SHIFT - 3
+config ARCH_MMAP_RND_BITS_MAX
+ default 24 if 64BIT # SV39 based
+ default 17
config MMU
def_bool y
tlb_remove_page((tlb), pte); \
} while (0)
-static inline void check_pgt_cache(void)
-{
-}
-
#endif /* _ASM_RISCV_PGALLOC_H */
extern void setup_bootmem(void);
extern void paging_init(void);
-static inline void pgtable_cache_init(void)
-{
- /* No page table caches to initialize */
-}
-
#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
#define VMALLOC_END (PAGE_OFFSET - 1)
#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-/*
- * No page table caches to initialise
- */
-static inline void pgtable_cache_init(void) { }
-static inline void check_pgt_cache(void) { }
-
#include <asm-generic/pgtable.h>
#endif /* _S390_PAGE_H */
#ifndef __ASM_SH_PGALLOC_H
#define __ASM_SH_PGALLOC_H
-#include <linux/quicklist.h>
#include <asm/page.h>
-
-#define QUICK_PT 0 /* Other page table pages that are zero on free */
+#include <asm-generic/pgalloc.h>
extern pgd_t *pgd_alloc(struct mm_struct *);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
}
#define pmd_pgtable(pmd) pmd_page(pmd)
-/*
- * Allocate and free page tables.
- */
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
- return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
- struct page *page;
- void *pg;
-
- pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
- if (!pg)
- return NULL;
- page = virt_to_page(pg);
- if (!pgtable_page_ctor(page)) {
- quicklist_free(QUICK_PT, NULL, pg);
- return NULL;
- }
- return page;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- quicklist_free(QUICK_PT, NULL, pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
- pgtable_page_dtor(pte);
- quicklist_free_page(QUICK_PT, NULL, pte);
-}
-
#define __pte_free_tlb(tlb,pte,addr) \
do { \
pgtable_page_dtor(pte); \
} while (0);
#endif
-static inline void check_pgt_cache(void)
-{
- quicklist_trim(QUICK_PT, NULL, 25, 16);
-}
-
#endif /* __ASM_SH_PGALLOC_H */
#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
-/*
- * Initialise the page table caches
- */
-extern void pgtable_cache_init(void);
-
struct vm_area_struct;
struct mm_struct;
# SPDX-License-Identifier: GPL-2.0
menu "Memory management options"
-config QUICKLIST
- def_bool y
-
config MMU
bool "Support for memory management hardware"
depends on !CPU_SH2
void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
{
}
-
-void pgtable_cache_init(void)
-{
-}
extern struct resource sparc_iomap;
-#define check_pgt_cache() do { } while (0)
-
pgd_t *get_pgd_fast(void);
static inline void free_pgd_fast(pgd_t *pgd)
{
#define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
#define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD))
-#define check_pgt_cache() do { } while (0)
-
void pgtable_free(void *table, bool is_page);
#ifdef CONFIG_SMP
/* We provide our own get_unmapped_area to cope with VA holes for userland */
#define HAVE_ARCH_UNMAPPED_AREA
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
#endif /* !(_SPARC_PGTABLE_H) */
unsigned long);
#define HAVE_ARCH_FB_UNMAPPED_AREA
-void pgtable_cache_init(void);
void sun4v_register_fault_status(void);
void sun4v_ktsb_register(void);
void __init cheetah_ecache_flush_init(void);
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/vaddrs.h>
-#include <asm/pgalloc.h> /* bug in asm-generic/tlb.h: check_pgt_cache */
#include <asm/setup.h>
#include <asm/tlb.h>
#include <asm/prom.h>
#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x))
#endif
-#define check_pgt_cache() do { } while (0)
-
#endif
/* zero page used for uninitialized stuff */
extern unsigned long *empty_zero_page;
-#define pgtable_cache_init() do ; while (0)
-
/* Just any arbitrary offset to the start of the vmalloc VM area: the
* current 8MB value just means that there will be a 8MB "hole" after the
* physical memory until the kernel virtual memory starts. That means that
#define __HAVE_ARCH_PTE_ALLOC_ONE
#include <asm-generic/pgalloc.h>
-#define check_pgt_cache() do { } while (0)
-
#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT)
#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT)
#include <asm-generic/pgtable.h>
-#define pgtable_cache_init() do { } while (0)
-
#endif /* !__ASSEMBLY__ */
#endif /* __UNICORE_PGTABLE_H__ */
extern pgd_t initial_page_table[1024];
extern pmd_t initial_pg_pmd[];
-static inline void pgtable_cache_init(void) { }
-static inline void check_pgt_cache(void) { }
void paging_init(void);
void sync_initial_page_table(void);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#define pgtable_cache_init() do { } while (0)
-#define check_pgt_cache() do { } while (0)
-
#define PAGE_AGP PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1
static struct kmem_cache *pgd_cache;
-void __init pgd_cache_init(void)
+void __init pgtable_cache_init(void)
{
/*
* When PAE kernel is running as a Xen domain, it does not use
}
#else
-void __init pgd_cache_init(void)
-{
-}
-
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
# define swapper_pg_dir NULL
static inline void paging_init(void) { }
#endif
-static inline void pgtable_cache_init(void) { }
/*
* The pmd contains the kernel virtual address of the pte page.
invalidate_dtlb_entry(tlb_entry);
}
-#define check_pgt_cache() do { } while (0)
-
-
/*
* DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa
* ISA and exist only for test purposes..
}
EXPORT_SYMBOL_GPL(memory_block_size_bytes);
-static unsigned long get_memory_block_size(void)
-{
- unsigned long block_sz;
-
- block_sz = memory_block_size_bytes();
-
- /* Validate blk_sz is a power of 2 and not less than section size */
- if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
- WARN_ON(1);
- block_sz = MIN_MEMORY_BLOCK_SIZE;
- }
-
- return block_sz;
-}
-
/*
- * use this as the physical section index that this memsection
- * uses.
+ * Show the first physical section index (number) of this memory block.
*/
-
static ssize_t phys_index_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
}
/*
- * Show whether the section of memory is likely to be hot-removable
+ * Show whether the memory block is likely to be offlineable (or is already
+ * offline). Once offline, the memory block could be removed. The return
+ * value does, however, not indicate that there is a way to remove the
+ * memory block.
*/
static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
char *buf)
static DEVICE_ATTR_RO(removable);
/*
- * Block size attribute stuff
+ * Show the memory block size (shared by all memory blocks).
*/
static ssize_t block_size_bytes_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%lx\n", get_memory_block_size());
+ return sprintf(buf, "%lx\n", memory_block_size_bytes());
}
static DEVICE_ATTR_RO(block_size_bytes);
return -ENOMEM;
mem->start_section_nr = block_id * sections_per_block;
- mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
mem->state = state;
start_pfn = section_nr_to_pfn(mem->start_section_nr);
mem->phys_device = arch_get_memory_phys_device(start_pfn);
+ mem->nid = NUMA_NO_NODE;
ret = register_memory(mem);
/*
* Initialize the sysfs support for memory devices...
*/
-int __init memory_dev_init(void)
+void __init memory_dev_init(void)
{
int ret;
int err;
unsigned long block_sz, nr;
+ /* Validate the configured memory block size */
+ block_sz = memory_block_size_bytes();
+ if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
+ panic("Memory block size not suitable: 0x%lx\n", block_sz);
+ sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+
ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
if (ret)
goto out;
- block_sz = get_memory_block_size();
- sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
-
/*
* Create entries for memory sections that were found
* during boot and have been initialized
out:
if (ret)
- printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
- return ret;
+ panic("%s() failed: %d\n", __func__, ret);
}
/**
"Node %d AnonHugePages: %8lu kB\n"
"Node %d ShmemHugePages: %8lu kB\n"
"Node %d ShmemPmdMapped: %8lu kB\n"
+ "Node %d FileHugePages: %8lu kB\n"
+ "Node %d FilePmdMapped: %8lu kB\n"
#endif
,
nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
HPAGE_PMD_NR),
nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
+ HPAGE_PMD_NR),
+ nid, K(node_page_state(pgdat, NR_FILE_THPS) *
+ HPAGE_PMD_NR),
+ nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
HPAGE_PMD_NR)
#endif
);
static int register_mem_sect_under_node(struct memory_block *mem_blk,
void *arg)
{
+ unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
+ unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+ unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
int ret, nid = *(int *)arg;
- unsigned long pfn, sect_start_pfn, sect_end_pfn;
+ unsigned long pfn;
- mem_blk->nid = nid;
-
- sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
- sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
- sect_end_pfn += PAGES_PER_SECTION - 1;
- for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
+ for (pfn = start_pfn; pfn <= end_pfn; pfn++) {
int page_nid;
/*
if (page_nid != nid)
continue;
}
+
+ /*
+ * If this memory block spans multiple nodes, we only indicate
+ * the last processed node.
+ */
+ mem_blk->nid = nid;
+
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
&mem_blk->dev.kobj,
kobject_name(&mem_blk->dev.kobj));
}
/*
- * Unregister memory block device under all nodes that it spans.
- * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
+ * Unregister a memory block device under the node it spans. Memory blocks
+ * with multiple nodes cannot be offlined and therefore also never be removed.
*/
void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
{
- unsigned long pfn, sect_start_pfn, sect_end_pfn;
- static nodemask_t unlinked_nodes;
-
- nodes_clear(unlinked_nodes);
- sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
- sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
- for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
- int nid;
+ if (mem_blk->nid == NUMA_NO_NODE)
+ return;
- nid = get_nid_for_pfn(pfn);
- if (nid < 0)
- continue;
- if (!node_online(nid))
- continue;
- if (node_test_and_set(nid, unlinked_nodes))
- continue;
- sysfs_remove_link(&node_devices[nid]->dev.kobj,
- kobject_name(&mem_blk->dev.kobj));
- sysfs_remove_link(&mem_blk->dev.kobj,
- kobject_name(&node_devices[nid]->dev.kobj));
- }
+ sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
+ kobject_name(&mem_blk->dev.kobj));
+ sysfs_remove_link(&mem_blk->dev.kobj,
+ kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
}
int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
bool merge;
if (page)
- pg_size <<= compound_order(page);
+ pg_size = page_size(page);
if (off < pg_size &&
skb_can_coalesce(skb, i, page, off)) {
merge = 1;
__GFP_NORETRY,
order);
if (page)
- pg_size <<=
- compound_order(page);
+ pg_size <<= order;
}
if (!page) {
page = alloc_page(gfp);
static void
via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg)
{
- struct page *page;
int i;
switch (vsg->state) {
kfree(vsg->desc_pages);
/* fall through */
case dr_via_pages_locked:
- for (i = 0; i < vsg->num_pages; ++i) {
- if (NULL != (page = vsg->pages[i])) {
- if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction))
- SetPageDirty(page);
- put_page(page);
- }
- }
+ put_user_pages_dirty_lock(vsg->pages, vsg->num_pages,
+ (vsg->direction == DMA_FROM_DEVICE));
/* fall through */
case dr_via_pages_alloc:
vfree(vsg->pages);
for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
page = sg_page_iter_page(&sg_iter);
- if (umem->writable && dirty)
- put_user_pages_dirty_lock(&page, 1);
- else
- put_user_page(page);
+ put_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
}
sg_free_table(&umem->sg_head);
void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
size_t npages, bool dirty)
{
- if (dirty)
- put_user_pages_dirty_lock(p, npages);
- else
- put_user_pages(p, npages);
+ put_user_pages_dirty_lock(p, npages, dirty);
if (mm) { /* during close after signal, mm can be NULL */
atomic64_sub(npages, &mm->pinned_vm);
static void __qib_release_user_pages(struct page **p, size_t num_pages,
int dirty)
{
- if (dirty)
- put_user_pages_dirty_lock(p, num_pages);
- else
- put_user_pages(p, num_pages);
+ put_user_pages_dirty_lock(p, num_pages, dirty);
}
/**
for_each_sg(chunk->page_list, sg, chunk->nents, i) {
page = sg_page(sg);
pa = sg_phys(sg);
- if (dirty)
- put_user_pages_dirty_lock(&page, 1);
- else
- put_user_page(page);
+ put_user_pages_dirty_lock(&page, 1, dirty);
usnic_dbg("pa: %pa\n", &pa);
}
kfree(chunk);
static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
bool dirty)
{
- struct page **p = chunk->plist;
-
- while (num_pages--) {
- if (!PageDirty(*p) && dirty)
- put_user_pages_dirty_lock(p, 1);
- else
- put_user_page(*p);
- p++;
- }
+ put_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
}
void siw_umem_release(struct siw_umem *umem, bool dirty)
if (!page)
goto free_pages;
list_add_tail(&page->lru, &pages);
- size_remaining -= PAGE_SIZE << compound_order(page);
+ size_remaining -= page_size(page);
max_order = compound_order(page);
i++;
}
sg = table->sgl;
list_for_each_entry_safe(page, tmp_page, &pages, lru) {
- sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0);
+ sg_set_page(sg, page, page_size(page), 0);
sg = sg_next(sg);
list_del(&page->lru);
}
page, off_in_page, tlen);
fr_len(fp) += tlen;
fp_skb(fp)->data_len += tlen;
- fp_skb(fp)->truesize +=
- PAGE_SIZE << compound_order(page);
+ fp_skb(fp)->truesize += page_size(page);
} else {
BUG_ON(!page);
from = kmap_atomic(page + (mem_off >> PAGE_SHIFT));
}
static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
- unsigned int page_shift)
+ unsigned int it_page_shift)
{
struct page *page;
unsigned long size = 0;
- if (mm_iommu_is_devmem(mm, hpa, page_shift, &size))
- return size == (1UL << page_shift);
+ if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
+ return size == (1UL << it_page_shift);
page = pfn_to_page(hpa >> PAGE_SHIFT);
/*
* a page we just found. Otherwise the hardware can get access to
* a bigger memory chunk that it should.
*/
- return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
+ return page_shift(compound_head(page)) >= it_page_shift;
}
static inline bool tce_groups_attached(struct tce_container *container)
* libraries. There is no binary dependent code anywhere else.
*/
-#ifndef STACK_RND_MASK
-#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
-#endif
-
-static unsigned long randomize_stack_top(unsigned long stack_top)
-{
- unsigned long random_variable = 0;
-
- if (current->flags & PF_RANDOMIZE) {
- random_variable = get_random_long();
- random_variable &= STACK_RND_MASK;
- random_variable <<= PAGE_SHIFT;
- }
-#ifdef CONFIG_STACK_GROWSUP
- return PAGE_ALIGN(stack_top) + random_variable;
-#else
- return PAGE_ALIGN(stack_top) - random_variable;
-#endif
-}
-
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
err = -ENOMEM;
goto error;
}
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[n]);
memset(bhs[n]->b_data, 0, sb->s_blocksize);
set_buffer_uptodate(bhs[n]);
+ unlock_buffer(bhs[n]);
mark_buffer_dirty_inode(bhs[n], dir);
n++;
fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
de = (struct msdos_dir_entry *)bhs[0]->b_data;
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[0]);
/* filling the new directory slots ("." and ".." entries) */
memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
de[0].size = de[1].size = 0;
memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
set_buffer_uptodate(bhs[0]);
+ unlock_buffer(bhs[0]);
mark_buffer_dirty_inode(bhs[0], dir);
err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
/* fill the directory entry */
copy = min(size, sb->s_blocksize);
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[n]);
memcpy(bhs[n]->b_data, slots, copy);
- slots += copy;
- size -= copy;
set_buffer_uptodate(bhs[n]);
+ unlock_buffer(bhs[n]);
mark_buffer_dirty_inode(bhs[n], dir);
+ slots += copy;
+ size -= copy;
if (!size)
break;
n++;
err = -ENOMEM;
goto error;
}
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(c_bh);
memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
set_buffer_uptodate(c_bh);
+ unlock_buffer(c_bh);
mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
if (sb->s_flags & SB_SYNCHRONOUS)
err = sync_dirty_buffer(c_bh);
mapping->flags = 0;
mapping->wb_err = 0;
atomic_set(&mapping->i_mmap_writable, 0);
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ atomic_set(&mapping->nr_thps, 0);
+#endif
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
mapping->writeback_index = 0;
}
page = virt_to_head_page(ptr);
- if (sz > (PAGE_SIZE << compound_order(page)))
+ if (sz > page_size(page))
return -EINVAL;
pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_inode_add_write);
-EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
return 0;
}
-int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
-{
- return jbd2_journal_file_inode(handle, jinode,
- JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX);
-}
-
-int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
-{
- return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0,
- LLONG_MAX);
-}
-
int jbd2_journal_inode_ranged_write(handle_t *handle,
struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
{
struct buffer_head *data_alloc_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
+ struct ocfs2_journal *journal = osb->journal;
BUG_ON(inode_trylock(tl_inode));
goto out;
}
+ /* Appending truncate log(TA) and and flushing truncate log(TF) are
+ * two separated transactions. They can be both committed but not
+ * checkpointed. If crash occurs then, both two transaction will be
+ * replayed with several already released to global bitmap clusters.
+ * Then truncate log will be replayed resulting in cluster double free.
+ */
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
data_alloc_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
struct page *page, int zero, u64 *phys)
{
int ret, partial = 0;
+ loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from;
+ loff_t length = to - from;
ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
if (ret)
if (ret < 0)
mlog_errno(ret);
else if (ocfs2_should_order_data(inode)) {
- ret = ocfs2_jbd2_file_inode(handle, inode);
+ ret = ocfs2_jbd2_inode_add_write(handle, inode,
+ start_byte, length);
if (ret < 0)
mlog_errno(ret);
}
if (tmppage && page_has_buffers(tmppage)) {
if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
+ user_pos, user_len);
block_commit_write(tmppage, from, to);
}
}
if (page_has_buffers(tmppage)) {
- if (handle && ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(handle, inode);
+ if (handle && ocfs2_should_order_data(inode)) {
+ loff_t start_byte =
+ ((loff_t)tmppage->index << PAGE_SHIFT) +
+ from;
+ loff_t length = to - from;
+ ocfs2_jbd2_inode_add_write(handle, inode,
+ start_byte, length);
+ }
block_commit_write(tmppage, from, to);
}
}
}
DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
-static struct dentry *blockcheck_debugfs_create(const char *name,
- struct dentry *parent,
- u64 *value)
-{
- return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
- &blockcheck_fops);
-}
-
static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
{
if (stats) {
static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
struct dentry *parent)
{
- stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+ struct dentry *dir;
+
+ dir = debugfs_create_dir("blockcheck", parent);
+ stats->b_debug_dir = dir;
+
+ debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir,
+ &stats->b_check_count, &blockcheck_fops);
- blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir,
- &stats->b_check_count);
+ debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir,
+ &stats->b_failure_count, &blockcheck_fops);
- blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir,
- &stats->b_failure_count);
+ debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir,
+ &stats->b_recover_count, &blockcheck_fops);
- blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir,
- &stats->b_recover_count);
}
#else
static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
unsigned int hr_region_num;
struct dentry *hr_debug_dir;
- struct dentry *hr_debug_livenodes;
- struct dentry *hr_debug_regnum;
- struct dentry *hr_debug_elapsed_time;
- struct dentry *hr_debug_pinned;
struct o2hb_debug_buf *hr_db_livenodes;
struct o2hb_debug_buf *hr_db_regnum;
struct o2hb_debug_buf *hr_db_elapsed_time;
kfree(o2hb_db_failedregions);
}
-static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
- struct o2hb_debug_buf **db, int db_len,
- int type, int size, int len, void *data)
+static void o2hb_debug_create(const char *name, struct dentry *dir,
+ struct o2hb_debug_buf **db, int db_len, int type,
+ int size, int len, void *data)
{
*db = kmalloc(db_len, GFP_KERNEL);
if (!*db)
- return NULL;
+ return;
(*db)->db_type = type;
(*db)->db_size = size;
(*db)->db_len = len;
(*db)->db_data = data;
- return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
- &o2hb_debug_fops);
+ debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops);
}
static void o2hb_debug_init(void)
kfree(reg->hr_slots);
- debugfs_remove(reg->hr_debug_livenodes);
- debugfs_remove(reg->hr_debug_regnum);
- debugfs_remove(reg->hr_debug_elapsed_time);
- debugfs_remove(reg->hr_debug_pinned);
- debugfs_remove(reg->hr_debug_dir);
+ debugfs_remove_recursive(reg->hr_debug_dir);
kfree(reg->hr_db_livenodes);
kfree(reg->hr_db_regnum);
kfree(reg->hr_db_elapsed_time);
: NULL;
}
-static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+static void o2hb_debug_region_init(struct o2hb_region *reg,
+ struct dentry *parent)
{
- int ret = -ENOMEM;
+ struct dentry *dir;
- reg->hr_debug_dir =
- debugfs_create_dir(config_item_name(®->hr_item), dir);
- if (!reg->hr_debug_dir) {
- mlog_errno(ret);
- goto bail;
- }
+ dir = debugfs_create_dir(config_item_name(®->hr_item), parent);
+ reg->hr_debug_dir = dir;
- reg->hr_debug_livenodes =
- o2hb_debug_create(O2HB_DEBUG_LIVENODES,
- reg->hr_debug_dir,
- &(reg->hr_db_livenodes),
- sizeof(*(reg->hr_db_livenodes)),
- O2HB_DB_TYPE_REGION_LIVENODES,
- sizeof(reg->hr_live_node_bitmap),
- O2NM_MAX_NODES, reg);
- if (!reg->hr_debug_livenodes) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes),
+ sizeof(*(reg->hr_db_livenodes)),
+ O2HB_DB_TYPE_REGION_LIVENODES,
+ sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES,
+ reg);
- reg->hr_debug_regnum =
- o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
- reg->hr_debug_dir,
- &(reg->hr_db_regnum),
- sizeof(*(reg->hr_db_regnum)),
- O2HB_DB_TYPE_REGION_NUMBER,
- 0, O2NM_MAX_NODES, reg);
- if (!reg->hr_debug_regnum) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum),
+ sizeof(*(reg->hr_db_regnum)),
+ O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg);
- reg->hr_debug_elapsed_time =
- o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
- reg->hr_debug_dir,
- &(reg->hr_db_elapsed_time),
- sizeof(*(reg->hr_db_elapsed_time)),
- O2HB_DB_TYPE_REGION_ELAPSED_TIME,
- 0, 0, reg);
- if (!reg->hr_debug_elapsed_time) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir,
+ &(reg->hr_db_elapsed_time),
+ sizeof(*(reg->hr_db_elapsed_time)),
+ O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg);
- reg->hr_debug_pinned =
- o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
- reg->hr_debug_dir,
- &(reg->hr_db_pinned),
- sizeof(*(reg->hr_db_pinned)),
- O2HB_DB_TYPE_REGION_PINNED,
- 0, 0, reg);
- if (!reg->hr_debug_pinned) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned),
+ sizeof(*(reg->hr_db_pinned)),
+ O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg);
- ret = 0;
-bail:
- return ret;
}
static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
if (ret)
goto unregister_handler;
- ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
- if (ret) {
- config_item_put(®->hr_item);
- goto unregister_handler;
- }
+ o2hb_debug_region_init(reg, o2hb_debug_dir);
return ®->hr_item;
int i, j, num_used;
u32 major_hash;
struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
- struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+ struct ocfs2_dx_entry_list *orig_list, *tmp_list;
struct ocfs2_dx_entry *dx_entry;
tmp_list = &tmp_dx_leaf->dl_list;
orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
orig_list = &orig_dx_leaf->dl_list;
new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
- new_list = &new_dx_leaf->dl_list;
num_used = le16_to_cpu(orig_list->de_num_used);
atomic_t res_tot_count;
atomic_t res_cur_count;
- struct dlm_debug_ctxt *dlm_debug_ctxt;
struct dentry *dlm_debugfs_subroot;
/* NOTE: Next three are protected by dlm_domain_lock */
/* files in subroot */
void dlm_debug_init(struct dlm_ctxt *dlm)
{
- struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
/* for dumping dlm_ctxt */
- dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_state_fops);
+ debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm, &debug_state_fops);
/* for dumping lockres */
- dc->debug_lockres_dentry =
- debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_lockres_fops);
+ debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops);
/* for dumping mles */
- dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_mle_fops);
+ debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops);
/* for dumping lockres on the purge list */
- dc->debug_purgelist_dentry =
- debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_purgelist_fops);
-}
-
-void dlm_debug_shutdown(struct dlm_ctxt *dlm)
-{
- struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
- if (dc) {
- debugfs_remove(dc->debug_purgelist_dentry);
- debugfs_remove(dc->debug_mle_dentry);
- debugfs_remove(dc->debug_lockres_dentry);
- debugfs_remove(dc->debug_state_dentry);
- kfree(dc);
- dc = NULL;
- }
+ debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm,
+ &debug_purgelist_fops);
}
/* subroot - domain dir */
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
{
- dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
- GFP_KERNEL);
- if (!dlm->dlm_debug_ctxt) {
- mlog_errno(-ENOMEM);
- return -ENOMEM;
- }
-
dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
dlm_debugfs_root);
- return 0;
}
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
- debugfs_remove(dlm->dlm_debugfs_subroot);
+ debugfs_remove_recursive(dlm->dlm_debugfs_subroot);
}
/* debugfs root */
#ifdef CONFIG_DEBUG_FS
-struct dlm_debug_ctxt {
- struct dentry *debug_state_dentry;
- struct dentry *debug_lockres_dentry;
- struct dentry *debug_mle_dentry;
- struct dentry *debug_purgelist_dentry;
-};
-
struct debug_lockres {
int dl_len;
char *dl_buf;
};
void dlm_debug_init(struct dlm_ctxt *dlm);
-void dlm_debug_shutdown(struct dlm_ctxt *dlm);
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
void dlm_create_debugfs_root(void);
static inline void dlm_debug_init(struct dlm_ctxt *dlm)
{
}
-static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
-{
-}
-static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
{
- return 0;
}
static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
{
dlm_unregister_domain_handlers(dlm);
- dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
if (status) {
dlm_unregister_domain_handlers(dlm);
- dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
dlm->key = key;
dlm->node_num = o2nm_this_node();
- ret = dlm_create_debugfs_subroot(dlm);
- if (ret < 0)
- goto leave;
+ dlm_create_debugfs_subroot(dlm);
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
mlog(0, "context init: refcount %u\n",
kref_read(&dlm->dlm_refs));
+ ret = 0;
leave:
if (ret < 0 && dlm) {
if (dlm->master_hash)
enum dlm_status status;
int actions = 0;
int in_use;
- u8 owner;
+ u8 owner;
+ int recovery_wait = 0;
mlog(0, "master_node = %d, valblk = %d\n", master_node,
flags & LKM_VALBLK);
}
if (flags & LKM_CANCEL)
lock->cancel_pending = 0;
- else
- lock->unlock_pending = 0;
-
+ else {
+ if (!lock->unlock_pending)
+ recovery_wait = 1;
+ else
+ lock->unlock_pending = 0;
+ }
}
/* get an extra ref on lock. if we are just switching
spin_unlock(&res->spinlock);
wake_up(&res->wq);
+ if (recovery_wait) {
+ spin_lock(&res->spinlock);
+ /* Unlock request will directly succeed after owner dies,
+ * and the lock is already removed from grant list. We have to
+ * wait for RECOVERING done or we miss the chance to purge it
+ * since the removement is much faster than RECOVERING proc.
+ */
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING);
+ spin_unlock(&res->spinlock);
+ }
+
/* let the caller's final dlm_lock_put handle the actual kfree */
if (actions & DLM_UNLOCK_FREE_LOCK) {
/* this should always be coupled with list removal */
ocfs2_inode_unlock(inode, ex);
}
- if (local_bh)
- brelse(local_bh);
-
+ brelse(local_bh);
return status;
}
*level = 1;
if (ocfs2_should_update_atime(inode, vfsmnt))
ocfs2_update_inode_atime(inode, bh);
- if (bh)
- brelse(bh);
+ brelse(bh);
} else
*level = 0;
kref_init(&dlm_debug->d_refcnt);
INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
- dlm_debug->d_locking_state = NULL;
- dlm_debug->d_locking_filter = NULL;
dlm_debug->d_filter_secs = 0;
out:
return dlm_debug;
{
struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
- dlm_debug->d_locking_state = debugfs_create_file("locking_state",
- S_IFREG|S_IRUSR,
- osb->osb_debug_root,
- osb,
- &ocfs2_dlm_debug_fops);
+ debugfs_create_file("locking_state", S_IFREG|S_IRUSR,
+ osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops);
- dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter",
- 0600,
- osb->osb_debug_root,
- &dlm_debug->d_filter_secs);
+ debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
+ &dlm_debug->d_filter_secs);
}
static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
{
struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
- if (dlm_debug) {
- debugfs_remove(dlm_debug->d_locking_state);
- debugfs_remove(dlm_debug->d_locking_filter);
+ if (dlm_debug)
ocfs2_put_dlm_debug(dlm_debug);
- }
}
int ocfs2_dlm_init(struct ocfs2_super *osb)
*extent_flags = rec->e_flags;
}
out:
- if (eb_bh)
- brelse(eb_bh);
+ brelse(eb_bh);
return ret;
}
* Thus, we need to explicitly order the zeroed pages.
*/
static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
- struct buffer_head *di_bh)
+ struct buffer_head *di_bh,
+ loff_t start_byte,
+ loff_t length)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
handle_t *handle = NULL;
goto out;
}
- ret = ocfs2_jbd2_file_inode(handle, inode);
+ ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
if (ret < 0) {
mlog_errno(ret);
goto out;
BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
- handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
+ handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
+ abs_from,
+ abs_to - abs_from);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
struct buffer_head *di_bh = NULL;
- loff_t end;
/*
* We start with a read level meta lock and only jump to an ex
}
}
- end = pos + count;
-
ret = ocfs2_check_range_for_refcount(inode, pos, count);
if (ret == 1) {
ocfs2_inode_unlock(inode, meta_level);
*/
mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
!!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
- "Inode %llu: system file state is ambigous\n",
+ "Inode %llu: system file state is ambiguous\n",
(unsigned long long)args->fi_blkno);
if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
-void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
void ocfs2_complete_recovery(struct work_struct *work);
void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
* ocfs2_journal_access_*() unless you intend to
* manage the checksum by hand.
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
- * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
- * the current handle commits.
+ * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes
+ * out before the current handle commits.
*/
/* You must always start_trans with a number of buffs > 0, but it's
* previous dirblock update in the free list */
static inline int ocfs2_link_credits(struct super_block *sb)
{
- return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
+ return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 +
ocfs2_quota_trans_credits(sb);
}
return ocfs2_extent_recs_per_gd(sb);
}
-static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
- unsigned int clusters_to_del,
- struct ocfs2_dinode *fe,
- struct ocfs2_extent_list *last_el)
+static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode,
+ loff_t start_byte, loff_t length)
{
- /* for dinode + all headers in this pass + update to next leaf */
- u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
- u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
- int credits = 1 + tree_depth + 1;
- int i;
-
- i = next_free - 1;
- BUG_ON(i < 0);
-
- /* We may be deleting metadata blocks, so metadata alloc dinode +
- one desc. block for each possible delete. */
- if (tree_depth && next_free == 1 &&
- ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
- credits += 1 + tree_depth;
-
- /* update to the truncate log. */
- credits += OCFS2_TRUNCATE_LOG_UPDATE;
-
- credits += ocfs2_quota_trans_credits(sb);
-
- return credits;
-}
-
-static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
-{
- return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
+ return jbd2_journal_inode_ranged_write(handle,
+ &OCFS2_I(inode)->ip_jinode,
+ start_byte, length);
}
static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
struct inode *inode = NULL;
struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
- struct ocfs2_dinode *di = NULL;
handle_t *handle = NULL;
char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
struct buffer_head *parent_di_bh = NULL;
goto leave;
}
- di = (struct ocfs2_dinode *)new_di_bh->b_data;
status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
&orphan_insert, orphan_dir, false);
if (status < 0) {
struct ocfs2_dlm_debug {
struct kref d_refcnt;
- struct dentry *d_locking_state;
- struct dentry *d_locking_filter;
u32 d_filter_secs;
struct list_head d_lockres_tracking;
};
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
- struct dentry *osb_ctxt;
wait_queue_head_t recovery_event;
osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
ocfs2_debugfs_root);
- osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
- osb->osb_debug_root,
- osb,
- &ocfs2_osb_debug_fops);
+ debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root,
+ osb, &ocfs2_osb_debug_fops);
if (ocfs2_meta_ecc(osb))
ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats,
kset_unregister(osb->osb_dev_kset);
- debugfs_remove(osb->osb_ctxt);
-
/* Orphan scan should be stopped as early as possible */
ocfs2_orphan_scan_stop(osb);
ocfs2_dlm_shutdown(osb, hangup_needed);
ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
- debugfs_remove(osb->osb_debug_root);
+ debugfs_remove_recursive(osb->osb_debug_root);
if (hangup_needed)
ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
return -EINVAL;
}
+
+ /*
+ * XXX: Huge page cache doesn't support writing yet. Drop all page
+ * cache for this file before processing writes.
+ */
+ if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
+ truncate_pagecache(inode, 0);
+
return 0;
cleanup_all:
#include <linux/mmzone.h>
#include <linux/proc_fs.h>
#include <linux/percpu.h>
-#include <linux/quicklist.h>
#include <linux/seq_file.h>
#include <linux/swap.h>
#include <linux/vmstat.h>
global_zone_page_state(NR_KERNEL_STACK_KB));
show_val_kb(m, "PageTables: ",
global_zone_page_state(NR_PAGETABLE));
-#ifdef CONFIG_QUICKLIST
- show_val_kb(m, "Quicklists: ", quicklist_total_size());
-#endif
show_val_kb(m, "NFS_Unstable: ",
global_node_page_state(NR_UNSTABLE_NFS));
global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
show_val_kb(m, "ShmemPmdMapped: ",
global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+ show_val_kb(m, "FileHugePages: ",
+ global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
+ show_val_kb(m, "FilePmdMapped: ",
+ global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
#endif
#ifdef CONFIG_CMA
unsigned long lazyfree;
unsigned long anonymous_thp;
unsigned long shmem_thp;
+ unsigned long file_thp;
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
static void smaps_account(struct mem_size_stats *mss, struct page *page,
bool compound, bool young, bool dirty, bool locked)
{
- int i, nr = compound ? 1 << compound_order(page) : 1;
+ int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
/*
else if (is_zone_device_page(page))
/* pass */;
else
- VM_BUG_ON_PAGE(1, page);
+ mss->file_thp += HPAGE_PMD_SIZE;
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
}
#else
SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
mss->private_hugetlb >> 10, 7);
__free_page(pte_page);
}
-#else /* CONFIG_MMU */
-
-/* This is enough for a nommu architecture */
-#define check_pgt_cache() do { } while (0)
-
#endif /* CONFIG_MMU */
#endif /* __ASM_GENERIC_PGALLOC_H */
* need this). If THP is not enabled, the pmd can't go away under the
* code even if MADV_DONTNEED runs, but if THP is enabled we need to
* run a pmd_trans_unstable before walking the ptes after
- * split_huge_page_pmd returns (because it may have run when the pmd
- * become null, but then a page fault can map in a THP and not a
- * regular page).
+ * split_huge_pmd returns (because it may have run when the pmd become
+ * null, but then a page fault can map in a THP and not a regular page).
*/
static inline int pmd_trans_unstable(pmd_t *pmd)
{
static inline void init_espfix_bsp(void) { }
#endif
-extern void __init pgd_cache_init(void);
+extern void __init pgtable_cache_init(void);
#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
return false;
}
-/*
- * Compaction has backed off for some reason. It might be throttling or
- * lock contention. Retrying is still worthwhile.
- */
-static inline bool compaction_withdrawn(enum compact_result result)
+/* Compaction needs reclaim to be performed first, so it can continue. */
+static inline bool compaction_needs_reclaim(enum compact_result result)
{
/*
* Compaction backed off due to watermark checks for order-0
if (result == COMPACT_SKIPPED)
return true;
+ return false;
+}
+
+/*
+ * Compaction has backed off for some reason after doing some work or none
+ * at all. It might be throttling or lock contention. Retrying might be still
+ * worthwhile, but with a higher priority if allowed.
+ */
+static inline bool compaction_withdrawn(enum compact_result result)
+{
/*
* If compaction is deferred for high-order allocations, it is
* because sync compaction recently failed. If this is the case
return false;
}
+static inline bool compaction_needs_reclaim(enum compact_result result)
+{
+ return false;
+}
+
static inline bool compaction_withdrawn(enum compact_result result)
{
return true;
* @i_pages: Cached pages.
* @gfp_mask: Memory allocation flags to use for allocating pages.
* @i_mmap_writable: Number of VM_SHARED mappings.
+ * @nr_thps: Number of THPs in the pagecache (non-shmem only).
* @i_mmap: Tree of private and shared mappings.
* @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
* @nrpages: Number of page entries, protected by the i_pages lock.
struct xarray i_pages;
gfp_t gfp_mask;
atomic_t i_mmap_writable;
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ /* number of thp, only for non-shmem files */
+ atomic_t nr_thps;
+#endif
struct rb_root_cached i_mmap;
struct rw_semaphore i_mmap_rwsem;
unsigned long nrpages;
return errseq_sample(&mapping->wb_err);
}
+static inline int filemap_nr_thps(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ return atomic_read(&mapping->nr_thps);
+#else
+ return 0;
+#endif
+}
+
+static inline void filemap_nr_thps_inc(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ atomic_inc(&mapping->nr_thps);
+#else
+ WARN_ON_ONCE(1);
+#endif
+}
+
+static inline void filemap_nr_thps_dec(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ atomic_dec(&mapping->nr_thps);
+#else
+ WARN_ON_ONCE(1);
+#endif
+}
+
extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
int datasync);
extern int vfs_fsync(struct file *file, int datasync);
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+ /*
+ * Global or memcg deferred list in the second tail pages is
+ * occupied by compound_head.
+ */
+ return &page[2].deferred_list;
+}
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
static inline struct hstate *page_hstate(struct page *page)
{
VM_BUG_ON_PAGE(!PageHuge(page), page);
- return size_to_hstate(PAGE_SIZE << compound_order(page));
+ return size_to_hstate(page_size(page));
}
static inline unsigned hstate_index_to_shift(unsigned index)
extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
extern int jbd2_journal_force_commit(journal_t *);
extern int jbd2_journal_force_commit_nested(journal_t *);
-extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
-extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
extern int jbd2_journal_inode_ranged_write(handle_t *handle,
struct jbd2_inode *inode, loff_t start_byte,
loff_t length);
extern void __khugepaged_exit(struct mm_struct *mm);
extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long vm_flags);
+#ifdef CONFIG_SHMEM
+extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
+#else
+static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+}
+#endif
#define khugepaged_enabled() \
(transparent_hugepage_flags & \
{
return 0;
}
+static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_KHUGEPAGED_H */
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
-#ifdef CONFIG_MEMCG_KMEM
struct memcg_shrinker_map __rcu *shrinker_map;
-#endif
+
struct rb_node tree_node; /* RB tree node */
unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
struct list_head event_list;
spinlock_t event_list_lock;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct deferred_split deferred_split_queue;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
} while ((memcg = parent_mem_cgroup(memcg)));
return false;
}
+
+extern int memcg_expand_shrinker_maps(int new_id);
+
+extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+ int nid, int shrinker_id);
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
{
return false;
}
+
+static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+ int nid, int shrinker_id)
+{
+}
#endif
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
return memcg ? memcg->kmemcg_id : -1;
}
-extern int memcg_expand_shrinker_maps(int new_id);
-
-extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
- int nid, int shrinker_id);
#else
static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
}
-static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
- int nid, int shrinker_id) { }
#endif /* CONFIG_MEMCG_KMEM */
#endif /* _LINUX_MEMCONTROL_H */
struct memory_block {
unsigned long start_section_nr;
- unsigned long end_section_nr;
unsigned long state; /* serialized by the dev->lock */
int section_count; /* serialized by mem_sysfs_mutex */
int online_type; /* for passing data to online routine */
#define IPC_CALLBACK_PRI 10
#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE
-static inline int memory_dev_init(void)
+static inline void memory_dev_init(void)
{
- return 0;
+ return;
}
static inline int register_memory_notifier(struct notifier_block *nb)
{
extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
int create_memory_block_devices(unsigned long start, unsigned long size);
void remove_memory_block_devices(unsigned long start, unsigned long size);
-extern int memory_dev_init(void);
+extern void memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
extern int memory_isolate_notify(unsigned long val, void *v);
extern struct memory_block *find_memory_block(struct mem_section *);
page[1].compound_order = order;
}
+/* Returns the number of pages in this potentially compound page. */
+static inline unsigned long compound_nr(struct page *page)
+{
+ return 1UL << compound_order(page);
+}
+
+/* Returns the number of bytes in this potentially compound page. */
+static inline unsigned long page_size(struct page *page)
+{
+ return PAGE_SIZE << compound_order(page);
+}
+
+/* Returns the number of bits needed for the number of bytes in a page */
+static inline unsigned int page_shift(struct page *page)
+{
+ return PAGE_SHIFT + compound_order(page);
+}
+
void free_compound_page(struct page *page);
#ifdef CONFIG_MMU
put_page(page);
}
-void put_user_pages_dirty(struct page **pages, unsigned long npages);
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages);
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+ bool make_dirty);
+
void put_user_pages(struct page **pages, unsigned long npages);
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
+#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
+#else
+static inline bool can_do_mlock(void) { return false; }
+#endif
extern int user_shm_lock(size_t, struct user_struct *);
extern void user_shm_unlock(size_t, struct user_struct *);
unsigned long addr, unsigned long len,
unsigned long flags, struct page **pages);
+unsigned long randomize_stack_top(unsigned long stack_top);
+
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
extern unsigned long mmap_region(struct file *file, unsigned long addr,
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
+#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
/*
* NOTE on FOLL_LONGTERM:
static inline void setup_nr_node_ids(void) {}
#endif
+extern int memcmp_pages(struct page *page1, struct page *page2);
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+ return !memcmp_pages(page1, page2);
+}
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
struct { /* Second tail page of compound page */
unsigned long _compound_pad_1; /* compound_head */
unsigned long _compound_pad_2;
+ /* For both global and memcg */
struct list_head deferred_list;
};
struct { /* Page table pages */
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_SHMEM_THPS,
NR_SHMEM_PMDMAPPED,
+ NR_FILE_THPS,
+ NR_FILE_PMDMAPPED,
NR_ANON_THPS,
NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_VMSCAN_WRITE,
extern struct page *mem_map;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct deferred_split {
+ spinlock_t split_queue_lock;
+ struct list_head split_queue;
+ unsigned long split_queue_len;
+};
+#endif
+
/*
* On NUMA machines, each NUMA node would have a pg_data_t to describe
* it's memory layout. On UMA machines there is a single pglist_data which
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- spinlock_t split_queue_lock;
- struct list_head split_queue;
- unsigned long split_queue_len;
+ struct deferred_split deferred_split_queue;
#endif
/* Fields commonly accessed by the page reclaim scanner */
enum page_ext_flags {
PAGE_EXT_OWNER,
+ PAGE_EXT_OWNER_ACTIVE,
#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
mapping_gfp_mask(mapping));
}
+static inline struct page *find_subpage(struct page *page, pgoff_t offset)
+{
+ if (PageHuge(page))
+ return page;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ return page + (offset & (compound_nr(page) - 1));
+}
+
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_QUICKLIST_H
-#define LINUX_QUICKLIST_H
-/*
- * Fast allocations and disposal of pages. Pages must be in the condition
- * as needed after allocation when they are freed. Per cpu lists of pages
- * are kept that only contain node local pages.
- *
- * (C) 2007, SGI. Christoph Lameter <cl@linux.com>
- */
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/percpu.h>
-
-#ifdef CONFIG_QUICKLIST
-
-struct quicklist {
- void *page;
- int nr_pages;
-};
-
-DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
-
-/*
- * The two key functions quicklist_alloc and quicklist_free are inline so
- * that they may be custom compiled for the platform.
- * Specifying a NULL ctor can remove constructor support. Specifying
- * a constant quicklist allows the determination of the exact address
- * in the per cpu area.
- *
- * The fast patch in quicklist_alloc touched only a per cpu cacheline and
- * the first cacheline of the page itself. There is minmal overhead involved.
- */
-static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *))
-{
- struct quicklist *q;
- void **p = NULL;
-
- q =&get_cpu_var(quicklist)[nr];
- p = q->page;
- if (likely(p)) {
- q->page = p[0];
- p[0] = NULL;
- q->nr_pages--;
- }
- put_cpu_var(quicklist);
- if (likely(p))
- return p;
-
- p = (void *)__get_free_page(flags | __GFP_ZERO);
- if (ctor && p)
- ctor(p);
- return p;
-}
-
-static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p,
- struct page *page)
-{
- struct quicklist *q;
-
- q = &get_cpu_var(quicklist)[nr];
- *(void **)p = q->page;
- q->page = p;
- q->nr_pages++;
- put_cpu_var(quicklist);
-}
-
-static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp)
-{
- __quicklist_free(nr, dtor, pp, virt_to_page(pp));
-}
-
-static inline void quicklist_free_page(int nr, void (*dtor)(void *),
- struct page *page)
-{
- __quicklist_free(nr, dtor, page_address(page), page);
-}
-
-void quicklist_trim(int nr, void (*dtor)(void *),
- unsigned long min_pages, unsigned long max_free);
-
-unsigned long quicklist_total_size(void);
-
-#else
-
-static inline unsigned long quicklist_total_size(void)
-{
- return 0;
-}
-
-#endif
-
-#endif /* LINUX_QUICKLIST_H */
-
/* These are for internal use */
struct list_head list;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
/* ID in shrinker_idr */
int id;
#endif
/* Flags */
#define SHRINKER_NUMA_AWARE (1 << 0)
#define SHRINKER_MEMCG_AWARE (1 << 1)
+/*
+ * It just makes sense when the shrinker is also MEMCG_AWARE for now,
+ * non-MEMCG_AWARE shrinker should not have this flag set.
+ */
+#define SHRINKER_NONSLAB (1 << 2)
extern int prealloc_shrinker(struct shrinker *shrinker);
extern void register_shrinker_prepared(struct shrinker *shrinker);
return __kmalloc_node(size, flags, node);
}
-struct memcg_cache_array {
- struct rcu_head rcu;
- struct kmem_cache *entries[0];
-};
-
-/*
- * This is the main placeholder for memcg-related information in kmem caches.
- * Both the root cache and the child caches will have it. For the root cache,
- * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system. To allow the
- * array to be accessed without taking any locks, on relocation we free the old
- * version only after a grace period.
- *
- * Root and child caches hold different metadata.
- *
- * @root_cache: Common to root and child caches. NULL for root, pointer to
- * the root cache for children.
- *
- * The following fields are specific to root caches.
- *
- * @memcg_caches: kmemcg ID indexed table of child caches. This table is
- * used to index child cachces during allocation and cleared
- * early during shutdown.
- *
- * @root_caches_node: List node for slab_root_caches list.
- *
- * @children: List of all child caches. While the child caches are also
- * reachable through @memcg_caches, a child cache remains on
- * this list until it is actually destroyed.
- *
- * The following fields are specific to child caches.
- *
- * @memcg: Pointer to the memcg this cache belongs to.
- *
- * @children_node: List node for @root_cache->children list.
- *
- * @kmem_caches_node: List node for @memcg->kmem_caches list.
- */
-struct memcg_cache_params {
- struct kmem_cache *root_cache;
- union {
- struct {
- struct memcg_cache_array __rcu *memcg_caches;
- struct list_head __root_caches_node;
- struct list_head children;
- bool dying;
- };
- struct {
- struct mem_cgroup *memcg;
- struct list_head children_node;
- struct list_head kmem_caches_node;
- struct percpu_ref refcnt;
-
- void (*work_fn)(struct kmem_cache *);
- union {
- struct rcu_head rcu_head;
- struct work_struct work;
- };
- };
- };
-};
-
int memcg_update_all_caches(int num_memcgs);
/**
unsigned long va_start;
unsigned long va_end;
- /*
- * Largest available free size in subtree.
- */
- unsigned long subtree_max_size;
- unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
- struct llist_node purge_list; /* "lazy purge" list */
- struct vm_struct *vm;
+
+ /*
+ * The following three variables can be packed, because
+ * a vmap_area object is always one of the three states:
+ * 1) in "free" tree (root is vmap_area_root)
+ * 2) in "busy" tree (root is free_vmap_area_root)
+ * 3) in purge list (head is vmap_purge_list)
+ */
+ union {
+ unsigned long subtree_max_size; /* in "free" tree */
+ struct vm_struct *vm; /* in "busy" tree */
+ struct llist_node purge_list; /* in purge list */
+ };
};
/*
void zpool_destroy_pool(struct zpool *pool);
+bool zpool_malloc_support_movable(struct zpool *pool);
+
int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
unsigned long *handle);
struct zpool *zpool);
void (*destroy)(void *pool);
+ bool malloc_support_movable;
int (*malloc)(void *pool, size_t size, gfp_t gfp,
unsigned long *handle);
void (*free)(void *pool, unsigned long handle);
void __init __weak poking_init(void) { }
-void __init __weak pgd_cache_init(void) { }
+void __init __weak pgtable_cache_init(void) { }
bool initcall_debug;
core_param(initcall_debug, initcall_debug, bool, 0644);
report_meminit();
mem_init();
kmem_cache_init();
+ kmemleak_init();
pgtable_init();
debug_objects_mem_init();
vmalloc_init();
init_espfix_bsp();
/* Should be run after espfix64 is set up. */
pti_init();
- pgd_cache_init();
}
void __init __weak arch_call_rest_init(void)
page_address_init();
pr_notice("%s", linux_banner);
setup_arch(&command_line);
- mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
initrd_start = 0;
}
#endif
- kmemleak_init();
setup_per_cpu_pageset();
numa_policy_init();
acpi_early_init();
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
+#include <linux/khugepaged.h>
#include <linux/uprobes.h>
*
* @vma: vma that holds the pte pointing to page
* @addr: address the old @page is mapped at
- * @page: the cowed page we are replacing by kpage
- * @kpage: the modified page we replace page by
+ * @old_page: the page we are replacing by new_page
+ * @new_page: the modified page we replace page by
*
- * Returns 0 on success, -EFAULT on failure.
+ * If @new_page is NULL, only unmap @old_page.
+ *
+ * Returns 0 on success, negative error code otherwise.
*/
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
- .page = old_page,
+ .page = compound_head(old_page),
.vma = vma,
.address = addr,
};
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
- VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
-
- err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
- false);
- if (err)
- return err;
+ if (new_page) {
+ err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false);
+ if (err)
+ return err;
+ }
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(old_page);
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
if (!page_vma_mapped_walk(&pvmw)) {
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ if (new_page)
+ mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ if (new_page) {
+ get_page(new_page);
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
+ } else
+ /* no new page, just dec_mm_counter for old_page */
+ dec_mm_counter(mm, MM_ANONPAGES);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, mm_counter_file(old_page));
flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
ptep_clear_flush_notify(vma, addr, pvmw.pte);
- set_pte_at_notify(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
+ if (new_page)
+ set_pte_at_notify(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, false);
if (!page_mapped(old_page))
struct page *old_page, *new_page;
struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
+ bool orig_page_huge = false;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
retry:
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(NULL, mm, vaddr, 1,
- FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
+ FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL);
if (ret <= 0)
return ret;
ref_ctr_updated = 1;
}
+ ret = 0;
+ if (!is_register && !PageAnon(old_page))
+ goto put_old;
+
ret = anon_vma_prepare(vma);
if (ret)
goto put_old;
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (!is_register) {
+ struct page *orig_page;
+ pgoff_t index;
+
+ VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
+
+ index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
+ orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
+ index);
+
+ if (orig_page) {
+ if (PageUptodate(orig_page) &&
+ pages_identical(new_page, orig_page)) {
+ /* let go new_page */
+ put_page(new_page);
+ new_page = NULL;
+
+ if (PageCompound(orig_page))
+ orig_page_huge = true;
+ }
+ put_page(orig_page);
+ }
+ }
+
ret = __replace_page(vma, vaddr, old_page, new_page);
- put_page(new_page);
+ if (new_page)
+ put_page(new_page);
put_old:
put_page(old_page);
if (ret && is_register && ref_ctr_updated)
update_ref_ctr(uprobe, mm, -1);
+ /* try collapse pmd for compound page */
+ if (!ret && orig_page_huge)
+ collapse_pte_mapped_thp(mm, vaddr);
+
return ret;
}
while (start < end &&
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
false, &res)) {
- pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
- end_pfn = (res.end + 1) >> PAGE_SHIFT;
+ pfn = PFN_UP(res.start);
+ end_pfn = PFN_DOWN(res.end + 1);
if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret)
tick_nohz_idle_enter();
while (!need_resched()) {
- check_pgt_cache();
rmb();
local_irq_disable();
extern struct ctl_table firmware_config_table[];
#endif
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
int sysctl_legacy_va_layout;
#endif
.proc_handler = proc_dointvec,
.extra1 = SYSCTL_ZERO,
},
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
{
.procname = "legacy_va_layout",
.data = &sysctl_legacy_va_layout,
In order to access the kmemleak file, debugfs needs to be
mounted (usually at /sys/kernel/debug).
-config DEBUG_KMEMLEAK_EARLY_LOG_SIZE
- int "Maximum kmemleak early log entries"
+config DEBUG_KMEMLEAK_MEM_POOL_SIZE
+ int "Kmemleak memory pool size"
depends on DEBUG_KMEMLEAK
- range 200 40000
- default 400
+ range 200 1000000
+ default 16000
help
Kmemleak must track all the memory allocations to avoid
reporting false positives. Since memory may be allocated or
- freed before kmemleak is initialised, an early log buffer is
- used to store these actions. If kmemleak reports "early log
- buffer exceeded", please increase this value.
+ freed before kmemleak is fully initialised, use a static pool
+ of metadata objects to track such callbacks. After kmemleak is
+ fully initialised, this memory pool acts as an emergency one
+ if slab allocations fail.
config DEBUG_KMEMLEAK_TEST
tristate "Simple test for the kernel memory leak detector"
to 3TB of RAM with KASan enabled). This options allows to force
4-level paging instead.
+config KASAN_SW_TAGS_IDENTIFY
+ bool "Enable memory corruption identification"
+ depends on KASAN_SW_TAGS
+ help
+ This option enables best-effort identification of bug type
+ (use-after-free or out-of-bounds) at the cost of increased
+ memory consumption.
+
config TEST_KASAN
tristate "Module for testing KASAN for bug detection"
depends on m && KASAN
head = compound_head(page);
v += (page - head) << PAGE_SHIFT;
- if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head))))
+ if (likely(n <= v && v <= (page_size(head))))
return true;
WARN_ON(1);
return false;
*/
#include <linux/mm.h>
-#include <linux/quicklist.h>
#include <linux/cma.h>
void show_mem(unsigned int filter, nodemask_t *nodemask)
#ifdef CONFIG_CMA
printk("%lu pages cma reserved\n", totalcma_pages);
#endif
-#ifdef CONFIG_QUICKLIST
- printk("%lu pages in pagetable cache\n",
- quicklist_total_size());
-#endif
#ifdef CONFIG_MEMORY_FAILURE
printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/uaccess.h>
+#include <linux/io.h>
+
+#include <asm/page.h>
/*
* Note: test functions are marked noinline so that their names appear in
kfree(ptr2);
}
+static noinline void __init kfree_via_page(void)
+{
+ char *ptr;
+ size_t size = 8;
+ struct page *page;
+ unsigned long offset;
+
+ pr_info("invalid-free false positive (via page)\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ page = virt_to_page(ptr);
+ offset = offset_in_page(ptr);
+ kfree(page_address(page) + offset);
+}
+
+static noinline void __init kfree_via_phys(void)
+{
+ char *ptr;
+ size_t size = 8;
+ phys_addr_t phys;
+
+ pr_info("invalid-free false positive (via phys)\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ phys = virt_to_phys(ptr);
+ kfree(phys_to_virt(phys));
+}
+
static noinline void __init kmem_cache_oob(void)
{
char *p;
kmalloc_uaf();
kmalloc_uaf_memset();
kmalloc_uaf2();
+ kfree_via_page();
+ kfree_via_phys();
kmem_cache_oob();
memcg_accounted_kmem_cache();
kasan_stack_oob();
by default when ZONE_DMA or HIGHMEM is selected, but you
may say n to override this.
-config NR_QUICK
- int
- depends on QUICKLIST
- default "1"
-
config VIRT_TO_BUS
bool
help
config GUP_GET_PTE_LOW_HIGH
bool
+config READ_ONLY_THP_FOR_FS
+ bool "Read-only THP for filesystems (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
+
+ help
+ Allow khugepaged to put read-only file-backed pages in THP.
+
+ This is marked experimental because it is a new feature. Write
+ support of file THPs will be developed in the next few release
+ cycles.
+
config ARCH_HAS_PTE_SPECIAL
bool
Also, the state of page tracking structures is checked more often as
pages are being allocated and freed, as unexpected state changes
often happen for same reasons as memory corruption (e.g. double free,
- use-after-free).
+ use-after-free). The error reports for these checks can be augmented
+ with stack traces of last allocation and freeing of the page, when
+ PAGE_OWNER is also selected and enabled on boot.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
+CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
+CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
-obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
* is safe to read and it's 0 for tail pages.
*/
if (unlikely(PageCompound(page))) {
- low_pfn += (1UL << compound_order(page)) - 1;
+ low_pfn += compound_nr(page) - 1;
goto isolate_fail;
}
}
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
*/
-static isolate_migrate_t isolate_migratepages(struct zone *zone,
- struct compact_control *cc)
+static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
{
unsigned long block_start_pfn;
unsigned long block_end_pfn;
*/
low_pfn = fast_find_migrateblock(cc);
block_start_pfn = pageblock_start_pfn(low_pfn);
- if (block_start_pfn < zone->zone_start_pfn)
- block_start_pfn = zone->zone_start_pfn;
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
/*
* fast_find_migrateblock marks a pageblock skipped so to avoid
if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
- page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
- zone);
+ page = pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone);
if (!page)
continue;
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
+ /*
+ * These counters track activities during zone compaction. Initialize
+ * them before compacting a new zone.
+ */
+ cc->total_migrate_scanned = 0;
+ cc->total_free_scanned = 0;
+ cc->nr_migratepages = 0;
+ cc->nr_freepages = 0;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
+
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
cc->rescan = true;
}
- switch (isolate_migratepages(cc->zone, cc)) {
+ switch (isolate_migratepages(cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
{
enum compact_result ret;
struct compact_control cc = {
- .nr_freepages = 0,
- .nr_migratepages = 0,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.order = order,
.search_order = order,
.gfp_mask = gfp_mask,
if (capture)
current->capture_control = &capc;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
ret = compact_zone(&cc, &capc);
struct zone *zone;
struct compact_control cc = {
.order = -1,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
if (!populated_zone(zone))
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
compact_zone(&cc, NULL);
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
.search_order = pgdat->kcompactd_max_order,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
COMPACT_CONTINUE)
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
- cc.total_migrate_scanned = 0;
- cc.total_free_scanned = 0;
- cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
-
if (kthread_should_stop())
return;
+
+ cc.zone = zone;
status = compact_zone(&cc, NULL);
if (status == COMPACT_SUCCESS) {
/* hugetlb pages are represented by a single entry in the xarray */
if (!PageHuge(page)) {
xas_set_order(&xas, page->index, compound_order(page));
- nr = 1U << compound_order(page);
+ nr = compound_nr(page);
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
- } else {
- VM_BUG_ON_PAGE(PageTransHuge(page), page);
+ } else if (PageTransHuge(page)) {
+ __dec_node_page_state(page, NR_FILE_THPS);
+ filemap_nr_thps_dec(mapping);
}
/*
* @pvec: pagevec with pages to delete
*
* The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index.
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
* It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the
- * mapping as well.
+ * @pvec.
*
* The function expects the i_pages lock to be held.
*/
{
XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
- int i = 0, tail_pages = 0;
+ int i = 0;
struct page *page;
mapping_set_update(&xas, mapping);
xas_for_each(&xas, page, ULONG_MAX) {
- if (i >= pagevec_count(pvec) && !tail_pages)
+ if (i >= pagevec_count(pvec))
break;
+
+ /* A swap/dax/shadow entry got inserted? Skip it. */
if (xa_is_value(page))
continue;
- if (!tail_pages) {
- /*
- * Some page got inserted in our range? Skip it. We
- * have our pages locked so they are protected from
- * being removed.
- */
- if (page != pvec->pages[i]) {
- VM_BUG_ON_PAGE(page->index >
- pvec->pages[i]->index, page);
- continue;
- }
- WARN_ON_ONCE(!PageLocked(page));
- if (PageTransHuge(page) && !PageHuge(page))
- tail_pages = HPAGE_PMD_NR - 1;
+ /*
+ * A page got inserted in our range? Skip it. We have our
+ * pages locked so they are protected from being removed.
+ * If we see a page whose index is higher than ours, it
+ * means our page has been removed, which shouldn't be
+ * possible because we're holding the PageLock.
+ */
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+ page);
+ continue;
+ }
+
+ WARN_ON_ONCE(!PageLocked(page));
+
+ if (page->index == xas.xa_index)
page->mapping = NULL;
- /*
- * Leave page->index set: truncation lookup relies
- * upon it
- */
+ /* Leave page->index set: truncation lookup relies on it */
+
+ /*
+ * Move to the next page in the vector if this is a regular
+ * page or the index is of the last sub-page of this compound
+ * page.
+ */
+ if (page->index + compound_nr(page) - 1 == xas.xa_index)
i++;
- } else {
- VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
- != pvec->pages[i]->index, page);
- tail_pages--;
- }
xas_store(&xas, NULL);
total_pages++;
}
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping))
+ if (!mapping_cap_writeback_dirty(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
+/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
- return (!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional);
+ if (dax_mapping(mapping))
+ return mapping->nrexceptional;
+
+ return mapping->nrpages;
}
int filemap_write_and_wait(struct address_space *mapping)
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
XA_STATE(xas, &mapping->i_pages, offset);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
repeat:
if (!page || xa_is_value(page))
goto out;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto repeat;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
/*
- * Has the page moved?
+ * Has the page moved or been split?
* This is part of the lockless pagecache protocol. See
* include/linux/pagemap.h for details.
*/
if (unlikely(page != xas_reload(&xas))) {
- put_page(head);
+ put_page(page);
goto repeat;
}
+ page = find_subpage(page, offset);
out:
rcu_read_unlock();
}
/* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
rcu_read_lock();
xas_for_each(&xas, page, ULONG_MAX) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
if (xa_is_value(page))
goto export;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
+ page = find_subpage(page, xas.xa_index);
export:
indices[ret] = xas.xa_index;
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
rcu_read_lock();
xas_for_each(&xas, page, end) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/* Skip over shadow, swap and DAX entries */
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*start = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
rcu_read_lock();
for (page = xas_load(&xas); page; page = xas_next(&xas)) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
if (xa_is_value(page))
break;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages)
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
rcu_read_lock();
xas_for_each_marked(&xas, page, end, tag) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*index = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
goto out_retry;
/* Did it get truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto retry_find;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
/*
* We have a locked page in the page cache, now we need to check
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
xas_for_each(&xas, page, end_pgoff) {
if (xa_is_value(page))
goto next;
- head = compound_head(page);
-
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(head))
+ if (PageLocked(page))
goto next;
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto next;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto skip;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto skip;
+ page = find_subpage(page, xas.xa_index);
if (!PageUptodate(page) ||
PageReadahead(page) ||
unsigned int page_mask;
};
-typedef int (*set_dirty_func_t)(struct page *page);
-
-static void __put_user_pages_dirty(struct page **pages,
- unsigned long npages,
- set_dirty_func_t sdf)
-{
- unsigned long index;
-
- for (index = 0; index < npages; index++) {
- struct page *page = compound_head(pages[index]);
-
- /*
- * Checking PageDirty at this point may race with
- * clear_page_dirty_for_io(), but that's OK. Two key cases:
- *
- * 1) This code sees the page as already dirty, so it skips
- * the call to sdf(). That could happen because
- * clear_page_dirty_for_io() called page_mkclean(),
- * followed by set_page_dirty(). However, now the page is
- * going to get written back, which meets the original
- * intention of setting it dirty, so all is well:
- * clear_page_dirty_for_io() goes on to call
- * TestClearPageDirty(), and write the page back.
- *
- * 2) This code sees the page as clean, so it calls sdf().
- * The page stays dirty, despite being written back, so it
- * gets written back again in the next writeback cycle.
- * This is harmless.
- */
- if (!PageDirty(page))
- sdf(page);
-
- put_user_page(page);
- }
-}
-
/**
- * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
- * @pages: array of pages to be marked dirty and released.
+ * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
+ * @pages: array of pages to be maybe marked dirty, and definitely released.
* @npages: number of pages in the @pages array.
+ * @make_dirty: whether to mark the pages dirty
*
* "gup-pinned page" refers to a page that has had one of the get_user_pages()
* variants called on that page.
*
* For each page in the @pages array, make that page (or its head page, if a
- * compound page) dirty, if it was previously listed as clean. Then, release
- * the page using put_user_page().
+ * compound page) dirty, if @make_dirty is true, and if the page was previously
+ * listed as clean. In any case, releases all pages using put_user_page(),
+ * possibly via put_user_pages(), for the non-dirty case.
*
* Please see the put_user_page() documentation for details.
*
- * set_page_dirty(), which does not lock the page, is used here.
- * Therefore, it is the caller's responsibility to ensure that this is
- * safe. If not, then put_user_pages_dirty_lock() should be called instead.
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), put_user_page().
*
*/
-void put_user_pages_dirty(struct page **pages, unsigned long npages)
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+ bool make_dirty)
{
- __put_user_pages_dirty(pages, npages, set_page_dirty);
-}
-EXPORT_SYMBOL(put_user_pages_dirty);
+ unsigned long index;
-/**
- * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
- * @pages: array of pages to be marked dirty and released.
- * @npages: number of pages in the @pages array.
- *
- * For each page in the @pages array, make that page (or its head page, if a
- * compound page) dirty, if it was previously listed as clean. Then, release
- * the page using put_user_page().
- *
- * Please see the put_user_page() documentation for details.
- *
- * This is just like put_user_pages_dirty(), except that it invokes
- * set_page_dirty_lock(), instead of set_page_dirty().
- *
- */
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
-{
- __put_user_pages_dirty(pages, npages, set_page_dirty_lock);
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+
+ if (!make_dirty) {
+ put_user_pages(pages, npages);
+ return;
+ }
+
+ for (index = 0; index < npages; index++) {
+ struct page *page = compound_head(pages[index]);
+ /*
+ * Checking PageDirty at this point may race with
+ * clear_page_dirty_for_io(), but that's OK. Two key
+ * cases:
+ *
+ * 1) This code sees the page as already dirty, so it
+ * skips the call to set_page_dirty(). That could happen
+ * because clear_page_dirty_for_io() called
+ * page_mkclean(), followed by set_page_dirty().
+ * However, now the page is going to get written back,
+ * which meets the original intention of setting it
+ * dirty, so all is well: clear_page_dirty_for_io() goes
+ * on to call TestClearPageDirty(), and write the page
+ * back.
+ *
+ * 2) This code sees the page as clean, so it calls
+ * set_page_dirty(). The page stays dirty, despite being
+ * written back, so it gets written back again in the
+ * next writeback cycle. This is harmless.
+ */
+ if (!PageDirty(page))
+ set_page_dirty_lock(page);
+ put_user_page(page);
+ }
}
EXPORT_SYMBOL(put_user_pages_dirty_lock);
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & FOLL_SPLIT) {
+ if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
- } else {
+ } else if (flags & FOLL_SPLIT) {
if (unlikely(!try_get_page(page))) {
spin_unlock(ptl);
return ERR_PTR(-ENOMEM);
put_page(page);
if (pmd_none(*pmd))
return no_page_table(vma, flags);
+ } else { /* flags & FOLL_SPLIT_PMD */
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
}
return ret ? ERR_PTR(ret) :
* gup may start from a tail page. Advance step by the left
* part.
*/
- step = (1 << compound_order(head)) - (pages[i] - head);
+ step = compound_nr(head) - (pages[i] - head);
/*
* If we get a page from the CMA zone, since we are going to
* be pinning these entries, we might as well move them out
return pmd;
}
-static inline struct list_head *page_deferred_list(struct page *page)
+#ifdef CONFIG_MEMCG
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
- /* ->lru in the tail pages is occupied by compound_head. */
- return &page[2].deferred_list;
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ if (memcg)
+ return &memcg->deferred_split_queue;
+ else
+ return &pgdat->deferred_split_queue;
+}
+#else
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ return &pgdat->deferred_split_queue;
}
+#endif
void prep_transhuge_page(struct page *page)
{
struct page *head = compound_head(page);
pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
+ struct address_space *swap_cache = NULL;
+ unsigned long offset = 0;
int i;
lruvec = mem_cgroup_page_lruvec(head, pgdat);
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
+ if (PageAnon(head) && PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ offset = swp_offset(entry);
+ swap_cache = swap_address_space(entry);
+ xa_lock(&swap_cache->i_pages);
+ }
+
for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
+ } else if (!PageAnon(page)) {
+ __xa_store(&head->mapping->i_pages, head[i].index,
+ head + i, 0);
+ } else if (swap_cache) {
+ __xa_store(&swap_cache->i_pages, offset + i,
+ head + i, 0);
}
}
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
/* Additional pin to swap cache */
- if (PageSwapCache(head))
+ if (PageSwapCache(head)) {
page_ref_add(head, 2);
- else
+ xa_unlock(&swap_cache->i_pages);
+ } else {
page_ref_inc(head);
+ }
} else {
/* Additional pin to page cache */
page_ref_add(head, 2);
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
}
/* Prevent deferred_split_scan() touching ->_refcount */
- spin_lock(&pgdata->split_queue_lock);
+ spin_lock(&ds_queue->split_queue_lock);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
if (mapping)
__dec_node_page_state(page, NR_SHMEM_THPS);
- spin_unlock(&pgdata->split_queue_lock);
+ spin_unlock(&ds_queue->split_queue_lock);
__split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
dump_page(page, "total_mapcount(head) > 0");
BUG();
}
- spin_unlock(&pgdata->split_queue_lock);
+ spin_unlock(&ds_queue->split_queue_lock);
fail: if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
void free_transhuge_page(struct page *page)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
unsigned long flags;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(page_deferred_list(page))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(page));
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
free_compound_page(page);
}
void deferred_split_huge_page(struct page *page)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+#endif
unsigned long flags;
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ /*
+ * The try_to_unmap() in page reclaim path might reach here too,
+ * this may cause a race condition to corrupt deferred split queue.
+ * And, if page reclaim is already handling the same page, it is
+ * unnecessary to handle it again in shrinker.
+ *
+ * Check PageSwapCache to determine if the page is being
+ * handled by page reclaim since THP swap would add the page into
+ * swap cache before calling try_to_unmap().
+ */
+ if (PageSwapCache(page))
+ return;
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
- list_add_tail(page_deferred_list(page), &pgdata->split_queue);
- pgdata->split_queue_len++;
+ list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+ ds_queue->split_queue_len++;
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ memcg_set_shrinker_bit(memcg, page_to_nid(page),
+ deferred_split_shrinker.id);
+#endif
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
- return READ_ONCE(pgdata->split_queue_len);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+ return READ_ONCE(ds_queue->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
LIST_HEAD(list), *pos, *next;
struct page *page;
int split = 0;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &pgdata->split_queue) {
+ list_for_each_safe(pos, next, &ds_queue->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
if (get_page_unless_zero(page)) {
} else {
/* We lost race with put_compound_page() */
list_del_init(page_deferred_list(page));
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
break;
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
put_page(page);
}
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
- list_splice_tail(&list, &pgdata->split_queue);
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ list_splice_tail(&list, &ds_queue->split_queue);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us.
*/
- if (!split && list_empty(&pgdata->split_queue))
+ if (!split && list_empty(&ds_queue->split_queue))
return SHRINK_STOP;
return split;
}
.count_objects = deferred_split_count,
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+ SHRINKER_NONSLAB,
};
#ifdef CONFIG_DEBUG_FS
}
static struct page *alloc_buddy_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct page *page;
+ bool alloc_try_hard = true;
- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ /*
+ * By default we always try hard to allocate the page with
+ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
+ * a loop (to adjust global huge page counts) and previous allocation
+ * failed, do not continue to try hard on the same node. Use the
+ * node_alloc_noretry bitmap to manage this state information.
+ */
+ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
+ alloc_try_hard = false;
+ gfp_mask |= __GFP_COMP|__GFP_NOWARN;
+ if (alloc_try_hard)
+ gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
else
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ /*
+ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
+ * indicates an overall state change. Clear bit so that we resume
+ * normal 'try hard' allocations.
+ */
+ if (node_alloc_noretry && page && !alloc_try_hard)
+ node_clear(nid, *node_alloc_noretry);
+
+ /*
+ * If we tried hard to get a page but failed, set bit so that
+ * subsequent attempts will not try as hard until there is an
+ * overall state change.
+ */
+ if (node_alloc_noretry && !page && alloc_try_hard)
+ node_set(nid, *node_alloc_noretry);
+
return page;
}
* should use this function to get new hugetlb pages
*/
static struct page *alloc_fresh_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
page = alloc_buddy_huge_page(h, gfp_mask,
- nid, nmask);
+ nid, nmask, node_alloc_noretry);
if (!page)
return NULL;
* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
* manner.
*/
-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
+ node_alloc_noretry);
if (page)
break;
}
goto out_unlock;
spin_unlock(&hugetlb_lock);
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
if (hstate_is_gigantic(h))
return NULL;
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
+ nodemask_t *node_alloc_noretry;
+
+ if (!hstate_is_gigantic(h)) {
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * Ignore errors as lower level routines can deal with
+ * node_alloc_noretry == NULL. If this kmalloc fails at boot
+ * time, we are likely in bigger trouble.
+ */
+ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
+ GFP_KERNEL);
+ } else {
+ /* allocations done at boot time */
+ node_alloc_noretry = NULL;
+ }
+
+ /* bit mask controlling how hard we retry per-node allocations */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_pool_huge_page(h,
- &node_states[N_MEMORY]))
+ &node_states[N_MEMORY],
+ node_alloc_noretry))
break;
cond_resched();
}
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
+
+ kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
+ NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
+
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * If we can not allocate the bit mask, do not attempt to allocate
+ * the requested huge pages.
+ */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
+ else
+ return -ENOMEM;
spin_lock(&hugetlb_lock);
if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
if (count > persistent_huge_pages(h)) {
spin_unlock(&hugetlb_lock);
+ NODEMASK_FREE(node_alloc_noretry);
return -EINVAL;
}
/* Fall through to decrease pool */
/* yield cpu to avoid soft lockup */
cond_resched();
- ret = alloc_pool_huge_page(h, nodes_allowed);
+ ret = alloc_pool_huge_page(h, nodes_allowed,
+ node_alloc_noretry);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
+ NODEMASK_FREE(node_alloc_noretry);
+
return 0;
}
if (!page_hcg || page_hcg != h_cg)
goto out;
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
- .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
+ .cpu_bitmap = CPU_BITS_NONE,
INIT_MM_CONTEXT(init_mm)
};
struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
const void *object)
{
- BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
return (void *)object + cache->kasan_info.alloc_meta_offset;
}
return (void *)object + cache->kasan_info.free_meta_offset;
}
+
+static void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ u8 idx = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ idx = alloc_meta->free_track_idx;
+ alloc_meta->free_pointer_tag[idx] = tag;
+ alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+ set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
void kasan_poison_slab(struct page *page)
{
unsigned long i;
- for (i = 0; i < (1 << compound_order(page)); i++)
+ for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
- kasan_poison_shadow(page_address(page),
- PAGE_SIZE << compound_order(page),
+ kasan_poison_shadow(page_address(page), page_size(page),
KASAN_KMALLOC_REDZONE);
}
unlikely(!(cache->flags & SLAB_KASAN)))
return false;
- set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
+ kasan_set_free_info(cache, object, tag);
+
quarantine_put(get_free_info(cache, object), cache);
return IS_ENABLED(CONFIG_KASAN_GENERIC);
page = virt_to_page(ptr);
redzone_start = round_up((unsigned long)(ptr + size),
KASAN_SHADOW_SCALE_SIZE);
- redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+ redzone_end = (unsigned long)ptr + page_size(page);
kasan_unpoison_shadow(ptr, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
kasan_report_invalid_free(ptr, ip);
return;
}
- kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
- KASAN_FREE_PAGE);
+ kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
} else {
__kasan_slab_free(page->slab_cache, ptr, ip, false);
}
depot_stack_handle_t stack;
};
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+#define KASAN_NR_FREE_STACKS 5
+#else
+#define KASAN_NR_FREE_STACKS 1
+#endif
+
struct kasan_alloc_meta {
struct kasan_track alloc_track;
- struct kasan_track free_track;
+ struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
+ u8 free_track_idx;
+#endif
};
struct qlist_node {
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip);
+struct page *kasan_addr_to_page(const void *addr);
+
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
}
}
-static struct page *addr_to_page(const void *addr)
+struct page *kasan_addr_to_page(const void *addr)
{
if ((addr >= (void *)PAGE_OFFSET) &&
(addr < high_memory))
(void *)(object_addr + cache->object_size));
}
+static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ int i = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ break;
+ }
+ if (i == KASAN_NR_FREE_STACKS)
+ i = alloc_meta->free_track_idx;
+#endif
+
+ return &alloc_meta->free_track[i];
+}
+
static void describe_object(struct kmem_cache *cache, void *object,
- const void *addr)
+ const void *addr, u8 tag)
{
struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
if (cache->flags & SLAB_KASAN) {
+ struct kasan_track *free_track;
+
print_track(&alloc_info->alloc_track, "Allocated");
pr_err("\n");
- print_track(&alloc_info->free_track, "Freed");
+ free_track = kasan_get_free_track(cache, object, tag);
+ print_track(free_track, "Freed");
pr_err("\n");
}
print_decoded_frame_descr(frame_descr);
}
-static void print_address_description(void *addr)
+static void print_address_description(void *addr, u8 tag)
{
- struct page *page = addr_to_page(addr);
+ struct page *page = kasan_addr_to_page(addr);
dump_stack();
pr_err("\n");
struct kmem_cache *cache = page->slab_cache;
void *object = nearest_obj(cache, page, addr);
- describe_object(cache, object, addr);
+ describe_object(cache, object, addr, tag);
}
if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
void kasan_report_invalid_free(void *object, unsigned long ip)
{
unsigned long flags;
+ u8 tag = get_tag(object);
+ object = reset_tag(object);
start_report(&flags);
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
- print_tags(get_tag(object), reset_tag(object));
- object = reset_tag(object);
+ print_tags(tag, object);
pr_err("\n");
- print_address_description(object);
+ print_address_description(object, tag);
pr_err("\n");
print_shadow_for_address(object);
end_report(&flags);
pr_err("\n");
if (addr_has_shadow(untagged_addr)) {
- print_address_description(untagged_addr);
+ print_address_description(untagged_addr, get_tag(tagged_addr));
pr_err("\n");
print_shadow_for_address(info.first_bad_addr);
} else {
const char *get_bug_type(struct kasan_access_info *info)
{
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ struct kasan_alloc_meta *alloc_meta;
+ struct kmem_cache *cache;
+ struct page *page;
+ const void *addr;
+ void *object;
+ u8 tag;
+ int i;
+
+ tag = get_tag(info->access_addr);
+ addr = reset_tag(info->access_addr);
+ page = kasan_addr_to_page(addr);
+ if (page && PageSlab(page)) {
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, (void *)addr);
+ alloc_meta = get_alloc_info(cache, object);
+
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ return "use-after-free";
+ return "out-of-bounds";
+ }
+
+#endif
return "invalid-access";
}
SCAN_CGROUP_CHARGE_FAIL,
SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED,
+ SCAN_PAGE_HAS_PRIVATE,
};
#define CREATE_TRACE_POINTS
static struct kmem_cache *mm_slot_cache __read_mostly;
+#define MAX_PTE_MAPPED_THP 8
+
/**
* struct mm_slot - hash lookup from mm to mm_slot
* @hash: hash collision list
struct hlist_node hash;
struct list_head mm_node;
struct mm_struct *mm;
+
+ /* pte-mapped THP in this mm */
+ int nr_pte_mapped_thp;
+ unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
};
/**
(vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
- if (shmem_file(vma->vm_file)) {
+
+ if (shmem_file(vma->vm_file) ||
+ (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ vma->vm_file &&
+ (vm_flags & VM_DENYWRITE))) {
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
return false;
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
unsigned long hstart, hend;
/*
- * khugepaged does not yet work on non-shmem files or special
- * mappings. And file-private shmem THP is not supported.
+ * khugepaged only supports read-only files for non-shmem files.
+ * khugepaged does not yet work on special mappings. And
+ * file-private shmem THP is not supported.
*/
if (!hugepage_vma_check(vma, vm_flags))
return 0;
}
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+/*
+ * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
+ * khugepaged should try to collapse the page table.
+ */
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct mm_slot *mm_slot;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+ mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+ spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
+
+/**
+ * Try to collapse a pte-mapped THP for mm at address haddr.
+ *
+ * This function checks whether all the PTEs in the PMD are pointing to the
+ * right THP. If so, retract the page table so the THP can refault in with
+ * as pmd-mapped.
+ */
+void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct page *hpage = NULL;
+ pte_t *start_pte, *pte;
+ pmd_t *pmd, _pmd;
+ spinlock_t *ptl;
+ int count = 0;
+ int i;
+
+ if (!vma || !vma->vm_file ||
+ vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+ return;
+
+ /*
+ * This vm_flags may not have VM_HUGEPAGE if the page was not
+ * collapsed by this mm. But we can still collapse if the page is
+ * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
+ * will not fail the vma for missing VM_HUGEPAGE
+ */
+ if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+ return;
+
+ pmd = mm_find_pmd(mm, haddr);
+ if (!pmd)
+ return;
+
+ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+
+ /* step 1: check all mapped PTEs are to the right huge page */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ /* empty pte, skip */
+ if (pte_none(*pte))
+ continue;
+
+ /* page swapped out, abort */
+ if (!pte_present(*pte))
+ goto abort;
+
+ page = vm_normal_page(vma, addr, *pte);
+
+ if (!page || !PageCompound(page))
+ goto abort;
+
+ if (!hpage) {
+ hpage = compound_head(page);
+ /*
+ * The mapping of the THP should not change.
+ *
+ * Note that uprobe, debugger, or MAP_PRIVATE may
+ * change the page table, but the new page will
+ * not pass PageCompound() check.
+ */
+ if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
+ goto abort;
+ }
+
+ /*
+ * Confirm the page maps to the correct subpage.
+ *
+ * Note that uprobe, debugger, or MAP_PRIVATE may change
+ * the page table, but the new page will not pass
+ * PageCompound() check.
+ */
+ if (WARN_ON(hpage + i != page))
+ goto abort;
+ count++;
+ }
+
+ /* step 2: adjust rmap */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ if (pte_none(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ page_remove_rmap(page, false);
+ }
+
+ pte_unmap_unlock(start_pte, ptl);
+
+ /* step 3: set proper refcount and mm_counters. */
+ if (hpage) {
+ page_ref_sub(hpage, count);
+ add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+ }
+
+ /* step 4: collapse pmd */
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ return;
+
+abort:
+ pte_unmap_unlock(start_pte, ptl);
+}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+ int i;
+
+ if (likely(mm_slot->nr_pte_mapped_thp == 0))
+ return 0;
+
+ if (!down_write_trylock(&mm->mmap_sem))
+ return -EBUSY;
+
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
+ collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+
+out:
+ mm_slot->nr_pte_mapped_thp = 0;
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- /* probably overkill */
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth investing
+ * down_write(mmap_sem) as PMD-mapping is likely to be split
+ * later.
+ *
+ * Not that vma->anon_vma check is racy: it can be set up after
+ * the check but before we took mmap_sem by the fault path.
+ * But page lock would prevent establishing any new ptes of the
+ * page, so we are safe.
+ *
+ * An alternative would be drop the check, but check that page
+ * table is clear before calling pmdp_collapse_flush() under
+ * ptl. It has higher chance to recover THP for the VMA, but
+ * has higher cost too.
+ */
if (vma->anon_vma)
continue;
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
continue;
/*
* We need exclusive mmap_sem to retract page table.
- * If trylock fails we would end up with pte-mapped THP after
- * re-fault. Not ideal, but it's more important to not disturb
- * the system too much.
+ *
+ * We use trylock due to lock inversion: we need to acquire
+ * mmap_sem while holding page lock. Fault path does it in
+ * reverse order. Trylock is a way to avoid deadlock.
*/
if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
up_write(&vma->vm_mm->mmap_sem);
mm_dec_nr_ptes(vma->vm_mm);
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ } else {
+ /* Try again later */
+ khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
}
}
i_mmap_unlock_write(mapping);
}
/**
- * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
+ * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
* - scan page cache replacing old pages with the new one
- * + swap in pages if necessary;
+ * + swap/gup in pages if necessary;
* + fill in gaps;
* + keep old pages around in case rollback is required;
* - if replacing succeeds:
* + restore gaps in the page cache;
* + unlock and free huge page;
*/
-static void collapse_shmem(struct mm_struct *mm,
- struct address_space *mapping, pgoff_t start,
+static void collapse_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start,
struct page **hpage, int node)
{
+ struct address_space *mapping = file->f_mapping;
gfp_t gfp;
struct page *new_page;
struct mem_cgroup *memcg;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
+ bool is_shmem = shmem_file(file);
+ VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
/* Only allocate from the target node */
} while (1);
__SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
+ if (is_shmem)
+ __SetPageSwapBacked(new_page);
new_page->index = start;
new_page->mapping = mapping;
struct page *page = xas_next(&xas);
VM_BUG_ON(index != xas.xa_index);
- if (!page) {
- /*
- * Stop if extent has been truncated or hole-punched,
- * and is now completely empty.
- */
- if (index == start) {
- if (!xas_next_entry(&xas, end - 1)) {
- result = SCAN_TRUNCATED;
+ if (is_shmem) {
+ if (!page) {
+ /*
+ * Stop if extent has been truncated or
+ * hole-punched, and is now completely
+ * empty.
+ */
+ if (index == start) {
+ if (!xas_next_entry(&xas, end - 1)) {
+ result = SCAN_TRUNCATED;
+ goto xa_locked;
+ }
+ xas_set(&xas, index);
+ }
+ if (!shmem_charge(mapping->host, 1)) {
+ result = SCAN_FAIL;
goto xa_locked;
}
- xas_set(&xas, index);
+ xas_store(&xas, new_page);
+ nr_none++;
+ continue;
}
- if (!shmem_charge(mapping->host, 1)) {
- result = SCAN_FAIL;
+
+ if (xa_is_value(page) || !PageUptodate(page)) {
+ xas_unlock_irq(&xas);
+ /* swap in or instantiate fallocated page */
+ if (shmem_getpage(mapping->host, index, &page,
+ SGP_NOHUGE)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
goto xa_locked;
}
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
- nr_none++;
- continue;
- }
-
- if (xa_is_value(page) || !PageUptodate(page)) {
- xas_unlock_irq(&xas);
- /* swap in or instantiate fallocated page */
- if (shmem_getpage(mapping->host, index, &page,
- SGP_NOHUGE)) {
+ } else { /* !is_shmem */
+ if (!page || xa_is_value(page)) {
+ xas_unlock_irq(&xas);
+ page_cache_sync_readahead(mapping, &file->f_ra,
+ file, index,
+ PAGE_SIZE);
+ /* drain pagevecs to help isolate_lru_page() */
+ lru_add_drain();
+ page = find_lock_page(mapping, index);
+ if (unlikely(page == NULL)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (!PageUptodate(page)) {
+ xas_unlock_irq(&xas);
+ wait_on_page_locked(page);
+ if (!trylock_page(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto xa_unlocked;
+ }
+ get_page(page);
+ } else if (PageDirty(page)) {
result = SCAN_FAIL;
- goto xa_unlocked;
+ goto xa_locked;
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
+ goto xa_locked;
}
- } else if (trylock_page(page)) {
- get_page(page);
- xas_unlock_irq(&xas);
- } else {
- result = SCAN_PAGE_LOCK;
- goto xa_locked;
}
/*
goto out_unlock;
}
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL)) {
+ result = SCAN_PAGE_HAS_PRIVATE;
+ goto out_unlock;
+ }
+
if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ xas_store(&xas, new_page);
continue;
out_unlock:
unlock_page(page);
goto xa_unlocked;
}
- __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ if (is_shmem)
+ __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ else {
+ __inc_node_page_state(new_page, NR_FILE_THPS);
+ filemap_nr_thps_inc(mapping);
+ }
+
if (nr_none) {
struct zone *zone = page_zone(new_page);
__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
- __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
+ if (is_shmem)
+ __mod_node_page_state(zone->zone_pgdat,
+ NR_SHMEM, nr_none);
}
xa_locked:
SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
- set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+
+ if (is_shmem) {
+ set_page_dirty(new_page);
+ lru_cache_add_anon(new_page);
+ } else {
+ lru_cache_add_file(new_page);
+ }
count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
- lru_cache_add_anon(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
/* Something went wrong: roll back page cache changes */
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
- shmem_uncharge(mapping->host, nr_none);
+
+ if (is_shmem)
+ shmem_uncharge(mapping->host, nr_none);
xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
/* TODO: tracepoints */
}
-static void khugepaged_scan_shmem(struct mm_struct *mm,
- struct address_space *mapping,
- pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
{
struct page *page = NULL;
+ struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
int present, swap;
int node = NUMA_NO_NODE;
break;
}
- if (page_count(page) != 1 + page_mapcount(page)) {
+ if (page_count(page) !=
+ 1 + page_mapcount(page) + page_has_private(page)) {
result = SCAN_PAGE_COUNT;
break;
}
result = SCAN_EXCEED_NONE_PTE;
} else {
node = khugepaged_find_target_node();
- collapse_shmem(mm, mapping, start, hpage, node);
+ collapse_file(mm, file, start, hpage, node);
}
}
/* TODO: tracepoints */
}
#else
-static void khugepaged_scan_shmem(struct mm_struct *mm,
- struct address_space *mapping,
- pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
{
BUILD_BUG();
}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ return 0;
+}
#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
+ khugepaged_collapse_pte_mapped_thps(mm_slot);
mm = mm_slot->mm;
/*
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (shmem_file(vma->vm_file)) {
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
struct file *file;
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
- if (!shmem_huge_enabled(vma))
+
+ if (shmem_file(vma->vm_file)
+ && !shmem_huge_enabled(vma))
goto skip;
file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);
ret = 1;
- khugepaged_scan_shmem(mm, file->f_mapping,
- pgoff, hpage);
+ khugepaged_scan_file(mm, file, pgoff, hpage);
fput(file);
} else {
ret = khugepaged_scan_pmd(mm, vma,
#define OBJECT_REPORTED (1 << 1)
/* flag set to not scan the object */
#define OBJECT_NO_SCAN (1 << 2)
+/* flag set to fully scan the object when scan_area allocation failed */
+#define OBJECT_FULL_SCAN (1 << 3)
#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
static LIST_HEAD(object_list);
/* the list of gray-colored objects (see color_gray comment below) */
static LIST_HEAD(gray_list);
+/* memory pool allocation */
+static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE];
+static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
+static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
/* rw_lock protecting the access to object_list and object_tree_root */
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled;
+static int kmemleak_enabled = 1;
/* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled;
+static int kmemleak_free_enabled = 1;
/* set in the late_initcall if there were no errors */
static int kmemleak_initialized;
-/* enables or disables early logging of the memory operations */
-static int kmemleak_early_log = 1;
/* set if a kmemleak warning was issued */
static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
static bool kmemleak_verbose;
module_param_named(verbose, kmemleak_verbose, bool, 0600);
-/*
- * Early object allocation/freeing logging. Kmemleak is initialized after the
- * kernel allocator. However, both the kernel allocator and kmemleak may
- * allocate memory blocks which need to be tracked. Kmemleak defines an
- * arbitrary buffer to hold the allocation/freeing information before it is
- * fully initialized.
- */
-
-/* kmemleak operation type for early logging */
-enum {
- KMEMLEAK_ALLOC,
- KMEMLEAK_ALLOC_PERCPU,
- KMEMLEAK_FREE,
- KMEMLEAK_FREE_PART,
- KMEMLEAK_FREE_PERCPU,
- KMEMLEAK_NOT_LEAK,
- KMEMLEAK_IGNORE,
- KMEMLEAK_SCAN_AREA,
- KMEMLEAK_NO_SCAN,
- KMEMLEAK_SET_EXCESS_REF
-};
-
-/*
- * Structure holding the information passed to kmemleak callbacks during the
- * early logging.
- */
-struct early_log {
- int op_type; /* kmemleak operation type */
- int min_count; /* minimum reference count */
- const void *ptr; /* allocated/freed memory block */
- union {
- size_t size; /* memory block size */
- unsigned long excess_ref; /* surplus reference passing */
- };
- unsigned long trace[MAX_TRACE]; /* stack trace */
- unsigned int trace_len; /* stack trace length */
-};
-
-/* early logging buffer and current position */
-static struct early_log
- early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
-static int crt_early_log __initdata;
-
static void kmemleak_disable(void);
/*
return atomic_inc_not_zero(&object->use_count);
}
+/*
+ * Memory pool allocation and freeing. kmemleak_lock must not be held.
+ */
+static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ /* try the slab allocator first */
+ if (object_cache) {
+ object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ if (object)
+ return object;
+ }
+
+ /* slab allocation failed, try the memory pool */
+ write_lock_irqsave(&kmemleak_lock, flags);
+ object = list_first_entry_or_null(&mem_pool_free_list,
+ typeof(*object), object_list);
+ if (object)
+ list_del(&object->object_list);
+ else if (mem_pool_free_count)
+ object = &mem_pool[--mem_pool_free_count];
+ else
+ pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+
+ return object;
+}
+
+/*
+ * Return the object to either the slab allocator or the memory pool.
+ */
+static void mem_pool_free(struct kmemleak_object *object)
+{
+ unsigned long flags;
+
+ if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) {
+ kmem_cache_free(object_cache, object);
+ return;
+ }
+
+ /* add the object to the memory pool free list */
+ write_lock_irqsave(&kmemleak_lock, flags);
+ list_add(&object->object_list, &mem_pool_free_list);
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
/*
* RCU callback to free a kmemleak_object.
*/
hlist_del(&area->node);
kmem_cache_free(scan_area_cache, area);
}
- kmem_cache_free(object_cache, object);
+ mem_pool_free(object);
}
/*
/* should only get here after delete_object was called */
WARN_ON(object->flags & OBJECT_ALLOCATED);
- call_rcu(&object->rcu, free_object_rcu);
+ /*
+ * It may be too early for the RCU callbacks, however, there is no
+ * concurrent object_list traversal when !object_cache and all objects
+ * came from the memory pool. Free the object directly.
+ */
+ if (object_cache)
+ call_rcu(&object->rcu, free_object_rcu);
+ else
+ free_object_rcu(&object->rcu);
}
/*
struct rb_node **link, *rb_parent;
unsigned long untagged_ptr;
- object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ object = mem_pool_alloc(gfp);
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
/*
* Create one or two objects that may result from the memory block
* split. Note that partial freeing is only done by free_bootmem() and
- * this happens before kmemleak_init() is called. The path below is
- * only executed during early log recording in kmemleak_init(), so
- * GFP_KERNEL is enough.
+ * this happens before kmemleak_init() is called.
*/
start = object->pointer;
end = object->pointer + object->size;
{
unsigned long flags;
struct kmemleak_object *object;
- struct kmemleak_scan_area *area;
+ struct kmemleak_scan_area *area = NULL;
object = find_and_get_object(ptr, 1);
if (!object) {
return;
}
- area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
- if (!area) {
- pr_warn("Cannot allocate a scan area\n");
- goto out;
- }
+ if (scan_area_cache)
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
spin_lock_irqsave(&object->lock, flags);
+ if (!area) {
+ pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
+ /* mark the object for full scan to avoid false positives */
+ object->flags |= OBJECT_FULL_SCAN;
+ goto out_unlock;
+ }
if (size == SIZE_MAX) {
size = object->pointer + object->size - ptr;
} else if (ptr + size > object->pointer + object->size) {
hlist_add_head(&area->node, &object->area_list);
out_unlock:
spin_unlock_irqrestore(&object->lock, flags);
-out:
put_object(object);
}
put_object(object);
}
-/*
- * Log an early kmemleak_* call to the early_log buffer. These calls will be
- * processed later once kmemleak is fully initialized.
- */
-static void __init log_early(int op_type, const void *ptr, size_t size,
- int min_count)
-{
- unsigned long flags;
- struct early_log *log;
-
- if (kmemleak_error) {
- /* kmemleak stopped recording, just count the requests */
- crt_early_log++;
- return;
- }
-
- if (crt_early_log >= ARRAY_SIZE(early_log)) {
- crt_early_log++;
- kmemleak_disable();
- return;
- }
-
- /*
- * There is no need for locking since the kernel is still in UP mode
- * at this stage. Disabling the IRQs is enough.
- */
- local_irq_save(flags);
- log = &early_log[crt_early_log];
- log->op_type = op_type;
- log->ptr = ptr;
- log->size = size;
- log->min_count = min_count;
- log->trace_len = __save_stack_trace(log->trace);
- crt_early_log++;
- local_irq_restore(flags);
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc(struct early_log *log)
-{
- struct kmemleak_object *object;
- unsigned long flags;
- int i;
-
- if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
- return;
-
- /*
- * RCU locking needed to ensure object is not freed via put_object().
- */
- rcu_read_lock();
- object = create_object((unsigned long)log->ptr, log->size,
- log->min_count, GFP_ATOMIC);
- if (!object)
- goto out;
- spin_lock_irqsave(&object->lock, flags);
- for (i = 0; i < log->trace_len; i++)
- object->trace[i] = log->trace[i];
- object->trace_len = log->trace_len;
- spin_unlock_irqrestore(&object->lock, flags);
-out:
- rcu_read_unlock();
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc_percpu(struct early_log *log)
-{
- unsigned int cpu;
- const void __percpu *ptr = log->ptr;
-
- for_each_possible_cpu(cpu) {
- log->ptr = per_cpu_ptr(ptr, cpu);
- early_alloc(log);
- }
-}
-
/**
* kmemleak_alloc - register a newly allocated object
* @ptr: pointer to beginning of the object
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
create_object((unsigned long)ptr, size, min_count, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
for_each_possible_cpu(cpu)
create_object((unsigned long)per_cpu_ptr(ptr, cpu),
size, 0, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
create_object((unsigned long)area->addr, size, 2, gfp);
object_set_excess_ref((unsigned long)area,
(unsigned long)area->addr);
- } else if (kmemleak_early_log) {
- log_early(KMEMLEAK_ALLOC, area->addr, size, 2);
- /* reusing early_log.size for storing area->addr */
- log_early(KMEMLEAK_SET_EXCESS_REF,
- area, (unsigned long)area->addr, 0);
}
}
EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
delete_object_part((unsigned long)ptr, size);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
for_each_possible_cpu(cpu)
delete_object_full((unsigned long)per_cpu_ptr(ptr,
cpu));
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_gray_object((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_not_leak);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_black_object((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_ignore);
if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
add_scan_area((unsigned long)ptr, size, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
}
EXPORT_SYMBOL(kmemleak_scan_area);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
object_no_scan((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_no_scan);
if (!(object->flags & OBJECT_ALLOCATED))
/* already freed object */
goto out;
- if (hlist_empty(&object->area_list)) {
+ if (hlist_empty(&object->area_list) ||
+ object->flags & OBJECT_FULL_SCAN) {
void *start = (void *)object->pointer;
void *end = (void *)(object->pointer + object->size);
void *next;
/* stop any memory operation tracing */
kmemleak_enabled = 0;
- kmemleak_early_log = 0;
/* check whether it is too early for a kernel thread */
if (kmemleak_initialized)
}
early_param("kmemleak", kmemleak_boot_config);
-static void __init print_log_trace(struct early_log *log)
-{
- pr_notice("Early log backtrace:\n");
- stack_trace_print(log->trace, log->trace_len, 2);
-}
-
/*
* Kmemleak initialization.
*/
void __init kmemleak_init(void)
{
- int i;
- unsigned long flags;
-
#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
if (!kmemleak_skip_disable) {
kmemleak_disable();
}
#endif
+ if (kmemleak_error)
+ return;
+
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
- if (crt_early_log > ARRAY_SIZE(early_log))
- pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
- crt_early_log);
-
- /* the kernel is still in UP mode, so disabling the IRQs is enough */
- local_irq_save(flags);
- kmemleak_early_log = 0;
- if (kmemleak_error) {
- local_irq_restore(flags);
- return;
- } else {
- kmemleak_enabled = 1;
- kmemleak_free_enabled = 1;
- }
- local_irq_restore(flags);
-
/* register the data/bss sections */
create_object((unsigned long)_sdata, _edata - _sdata,
KMEMLEAK_GREY, GFP_ATOMIC);
create_object((unsigned long)__start_ro_after_init,
__end_ro_after_init - __start_ro_after_init,
KMEMLEAK_GREY, GFP_ATOMIC);
-
- /*
- * This is the point where tracking allocations is safe. Automatic
- * scanning is started during the late initcall. Add the early logged
- * callbacks to the kmemleak infrastructure.
- */
- for (i = 0; i < crt_early_log; i++) {
- struct early_log *log = &early_log[i];
-
- switch (log->op_type) {
- case KMEMLEAK_ALLOC:
- early_alloc(log);
- break;
- case KMEMLEAK_ALLOC_PERCPU:
- early_alloc_percpu(log);
- break;
- case KMEMLEAK_FREE:
- kmemleak_free(log->ptr);
- break;
- case KMEMLEAK_FREE_PART:
- kmemleak_free_part(log->ptr, log->size);
- break;
- case KMEMLEAK_FREE_PERCPU:
- kmemleak_free_percpu(log->ptr);
- break;
- case KMEMLEAK_NOT_LEAK:
- kmemleak_not_leak(log->ptr);
- break;
- case KMEMLEAK_IGNORE:
- kmemleak_ignore(log->ptr);
- break;
- case KMEMLEAK_SCAN_AREA:
- kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
- break;
- case KMEMLEAK_NO_SCAN:
- kmemleak_no_scan(log->ptr);
- break;
- case KMEMLEAK_SET_EXCESS_REF:
- object_set_excess_ref((unsigned long)log->ptr,
- log->excess_ref);
- break;
- default:
- kmemleak_warn("Unknown early log operation: %d\n",
- log->op_type);
- }
-
- if (kmemleak_warning) {
- print_log_trace(log);
- kmemleak_warning = 0;
- }
- }
}
/*
mutex_unlock(&scan_mutex);
}
- pr_info("Kernel memory leak detector initialized\n");
+ pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n",
+ mem_pool_free_count);
return 0;
}
return checksum;
}
-static int memcmp_pages(struct page *page1, struct page *page2)
-{
- char *addr1, *addr2;
- int ret;
-
- addr1 = kmap_atomic(page1);
- addr2 = kmap_atomic(page2);
- ret = memcmp(addr1, addr2, PAGE_SIZE);
- kunmap_atomic(addr2);
- kunmap_atomic(addr1);
- return ret;
-}
-
-static inline int pages_identical(struct page *page1, struct page *page2)
-{
- return !memcmp_pages(page1, page2);
-}
-
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
break;
}
goto out;
}
error = __split_vma(mm, vma, start, 1);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
}
if (end != vma->vm_end) {
goto out;
}
error = __split_vma(mm, vma, end, 0);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
}
success:
* vm_flags is protected by the mmap_sem held in write mode.
*/
vma->vm_flags = new_flags;
+
+out_convert_errno:
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
out:
return error;
}
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
EXPORT_SYMBOL(memcg_kmem_enabled_key);
struct workqueue_struct *memcg_kmem_cache_wq;
+#endif
static int memcg_shrinker_map_size;
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
}
}
-#else /* CONFIG_MEMCG_KMEM */
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
- return 0;
-}
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
-#endif /* CONFIG_MEMCG_KMEM */
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
memcg = stock->cached;
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
- continue;
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
- css_put(&memcg->css);
- continue;
- }
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (memcg && stock->nr_pages &&
+ mem_cgroup_is_descendant(memcg, root_memcg))
+ flush = true;
+ rcu_read_unlock();
+
+ if (flush &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
- css_put(&memcg->css);
}
put_cpu();
mutex_unlock(&percpu_charge_mutex);
reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
}
+/*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ * overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
+ * to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ * +-------+------------------------+
+ * | usage | time to allocate in ms |
+ * +-------+------------------------+
+ * | 100M | 0 |
+ * | 101M | 6 |
+ * | 102M | 25 |
+ * | 103M | 57 |
+ * | 104M | 102 |
+ * | 105M | 159 |
+ * | 106M | 230 |
+ * | 107M | 313 |
+ * | 108M | 409 |
+ * | 109M | 518 |
+ * | 110M | 639 |
+ * | 111M | 774 |
+ * | 112M | 921 |
+ * | 113M | 1081 |
+ * | 114M | 1254 |
+ * | 115M | 1439 |
+ * | 116M | 1638 |
+ * | 117M | 1849 |
+ * | 118M | 2000 |
+ * | 119M | 2000 |
+ * | 120M | 2000 |
+ * +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
*/
void mem_cgroup_handle_over_high(void)
{
+ unsigned long usage, high, clamped_high;
+ unsigned long pflags;
+ unsigned long penalty_jiffies, overage;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
struct mem_cgroup *memcg;
memcg = get_mem_cgroup_from_mm(current->mm);
reclaim_high(memcg, nr_pages, GFP_KERNEL);
- css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ *
+ * We use overage compared to memory.high to calculate the number of
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
+ * fairly lenient on small overages, and increasingly harsh when the
+ * memcg in question makes it clear that it has no intention of stopping
+ * its crazy behaviour, so we exponentially increase the delay based on
+ * overage amount.
+ */
+
+ usage = page_counter_read(&memcg->memory);
+ high = READ_ONCE(memcg->high);
+
+ if (usage <= high)
+ goto out;
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if it was a
+ * threshold of 1 page
+ */
+ clamped_high = max(high, 1UL);
+
+ overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
+ clamped_high);
+
+ penalty_jiffies = ((u64)overage * overage * HZ)
+ >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+
+ /*
+ * Factor in the task's own contribution to the overage, such that four
+ * N-sized allocations are throttled approximately the same as one
+ * 4N-sized allocation.
+ *
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+ * larger the current charge patch is than that.
+ */
+ penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+ /*
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
+ * that it's not even worth doing, in an attempt to be nice to those who
+ * go only a small amount over their memory.high value and maybe haven't
+ * been aggressively reclaimed enough yet.
+ */
+ if (penalty_jiffies <= HZ / 100)
+ goto out;
+
+ /*
+ * If we exit early, we're guaranteed to die (since
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+ * need to account for any ill-begotten jiffies to pay them off later.
+ */
+ psi_memstall_enter(&pflags);
+ schedule_timeout_killable(penalty_jiffies);
+ psi_memstall_leave(&pflags);
+
+out:
+ css_put(&memcg->css);
}
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
}
}
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
-{
- mem_cgroup_id_get_many(memcg, 1);
-}
-
static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
{
mem_cgroup_id_put_many(memcg, 1);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
memcg->cgwb_frn[i].done =
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+ memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
__mod_memcg_state(to, NR_WRITEBACK, nr_pages);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && !list_empty(page_deferred_list(page))) {
+ spin_lock(&from->deferred_split_queue.split_queue_lock);
+ list_del_init(page_deferred_list(page));
+ from->deferred_split_queue.split_queue_len--;
+ spin_unlock(&from->deferred_split_queue.split_queue_lock);
+ }
+#endif
/*
* It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
/* caller should have done css_get */
page->mem_cgroup = to;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && list_empty(page_deferred_list(page))) {
+ spin_lock(&to->deferred_split_queue.split_queue_lock);
+ list_add_tail(page_deferred_list(page),
+ &to->deferred_split_queue.split_queue);
+ to->deferred_split_queue.split_queue_len++;
+ spin_unlock(&to->deferred_split_queue.split_queue_lock);
+ }
+#endif
+
spin_unlock_irqrestore(&from->move_lock, flags);
ret = 0;
unsigned int nr_pages = 1;
if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
+ nr_pages = compound_nr(page);
ug->nr_huge += nr_pages;
}
if (PageAnon(page))
}
ug->pgpgout++;
} else {
- ug->nr_kmem += 1 << compound_order(page);
+ ug->nr_kmem += compound_nr(page);
__ClearPageKmemcg(page);
}
xas_for_each(xas, page, ULONG_MAX) {
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas->xa_index);
if (page_count(page) - page_mapcount(page) > 1)
xas_set_mark(xas, MEMFD_TAG_PINNED);
bool clear = true;
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas.xa_index);
if (page_count(page) - page_mapcount(page) != 1) {
/*
* On the last scan, we clean up all those tags
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
- pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+ pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
vma->vm_file,
if (pte_none(ptent))
continue;
+ if (need_resched())
+ break;
+
if (pte_present(ptent)) {
struct page *page;
if (unlikely(details))
continue;
- entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry))
rss[MM_SWAPENTS]--;
else if (is_migration_entry(entry)) {
if (force_flush) {
force_flush = 0;
tlb_flush_mmu(tlb);
- if (addr != end)
- goto again;
+ }
+
+ if (addr != end) {
+ cond_resched();
+ goto again;
}
return addr;
#endif
}
-static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
-{
- unsigned long end = start + nr_pages;
- int order, onlined_pages = 0;
-
- while (start < end) {
- order = min(MAX_ORDER - 1,
- get_order(PFN_PHYS(end) - PFN_PHYS(start)));
- (*online_page_callback)(pfn_to_page(start), order);
-
- onlined_pages += (1UL << order);
- start += (1UL << order);
- }
- return onlined_pages;
-}
-
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long onlined_pages = *(unsigned long *)arg;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn;
+ int order;
- if (PageReserved(pfn_to_page(start_pfn)))
- onlined_pages += online_pages_blocks(start_pfn, nr_pages);
+ /*
+ * Online the pages. The callback might decide to keep some pages
+ * PG_reserved (to add them to the buddy later), but we still account
+ * them as being online/belonging to this zone ("present").
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
+ order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
+ /* __free_pages_core() wants pfns to be aligned to the order */
+ if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
+ order = 0;
+ (*online_page_callback)(pfn_to_page(pfn), order);
+ }
- online_mem_sections(start_pfn, start_pfn + nr_pages);
+ /* mark all involved sections as online */
+ online_mem_sections(start_pfn, end_pfn);
- *(unsigned long *)arg = onlined_pages;
+ *(unsigned long *)arg += nr_pages;
return 0;
}
pgdat->node_start_pfn = start_pfn;
pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
-}
+}
+/*
+ * Associate the pfn range with the given zone, initializing the memmaps
+ * and resizing the pgdat/zone data to span the added pages. After this
+ * call, all affected pages are PG_reserved.
+ */
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-/*
- * Associates the given pfn range with the given node and the zone appropriate
- * for the given online type.
- */
-static struct zone * __meminit move_pfn_range(int online_type, int nid,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- struct zone *zone;
-
- zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
- move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
- return zone;
-}
-
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
put_device(&mem->dev);
/* associate pfn range with the zone */
- zone = move_pfn_range(online_type, nid, pfn, nr_pages);
+ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+ /* not a single memory resource was applicable */
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
goto failed_addition;
shuffle_zone(zone);
- if (onlined_pages) {
- node_states_set_node(nid, &arg);
- if (need_zonelists_rebuild)
- build_all_zonelists(NULL);
- else
- zone_pcp_update(zone);
- }
+ node_states_set_node(nid, &arg);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL);
+ else
+ zone_pcp_update(zone);
init_per_zone_wmark_min();
- if (onlined_pages) {
- kswapd_run(nid);
- kcompactd_run(nid);
- }
+ kswapd_run(nid);
+ kcompactd_run(nid);
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
- if (onlined_pages)
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &arg);
mem_hotplug_done();
return 0;
if (!pgdat)
return NULL;
+ pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
arch_refresh_nodedata(nid, pgdat);
} else {
+ int cpu;
/*
* Reset the nr_zones, order and classzone_idx before reuse.
* Note that kswapd will init kswapd_classzone_idx properly
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
}
/* we can use NODE_DATA(nid) from here */
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(nid);
- pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/*
* The node we allocated has no zone fallback lists. For avoiding
head = compound_head(page);
if (page_huge_active(head))
return pfn;
- skip = (1 << compound_order(head)) - (page - head);
+ skip = compound_nr(head) - (page - head);
pfn += skip - 1;
}
return 0;
if (PageHuge(page)) {
struct page *head = compound_head(page);
- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+ pfn = page_to_pfn(head) + compound_nr(head) - 1;
isolate_huge_page(head, &source);
continue;
} else if (PageTransHuge(page))
phys_addr_t beginpa, endpa;
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
- endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
+ endpa = beginpa + memory_block_size_bytes() - 1;
pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
{
/*
- * trigger BUG() is some memory is not offlined prior to calling this
+ * trigger BUG() if some memory is not offlined prior to calling this
* function
*/
if (try_remove_memory(nid, start, size))
if (nodes_empty(*new))
goto out_put;
- nodes_and(*new, *new, node_states[N_MEMORY]);
- if (nodes_empty(*new))
- goto out_put;
-
err = security_task_movememory(task);
if (err)
goto out_put;
for (i = 1; i < HPAGE_PMD_NR; i++) {
xas_next(&xas);
- xas_store(&xas, newpage + i);
+ xas_store(&xas, newpage);
}
}
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
return 0;
if (isolate_lru_page(page))
pte_t pte;
pte = *ptep;
- pfn = pte_pfn(pte);
if (pte_none(pte)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
- pfn = 0;
goto next;
}
if (!pte_present(pte)) {
- mpfn = pfn = 0;
+ mpfn = 0;
/*
* Only care about unaddressable device page special
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
+ pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
- pfn = 0;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
/* FIXME support THP */
if (!page || !page->mapping || PageTransCompound(page)) {
- mpfn = pfn = 0;
+ mpfn = 0;
goto next;
}
- pfn = page_to_pfn(page);
/*
* By getting a reference on the page we pin it and that blocks
if (S_ISBLK(inode->i_mode))
return MAX_LFS_FILESIZE;
+ if (S_ISSOCK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
/* Special "we do even unsigned file positions" case */
if (file->f_mode & FMODE_UNSIGNED_OFFSET)
return 0;
if (vma) {
*pprev = vma->vm_prev;
} else {
- struct rb_node *rb_node = mm->mm_rb.rb_node;
- *pprev = NULL;
- while (rb_node) {
- *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
- rb_node = rb_node->rb_right;
- }
+ struct rb_node *rb_node = rb_last(&mm->mm_rb);
+
+ *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
tlb_flush_mmu(tlb);
- /* keep the page table cache within bounds */
- check_pgt_cache();
#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
tlb_batch_list_free(tlb);
#endif
* The ksize() function is only guaranteed to work for pointers
* returned by kmalloc(). So handle arbitrary pointers here.
*/
- return PAGE_SIZE << compound_order(page);
+ return page_size(page);
}
/**
/**
* oom_cpuset_eligible() - check task eligiblity for kill
* @start: task struct of which task to consider
- * @mask: nodemask passed to page allocator for mempolicy ooms
+ * @oc: pointer to struct oom_control
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, *oc->nodemask)
- oc->totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_present_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
if (cpuset_limited) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, cpuset_current_mems_allowed)
- oc->totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_present_pages(nid);
return CONSTRAINT_CPUSET;
}
return CONSTRAINT_NONE;
*/
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
- pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
- message, task_pid_nr(victim), victim->comm,
- K(victim->mm->total_vm),
- K(get_mm_counter(victim->mm, MM_ANONPAGES)),
- K(get_mm_counter(victim->mm, MM_FILEPAGES)),
- K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
+ message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)),
+ from_kuid(&init_user_ns, task_uid(victim)),
+ mm_pgtables_bytes(mm), victim->signal->oom_score_adj);
task_unlock(victim);
/*
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here.
+ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
+ * invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
void free_compound_page(struct page *page)
{
+ mem_cgroup_uncharge(page);
__free_pages_ok(page, compound_order(page));
}
if (compaction_failed(compact_result))
goto check_priority;
+ /*
+ * compaction was skipped because there are not enough order-0 pages
+ * to work with, so we retry only if it looks like reclaim can help.
+ */
+ if (compaction_needs_reclaim(compact_result)) {
+ ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+ goto out;
+ }
+
/*
* make sure the compaction wasn't deferred or didn't bail out early
* due to locks contention before we declare that we should give up.
- * But do not retry if the given zonelist is not suitable for
- * compaction.
+ * But the next retry should use a higher priority if allowed, so
+ * we don't just keep bailing out endlessly.
*/
if (compaction_withdrawn(compact_result)) {
- ret = compaction_zonelist_suitable(ac, order, alloc_flags);
- goto out;
+ goto check_priority;
}
/*
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void pgdat_init_split_queue(struct pglist_data *pgdat)
{
- spin_lock_init(&pgdat->split_queue_lock);
- INIT_LIST_HEAD(&pgdat->split_queue);
- pgdat->split_queue_len = 0;
+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
+
+ spin_lock_init(&ds_queue->split_queue_lock);
+ INIT_LIST_HEAD(&ds_queue->split_queue);
+ ds_queue->split_queue_len = 0;
}
#else
static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
if (!hugepage_migration_supported(page_hstate(head)))
goto unmovable;
- skip_pages = (1 << compound_order(head)) - (page - head);
+ skip_pages = compound_nr(head) - (page - head);
iter += skip_pages - 1;
continue;
}
short last_migrate_reason;
gfp_t gfp_mask;
depot_stack_handle_t handle;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ depot_stack_handle_t free_handle;
+#endif
};
static bool page_owner_disabled = true;
return (void *)page_ext + page_owner_ops.offset;
}
-void __reset_page_owner(struct page *page, unsigned int order)
-{
- int i;
- struct page_ext *page_ext;
-
- for (i = 0; i < (1 << order); i++) {
- page_ext = lookup_page_ext(page + i);
- if (unlikely(!page_ext))
- continue;
- __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
- }
-}
-
static inline bool check_recursive_alloc(unsigned long *entries,
unsigned int nr_entries,
unsigned long ip)
return handle;
}
-static inline void __set_page_owner_handle(struct page_ext *page_ext,
- depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
+void __reset_page_owner(struct page *page, unsigned int order)
{
+ int i;
+ struct page_ext *page_ext;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ depot_stack_handle_t handle = 0;
struct page_owner *page_owner;
- page_owner = get_page_owner(page_ext);
- page_owner->handle = handle;
- page_owner->order = order;
- page_owner->gfp_mask = gfp_mask;
- page_owner->last_migrate_reason = -1;
+ if (debug_pagealloc_enabled())
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+#endif
- __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ for (i = 0; i < (1 << order); i++) {
+ page_ext = lookup_page_ext(page + i);
+ if (unlikely(!page_ext))
+ continue;
+ __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (debug_pagealloc_enabled()) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->free_handle = handle;
+ }
+#endif
+ }
+}
+
+static inline void __set_page_owner_handle(struct page *page,
+ struct page_ext *page_ext, depot_stack_handle_t handle,
+ unsigned int order, gfp_t gfp_mask)
+{
+ struct page_owner *page_owner;
+ int i;
+
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->handle = handle;
+ page_owner->order = order;
+ page_owner->gfp_mask = gfp_mask;
+ page_owner->last_migrate_reason = -1;
+ __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
+
+ page_ext = lookup_page_ext(page + i);
+ }
}
noinline void __set_page_owner(struct page *page, unsigned int order,
return;
handle = save_stack(gfp_mask);
- __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
page_owner = get_page_owner(page_ext);
page_owner->order = 0;
- for (i = 1; i < (1 << order); i++)
- __copy_page_owner(page, page + i);
+ for (i = 1; i < (1 << order); i++) {
+ page_ext = lookup_page_ext(page + i);
+ page_owner = get_page_owner(page_ext);
+ page_owner->order = 0;
+ }
}
void __copy_page_owner(struct page *oldpage, struct page *newpage)
* the new page, which will be freed.
*/
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags);
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (unlikely(!page_ext))
continue;
- if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
continue;
page_owner = get_page_owner(page_ext);
mt = gfpflags_to_migratetype(gfp_mask);
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
- pr_alert("page_owner info is not active (free page?)\n");
+ pr_alert("page_owner info is not present (never set?)\n");
return;
}
+ if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+ pr_alert("page_owner tracks the page as allocated\n");
+ else
+ pr_alert("page_owner tracks the page as freed\n");
+
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+ page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+
handle = READ_ONCE(page_owner->handle);
if (!handle) {
- pr_alert("page_owner info is not active (free page?)\n");
- return;
+ pr_alert("page_owner allocation stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ stack_trace_print(entries, nr_entries, 0);
}
- nr_entries = stack_depot_fetch(handle, &entries);
- pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
- page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
- stack_trace_print(entries, nr_entries, 0);
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ handle = READ_ONCE(page_owner->free_handle);
+ if (!handle) {
+ pr_alert("page_owner free stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ pr_alert("page last free stack trace:\n");
+ stack_trace_print(entries, nr_entries, 0);
+ }
+#endif
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
continue;
+ /*
+ * Although we do have the info about past allocation of free
+ * pages, it's not relevant for current memory usage.
+ */
+ if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+ continue;
+
page_owner = get_page_owner(page_ext);
+ /*
+ * Don't print "tail" pages of high-order allocations as that
+ * would inflate the stats.
+ */
+ if (!IS_ALIGNED(pfn, 1 << page_owner->order))
+ continue;
+
/*
* Access to page_ext->handle isn't synchronous so we should
* be careful to access it.
continue;
/* Found early allocated page */
- __set_page_owner_handle(page_ext, early_handle, 0, 0);
+ __set_page_owner_handle(page, page_ext, early_handle,
+ 0, 0);
count++;
}
cond_resched();
/*
* Page poisoning when enabled poisons each and every page
* that is freed to buddy. Thus no extra check is done to
- * see if a page was posioned.
+ * see if a page was poisoned.
*/
check_poison_mem(addr, PAGE_SIZE);
kunmap_atomic(addr);
if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address,
- PAGE_SIZE << compound_order(page));
+ pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
return false;
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Quicklist support.
- *
- * Quicklists are light weight lists of pages that have a defined state
- * on alloc and free. Pages must be in the quicklist specific defined state
- * (zero by default) when the page is freed. It seems that the initial idea
- * for such lists first came from Dave Miller and then various other people
- * improved on it.
- *
- * Copyright (C) 2007 SGI,
- * Christoph Lameter <cl@linux.com>
- * Generalized, added support for multiple lists and
- * constructors / destructors.
- */
-#include <linux/kernel.h>
-
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/quicklist.h>
-
-DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
-
-#define FRACTION_OF_NODE_MEM 16
-
-static unsigned long max_pages(unsigned long min_pages)
-{
- unsigned long node_free_pages, max;
- int node = numa_node_id();
- struct zone *zones = NODE_DATA(node)->node_zones;
- int num_cpus_on_node;
-
- node_free_pages =
-#ifdef CONFIG_ZONE_DMA
- zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
- zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
-#endif
- zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
-
- max = node_free_pages / FRACTION_OF_NODE_MEM;
-
- num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
- max /= num_cpus_on_node;
-
- return max(max, min_pages);
-}
-
-static long min_pages_to_free(struct quicklist *q,
- unsigned long min_pages, long max_free)
-{
- long pages_to_free;
-
- pages_to_free = q->nr_pages - max_pages(min_pages);
-
- return min(pages_to_free, max_free);
-}
-
-/*
- * Trim down the number of pages in the quicklist
- */
-void quicklist_trim(int nr, void (*dtor)(void *),
- unsigned long min_pages, unsigned long max_free)
-{
- long pages_to_free;
- struct quicklist *q;
-
- q = &get_cpu_var(quicklist)[nr];
- if (q->nr_pages > min_pages) {
- pages_to_free = min_pages_to_free(q, min_pages, max_free);
-
- while (pages_to_free > 0) {
- /*
- * We pass a gfp_t of 0 to quicklist_alloc here
- * because we will never call into the page allocator.
- */
- void *p = quicklist_alloc(nr, 0, NULL);
-
- if (dtor)
- dtor(p);
- free_page((unsigned long)p);
- pages_to_free--;
- }
- }
- put_cpu_var(quicklist);
-}
-
-unsigned long quicklist_total_size(void)
-{
- unsigned long count = 0;
- int cpu;
- struct quicklist *ql, *q;
-
- for_each_online_cpu(cpu) {
- ql = per_cpu(quicklist, cpu);
- for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
- count += q->nr_pages;
- }
- return count;
-}
-
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- min(vma->vm_end, address +
- (PAGE_SIZE << compound_order(page))));
+ min(vma->vm_end, address + page_size(page)));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
- unsigned long cstart;
int ret = 0;
- cstart = address = pvmw.address;
+ address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
- cstart &= PMD_MASK;
ret = 1;
#else
/* unexpected pmd-mapped page? */
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ if (PageSwapBacked(page))
+ __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __inc_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
goto out;
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ if (PageSwapBacked(page))
+ __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address,
- min(vma->vm_end, address +
- (PAGE_SIZE << compound_order(page))));
+ min(vma->vm_end, address + page_size(page)));
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
- int nr = 1 << compound_order(page);
- hugetlb_count_sub(nr, mm);
+ hugetlb_count_sub(compound_nr(page), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
unsigned long i = 0;
- unsigned long nr = 1UL << compound_order(page);
+ unsigned long nr = compound_nr(page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(index != round_down(index, nr), page);
if (xas_error(&xas))
goto unlock;
next:
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
if (++i < nr) {
xas_next(&xas);
goto next;
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * fault_mm and fault_type are only supplied by shmem_fault:
+ * vmf and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
lru_cache_add_anon(page);
spin_lock_irq(&info->lock);
- info->alloced += 1 << compound_order(page);
+ info->alloced += compound_nr(page);
inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
struct page *head = compound_head(page);
int i;
- for (i = 0; i < (1 << compound_order(head)); i++) {
+ for (i = 0; i < compound_nr(head); i++) {
clear_highpage(head + i);
flush_dcache_page(head + i);
}
* Error recovery.
*/
unacct:
- shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
+ shmem_inode_unacct_blocks(inode, compound_nr(page));
if (PageTransHuge(page)) {
unlock_page(page);
struct list_head list; /* List of all slab caches on the system */
};
+#else /* !CONFIG_SLOB */
+
+struct memcg_cache_array {
+ struct rcu_head rcu;
+ struct kmem_cache *entries[0];
+};
+
+/*
+ * This is the main placeholder for memcg-related information in kmem caches.
+ * Both the root cache and the child caches will have it. For the root cache,
+ * this will hold a dynamically allocated array large enough to hold
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
+ *
+ * Root and child caches hold different metadata.
+ *
+ * @root_cache: Common to root and child caches. NULL for root, pointer to
+ * the root cache for children.
+ *
+ * The following fields are specific to root caches.
+ *
+ * @memcg_caches: kmemcg ID indexed table of child caches. This table is
+ * used to index child cachces during allocation and cleared
+ * early during shutdown.
+ *
+ * @root_caches_node: List node for slab_root_caches list.
+ *
+ * @children: List of all child caches. While the child caches are also
+ * reachable through @memcg_caches, a child cache remains on
+ * this list until it is actually destroyed.
+ *
+ * The following fields are specific to child caches.
+ *
+ * @memcg: Pointer to the memcg this cache belongs to.
+ *
+ * @children_node: List node for @root_cache->children list.
+ *
+ * @kmem_caches_node: List node for @memcg->kmem_caches list.
+ */
+struct memcg_cache_params {
+ struct kmem_cache *root_cache;
+ union {
+ struct {
+ struct memcg_cache_array __rcu *memcg_caches;
+ struct list_head __root_caches_node;
+ struct list_head children;
+ bool dying;
+ };
+ struct {
+ struct mem_cgroup *memcg;
+ struct list_head children_node;
+ struct list_head kmem_caches_node;
+ struct percpu_ref refcnt;
+
+ void (*work_fn)(struct kmem_cache *);
+ union {
+ struct rcu_head rcu_head;
+ struct work_struct work;
+ };
+ };
+ };
+};
#endif /* CONFIG_SLOB */
#ifdef CONFIG_SLAB
void __kmemcg_cache_deactivate(struct kmem_cache *s);
void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
+void kmem_cache_shrink_all(struct kmem_cache *s);
struct seq_file;
struct file;
}
EXPORT_SYMBOL(kmem_cache_shrink);
+/**
+ * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
+ * @s: The cache pointer
+ */
+void kmem_cache_shrink_all(struct kmem_cache *s)
+{
+ struct kmem_cache *c;
+
+ if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
+ kmem_cache_shrink(s);
+ return;
+ }
+
+ get_online_cpus();
+ get_online_mems();
+ kasan_cache_shrink(s);
+ __kmem_cache_shrink(s);
+
+ /*
+ * We have to take the slab_mutex to protect from the memcg list
+ * modification.
+ */
+ mutex_lock(&slab_mutex);
+ for_each_memcg_cache(c, s) {
+ /*
+ * Don't need to shrink deactivated memcg caches.
+ */
+ if (s->flags & SLAB_DEACTIVATED)
+ continue;
+ kasan_cache_shrink(c);
+ __kmem_cache_shrink(c);
+ }
+ mutex_unlock(&slab_mutex);
+ put_online_mems();
+ put_online_cpus();
+}
+
bool slab_is_available(void)
{
return slab_state >= UP;
sp = virt_to_page(block);
if (unlikely(!PageSlab(sp)))
- return PAGE_SIZE << compound_order(sp);
+ return page_size(sp);
align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
m = (unsigned int *)(block - align);
return 1;
start = page_address(page);
- length = PAGE_SIZE << compound_order(page);
+ length = page_size(page);
end = start + length;
remainder = length % s->size;
if (!remainder)
init_tracking(s, object);
}
-static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
+static
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
{
if (!(s->flags & SLAB_POISON))
return;
metadata_access_enable();
- memset(addr, POISON_INUSE, PAGE_SIZE << order);
+ memset(addr, POISON_INUSE, page_size(page));
metadata_access_disable();
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
-static inline void setup_page_debug(struct kmem_cache *s,
- void *addr, int order) {}
+static inline
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
void *start, *p, *next;
- int idx, order;
+ int idx;
bool shuffle;
flags &= gfp_allowed_mask;
page->objects = oo_objects(oo);
- order = compound_order(page);
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
start = page_address(page);
- setup_page_debug(s, start, order);
+ setup_page_debug(s, page, start);
shuffle = shuffle_freelist(s, page);
return tid + TID_STEP;
}
+#ifdef SLUB_DEBUG_CMPXCHG
static inline unsigned int tid_to_cpu(unsigned long tid)
{
return tid % TID_STEP;
{
return tid / TID_STEP;
}
+#endif
static inline unsigned int init_tid(int cpu)
{
if (unlikely(!PageSlab(page))) {
WARN_ON(!PageCompound(page));
- return PAGE_SIZE << compound_order(page);
+ return page_size(page);
}
return slab_ksize(page->slab_cache);
const char *buf, size_t length)
{
if (buf[0] == '1')
- kmem_cache_shrink(s);
+ kmem_cache_shrink_all(s);
else
return -EINVAL;
return length;
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include "internal.h"
#include <asm/dma.h>
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;
+static inline void __meminit sparse_buffer_free(unsigned long size)
+{
+ WARN_ON(!sparsemap_buf || size == 0);
+ memblock_free_early(__pa(sparsemap_buf), size);
+}
+
static void __init sparse_buffer_init(unsigned long size, int nid)
{
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
unsigned long size = sparsemap_buf_end - sparsemap_buf;
if (sparsemap_buf && size > 0)
- memblock_free_early(__pa(sparsemap_buf), size);
+ sparse_buffer_free(size);
sparsemap_buf = NULL;
}
void *ptr = NULL;
if (sparsemap_buf) {
- ptr = PTR_ALIGN(sparsemap_buf, size);
+ ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
if (ptr + size > sparsemap_buf_end)
ptr = NULL;
- else
+ else {
+ /* Free redundant aligned space */
+ if ((unsigned long)(ptr - sparsemap_buf) > 0)
+ sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
sparsemap_buf = ptr + size;
+ }
}
return ptr;
}
*/
page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
- ms = __pfn_to_section(start_pfn);
+ ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
section_mark_present(ms);
{
int i;
- if (!memmap)
- return;
-
/*
* A further optimization is to have per section refcounted
* num_poisoned_pages. But that would need more space per memmap, so
for (i = 0; i < nr_pages; i++) {
if (PageHWPoison(&memmap[i])) {
- atomic_long_sub(1, &num_poisoned_pages);
+ num_poisoned_pages_dec();
ClearPageHWPoison(&memmap[i]);
}
}
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
}
__ClearPageWaiters(page);
- mem_cgroup_uncharge(page);
}
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
+ mem_cgroup_uncharge(page);
free_unref_page(page);
}
del_page_from_lru_list(page, lruvec, lru + active);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
if (PageWriteback(page) || PageDirty(page)) {
/*
* It can make readahead confusing. But race window
* is _really_ small and it's non-critical problem.
*/
+ add_page_to_lru_list(page, lruvec, lru);
SetPageReclaim(page);
} else {
/*
* The page's writeback ends up during pagevec
* We moves tha page into tail of inactive.
*/
- list_move_tail(&page->lru, &lruvec->lists[lru]);
+ add_page_to_lru_list_tail(page, lruvec, lru);
__count_vm_event(PGROTATED);
}
get_page(page_tail);
list_add_tail(&page_tail->lru, list);
} else {
- struct list_head *list_head;
/*
* Head page has not yet been counted, as an hpage,
* so we must account for each subpage individually.
*
- * Use the standard add function to put page_tail on the list,
- * but then correct its position so they all end up in order.
+ * Put page_tail on the list at the correct position
+ * so they all end up in order.
*/
- add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
- list_head = page_tail->lru.prev;
- list_move_tail(&page_tail->lru, list_head);
+ add_page_to_lru_list_tail(page_tail, lruvec,
+ page_lru(page_tail));
}
if (!PageUnevictable(page))
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
- unsigned long i, nr = 1UL << compound_order(page);
+ unsigned long i, nr = compound_nr(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
for (i = 0; i < nr; i++) {
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
set_page_private(page + i, entry.val + i);
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
xas_next(&xas);
}
address_space->nrpages += nr;
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, NULL);
- VM_BUG_ON_PAGE(entry != page + i, entry);
+ VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
xas_next(&xas);
}
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
+#include <linux/elf.h>
+#include <linux/elf-randomize.h>
+#include <linux/personality.h>
+#include <linux/random.h>
+#include <linux/processor.h>
+#include <linux/sizes.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
-#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
+#ifndef STACK_RND_MASK
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
+#endif
+
+unsigned long randomize_stack_top(unsigned long stack_top)
+{
+ unsigned long random_variable = 0;
+
+ if (current->flags & PF_RANDOMIZE) {
+ random_variable = get_random_long();
+ random_variable &= STACK_RND_MASK;
+ random_variable <<= PAGE_SHIFT;
+ }
+#ifdef CONFIG_STACK_GROWSUP
+ return PAGE_ALIGN(stack_top) + random_variable;
+#else
+ return PAGE_ALIGN(stack_top) - random_variable;
+#endif
+}
+
+#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ /* Is the current task 32bit ? */
+ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
+ return randomize_page(mm->brk, SZ_32M);
+
+ return randomize_page(mm->brk, SZ_1G);
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+ unsigned long rnd;
+
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ if (is_compat_task())
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
+ else
+#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+
+ return rnd << PAGE_SHIFT;
+}
+
+static int mmap_is_legacy(struct rlimit *rlim_stack)
+{
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
+ return 1;
+
+ return sysctl_legacy_va_layout;
+}
+
+/*
+ * Leave enough space between the mmap area and the stack to honour ulimit in
+ * the face of randomisation.
+ */
+#define MIN_GAP (SZ_128M)
+#define MAX_GAP (STACK_TOP / 6 * 5)
+
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
+{
+ unsigned long gap = rlim_stack->rlim_cur;
+ unsigned long pad = stack_guard_gap;
+
+ /* Account for stack randomization if necessary */
+ if (current->flags & PF_RANDOMIZE)
+ pad += (STACK_RND_MASK << PAGE_SHIFT);
+
+ /* Values close to RLIM_INFINITY can overflow. */
+ if (gap + pad > gap)
+ gap += pad;
+
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+
+ return PAGE_ALIGN(STACK_TOP - gap - rnd);
+}
+
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ if (mmap_is_legacy(rlim_stack)) {
+ mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ } else {
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ }
+}
+#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
return true;
if (PageHuge(page))
return false;
- for (i = 0; i < (1 << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
out:
return res;
}
+
+int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
+ return ret;
+}
#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
-#define VM_LAZY_FREE 0x02
-#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
va->va_start = addr;
va->va_end = addr + size;
- va->flags = 0;
+ va->vm = NULL;
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
- __free_vmap_area(va);
+ /*
+ * Finally insert or merge lazily-freed area. It is
+ * detached and there is no need to "unlink" it from
+ * anything.
+ */
+ merge_or_add_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
+
atomic_long_sub(nr, &vmap_lazy_nr);
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
{
unsigned long nr_lazy;
+ spin_lock(&vmap_area_lock);
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);
if (WARN_ON_ONCE(!va))
continue;
- va->flags = VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
- va->flags |= VM_VM_AREA;
spin_unlock(&vmap_area_lock);
}
struct vmap_area *va;
va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA)
- return va->vm;
+ if (!va)
+ return NULL;
- return NULL;
+ return va->vm;
}
/**
might_sleep();
- va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA) {
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr);
+ if (va && va->vm) {
struct vm_struct *vm = va->vm;
- spin_lock(&vmap_area_lock);
va->vm = NULL;
- va->flags &= ~VM_VM_AREA;
- va->flags |= VM_LAZY_FREE;
spin_unlock(&vmap_area_lock);
kasan_free_shadow(vm);
return vm;
}
+
+ spin_unlock(&vmap_area_lock);
return NULL;
}
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
- area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
- area->pages = pages;
- if (!area->pages) {
+
+ if (!pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
+ area->pages = pages;
+ area->nr_pages = nr_pages;
+
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
if (!count)
break;
- if (!(va->flags & VM_VM_AREA))
+ if (!va->vm)
continue;
vm = va->vm;
if (!count)
break;
- if (!(va->flags & VM_VM_AREA))
+ if (!va->vm)
continue;
vm = va->vm;
}
}
+static void show_purge_info(struct seq_file *m)
+{
+ struct llist_node *head;
+ struct vmap_area *va;
+
+ head = READ_ONCE(vmap_purge_list.first);
+ if (head == NULL)
+ return;
+
+ llist_for_each_entry(va, head, purge_list) {
+ seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+ }
+}
+
static int s_show(struct seq_file *m, void *p)
{
struct vmap_area *va;
va = list_entry(p, struct vmap_area, list);
/*
- * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
- * behalf of vmap area is being tear down or vm_map_ram allocation.
+ * s_show can encounter race with remove_vm_area, !vm on behalf
+ * of vmap area is being tear down or vm_map_ram allocation.
*/
- if (!(va->flags & VM_VM_AREA)) {
- seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
+ if (!va->vm) {
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
(void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start,
- va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
+ va->va_end - va->va_start);
return 0;
}
show_numa_info(m, v);
seq_putc(m, '\n');
+
+ /*
+ * As a final step, dump "unpurged" areas. Note,
+ * that entire "/proc/vmallocinfo" output will not
+ * be address sorted, because the purge list is not
+ * sorted.
+ */
+ if (list_is_last(&va->list, &vmap_area_list))
+ show_purge_info(m);
+
return 0;
}
*/
unsigned long vm_total_pages;
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_MEMCG_KMEM
-
+#ifdef CONFIG_MEMCG
/*
* We allow subsystems to populate their shrinker-related
* LRU lists before register_shrinker_prepared() is called
idr_remove(&shrinker_idr, id);
up_write(&shrinker_rwsem);
}
-#else /* CONFIG_MEMCG_KMEM */
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
- return 0;
-}
-
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
-static void set_task_reclaim_state(struct task_struct *task,
- struct reclaim_state *rs)
-{
- /* Check for an overwrite */
- WARN_ON_ONCE(rs && task->reclaim_state);
-
- /* Check for the nulling of an already-nulled member */
- WARN_ON_ONCE(!rs && !task->reclaim_state);
- task->reclaim_state = rs;
-}
-
-#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
}
#else
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+
static bool global_reclaim(struct scan_control *sc)
{
return true;
return freed;
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
unsigned long ret, freed = 0;
int i;
- if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+ if (!mem_cgroup_online(memcg))
return 0;
if (!down_read_trylock(&shrinker_rwsem))
continue;
}
+ /* Call non-slab shrinkers even though kmem is disabled */
+ if (!memcg_kmem_enabled() &&
+ !(shrinker->flags & SHRINKER_NONSLAB))
+ continue;
+
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
clear_bit(i, map->map);
up_read(&shrinker_rwsem);
return freed;
}
-#else /* CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
return 0;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG */
/**
* shrink_slab - shrink slab caches
VM_BUG_ON_PAGE(PageActive(page), page);
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
/* Account the number of base pages even though THP */
sc->nr_scanned += nr_pages;
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
- if (unlikely(PageTransHuge(page))) {
- mem_cgroup_uncharge(page);
+ if (unlikely(PageTransHuge(page)))
(*get_compound_page_dtor(page))(page);
- } else
+ else
list_add(&page->lru, &free_pages);
continue;
VM_BUG_ON_PAGE(!PageLRU(page), page);
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
total_scan += nr_pages;
if (page_zonenum(page) > sc->reclaim_idx) {
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&pgdat->lru_lock);
} else
*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long nr_reclaimed,
- unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
if (!in_reclaim_compaction(sc))
return false;
- /* Consider stopping depending on scan and reclaim activity */
- if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
- /*
- * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
- * full LRU list has been scanned and we are still failing
- * to reclaim pages. This full LRU scan is potentially
- * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
- */
- if (!nr_reclaimed && !nr_scanned)
- return false;
- } else {
- /*
- * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
- * fail without consequence, stop if we failed to reclaim
- * any pages from the last SWAP_CLUSTER_MAX number of
- * pages that were scanned. This will return to the
- * caller faster at the risk reclaim/compaction and
- * the resulting allocation attempt fails
- */
- if (!nr_reclaimed)
- return false;
- }
-
/*
- * If we have not reclaimed enough pages for compaction and the
- * inactive lists are large enough, continue reclaiming
+ * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
+ * number of pages that were scanned. This will return to the caller
+ * with the risk reclaim/compaction and the resulting allocation attempt
+ * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
+ * allocations through requiring that the full LRU list has been scanned
+ * first, by assuming that zero delta of sc->nr_scanned means full LRU
+ * scan, but that approximation was wrong, and there were corner cases
+ * where always a non-zero amount of pages were scanned.
*/
- pages_for_compaction = compact_gap(sc->order);
- inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
- inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
- if (sc->nr_reclaimed < pages_for_compaction &&
- inactive_lru_pages > pages_for_compaction)
- return true;
+ if (!nr_reclaimed)
+ return false;
/* If compaction would go ahead or the allocation would succeed, stop */
for (z = 0; z <= sc->reclaim_idx; z++) {
;
}
}
- return true;
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = compact_gap(sc->order);
+ inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ return inactive_lru_pages > pages_for_compaction;
}
static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
- struct mem_cgroup_reclaim_cookie reclaim = {
- .pgdat = pgdat,
- .priority = sc->priority,
- };
unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- memcg = mem_cgroup_iter(root, NULL, &reclaim);
+ memcg = mem_cgroup_iter(root, NULL, NULL);
do {
unsigned long lru_pages;
unsigned long reclaimed;
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
- /*
- * Kswapd have to scan all memory cgroups to fulfill
- * the overall scan target for the node.
- *
- * Limit reclaim, on the other hand, only cares about
- * nr_to_reclaim pages to be reclaimed and it will
- * retry with decreasing priority if one round over the
- * whole hierarchy is not sufficient.
- */
- if (!current_is_kswapd() &&
- sc->nr_reclaimed >= sc->nr_to_reclaim) {
- mem_cgroup_iter_break(root, memcg);
- break;
- }
- } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
+ } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
- sc->nr_scanned - nr_scanned, sc));
+ sc));
/*
* Kswapd gives up on balancing particular nodes after too
"nr_shmem",
"nr_shmem_hugepages",
"nr_shmem_pmdmapped",
+ "nr_file_hugepages",
+ "nr_file_pmdmapped",
"nr_anon_transparent_hugepages",
"nr_unstable",
"nr_vmscan_write",
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/wait.h>
#include <linux/zpool.h>
#include <linux/magic.h>
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
* @inode: inode for z3fold pseudo filesystem
- * @destroying: bool to stop migration once we start destruction
- * @isolated: int to count the number of pages currently in isolation
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
- struct wait_queue_head isolate_wait;
struct work_struct work;
struct inode *inode;
- bool destroying;
- int isolated;
};
/*
}
/* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page,
+static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
struct z3fold_pool *pool, gfp_t gfp)
{
struct z3fold_header *zhdr = page_address(page);
- struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp);
-
- if (!slots)
- return NULL;
+ struct z3fold_buddy_slots *slots;
INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
clear_bit(PAGE_CLAIMED, &page->private);
+ if (headless)
+ return zhdr;
+
+ slots = alloc_slots(pool, gfp);
+ if (!slots)
+ return NULL;
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
* Encodes the handle of a particular buddy within a z3fold page
* Pool lock should be held as this function accesses first_num
*/
-static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+static unsigned long __encode_handle(struct z3fold_header *zhdr,
+ struct z3fold_buddy_slots *slots,
+ enum buddy bud)
{
- struct z3fold_buddy_slots *slots;
unsigned long h = (unsigned long)zhdr;
int idx = 0;
if (bud == LAST)
h |= (zhdr->last_chunks << BUDDY_SHIFT);
- slots = zhdr->slots;
slots->slot[idx] = h;
return (unsigned long)&slots->slot[idx];
}
+static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+{
+ return __encode_handle(zhdr, zhdr->slots, bud);
+}
+
/* Returns the z3fold page where a given handle is stored */
static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
{
}
if (unlikely(PageIsolated(page) ||
+ test_bit(PAGE_CLAIMED, &page->private) ||
test_bit(PAGE_STALE, &page->private))) {
z3fold_page_unlock(zhdr);
return;
goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
- init_waitqueue_head(&pool->isolate_wait);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
if (!pool->unbuddied)
goto out_pool;
return NULL;
}
-static bool pool_isolated_are_drained(struct z3fold_pool *pool)
-{
- bool ret;
-
- spin_lock(&pool->lock);
- ret = pool->isolated == 0;
- spin_unlock(&pool->lock);
- return ret;
-}
/**
* z3fold_destroy_pool() - destroys an existing z3fold pool
* @pool: the z3fold pool to be destroyed
static void z3fold_destroy_pool(struct z3fold_pool *pool)
{
kmem_cache_destroy(pool->c_handle);
- /*
- * We set pool-> destroying under lock to ensure that
- * z3fold_page_isolate() sees any changes to destroying. This way we
- * avoid the need for any memory barriers.
- */
-
- spin_lock(&pool->lock);
- pool->destroying = true;
- spin_unlock(&pool->lock);
-
- /*
- * We need to ensure that no pages are being migrated while we destroy
- * these workqueues, as migration can queue work on either of the
- * workqueues.
- */
- wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool));
/*
* We need to destroy pool->compact_wq before pool->release_wq,
if (!page)
return -ENOMEM;
- zhdr = init_z3fold_page(page, pool, gfp);
+ zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
if (!zhdr) {
__free_page(page);
return -ENOMEM;
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
struct list_head *pos;
+ struct z3fold_buddy_slots slots;
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
/* this bit could have been set by free, in which case
* we pass over to the next page in the pool.
*/
- if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ page = NULL;
continue;
+ }
- if (unlikely(PageIsolated(page)))
+ if (unlikely(PageIsolated(page))) {
+ clear_bit(PAGE_CLAIMED, &page->private);
+ page = NULL;
continue;
+ }
+ zhdr = page_address(page);
if (test_bit(PAGE_HEADLESS, &page->private))
break;
- zhdr = page_address(page);
if (!z3fold_page_trylock(zhdr)) {
+ clear_bit(PAGE_CLAIMED, &page->private);
zhdr = NULL;
continue; /* can't evict at this point */
}
if (!test_bit(PAGE_HEADLESS, &page->private)) {
/*
- * We need encode the handles before unlocking, since
- * we can race with free that will set
- * (first|last)_chunks to 0
+ * We need encode the handles before unlocking, and
+ * use our local slots structure because z3fold_free
+ * can zero out zhdr->slots and we can't do much
+ * about that
*/
first_handle = 0;
last_handle = 0;
middle_handle = 0;
if (zhdr->first_chunks)
- first_handle = encode_handle(zhdr, FIRST);
+ first_handle = __encode_handle(zhdr, &slots,
+ FIRST);
if (zhdr->middle_chunks)
- middle_handle = encode_handle(zhdr, MIDDLE);
+ middle_handle = __encode_handle(zhdr, &slots,
+ MIDDLE);
if (zhdr->last_chunks)
- last_handle = encode_handle(zhdr, LAST);
+ last_handle = __encode_handle(zhdr, &slots,
+ LAST);
/*
* it's safe to unlock here because we hold a
* reference to this page
*/
z3fold_page_unlock(zhdr);
} else {
- first_handle = encode_handle(zhdr, HEADLESS);
+ first_handle = __encode_handle(zhdr, &slots, HEADLESS);
last_handle = middle_handle = 0;
}
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
+ clear_bit(PAGE_CLAIMED, &page->private);
} else {
z3fold_page_lock(zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked)) {
atomic64_dec(&pool->pages_nr);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
}
/* We started off locked to we need to lock the pool back */
return atomic64_read(&pool->pages_nr);
}
-/*
- * z3fold_dec_isolated() expects to be called while pool->lock is held.
- */
-static void z3fold_dec_isolated(struct z3fold_pool *pool)
-{
- assert_spin_locked(&pool->lock);
- VM_BUG_ON(pool->isolated <= 0);
- pool->isolated--;
-
- /*
- * If we have no more isolated pages, we have to see if
- * z3fold_destroy_pool() is waiting for a signal.
- */
- if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait))
- wake_up_all(&pool->isolate_wait);
-}
-
-static void z3fold_inc_isolated(struct z3fold_pool *pool)
-{
- pool->isolated++;
-}
-
static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
{
struct z3fold_header *zhdr;
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
- if (test_bit(PAGE_HEADLESS, &page->private))
+ if (test_bit(PAGE_HEADLESS, &page->private) ||
+ test_bit(PAGE_CLAIMED, &page->private))
return false;
zhdr = page_address(page);
spin_lock(&pool->lock);
if (!list_empty(&page->lru))
list_del(&page->lru);
- /*
- * We need to check for destruction while holding pool->lock, as
- * otherwise destruction could see 0 isolated pages, and
- * proceed.
- */
- if (unlikely(pool->destroying)) {
- spin_unlock(&pool->lock);
- /*
- * If this page isn't stale, somebody else holds a
- * reference to it. Let't drop our refcount so that they
- * can call the release logic.
- */
- if (unlikely(kref_put(&zhdr->refcount,
- release_z3fold_page_locked))) {
- /*
- * If we get here we have kref problems, so we
- * should freak out.
- */
- WARN(1, "Z3fold is experiencing kref problems\n");
- z3fold_page_unlock(zhdr);
- return false;
- }
- z3fold_page_unlock(zhdr);
- return false;
- }
-
-
- z3fold_inc_isolated(pool);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
return true;
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
- spin_lock(&pool->lock);
- z3fold_dec_isolated(pool);
- spin_unlock(&pool->lock);
-
page_mapcount_reset(page);
put_page(page);
return 0;
INIT_LIST_HEAD(&page->lru);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
atomic64_dec(&pool->pages_nr);
- spin_lock(&pool->lock);
- z3fold_dec_isolated(pool);
- spin_unlock(&pool->lock);
return;
}
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
- z3fold_dec_isolated(pool);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
}
return zpool->driver->type;
}
+/**
+ * zpool_malloc_support_movable() - Check if the zpool support
+ * allocate movable memory
+ * @zpool: The zpool to check
+ *
+ * This returns if the zpool support allocate movable memory.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: true if if the zpool support allocate movable memory, false if not
+ */
+bool zpool_malloc_support_movable(struct zpool *zpool)
+{
+ return zpool->driver->malloc_support_movable;
+}
+
/**
* zpool_malloc() - Allocate memory
* @zpool: The zpool to allocate from.
}
static struct zpool_driver zs_zpool_driver = {
- .type = "zsmalloc",
- .owner = THIS_MODULE,
- .create = zs_zpool_create,
- .destroy = zs_zpool_destroy,
- .malloc = zs_zpool_malloc,
- .free = zs_zpool_free,
- .map = zs_zpool_map,
- .unmap = zs_zpool_unmap,
- .total_size = zs_zpool_total_size,
+ .type = "zsmalloc",
+ .owner = THIS_MODULE,
+ .create = zs_zpool_create,
+ .destroy = zs_zpool_destroy,
+ .malloc_support_movable = true,
+ .malloc = zs_zpool_malloc,
+ .free = zs_zpool_free,
+ .map = zs_zpool_map,
+ .unmap = zs_zpool_unmap,
+ .total_size = zs_zpool_total_size,
};
MODULE_ALIAS("zpool-zsmalloc");
return zspage->inuse;
}
-static inline void set_zspage_inuse(struct zspage *zspage, int val)
-{
- zspage->inuse = val;
-}
static inline void mod_zspage_inuse(struct zspage *zspage, int val)
{
/* extract swpentry from data */
zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
swpentry = zhdr->swpentry; /* here */
- zpool_unmap_handle(pool, handle);
tree = zswap_trees[swp_type(swpentry)];
offset = swp_offset(swpentry);
if (!entry) {
/* entry was invalidated */
spin_unlock(&tree->lock);
+ zpool_unmap_handle(pool, handle);
return 0;
}
spin_unlock(&tree->lock);
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
- ZPOOL_MM_RO) + sizeof(struct zswap_header);
+ src = (u8 *)zhdr + sizeof(struct zswap_header);
dst = kmap_atomic(page);
tfm = *get_cpu_ptr(entry->pool->tfm);
ret = crypto_comp_decompress(tfm, src, entry->length,
dst, &dlen);
put_cpu_ptr(entry->pool->tfm);
kunmap_atomic(dst);
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
spin_unlock(&tree->lock);
end:
+ zpool_unmap_handle(pool, handle);
return ret;
}
char *buf;
u8 *src, *dst;
struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
+ gfp_t gfp;
/* THP isn't supported */
if (PageTransHuge(page)) {
/* store */
hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
- ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
- __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
- &handle);
+ gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ if (zpool_malloc_support_movable(entry->pool->zpool))
+ gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+ ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
{
- unsigned int i;
-
- for (i = 0; i < umem->npgs; i++) {
- struct page *page = umem->pgs[i];
-
- set_page_dirty_lock(page);
- put_page(page);
- }
+ put_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
kfree(umem->pgs);
umem->pgs = NULL;
/* Matches the smp_wmb() in xsk_init_queue */
smp_rmb();
qpg = virt_to_head_page(q->ring);
- if (size > (PAGE_SIZE << compound_order(qpg)))
+ if (size > page_size(qpg))
return -EINVAL;
pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
datafile_d_y = .$(datafile_y).d
AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)"
+# clean rules do not have CONFIG_INITRAMFS_COMPRESSION. So clean up after all
+# possible compression formats.
+clean-files += initramfs_data.cpio*
# Generate builtin.o based on initramfs_data.o
obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o