keepinitrd [HW,ARM]
- kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
+ kernelcore= [KNL,X86,IA-64,PPC]
+ Format: nn[KMGTPE] | "mirror"
+ This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations. The requested amount is
spread evenly throughout all nodes in the system. The
use the HighMem zone if it exists, and the Normal
zone if it does not.
+ Instead of specifying the amount of memory (nn[KMGTPE]),
+ you can specify "mirror" option. In case "mirror"
+ option is specified, mirrored (reliable) memory is used
+ for non-movable allocations and remaining memory is used
+ for Movable pages. nn[KMGTPE] and "mirror" are exclusive,
+ so you can NOT specify nn[KMGTPE] and "mirror" at the same
+ time.
+
kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
Format: <Controller#>[,poll interval]
The controller # is the number of the ehci usb debug
we can turn it on.
on: enable the feature
+ page_poison= [KNL] Boot-time parameter changing the state of
+ poisoning on the buddy allocator.
+ off: turn off poisoning
+ on: turn on poisoning
+
panic= [KNL] Kernel behaviour on panic: delay <timeout>
timeout > 0: seconds before rebooting
timeout = 0: wait forever
5.2. How to online memory
------------
-Even if the memory is hot-added, it is not at ready-to-use state.
-For using newly added memory, you have to "online" the memory block.
+When the memory is hot-added, the kernel decides whether or not to "online"
+it according to the policy which can be read from "auto_online_blocks" file:
-For onlining, you have to write "online" to the memory block's state file as:
+% cat /sys/devices/system/memory/auto_online_blocks
+
+The default is "offline" which means the newly added memory is not in a
+ready-to-use state and you have to "online" the newly added memory blocks
+manually. Automatic onlining can be requested by writing "online" to
+"auto_online_blocks" file:
+
+% echo online > /sys/devices/system/memory/auto_online_blocks
+
+This sets a global policy and impacts all memory blocks that will subsequently
+be hotplugged. Currently offline blocks keep their state. It is possible, under
+certain circumstances, that some memory blocks will be added but will fail to
+online. User space tools can check their "state" files
+(/sys/devices/system/memory/memoryXXX/state) and try to online them manually.
+
+If the automatic onlining wasn't requested, failed, or some memory block was
+offlined it is possible to change the individual block's state by writing to the
+"state" file:
% echo online > /sys/devices/system/memory/memoryXXX/state
Passed by reference.
+Flags bitfields such as page flags, gfp_flags:
+
+ %pGp referenced|uptodate|lru|active|private
+ %pGg GFP_USER|GFP_DMA32|GFP_NOWARN
+ %pGv read|exec|mayread|maywrite|mayexec|denywrite
+
+ For printing flags bitfields as a collection of symbolic constants that
+ would construct the value. The type of flags is given by the third
+ character. Currently supported are [p]age flags, [v]ma_flags (both
+ expect unsigned long *) and [g]fp_flags (expects gfp_t *). The flag
+ names and print order depends on the particular type.
+
+ Note that this format should not be used directly in TP_printk() part
+ of a tracepoint. Instead, use the show_*_flags() functions from
+ <trace/events/mmflags.h>.
+
+ Passed by reference.
+
Network device features:
%pNF 0x000000000000c000
boot option, runtime overhead is marginal. If disabled in runtime, it
doesn't require memory to store owner information, so there is no runtime
memory overhead. And, page owner inserts just two unlikely branches into
-the page allocator hotpath and if it returns false then allocation is
-done like as the kernel without page owner. These two unlikely branches
-would not affect to allocation performance. Following is the kernel's
-code size change due to this facility.
+the page allocator hotpath and if not enabled, then allocation is done
+like as the kernel without page owner. These two unlikely branches should
+not affect to allocation performance, especially if the static keys jump
+label patching functionality is available. Following is the kernel's code
+size change due to this facility.
- Without page owner
text data bss dec hex filename
Enable options only for select slabs
Possible debug options are
- F Sanity checks on (enables SLAB_DEBUG_FREE. Sorry
- SLAB legacy issues)
+ F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
+ Sorry SLAB legacy issues)
Z Red zoning
P Poisoning (object and padding)
U User tracking (free and alloc)
unsigned long);
#define HAVE_ARCH_FB_UNMAPPED_AREA
+#define pgprot_writecombine pgprot_noncached
+
#include <asm-generic/pgtable.h>
#endif /* _BLACKFIN_PGTABLE_H */
void __init zone_sizes_init(void)
{
unsigned long zones_size[MAX_NR_ZONES] = {0, };
- unsigned long max_dma;
- unsigned long low;
unsigned long start_pfn;
#ifdef CONFIG_MMU
- start_pfn = START_PFN(0);
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- low = MAX_LOW_PFN(0);
-
- if (low < max_dma){
- zones_size[ZONE_DMA] = low - start_pfn;
- zones_size[ZONE_NORMAL] = 0;
- } else {
- zones_size[ZONE_DMA] = low - start_pfn;
- zones_size[ZONE_NORMAL] = low - max_dma;
+ {
+ unsigned long low;
+ unsigned long max_dma;
+
+ start_pfn = START_PFN(0);
+ max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ low = MAX_LOW_PFN(0);
+
+ if (low < max_dma) {
+ zones_size[ZONE_DMA] = low - start_pfn;
+ zones_size[ZONE_NORMAL] = 0;
+ } else {
+ zones_size[ZONE_DMA] = low - start_pfn;
+ zones_size[ZONE_NORMAL] = low - max_dma;
+ }
}
#else
zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT;
#include <linux/export.h>
#include <linux/kdebug.h>
#include <linux/ptrace.h>
+#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <asm/processor.h>
#ifdef CONFIG_SMP
printk("SMP ");
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
+ if (debug_pagealloc_enabled())
+ printk("DEBUG_PAGEALLOC");
printk("\n");
notify_die(DIE_OOPS, str, regs, 0, regs->int_code & 0xffff, SIGSEGV);
print_modules();
pgd_populate(&init_mm, pg_dir, pu_dir);
}
pu_dir = pud_offset(pg_dir, address);
-#ifndef CONFIG_DEBUG_PAGEALLOC
if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
- !(address & ~PUD_MASK) && (address + PUD_SIZE <= end)) {
+ !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
+ !debug_pagealloc_enabled()) {
pud_val(*pu_dir) = __pa(address) |
_REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE |
(ro ? _REGION_ENTRY_PROTECT : 0);
address += PUD_SIZE;
continue;
}
-#endif
if (pud_none(*pu_dir)) {
pm_dir = vmem_pmd_alloc();
if (!pm_dir)
pud_populate(&init_mm, pu_dir, pm_dir);
}
pm_dir = pmd_offset(pu_dir, address);
-#ifndef CONFIG_DEBUG_PAGEALLOC
if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
- !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) {
+ !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
+ !debug_pagealloc_enabled()) {
pmd_val(*pm_dir) = __pa(address) |
_SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
_SEGMENT_ENTRY_YOUNG |
address += PMD_SIZE;
continue;
}
-#endif
if (pmd_none(*pm_dir)) {
pt_dir = vmem_pte_alloc(address);
if (!pt_dir)
#ifdef CONFIG_SMP
printk("SMP ");
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC ");
-#endif
+ if (debug_pagealloc_enabled())
+ printk("DEBUG_PAGEALLOC ");
#ifdef CONFIG_KASAN
printk("KASAN");
#endif
static void __init probe_page_size_mask(void)
{
-#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK)
/*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
+ * use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (cpu_has_pse)
+ if (cpu_has_pse && !debug_pagealloc_enabled())
page_size_mask |= 1 << PG_LEVEL_2M;
#endif
* mark them not present - any buggy init-section access will
* create a kernel page fault:
*/
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
- begin, end - 1);
- set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
- /*
- * We just marked the kernel text read only above, now that
- * we are going to free part of that, we need to make that
- * writeable and non-executable first.
- */
- set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
- set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+ if (debug_pagealloc_enabled()) {
+ pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
+ begin, end - 1);
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+ } else {
+ /*
+ * We just marked the kernel text read only above, now that
+ * we are going to free part of that, we need to make that
+ * writeable and non-executable first.
+ */
+ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
- free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
-#endif
+ free_reserved_area((void *)begin, (void *)end,
+ POISON_FREE_INITMEM, what);
+ }
}
void free_initmem(void)
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-# define debug_pagealloc 1
-#else
-# define debug_pagealloc 0
-#endif
-
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
{
struct page *base;
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
if (!base)
return -ENOMEM;
if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa->numpages = 1;
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
ret = __change_page_attr(cpa, checkalias);
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
if (ret)
return ret;
kfree(p);
}
+static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct hd_struct *part = dev_to_part(dev);
+
+ add_uevent_var(env, "PARTN=%u", part->partno);
+ if (part->info && part->info->volname[0])
+ add_uevent_var(env, "PARTNAME=%s", part->info->volname);
+ return 0;
+}
+
struct device_type part_type = {
.name = "partition",
.groups = part_attr_groups,
.release = part_release,
+ .uevent = part_uevent,
};
static void delete_partition_rcu_cb(struct rcu_head *head)
static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device);
-static DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX],
- acpi_cstate);
+static
+DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], acpi_cstate);
static int disabled_by_idle_boot_param(void)
{
return ret;
}
-static int memory_block_change_state(struct memory_block *mem,
+int memory_block_change_state(struct memory_block *mem,
unsigned long to_state, unsigned long from_state_req)
{
int ret = 0;
static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
+/*
+ * Memory auto online policy.
+ */
+
+static ssize_t
+show_auto_online_blocks(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ if (memhp_auto_online)
+ return sprintf(buf, "online\n");
+ else
+ return sprintf(buf, "offline\n");
+}
+
+static ssize_t
+store_auto_online_blocks(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (sysfs_streq(buf, "online"))
+ memhp_auto_online = true;
+ else if (sysfs_streq(buf, "offline"))
+ memhp_auto_online = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks,
+ store_auto_online_blocks);
+
/*
* Some architectures will have custom drivers to do this, and
* will not need to do it from userspace. The fake hot-add code
#endif
&dev_attr_block_size_bytes.attr,
+ &dev_attr_auto_online_blocks.attr,
NULL
};
*/
#include <linux/types.h>
-static bool verbose = 0;
+static int verbose = 0;
static int major = PD_MAJOR;
static char *name = PD_NAME;
static int cluster = 64;
static DEFINE_MUTEX(pd_mutex);
static DEFINE_SPINLOCK(pd_lock);
-module_param(verbose, bool, 0);
+module_param(verbose, int, 0);
module_param(major, int, 0);
module_param(name, charp, 0);
module_param(cluster, int, 0);
*/
-static bool verbose = 0;
+static int verbose = 0;
static int major = PT_MAJOR;
static char *name = PT_NAME;
static int disable = 0;
#include <asm/uaccess.h>
-module_param(verbose, bool, 0);
+module_param(verbose, int, 0);
module_param(major, int, 0);
module_param(name, charp, 0);
module_param_array(drive0, int, NULL, 0);
Memory could be hotplugged in following steps:
- 1) dom0: xl mem-max <domU> <maxmem>
+ 1) target domain: ensure that memory auto online policy is in
+ effect by checking /sys/devices/system/memory/auto_online_blocks
+ file (should be 'online').
+
+ 2) control domain: xl mem-max <target-domain> <maxmem>
where <maxmem> is >= requested memory size,
- 2) dom0: xl mem-set <domU> <memory>
+ 3) control domain: xl mem-set <target-domain> <memory>
where <memory> is requested memory size; alternatively memory
could be added by writing proper value to
/sys/devices/system/xen_memory/xen_memory0/target or
- /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU,
+ /sys/devices/system/xen_memory/xen_memory0/target_kb on the
+ target domain.
- 3) domU: for i in /sys/devices/system/memory/memory*/state; do \
- [ "`cat "$i"`" = offline ] && echo online > "$i"; done
+ Alternatively, if memory auto onlining was not requested at step 1
+ the newly added memory can be manually onlined in the target domain
+ by doing the following:
- Memory could be onlined automatically on domU by adding following line to udev rules:
+ for i in /sys/devices/system/memory/memory*/state; do \
+ [ "`cat "$i"`" = offline ] && echo online > "$i"; done
- SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'"
+ or by adding the following line to udev rules:
- In that case step 3 should be omitted.
+ SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'"
config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
int "Hotplugged memory limit (in GiB) for a PV guest"
}
#endif
- rc = add_memory_resource(nid, resource);
+ /*
+ * add_memory_resource() will call online_pages() which in its turn
+ * will call xen_online_page() callback causing deadlock if we don't
+ * release balloon_mutex here. Unlocking here is safe because the
+ * callers drop the mutex before trying again.
+ */
+ mutex_unlock(&balloon_mutex);
+ rc = add_memory_resource(nid, resource, memhp_auto_online);
+ mutex_lock(&balloon_mutex);
+
if (rc) {
pr_warn("Cannot add additional memory (%i)\n", rc);
goto err;
/* Find the first set bit in a evtchn mask */
#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
-static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD],
- cpu_evtchn_mask);
+#define EVTCHN_MASK_SIZE (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)
+
+static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_MASK_SIZE], cpu_evtchn_mask);
static unsigned evtchn_2l_max_channels(void)
{
-/* -*- c -*- ------------------------------------------------------------- *
- *
- * linux/fs/autofs/autofs_i.h
- *
- * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
+ */
/* Internal header file for autofs */
#include <linux/mount.h>
#include <linux/namei.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/* #define DEBUG */
-#define DPRINTK(fmt, ...) \
- pr_debug("pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_WARN(fmt, ...) \
- printk(KERN_WARNING "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_ERROR(fmt, ...) \
- printk(KERN_ERR "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-/* Unified info structure. This is pointed to by both the dentry and
- inode structures. Each file in the filesystem has an instance of this
- structure. It holds a reference to the dentry, so dentries are never
- flushed while the file exists. All name lookups are dealt with at the
- dentry level, although the filesystem can interfere in the validation
- process. Readdir is implemented by traversing the dentry lists. */
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
+
+/*
+ * Unified info structure. This is pointed to by both the dentry and
+ * inode structures. Each file in the filesystem has an instance of this
+ * structure. It holds a reference to the dentry, so dentries are never
+ * flushed while the file exists. All name lookups are dealt with at the
+ * dentry level, although the filesystem can interfere in the validation
+ * process. Readdir is implemented by traversing the dentry lists.
+ */
struct autofs_info {
struct dentry *dentry;
struct inode *inode;
kgid_t gid;
};
-#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */
#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
* for expiry, so RCU_walk is
* not permitted
}
/* autofs4_oz_mode(): do we see the man behind the curtain? (The
- processes which do manipulations for us in user space sees the raw
- filesystem without "magic".) */
-
-static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+ * processes which do manipulations for us in user space sees the raw
+ * filesystem without "magic".)
+ */
+static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
+{
return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
int is_autofs4_dentry(struct dentry *);
int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *,
- struct autofs_packet_expire __user *);
+ struct autofs_sb_info *,
+ struct autofs_packet_expire __user *);
int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int when);
int autofs4_expire_multi(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *, int __user *);
+ struct autofs_sb_info *, int __user *);
struct dentry *autofs4_expire_direct(struct super_block *sb,
struct vfsmount *mnt,
struct autofs_sb_info *sbi, int how);
/* Queue management functions */
-int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
-int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
+int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
void autofs4_catatonic_mode(struct autofs_sb_info *);
static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
}
- return;
}
static inline void autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static inline void autofs4_del_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!list_empty(&ino->expiring))
list_del_init(&ino->expiring);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
extern void autofs4_kill_sb(struct super_block *);
{
int err = 0;
- if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
- (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
- AUTOFS_WARN("ioctl control interface version mismatch: "
- "kernel(%u.%u), user(%u.%u), cmd(%d)",
- AUTOFS_DEV_IOCTL_VERSION_MAJOR,
- AUTOFS_DEV_IOCTL_VERSION_MINOR,
- param->ver_major, param->ver_minor, cmd);
+ if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
+ (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
+ pr_warn("ioctl control interface version mismatch: "
+ "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
+ AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+ AUTOFS_DEV_IOCTL_VERSION_MINOR,
+ param->ver_major, param->ver_minor, cmd);
err = -EINVAL;
}
* Copy parameter control struct, including a possible path allocated
* at the end of the struct.
*/
-static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+static struct autofs_dev_ioctl *
+ copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
struct autofs_dev_ioctl tmp, *res;
static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
{
kfree(param);
- return;
}
/*
err = check_dev_ioctl_version(cmd, param);
if (err) {
- AUTOFS_WARN("invalid device control module version "
- "supplied for cmd(0x%08x)", cmd);
+ pr_warn("invalid device control module version "
+ "supplied for cmd(0x%08x)\n", cmd);
goto out;
}
if (param->size > sizeof(*param)) {
err = invalid_str(param->path, param->size - sizeof(*param));
if (err) {
- AUTOFS_WARN(
- "path string terminator missing for cmd(0x%08x)",
+ pr_warn(
+ "path string terminator missing for cmd(0x%08x)\n",
cmd);
goto out;
}
err = check_name(param->path);
if (err) {
- AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
- cmd);
+ pr_warn("invalid path supplied for cmd(0x%08x)\n",
+ cmd);
goto out;
}
}
void *data)
{
struct path path;
- int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+ int err;
+
+ err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
if (err)
return err;
err = -ENOENT;
static int test_by_type(struct path *path, void *p)
{
struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+
return ino && ino->sbi->type & *(unsigned *)p;
}
new_pid = get_task_pid(current, PIDTYPE_PGID);
if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
- AUTOFS_WARN("Not allowed to change PID namespace");
+ pr_warn("not allowed to change PID namespace\n");
err = -EINVAL;
goto out;
}
err = 0;
autofs4_expire_wait(path.dentry, 0);
spin_lock(&sbi->fs_lock);
- param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
- param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
+ param->requester.uid =
+ from_kuid_munged(current_user_ns(), ino->uid);
+ param->requester.gid =
+ from_kgid_munged(current_user_ns(), ino->gid);
spin_unlock(&sbi->fs_lock);
}
path_put(&path);
}
/* ioctl dispatcher */
-static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+static int _autofs_dev_ioctl(unsigned int command,
+ struct autofs_dev_ioctl __user *user)
{
struct autofs_dev_ioctl *param;
struct file *fp;
fn = lookup_dev_ioctl(cmd);
if (!fn) {
- AUTOFS_WARN("unknown command 0x%08x", command);
+ pr_warn("unknown command 0x%08x\n", command);
return -ENOTTY;
}
static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
{
int err;
+
err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
return (long) err;
}
static struct miscdevice _autofs_dev_ioctl_misc = {
.minor = AUTOFS_MINOR,
- .name = AUTOFS_DEVICE_NAME,
- .fops = &_dev_ioctl_fops
+ .name = AUTOFS_DEVICE_NAME,
+ .fops = &_dev_ioctl_fops
};
MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
r = misc_register(&_autofs_dev_ioctl_misc);
if (r) {
- AUTOFS_ERROR("misc_register failed for control device");
+ pr_err("misc_register failed for control device\n");
return r;
}
void autofs_dev_ioctl_exit(void)
{
misc_deregister(&_autofs_dev_ioctl_misc);
- return;
}
-
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/expire.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
/* Check if a dentry can be expired */
static inline int autofs4_can_expire(struct dentry *dentry,
- unsigned long timeout, int do_now)
+ unsigned long timeout, int do_now)
{
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct path path = {.mnt = mnt, .dentry = dentry};
int status = 1;
- DPRINTK("dentry %p %pd", dentry, dentry);
+ pr_debug("dentry %p %pd\n", dentry, dentry);
path_get(&path);
/* Update the expiry counter if fs is busy */
if (!may_umount_tree(path.mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
ino->last_used = jiffies;
goto done;
}
status = 0;
done:
- DPRINTK("returning = %d", status);
+ pr_debug("returning = %d\n", status);
path_put(&path);
return status;
}
* Calculate and dget next entry in the subdirs list under root.
*/
static struct dentry *get_next_positive_subdir(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
* Calculate and dget next entry in top down tree traversal.
*/
static struct dentry *get_next_positive_dentry(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
* autofs submounts.
*/
static int autofs4_direct_busy(struct vfsmount *mnt,
- struct dentry *top,
- unsigned long timeout,
- int do_now)
+ struct dentry *top,
+ unsigned long timeout,
+ int do_now)
{
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* If it's busy update the expiry counters */
if (!may_umount_tree(mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
if (ino)
ino->last_used = jiffies;
return 1;
return 0;
}
-/* Check a directory tree of mount points for busyness
+/*
+ * Check a directory tree of mount points for busyness
* The tree is not busy iff no mountpoints are busy
*/
static int autofs4_tree_busy(struct vfsmount *mnt,
struct autofs_info *top_ino = autofs4_dentry_ino(top);
struct dentry *p;
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* Negative dentry - give up */
if (!simple_positive(top))
p = NULL;
while ((p = get_next_positive_dentry(p, top))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
/*
* Is someone visiting anywhere in the subtree ?
{
struct dentry *p;
- DPRINTK("parent %p %pd", parent, parent);
+ pr_debug("parent %p %pd\n", parent, parent);
p = NULL;
while ((p = get_next_positive_dentry(p, parent))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
if (d_mountpoint(p)) {
/* Can we umount this guy */
* offset (autofs-5.0+).
*/
if (d_mountpoint(dentry)) {
- DPRINTK("checking mountpoint %p %pd", dentry, dentry);
+ pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, dentry))
}
if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
- DPRINTK("checking symlink %p %pd", dentry, dentry);
+ pr_debug("checking symlink %p %pd\n", dentry, dentry);
/*
* A symlink can't be "busy" in the usual sense so
* just check last used for expire timeout.
} else {
/* Path walk currently on this dentry? */
struct dentry *expired;
+
ino_count = atomic_read(&ino->count) + 1;
if (d_count(dentry) > ino_count)
return NULL;
return NULL;
found:
- DPRINTK("returning %p %pd", expired, expired);
+ pr_debug("returning %p %pd\n", expired, expired);
ino->flags |= AUTOFS_INF_EXPIRING;
smp_mb();
ino->flags &= ~AUTOFS_INF_NO_RCU;
if (ino->flags & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
- DPRINTK("waiting for expire %p name=%pd", dentry, dentry);
+ pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
status = autofs4_wait(sbi, dentry, NFY_NONE);
wait_for_completion(&ino->expire_complete);
- DPRINTK("expire done status=%d", status);
+ pr_debug("expire done status=%d\n", status);
if (d_unhashed(dentry))
return -EAGAIN;
/* Perform an expiry operation */
int autofs4_expire_run(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- struct autofs_packet_expire __user *pkt_p)
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ struct autofs_packet_expire __user *pkt_p)
{
struct autofs_packet_expire pkt;
struct autofs_info *ino;
struct dentry *dentry;
int ret = 0;
- memset(&pkt,0,sizeof pkt);
+ memset(&pkt, 0, sizeof(pkt));
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = autofs_ptype_expire;
- if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL)
+ dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
+ if (!dentry)
return -EAGAIN;
pkt.len = dentry->d_name.len;
pkt.name[pkt.len] = '\0';
dput(dentry);
- if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+ if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
ret = -EFAULT;
spin_lock(&sbi->fs_lock);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
/* This is synchronous because it makes the daemon a
- little easier */
+ * little easier
+ */
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
return ret;
}
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
- more to be done */
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more to be done.
+ */
int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int __user *arg)
{
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/module.h>
#include <linux/init.h>
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/kernel.h>
#include <linux/slab.h>
struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
{
- struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+ struct autofs_info *ino;
+
+ ino = kzalloc(sizeof(*ino), GFP_KERNEL);
if (ino) {
INIT_LIST_HEAD(&ino->active);
INIT_LIST_HEAD(&ino->expiring);
put_pid(sbi->oz_pgrp);
}
- DPRINTK("shutting down");
+ pr_debug("shutting down\n");
kill_litter_super(sb);
if (sbi)
kfree_rcu(sbi, rcu);
seq_printf(m, ",direct");
else
seq_printf(m, ",indirect");
-
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (sbi->pipe)
+ seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+ else
+ seq_printf(m, ",pipe_ino=-1");
+#endif
return 0;
}
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
int autofs4_fill_super(struct super_block *s, void *data, int silent)
{
- struct inode * root_inode;
- struct dentry * root;
- struct file * pipe;
+ struct inode *root_inode;
+ struct dentry *root;
+ struct file *pipe;
int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
- DPRINTK("starting up, sbi = %p",sbi);
+ pr_debug("starting up, sbi = %p\n", sbi);
s->s_fs_info = sbi;
sbi->magic = AUTOFS_SBI_MAGIC;
if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
&pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
&sbi->max_proto)) {
- printk("autofs: called with bogus options\n");
+ pr_err("called with bogus options\n");
goto fail_dput;
}
if (pgrp_set) {
sbi->oz_pgrp = find_get_pid(pgrp);
if (!sbi->oz_pgrp) {
- pr_warn("autofs: could not find process group %d\n",
+ pr_err("could not find process group %d\n",
pgrp);
goto fail_dput;
}
/* Couldn't this be tested earlier? */
if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
- printk("autofs: kernel does not match daemon version "
+ pr_err("kernel does not match daemon version "
"daemon (%d, %d) kernel (%d, %d)\n",
- sbi->min_proto, sbi->max_proto,
- AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+ sbi->min_proto, sbi->max_proto,
+ AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
goto fail_dput;
}
sbi->version = sbi->max_proto;
sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
+ pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
if (!pipe) {
- printk("autofs: could not open pipe file descriptor\n");
+ pr_err("could not open pipe file descriptor\n");
goto fail_dput;
}
ret = autofs_prepare_pipe(pipe);
*/
s->s_root = root;
return 0;
-
+
/*
* Failure ... clean up.
*/
fail_fput:
- printk("autofs: pipe file descriptor does not contain proper ops\n");
+ pr_err("pipe file descriptor does not contain proper ops\n");
fput(pipe);
/* fall through */
fail_dput:
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/capability.h>
#include <linux/errno.h>
#include "autofs_i.h"
-static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
-static int autofs4_dir_unlink(struct inode *,struct dentry *);
-static int autofs4_dir_rmdir(struct inode *,struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
-static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
+static int autofs4_dir_unlink(struct inode *, struct dentry *);
+static int autofs4_dir_rmdir(struct inode *, struct dentry *);
+static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
+static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
-static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_compat_ioctl(struct file *,
+ unsigned int, unsigned long);
#endif
static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
+static struct dentry *autofs4_lookup(struct inode *,
+ struct dentry *, unsigned int);
static struct vfsmount *autofs4_d_automount(struct path *);
static int autofs4_d_manage(struct dentry *, bool);
static void autofs4_dentry_release(struct dentry *);
static void autofs4_add_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!ino->active_count) {
ino->active_count++;
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static void autofs4_del_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
ino->active_count--;
}
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static int autofs4_dir_open(struct inode *inode, struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry);
+ pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
if (autofs4_oz_mode(sbi))
goto out;
struct autofs_info *ino = autofs4_dentry_ino(de);
struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
- DPRINTK("releasing %p", de);
+ pr_debug("releasing %p\n", de);
if (!ino)
return;
if (ino->flags & AUTOFS_INF_PENDING) {
if (rcu_walk)
return -ECHILD;
- DPRINTK("waiting for mount name=%pd", dentry);
+ pr_debug("waiting for mount name=%pd\n", dentry);
status = autofs4_wait(sbi, dentry, NFY_MOUNT);
- DPRINTK("mount wait done status=%d", status);
+ pr_debug("mount wait done status=%d\n", status);
}
ino->last_used = jiffies;
return status;
if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
struct autofs_info *ino;
- struct dentry *new = d_lookup(parent, &dentry->d_name);
+ struct dentry *new;
+
+ new = d_lookup(parent, &dentry->d_name);
if (!new)
return NULL;
ino = autofs4_dentry_ino(new);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never triggers a mount. */
if (autofs4_oz_mode(sbi))
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
* a mount-trap.
*/
struct inode *inode;
+
if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
return 0;
if (d_mountpoint(dentry))
}
/* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *autofs4_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
struct dentry *active;
- DPRINTK("name = %pd", dentry);
+ pr_debug("name = %pd\n", dentry);
/* File name too long to exist */
if (dentry->d_name.len > NAME_MAX)
sbi = autofs4_sbi(dir->i_sb);
- DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
- current->pid, task_pgrp_nr(current), sbi->catatonic,
- autofs4_oz_mode(sbi));
+ pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
+ current->pid, task_pgrp_nr(current), sbi->catatonic,
+ autofs4_oz_mode(sbi));
active = autofs4_lookup_active(dentry);
- if (active) {
+ if (active)
return active;
- } else {
+ else {
/*
* A dentry that is not within the root can never trigger a
* mount operation, unless the directory already exists, so we
return ERR_PTR(-ENOENT);
/* Mark entries in the root as mount triggers */
- if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+ if (IS_ROOT(dentry->d_parent) &&
+ autofs_type_indirect(sbi->type))
__managed_dentry_set_managed(dentry);
ino = autofs4_new_ino(sbi);
size_t size = strlen(symname);
char *cp;
- DPRINTK("%s <- %pd", symname, dentry);
+ pr_debug("%s <- %pd\n", symname, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
+
/* This allows root to remove symlinks */
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
if (IS_ROOT(parent->d_parent))
return;
managed_dentry_clear_managed(parent);
- return;
}
static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
if (d_child->next == &parent->d_subdirs &&
d_child->prev == &parent->d_subdirs)
managed_dentry_set_managed(parent);
- return;
}
static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
- DPRINTK("dentry %p, removing %pd", dentry, dentry);
+
+ pr_debug("dentry %p, removing %pd\n", dentry, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
return 0;
}
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int autofs4_dir_mkdir(struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
- DPRINTK("dentry %p, creating %pd", dentry, dentry);
+ pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
/* Get/set timeout ioctl() operation */
#ifdef CONFIG_COMPAT
static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
- compat_ulong_t __user *p)
+ compat_ulong_t __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
+
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > UINT_MAX/HZ)
sbi->exp_timeout = 0;
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
#endif
static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
- unsigned long __user *p)
+ unsigned long __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
+
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > ULONG_MAX/HZ)
sbi->exp_timeout = 0;
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
/* Return protocol version */
-static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->version, p);
}
/* Return protocol sub version */
-static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->sub_version, p);
}
if (may_umount(mnt))
status = 1;
- DPRINTK("returning %d", status);
+ pr_debug("returning %d\n", status);
status = put_user(status, p);
}
/* Identify autofs4_dentries - this is so we can tell if there's
- an extra dentry refcount or not. We only hold a refcount on the
- dentry if its non-negative (ie, d_inode != NULL)
-*/
+ * an extra dentry refcount or not. We only hold a refcount on the
+ * dentry if its non-negative (ie, d_inode != NULL)
+ */
int is_autofs4_dentry(struct dentry *dentry)
{
return dentry && d_really_is_positive(dentry) &&
struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
void __user *p = (void __user *)arg;
- DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
- cmd,arg,sbi,task_pgrp_nr(current));
+ pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",
+ cmd, arg, sbi, task_pgrp_nr(current));
if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
_IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
return -ENOTTY;
-
+
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
-
- switch(cmd) {
+
+ switch (cmd) {
case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
autofs4_catatonic_mode(sbi);
return 0;
/* return a single thing to expire */
case AUTOFS_IOC_EXPIRE:
- return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_run(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
/* same as above, but can send multiple expires through pipe */
case AUTOFS_IOC_EXPIRE_MULTI:
- return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_multi(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
default:
- return -ENOSYS;
+ return -EINVAL;
}
}
unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+
return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
}
#ifdef CONFIG_COMPAT
static long autofs4_root_compat_ioctl(struct file *filp,
- unsigned int cmd, unsigned long arg)
+ unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
int ret;
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
else
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
- (unsigned long)compat_ptr(arg));
+ (unsigned long) compat_ptr(arg));
return ret;
}
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
+
if (!dentry)
return ERR_PTR(-ECHILD);
sbi = autofs4_sbi(dentry->d_sb);
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/slab.h>
#include <linux/time.h>
#include "autofs_i.h"
/* We make this a static variable rather than a part of the superblock; it
- is better if we don't reassign numbers easily even across filesystems */
+ * is better if we don't reassign numbers easily even across filesystems
+ */
static autofs_wqt_t autofs4_next_wait_queue = 1;
/* These are the signals we allow interrupting a pending mount */
return;
}
- DPRINTK("entering catatonic mode");
+ pr_debug("entering catatonic mode\n");
sbi->catatonic = 1;
wq = sbi->queues;
set_fs(KERNEL_DS);
mutex_lock(&sbi->pipe_mutex);
- while (bytes &&
- (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) {
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
+ while (bytes && wr) {
data += wr;
bytes -= wr;
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
}
mutex_unlock(&sbi->pipe_mutex);
set_fs(fs);
/* Keep the currently executing process from receiving a
- SIGPIPE unless it was already supposed to get one */
+ * SIGPIPE unless it was already supposed to get one
+ */
if (wr == -EPIPE && !sigpipe) {
spin_lock_irqsave(¤t->sighand->siglock, flags);
sigdelset(¤t->pending.signal, SIGPIPE);
return (bytes > 0);
}
-
+
static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct autofs_wait_queue *wq,
int type)
struct file *pipe = NULL;
size_t pktsz;
- DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
+ pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
+ (unsigned long) wq->wait_queue_token,
+ wq->name.len, wq->name.name, type);
- memset(&pkt,0,sizeof pkt); /* For security reasons */
+ memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = type;
}
case autofs_ptype_expire_multi:
{
- struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
+ struct autofs_packet_expire_multi *ep =
+ &pkt.v4_pkt.expire_multi;
pktsz = sizeof(*ep);
break;
}
default:
- printk("autofs4_notify_daemon: bad type %d!\n", type);
+ pr_warn("bad type %d!\n", type);
mutex_unlock(&sbi->wq_mutex);
return;
}
if (wq->name.hash == qstr->hash &&
wq->name.len == qstr->len &&
wq->name.name &&
- !memcmp(wq->name.name, qstr->name, qstr->len))
+ !memcmp(wq->name.name, qstr->name, qstr->len))
break;
}
return wq;
static int validate_request(struct autofs_wait_queue **wait,
struct autofs_sb_info *sbi,
struct qstr *qstr,
- struct dentry*dentry, enum autofs_notify notify)
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct autofs_info *ino;
* continue on and create a new request.
*/
if (!IS_ROOT(dentry)) {
- if (d_really_is_positive(dentry) && d_unhashed(dentry)) {
+ if (d_unhashed(dentry) &&
+ d_really_is_positive(dentry)) {
struct dentry *parent = dentry->d_parent;
+
new = d_lookup(parent, &dentry->d_name);
if (new)
dentry = new;
return 1;
}
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
- enum autofs_notify notify)
+int autofs4_wait(struct autofs_sb_info *sbi,
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct qstr qstr;
if (!wq) {
/* Create a new wait queue */
- wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
if (!wq) {
kfree(qstr.name);
mutex_unlock(&sbi->wq_mutex);
autofs_ptype_expire_indirect;
}
- DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
- /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
+ /*
+ * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
+ */
autofs4_notify_daemon(sbi, wq, type);
} else {
wq->wait_ctr++;
- DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
mutex_unlock(&sbi->wq_mutex);
kfree(qstr.name);
}
*/
if (wq->name.name) {
/* Block all but "shutdown" signals while waiting */
- sigset_t oldset;
+ unsigned long shutdown_sigs_mask;
unsigned long irqflags;
+ sigset_t oldset;
spin_lock_irqsave(¤t->sighand->siglock, irqflags);
oldset = current->blocked;
- siginitsetinv(¤t->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]);
+ shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
+ siginitsetinv(¤t->blocked, shutdown_sigs_mask);
recalc_sigpending();
spin_unlock_irqrestore(¤t->sighand->siglock, irqflags);
recalc_sigpending();
spin_unlock_irqrestore(¤t->sighand->siglock, irqflags);
} else {
- DPRINTK("skipped sleeping");
+ pr_debug("skipped sleeping\n");
}
status = wq->status;
return 0;
}
-
* If warn is true, then emit a warning if the page is not uptodate and has
* not been truncated.
*
- * The caller must hold mem_cgroup_begin_page_stat() lock.
+ * The caller must hold lock_page_memcg().
*/
static void __set_page_dirty(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, int warn)
+ int warn)
{
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
int __set_page_dirty_buffers(struct page *page)
{
int newly_dirty;
- struct mem_cgroup *memcg;
struct address_space *mapping = page_mapping(page);
if (unlikely(!mapping))
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, memcg, 1);
+ __set_page_dirty(page, mapping, 1);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
struct address_space *mapping = NULL;
- struct mem_cgroup *memcg;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
mapping = page_mapping(page);
if (mapping)
- __set_page_dirty(page, mapping, memcg, 0);
+ __set_page_dirty(page, mapping, 0);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
+#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
map_bh.b_state = 0;
map_bh.b_size = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = list_entry(pages->prev, struct page, lru);
+ struct page *page = lru_to_page(pages);
prefetchw(&page->flags);
list_del(&page->lru);
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
- unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
if (o2hb_global_heartbeat_active()) {
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
failed = bitmap_weight(o2hb_failed_region_bitmap,
O2NM_MAX_REGIONS);
quorum = bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS);
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
quorum, failed);
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long flags;
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
#define DLM_LOCK_RES_DROPPING_REF 0x00000040
#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
+#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000
/* max milliseconds to wait to sync up a network failure with a node death */
#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
DLM_QUERY_REGION = 519,
DLM_QUERY_NODEINFO = 520,
DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
+ DLM_DEREF_LOCKRES_DONE = 522,
};
struct dlm_reco_node_data
* };
*
* from ../cluster/tcp.h
- * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
+ * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
* (roughly 4080 bytes)
* and sizeof(dlm_migratable_lockres) = 112 bytes
* and sizeof(dlm_migratable_lock) = 16 bytes
/* from above, 128 bytes
* for some undetermined future use */
-#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
+#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \
DLM_MIG_LOCKRES_MAX_LEN)
struct dlm_create_lock
u8 name[O2NM_MAX_NAME_LEN];
};
+enum {
+ DLM_DEREF_RESPONSE_DONE = 0,
+ DLM_DEREF_RESPONSE_INPROG = 1,
+};
+
+struct dlm_deref_lockres_done {
+ u32 pad1;
+ u16 pad2;
+ u8 node_idx;
+ u8 namelen;
+
+ u8 name[O2NM_MAX_NAME_LEN];
+};
+
static inline enum dlm_status
__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
status = DLM_RECOVERING;
else if (res->state & DLM_LOCK_RES_MIGRATING)
status = DLM_MIGRATING;
void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
{
__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING|
DLM_LOCK_RES_MIGRATING));
}
* - Message DLM_QUERY_NODEINFO added to allow online node removes
* New in version 1.2:
* - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ * New in version 1.3:
+ * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the
+ * refmap is cleared
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 2,
+ .pv_minor = 3,
};
#define DLM_DOMAIN_BACKOFF_MS 200
unsigned int map_size)
{
int status, tmpstat;
- unsigned int node;
+ int node;
if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
sizeof(unsigned long))) {
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+ status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ sizeof(struct dlm_deref_lockres_done),
+ dlm_deref_lockres_done_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
dlm_print_one_lock_resource(res);
BUG();
}
- return ret;
+ return ret ? ret : r;
}
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
res->lockname.len, res->lockname.name, node);
dlm_print_one_lock_resource(res);
}
- ret = 0;
+ ret = DLM_DEREF_RESPONSE_DONE;
goto done;
}
spin_unlock(&dlm->work_lock);
queue_work(dlm->dlm_worker, &dlm->dispatched_work);
- return 0;
+ return DLM_DEREF_RESPONSE_INPROG;
done:
if (res)
return ret;
}
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct dlm_ctxt *dlm = data;
+ struct dlm_deref_lockres_done *deref
+ = (struct dlm_deref_lockres_done *)msg->buf;
+ struct dlm_lock_resource *res = NULL;
+ char *name;
+ unsigned int namelen;
+ int ret = -EINVAL;
+ u8 node;
+ unsigned int hash;
+
+ if (!dlm_grab(dlm))
+ return 0;
+
+ name = deref->name;
+ namelen = deref->namelen;
+ node = deref->node_idx;
+
+ if (namelen > DLM_LOCKID_NAME_MAX) {
+ mlog(ML_ERROR, "Invalid name length!");
+ goto done;
+ }
+ if (deref->node_idx >= O2NM_MAX_NODES) {
+ mlog(ML_ERROR, "Invalid node number: %u\n", node);
+ goto done;
+ }
+
+ hash = dlm_lockid_hash(name, namelen);
+
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+ if (!res) {
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+ dlm->name, namelen, name);
+ goto done;
+ }
+
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
+ dlm->purge_count--;
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
+
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ /* lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource.
+ */
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+
+ dlm_lockres_put(res);
+
+ spin_unlock(&dlm->spinlock);
+
+done:
+ dlm_put(dlm);
+ return ret;
+}
+
+static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, u8 node)
+{
+ struct dlm_deref_lockres_done deref;
+ int ret = 0, r;
+ const char *lockname;
+ unsigned int namelen;
+
+ lockname = res->lockname.name;
+ namelen = res->lockname.len;
+ BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+ memset(&deref, 0, sizeof(deref));
+ deref.node_idx = dlm->node_num;
+ deref.namelen = namelen;
+ memcpy(deref.name, lockname, namelen);
+
+ ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ &deref, sizeof(deref), node, &r);
+ if (ret < 0) {
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
+ " to node %u\n", dlm->name, namelen,
+ lockname, ret, node);
+ } else if (r < 0) {
+ /* ignore the error */
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, node, r);
+ dlm_print_one_lock_resource(res);
+ }
+}
+
static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
{
struct dlm_ctxt *dlm;
}
spin_unlock(&res->spinlock);
+ dlm_drop_lockres_ref_done(dlm, res, node);
+
if (cleared) {
mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
dlm->name, res->lockname.len, res->lockname.name, node);
return 0;
/* delay migration when the lockres is in RECOCERING state */
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
if (res->owner != dlm->node_num)
* and RECOVERY flag changed when it completes. */
hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
spin_lock(&dlm->spinlock);
- res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+ res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
hash);
if (res) {
/* this will get a ref on res */
/* mark it as recovering/migrating and hash it */
spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(0, "%s: node is attempting to migrate "
+ "lockres %.*s, but marked as dropping "
+ " ref!\n", dlm->name,
+ mres->lockname_len, mres->lockname);
+ ret = -EINVAL;
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ dlm_lockres_put(res);
+ goto leave;
+ }
+
if (mres->flags & DLM_MRES_RECOVERY) {
res->state |= DLM_LOCK_RES_RECOVERING;
} else {
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, bucket, hash_node) {
+ if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ }
+
if (!(res->state & DLM_LOCK_RES_RECOVERING))
continue;
res->lockname.len, res->lockname.name, freed, dead_node);
__dlm_print_one_lock_resource(res);
}
+ res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
- mlog(ML_NOTICE, "%s: res %.*s, Skip "
- "recovery as it is being freed\n",
- dlm->name, res->lockname.len,
- res->lockname.name);
- } else
- dlm_move_lockres_to_recovery_list(dlm,
- res);
-
+ mlog(0, "%s:%.*s: owned by "
+ "dead node %u, this node was "
+ "dropping its ref when it died. "
+ "continue, dropping the flag.\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ }
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
/* Another node has this resource with this node as the master */
dlm->purge_count--;
}
+ if (!master && ret != 0) {
+ mlog(0, "%s: deref %.*s in progress or master goes down\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
* dirty for a short while. */
BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
- DLM_LOCK_RES_RECOVERING)) {
+ DLM_LOCK_RES_RECOVERING |
+ DLM_LOCK_RES_RECOVERY_WAITING)) {
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
struct ocfs2_recovery_map *rm = osb->recovery_map;
struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
int i, out = 0;
+ unsigned long flags;
out += snprintf(buf + out, len - out,
"%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
cconn->cc_version.pv_minor);
}
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
out += snprintf(buf + out, len - out,
"%10s => Pid: %d Count: %lu WakeSeq: %lu "
"WorkSeq: %lu\n", "DownCnvt",
(osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
osb->blocked_lock_count, osb->dc_wake_sequence,
osb->dc_work_sequence);
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
spin_lock(&osb->osb_lock);
out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
loff_t end_offset;
loff_t offset;
int newly_dirty;
- struct mem_cgroup *memcg;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
#define _LINUX_AUTO_DEV_IOCTL_H
#include <linux/auto_fs.h>
-
-#ifdef __KERNEL__
#include <linux/string.h>
-#else
-#include <string.h>
-#endif /* __KERNEL__ */
#define AUTOFS_DEVICE_NAME "autofs"
in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
in->size = sizeof(struct autofs_dev_ioctl);
in->ioctlfd = -1;
- return;
}
/*
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- * linux/include/linux/auto_fs.h
- *
- * Copyright 1997 Transmeta Corporation - All Rights Reserved
+/*
+ * Copyright 1997 Transmeta Corporation - All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
+ */
#ifndef _LINUX_AUTO_FS_H
#define _LINUX_AUTO_FS_H
#endif /* CONFIG_FAULT_INJECTION */
#ifdef CONFIG_FAILSLAB
-extern bool should_failslab(size_t size, gfp_t gfpflags, unsigned long flags);
+extern bool should_failslab(struct kmem_cache *s, gfp_t gfpflags);
#else
-static inline bool should_failslab(size_t size, gfp_t gfpflags,
- unsigned long flags)
+static inline bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
return false;
}
struct vm_area_struct;
+/*
+ * In case of changes, please don't forget to update
+ * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
+ */
+
/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA 0x01u
#define ___GFP_HIGHMEM 0x02u
#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
-#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
void page_alloc_init_late(void);
-#else
-static inline void page_alloc_init_late(void)
-{
-}
-#endif
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
#include <linux/eventfd.h>
#include <linux/mmzone.h>
#include <linux/writeback.h>
+#include <linux/page-flags.h>
struct mem_cgroup;
struct page;
};
#ifdef CONFIG_MEMCG
+
+#define MEM_CGROUP_ID_SHIFT 16
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
struct mem_cgroup_stat_cpu {
long count[MEMCG_NR_STAT];
unsigned long events[MEMCG_NR_EVENTS];
extern struct mem_cgroup *root_mem_cgroup;
+static inline bool mem_cgroup_disabled(void)
+{
+ return !cgroup_subsys_enabled(memory_cgrp_subsys);
+}
+
/**
* mem_cgroup_events - count memory events against a cgroup
* @memcg: the memory cgroup
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage);
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+
+ return memcg->css.id;
+}
+
+/**
+ * mem_cgroup_from_id - look up a memcg from an id
+ * @id: the id to look up
+ *
+ * Caller must hold rcu_read_lock() and use css_tryget() as necessary.
+ */
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id, &memory_cgrp_subsys);
+ return mem_cgroup_from_css(css);
+}
+
/**
* parent_mem_cgroup - find the accounting parent of a memcg
* @memcg: memcg whose parent to find
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
ino_t page_cgroup_ino(struct page *page);
-static inline bool mem_cgroup_disabled(void)
-{
- return !cgroup_subsys_enabled(memory_cgrp_subsys);
-}
-
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
extern int do_swap_account;
#endif
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
+void lock_page_memcg(struct page *page);
+void unlock_page_memcg(struct page *page);
/**
* mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
+ * @page: the page
* @idx: page state item to account
* @val: number of pages (positive or negative)
*
- * See mem_cgroup_begin_page_stat() for locking requirements.
+ * The @page must be locked or the caller must use lock_page_memcg()
+ * to prevent double accounting when the page is concurrently being
+ * moved to another memcg:
+ *
+ * lock_page(page) or lock_page_memcg(page)
+ * if (TestClearPageState(page))
+ * mem_cgroup_update_page_stat(page, state, -1);
+ * unlock_page(page) or unlock_page_memcg(page)
*/
-static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_stat_index idx, int val)
{
- VM_BUG_ON(!rcu_read_lock_held());
+ VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page)));
- if (memcg)
- this_cpu_add(memcg->stat->count[idx], val);
+ if (page->mem_cgroup)
+ this_cpu_add(page->mem_cgroup->stat->count[idx], val);
}
-static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_stat_index idx)
{
- mem_cgroup_update_page_stat(memcg, idx, 1);
+ mem_cgroup_update_page_stat(page, idx, 1);
}
-static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_dec_page_stat(struct page *page,
enum mem_cgroup_stat_index idx)
{
- mem_cgroup_update_page_stat(memcg, idx, -1);
+ mem_cgroup_update_page_stat(page, idx, -1);
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
#endif
#else /* CONFIG_MEMCG */
+
+#define MEM_CGROUP_ID_SHIFT 0
+#define MEM_CGROUP_ID_MAX 0
+
struct mem_cgroup;
+static inline bool mem_cgroup_disabled(void)
+{
+ return true;
+}
+
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
unsigned int nr)
{
}
-static inline void mem_cgroup_replace_page(struct page *old, struct page *new)
+static inline void mem_cgroup_migrate(struct page *old, struct page *new)
{
}
{
}
-static inline bool mem_cgroup_disabled(void)
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
- return true;
+ return 0;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ WARN_ON_ONCE(id);
+ /* XXX: This should always return root_mem_cgroup */
+ return NULL;
}
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
}
-static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+static inline void lock_page_memcg(struct page *page)
{
- return NULL;
}
-static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+static inline void unlock_page_memcg(struct page *page)
{
}
return false;
}
-static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_stat_index idx)
{
}
-static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
+static inline void mem_cgroup_dec_page_stat(struct page *page,
enum mem_cgroup_stat_index idx)
{
}
void __memcg_kmem_uncharge(struct page *page, int order);
/*
- * helper for acessing a memcg's index. It will be used as an index in the
+ * helper for accessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
* will return -1 when this is not a kmem-limited memcg.
*/
extern int register_memory_isolate_notifier(struct notifier_block *nb);
extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
extern int register_new_memory(int, struct mem_section *);
+extern int memory_block_change_state(struct memory_block *mem,
+ unsigned long to_state,
+ unsigned long from_state_req);
#ifdef CONFIG_MEMORY_HOTREMOVE
extern int unregister_memory_section(struct mem_section *);
#endif
extern int try_online_node(int nid);
+extern bool memhp_auto_online;
+
#ifdef CONFIG_MEMORY_HOTREMOVE
extern bool is_pageblock_removable_nolock(struct page *page);
extern int arch_remove_memory(u64 start, u64 size);
void mem_hotplug_begin(void);
void mem_hotplug_done(void);
+extern void set_zone_contiguous(struct zone *zone);
+extern void clear_zone_contiguous(struct zone *zone);
+
#else /* ! CONFIG_MEMORY_HOTPLUG */
/*
* Stub functions for when hotplug is off
extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
void *arg, int (*func)(struct memory_block *, void *));
extern int add_memory(int nid, u64 start, u64 size);
-extern int add_memory_resource(int nid, struct resource *resource);
+extern int add_memory_resource(int nid, struct resource *resource, bool online);
extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
bool for_device);
extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
MR_SYSCALL, /* also applies to cpusets */
MR_MEMPOLICY_MBIND,
MR_NUMA_MISPLACED,
- MR_CMA
+ MR_CMA,
+ MR_TYPES
};
+/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */
+extern char *migrate_reason_names[MR_TYPES];
+
#ifdef CONFIG_MIGRATION
extern void putback_movable_pages(struct list_head *l);
{
return page->mem_cgroup;
}
-
-static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
-{
- page->mem_cgroup = memcg;
-}
#else
static inline struct mem_cgroup *page_memcg(struct page *page)
{
return NULL;
}
-
-static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
-{
-}
#endif
/*
int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg);
+void account_page_dirtied(struct page *page, struct address_space *mapping);
void account_page_cleaned(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, struct bdi_writeback *wb);
+ struct bdi_writeback *wb);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
void cancel_dirty_page(struct page *page);
unsigned long size, pte_fn_t fn, void *data);
+#ifdef CONFIG_PAGE_POISONING
+extern bool page_poisoning_enabled(void);
+extern void kernel_poison_pages(struct page *page, int numpages, int enable);
+extern bool page_is_poisoned(struct page *page);
+#else
+static inline bool page_poisoning_enabled(void) { return false; }
+static inline void kernel_poison_pages(struct page *page, int numpages,
+ int enable) { }
+static inline bool page_is_poisoned(struct page *page) { return false; }
+#endif
+
#ifdef CONFIG_DEBUG_PAGEALLOC
extern bool _debug_pagealloc_enabled;
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
}
#ifdef CONFIG_HIBERNATION
extern bool kernel_page_present(struct page *page);
-#endif /* CONFIG_HIBERNATION */
-#else
+#endif /* CONFIG_HIBERNATION */
+#else /* CONFIG_DEBUG_PAGEALLOC */
static inline void
kernel_map_pages(struct page *page, int numpages, int enable) {}
#ifdef CONFIG_HIBERNATION
static inline bool kernel_page_present(struct page *page) { return true; }
-#endif /* CONFIG_HIBERNATION */
-#endif
+#endif /* CONFIG_HIBERNATION */
+static inline bool debug_pagealloc_enabled(void)
+{
+ return false;
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
struct mm_struct;
extern void dump_page(struct page *page, const char *reason);
-extern void dump_page_badflags(struct page *page, const char *reason,
- unsigned long badflags);
+extern void __dump_page(struct page *page, const char *reason);
void dump_vma(const struct vm_area_struct *vma);
void dump_mm(const struct mm_struct *mm);
MIGRATE_TYPES
};
+/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
+extern char * const migratetype_names[MIGRATE_TYPES];
+
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#else
};
struct lruvec {
- struct list_head lists[NR_LRU_LISTS];
- struct zone_reclaim_stat reclaim_stat;
+ struct list_head lists[NR_LRU_LISTS];
+ struct zone_reclaim_stat reclaim_stat;
+ /* Evictions & activations on the inactive file list */
+ atomic_long_t inactive_age;
#ifdef CONFIG_MEMCG
- struct zone *zone;
+ struct zone *zone;
#endif
};
spinlock_t lru_lock;
struct lruvec lruvec;
- /* Evictions & activations on the inactive file list */
- atomic_long_t inactive_age;
-
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
bool compact_blockskip_flush;
#endif
+ bool contiguous;
+
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
#endif
}
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
+
#ifdef CONFIG_HAVE_MEMORY_PRESENT
void memory_present(int nid, unsigned long start, unsigned long end);
#else
unsigned int order;
gfp_t gfp_mask;
unsigned int nr_entries;
+ int last_migrate_reason;
unsigned long trace_entries[8];
#endif
};
#ifndef __LINUX_PAGE_OWNER_H
#define __LINUX_PAGE_OWNER_H
+#include <linux/jump_label.h>
+
#ifdef CONFIG_PAGE_OWNER
-extern bool page_owner_inited;
+extern struct static_key_false page_owner_inited;
extern struct page_ext_operations page_owner_ops;
extern void __reset_page_owner(struct page *page, unsigned int order);
extern void __set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask);
extern gfp_t __get_page_owner_gfp(struct page *page);
+extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
+extern void __set_page_owner_migrate_reason(struct page *page, int reason);
+extern void __dump_page_owner(struct page *page);
static inline void reset_page_owner(struct page *page, unsigned int order)
{
- if (likely(!page_owner_inited))
- return;
-
- __reset_page_owner(page, order);
+ if (static_branch_unlikely(&page_owner_inited))
+ __reset_page_owner(page, order);
}
static inline void set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask)
{
- if (likely(!page_owner_inited))
- return;
-
- __set_page_owner(page, order, gfp_mask);
+ if (static_branch_unlikely(&page_owner_inited))
+ __set_page_owner(page, order, gfp_mask);
}
static inline gfp_t get_page_owner_gfp(struct page *page)
{
- if (likely(!page_owner_inited))
+ if (static_branch_unlikely(&page_owner_inited))
+ return __get_page_owner_gfp(page);
+ else
return 0;
-
- return __get_page_owner_gfp(page);
+}
+static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ if (static_branch_unlikely(&page_owner_inited))
+ __copy_page_owner(oldpage, newpage);
+}
+static inline void set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ if (static_branch_unlikely(&page_owner_inited))
+ __set_page_owner_migrate_reason(page, reason);
+}
+static inline void dump_page_owner(struct page *page)
+{
+ if (static_branch_unlikely(&page_owner_inited))
+ __dump_page_owner(page);
}
#else
static inline void reset_page_owner(struct page *page, unsigned int order)
{
return 0;
}
-
+static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+}
+static inline void set_page_owner_migrate_reason(struct page *page, int reason)
+{
+}
+static inline void dump_page_owner(struct page *page)
+{
+}
#endif /* CONFIG_PAGE_OWNER */
#endif /* __LINUX_PAGE_OWNER_H */
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow,
- struct mem_cgroup *memcg);
+extern void __delete_from_page_cache(struct page *page, void *shadow);
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
/********** mm/debug-pagealloc.c **********/
+#ifdef CONFIG_PAGE_POISONING_ZERO
+#define PAGE_POISON 0x00
+#else
#define PAGE_POISON 0xaa
+#endif
/********** mm/page_alloc.c ************/
* Flags to pass to kmem_cache_create().
* The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
*/
-#define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */
+#define SLAB_CONSISTENCY_CHECKS 0x00000100UL /* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */
#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */
#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */
void kmem_cache_free(struct kmem_cache *, void *);
/*
- * Bulk allocation and freeing operations. These are accellerated in an
+ * Bulk allocation and freeing operations. These are accelerated in an
* allocator specific way to avoid taking locks repeatedly or building
* metadata structures unnecessarily.
*
void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+/*
+ * Caller must not use kfree_bulk() on memory not originally allocated
+ * by kmalloc(), because the SLOB allocator cannot handle this.
+ */
+static __always_inline void kfree_bulk(size_t size, void **p)
+{
+ kmem_cache_free_bulk(NULL, size, p);
+}
+
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ atomic_t store_user_clean;
+#endif
/*
* If debugging is enabled, then the allocator can add additional
int reserved; /* Reserved bytes at the end of slabs */
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
+ int red_left_pad; /* Left redzone padding size */
#ifdef CONFIG_SYSFS
struct kobject kobj; /* For sysfs */
#endif
struct dentry;
struct bpf_prog;
-struct trace_print_flags {
- unsigned long mask;
- const char *name;
-};
-
-struct trace_print_flags_u64 {
- unsigned long long mask;
- const char *name;
-};
-
const char *trace_print_flags_seq(struct trace_seq *p, const char *delim,
unsigned long flags,
const struct trace_print_flags *flag_array);
/*
* File can be included directly by headers who only want to access
- * tracepoint->key to guard out of line trace calls. Otherwise
- * linux/tracepoint.h should be used.
+ * tracepoint->key to guard out of line trace calls, or the definition of
+ * trace_print_flags{_u64}. Otherwise linux/tracepoint.h should be used.
*/
#include <linux/atomic.h>
#include <linux/static_key.h>
+struct trace_print_flags {
+ unsigned long mask;
+ const char *name;
+};
+
+struct trace_print_flags_u64 {
+ unsigned long long mask;
+ const char *name;
+};
+
struct tracepoint_func {
void *func;
void *data;
#include <linux/writeback.h>
#include <linux/tracepoint.h>
-#include <trace/events/gfpflags.h>
+#include <trace/events/mmflags.h>
struct btrfs_root;
struct btrfs_fs_info;
#include <linux/types.h>
#include <linux/list.h>
#include <linux/tracepoint.h>
-#include <trace/events/gfpflags.h>
+#include <trace/events/mmflags.h>
#define COMPACTION_STATUS \
EM( COMPACT_DEFERRED, "deferred") \
+++ /dev/null
-/*
- * The order of these masks is important. Matching masks will be seen
- * first and the left over flags will end up showing by themselves.
- *
- * For example, if we have GFP_KERNEL before GFP_USER we wil get:
- *
- * GFP_KERNEL|GFP_HARDWALL
- *
- * Thus most bits set go first.
- */
-#define show_gfp_flags(flags) \
- (flags) ? __print_flags(flags, "|", \
- {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
- {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \
- {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
- {(unsigned long)GFP_USER, "GFP_USER"}, \
- {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \
- {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \
- {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \
- {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
- {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
- {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \
- {(unsigned long)__GFP_ATOMIC, "GFP_ATOMIC"}, \
- {(unsigned long)__GFP_IO, "GFP_IO"}, \
- {(unsigned long)__GFP_COLD, "GFP_COLD"}, \
- {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \
- {(unsigned long)__GFP_REPEAT, "GFP_REPEAT"}, \
- {(unsigned long)__GFP_NOFAIL, "GFP_NOFAIL"}, \
- {(unsigned long)__GFP_NORETRY, "GFP_NORETRY"}, \
- {(unsigned long)__GFP_COMP, "GFP_COMP"}, \
- {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \
- {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \
- {(unsigned long)__GFP_MEMALLOC, "GFP_MEMALLOC"}, \
- {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
- {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
- {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
- {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
- {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
- {(unsigned long)__GFP_DIRECT_RECLAIM, "GFP_DIRECT_RECLAIM"}, \
- {(unsigned long)__GFP_KSWAPD_RECLAIM, "GFP_KSWAPD_RECLAIM"}, \
- {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \
- ) : "GFP_NOWAIT"
-
#include <linux/tracepoint.h>
-#include <trace/events/gfpflags.h>
-
#define SCAN_STATUS \
EM( SCAN_FAIL, "failed") \
EM( SCAN_SUCCEED, "succeeded") \
#include <linux/types.h>
#include <linux/tracepoint.h>
-#include <trace/events/gfpflags.h>
+#include <trace/events/mmflags.h>
DECLARE_EVENT_CLASS(kmem_alloc,
--- /dev/null
+/*
+ * The order of these masks is important. Matching masks will be seen
+ * first and the left over flags will end up showing by themselves.
+ *
+ * For example, if we have GFP_KERNEL before GFP_USER we wil get:
+ *
+ * GFP_KERNEL|GFP_HARDWALL
+ *
+ * Thus most bits set go first.
+ */
+
+#define __def_gfpflag_names \
+ {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
+ {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\
+ {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
+ {(unsigned long)GFP_USER, "GFP_USER"}, \
+ {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \
+ {(unsigned long)GFP_KERNEL_ACCOUNT, "GFP_KERNEL_ACCOUNT"}, \
+ {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \
+ {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \
+ {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
+ {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
+ {(unsigned long)GFP_NOWAIT, "GFP_NOWAIT"}, \
+ {(unsigned long)GFP_DMA, "GFP_DMA"}, \
+ {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \
+ {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \
+ {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \
+ {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \
+ {(unsigned long)__GFP_IO, "__GFP_IO"}, \
+ {(unsigned long)__GFP_FS, "__GFP_FS"}, \
+ {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \
+ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \
+ {(unsigned long)__GFP_REPEAT, "__GFP_REPEAT"}, \
+ {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \
+ {(unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \
+ {(unsigned long)__GFP_COMP, "__GFP_COMP"}, \
+ {(unsigned long)__GFP_ZERO, "__GFP_ZERO"}, \
+ {(unsigned long)__GFP_NOMEMALLOC, "__GFP_NOMEMALLOC"}, \
+ {(unsigned long)__GFP_MEMALLOC, "__GFP_MEMALLOC"}, \
+ {(unsigned long)__GFP_HARDWALL, "__GFP_HARDWALL"}, \
+ {(unsigned long)__GFP_THISNODE, "__GFP_THISNODE"}, \
+ {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \
+ {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \
+ {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \
+ {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \
+ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \
+ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \
+ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\
+ {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\
+ {(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \
+
+#define show_gfp_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ __def_gfpflag_names \
+ ) : "none"
+
+#ifdef CONFIG_MMU
+#define IF_HAVE_PG_MLOCK(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_MLOCK(flag,string)
+#endif
+
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+#define IF_HAVE_PG_UNCACHED(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_UNCACHED(flag,string)
+#endif
+
+#ifdef CONFIG_MEMORY_FAILURE
+#define IF_HAVE_PG_HWPOISON(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_HWPOISON(flag,string)
+#endif
+
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+#define IF_HAVE_PG_IDLE(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_IDLE(flag,string)
+#endif
+
+#define __def_pageflag_names \
+ {1UL << PG_locked, "locked" }, \
+ {1UL << PG_error, "error" }, \
+ {1UL << PG_referenced, "referenced" }, \
+ {1UL << PG_uptodate, "uptodate" }, \
+ {1UL << PG_dirty, "dirty" }, \
+ {1UL << PG_lru, "lru" }, \
+ {1UL << PG_active, "active" }, \
+ {1UL << PG_slab, "slab" }, \
+ {1UL << PG_owner_priv_1, "owner_priv_1" }, \
+ {1UL << PG_arch_1, "arch_1" }, \
+ {1UL << PG_reserved, "reserved" }, \
+ {1UL << PG_private, "private" }, \
+ {1UL << PG_private_2, "private_2" }, \
+ {1UL << PG_writeback, "writeback" }, \
+ {1UL << PG_head, "head" }, \
+ {1UL << PG_swapcache, "swapcache" }, \
+ {1UL << PG_mappedtodisk, "mappedtodisk" }, \
+ {1UL << PG_reclaim, "reclaim" }, \
+ {1UL << PG_swapbacked, "swapbacked" }, \
+ {1UL << PG_unevictable, "unevictable" } \
+IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \
+IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
+IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
+IF_HAVE_PG_IDLE(PG_young, "young" ) \
+IF_HAVE_PG_IDLE(PG_idle, "idle" )
+
+#define show_page_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ __def_pageflag_names \
+ ) : "none"
+
+#if defined(CONFIG_X86)
+#define __VM_ARCH_SPECIFIC {VM_PAT, "pat" }
+#elif defined(CONFIG_PPC)
+#define __VM_ARCH_SPECIFIC {VM_SAO, "sao" }
+#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
+#define __VM_ARCH_SPECIFIC {VM_GROWSUP, "growsup" }
+#elif !defined(CONFIG_MMU)
+#define __VM_ARCH_SPECIFIC {VM_MAPPED_COPY,"mappedcopy" }
+#else
+#define __VM_ARCH_SPECIFIC {VM_ARCH_1, "arch_1" }
+#endif
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
+#else
+#define IF_HAVE_VM_SOFTDIRTY(flag,name)
+#endif
+
+#define __def_vmaflag_names \
+ {VM_READ, "read" }, \
+ {VM_WRITE, "write" }, \
+ {VM_EXEC, "exec" }, \
+ {VM_SHARED, "shared" }, \
+ {VM_MAYREAD, "mayread" }, \
+ {VM_MAYWRITE, "maywrite" }, \
+ {VM_MAYEXEC, "mayexec" }, \
+ {VM_MAYSHARE, "mayshare" }, \
+ {VM_GROWSDOWN, "growsdown" }, \
+ {VM_PFNMAP, "pfnmap" }, \
+ {VM_DENYWRITE, "denywrite" }, \
+ {VM_LOCKONFAULT, "lockonfault" }, \
+ {VM_LOCKED, "locked" }, \
+ {VM_IO, "io" }, \
+ {VM_SEQ_READ, "seqread" }, \
+ {VM_RAND_READ, "randread" }, \
+ {VM_DONTCOPY, "dontcopy" }, \
+ {VM_DONTEXPAND, "dontexpand" }, \
+ {VM_ACCOUNT, "account" }, \
+ {VM_NORESERVE, "noreserve" }, \
+ {VM_HUGETLB, "hugetlb" }, \
+ __VM_ARCH_SPECIFIC , \
+ {VM_DONTDUMP, "dontdump" }, \
+IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
+ {VM_MIXEDMAP, "mixedmap" }, \
+ {VM_HUGEPAGE, "hugepage" }, \
+ {VM_NOHUGEPAGE, "nohugepage" }, \
+ {VM_MERGEABLE, "mergeable" } \
+
+#define show_vma_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ __def_vmaflag_names \
+ ) : "none"
#include <linux/tracepoint.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
-#include <trace/events/gfpflags.h>
+#include <trace/events/mmflags.h>
#define RECLAIM_WB_ANON 0x0001u
#define RECLAIM_WB_FILE 0x0002u
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- * linux/include/linux/auto_fs.h
- *
+/*
* Copyright 1997 Transmeta Corporation - All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
struct autofs_packet_missing {
struct autofs_packet_hdr hdr;
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_token;
int len;
char name[NAME_MAX+1];
};
char name[NAME_MAX+1];
};
-#define AUTOFS_IOC_READY _IO(0x93,0x60)
-#define AUTOFS_IOC_FAIL _IO(0x93,0x61)
-#define AUTOFS_IOC_CATATONIC _IO(0x93,0x62)
-#define AUTOFS_IOC_PROTOVER _IOR(0x93,0x63,int)
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,compat_ulong_t)
-#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93,0x64,unsigned long)
-#define AUTOFS_IOC_EXPIRE _IOR(0x93,0x65,struct autofs_packet_expire)
+#define AUTOFS_IOC_READY _IO(0x93, 0x60)
+#define AUTOFS_IOC_FAIL _IO(0x93, 0x61)
+#define AUTOFS_IOC_CATATONIC _IO(0x93, 0x62)
+#define AUTOFS_IOC_PROTOVER _IOR(0x93, 0x63, int)
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93, 0x64, compat_ulong_t)
+#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93, 0x64, unsigned long)
+#define AUTOFS_IOC_EXPIRE _IOR(0x93, 0x65, struct autofs_packet_expire)
#endif /* _UAPI_LINUX_AUTO_FS_H */
-/* -*- c -*-
- * linux/include/linux/auto_fs4.h
- *
+/*
* Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
*
* This file is part of the Linux kernel and is made available under
static inline void set_autofs_type_indirect(unsigned int *type)
{
*type = AUTOFS_TYPE_INDIRECT;
- return;
}
static inline unsigned int autofs_type_indirect(unsigned int type)
static inline void set_autofs_type_direct(unsigned int *type)
{
*type = AUTOFS_TYPE_DIRECT;
- return;
}
static inline unsigned int autofs_type_direct(unsigned int type)
static inline void set_autofs_type_offset(unsigned int *type)
{
*type = AUTOFS_TYPE_OFFSET;
- return;
}
static inline unsigned int autofs_type_offset(unsigned int type)
static inline void set_autofs_type_any(unsigned int *type)
{
*type = AUTOFS_TYPE_ANY;
- return;
}
static inline unsigned int autofs_type_any(unsigned int type)
/* v4 multi expire (via pipe) */
struct autofs_packet_expire_multi {
struct autofs_packet_hdr hdr;
- autofs_wqt_t wait_queue_token;
+ autofs_wqt_t wait_queue_token;
int len;
char name[NAME_MAX+1];
};
autofs_packet_expire_direct_t expire_direct;
};
-#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93,0x66,int)
+#define AUTOFS_IOC_EXPIRE_MULTI _IOW(0x93, 0x66, int)
#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI
#define AUTOFS_IOC_EXPIRE_DIRECT AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_PROTOSUBVER _IOR(0x93,0x67,int)
-#define AUTOFS_IOC_ASKUMOUNT _IOR(0x93,0x70,int)
-
+#define AUTOFS_IOC_PROTOSUBVER _IOR(0x93, 0x67, int)
+#define AUTOFS_IOC_ASKUMOUNT _IOR(0x93, 0x70, int)
#endif /* _LINUX_AUTO_FS4_H */
Say N unless you really need all symbols.
+config KALLSYMS_ABSOLUTE_PERCPU
+ bool
+ default X86_64 && SMP
+
+config KALLSYMS_BASE_RELATIVE
+ bool
+ depends on KALLSYMS
+ default !IA64 && !(TILE && 64BIT)
+ help
+ Instead of emitting them as absolute values in the native word size,
+ emit the symbol references in the kallsyms table as 32-bit entries,
+ each containing a relative value in the range [base, base + U32_MAX]
+ or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either
+ an absolute value in the range [0, S32_MAX] or a relative value in the
+ range [base, base + S32_MAX], where base is the lowest relative symbol
+ address encountered in the image.
+
+ On 64-bit builds, this reduces the size of the address table by 50%,
+ but more importantly, it results in entries whose values are build
+ time constants, and no relocation pass is required at runtime to fix
+ up the entries based on the runtime load address of the kernel.
+
config PRINTK
default y
bool "Enable support for printk" if EXPERT
static bool __init_or_module initcall_blacklisted(initcall_t fn)
{
- struct list_head *tmp;
struct blacklist_entry *entry;
char *fn_name;
if (!fn_name)
return false;
- list_for_each(tmp, &blacklisted_initcalls) {
- entry = list_entry(tmp, struct blacklist_entry, next);
+ list_for_each_entry(entry, &blacklisted_initcalls, next) {
if (!strcmp(fn_name, entry->buf)) {
pr_debug("initcall %s blacklisted\n", fn_name);
kfree(fn_name);
* during the second link stage.
*/
extern const unsigned long kallsyms_addresses[] __weak;
+extern const int kallsyms_offsets[] __weak;
extern const u8 kallsyms_names[] __weak;
/*
extern const unsigned long kallsyms_num_syms
__attribute__((weak, section(".rodata")));
+extern const unsigned long kallsyms_relative_base
+__attribute__((weak, section(".rodata")));
+
extern const u8 kallsyms_token_table[] __weak;
extern const u16 kallsyms_token_index[] __weak;
return name - kallsyms_names;
}
+static unsigned long kallsyms_sym_address(int idx)
+{
+ if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
+ return kallsyms_addresses[idx];
+
+ /* values are unsigned offsets if --absolute-percpu is not in effect */
+ if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU))
+ return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
+
+ /* ...otherwise, positive offsets are absolute values */
+ if (kallsyms_offsets[idx] >= 0)
+ return kallsyms_offsets[idx];
+
+ /* ...and negative offsets are relative to kallsyms_relative_base - 1 */
+ return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
+}
+
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
if (strcmp(namebuf, name) == 0)
- return kallsyms_addresses[i];
+ return kallsyms_sym_address(i);
}
return module_kallsyms_lookup_name(name);
}
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+ ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
if (ret != 0)
return ret;
}
unsigned long i, low, high, mid;
/* This kernel should never had been booted. */
- BUG_ON(!kallsyms_addresses);
+ if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
+ BUG_ON(!kallsyms_addresses);
+ else
+ BUG_ON(!kallsyms_offsets);
/* Do a binary search on the sorted kallsyms_addresses array. */
low = 0;
while (high - low > 1) {
mid = low + (high - low) / 2;
- if (kallsyms_addresses[mid] <= addr)
+ if (kallsyms_sym_address(mid) <= addr)
low = mid;
else
high = mid;
* Search for the first aliased symbol. Aliased
* symbols are symbols with the same address.
*/
- while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+ while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low))
--low;
- symbol_start = kallsyms_addresses[low];
+ symbol_start = kallsyms_sym_address(low);
/* Search for next non-aliased symbol. */
for (i = low + 1; i < kallsyms_num_syms; i++) {
- if (kallsyms_addresses[i] > symbol_start) {
- symbol_end = kallsyms_addresses[i];
+ if (kallsyms_sym_address(i) > symbol_start) {
+ symbol_end = kallsyms_sym_address(i);
break;
}
}
unsigned off = iter->nameoff;
iter->module_name[0] = '\0';
- iter->value = kallsyms_addresses[iter->pos];
+ iter->value = kallsyms_sym_address(iter->pos);
iter->type = kallsyms_get_symbol_type(off);
}
#ifdef CONFIG_LOCK_STAT
-static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
- cpu_lock_stats);
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats);
static inline u64 lockstat_clock(void)
{
/*
* 'memmap_start' is the virtual address for the first "struct
* page" in this range of the vmemmap array. In the case of
- * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple
+ * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple
* pointer arithmetic, so we can perform this to_vmem_altmap()
* conversion without concern for the initialization state of
* the struct page fields.
struct dev_pagemap *pgmap;
/*
- * Uncoditionally retrieve a dev_pagemap associated with the
+ * Unconditionally retrieve a dev_pagemap associated with the
* given physical address, this is only for use in the
* arch_{add|remove}_memory() for setting up and tearing down
* the memmap.
return nohibernate_setup(str);
}
+static int __init page_poison_nohibernate_setup(char *str)
+{
+#ifdef CONFIG_PAGE_POISONING_ZERO
+ /*
+ * The zeroing option for page poison skips the checks on alloc.
+ * since hibernation doesn't save free pages there's no way to
+ * guarantee the pages will still be zeroed.
+ */
+ if (!strcmp(str, "on")) {
+ pr_info("Disabling hibernation due to page poisoning\n");
+ return nohibernate_setup(str);
+ }
+#endif
+ return 1;
+}
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
__setup("resumedelay=", resumedelay_setup);
__setup("nohibernate", nohibernate_setup);
__setup("kaslr", kaslr_nohibernate_setup);
+__setup("page_poison=", page_poison_nohibernate_setup);
static unsigned long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
- rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
- rcu_torture_batch) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 };
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
/* the per-cpu worker pools */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
- cpu_worker_pools);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
#include <linux/socket.h>
#include <linux/in.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+
#define BUF_SIZE 256
#define PAD_SIZE 16
#define FILL_CHAR '$'
{
}
+static void __init
+flags(void)
+{
+ unsigned long flags;
+ gfp_t gfp;
+ char *cmp_buffer;
+
+ flags = 0;
+ test("", "%pGp", &flags);
+
+ /* Page flags should filter the zone id */
+ flags = 1UL << NR_PAGEFLAGS;
+ test("", "%pGp", &flags);
+
+ flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru
+ | 1UL << PG_active | 1UL << PG_swapbacked;
+ test("uptodate|dirty|lru|active|swapbacked", "%pGp", &flags);
+
+
+ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC
+ | VM_DENYWRITE;
+ test("read|exec|mayread|maywrite|mayexec|denywrite", "%pGv", &flags);
+
+ gfp = GFP_TRANSHUGE;
+ test("GFP_TRANSHUGE", "%pGg", &gfp);
+
+ gfp = GFP_ATOMIC|__GFP_DMA;
+ test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp);
+
+ gfp = __GFP_ATOMIC;
+ test("__GFP_ATOMIC", "%pGg", &gfp);
+
+ cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
+ if (!cmp_buffer)
+ return;
+
+ /* Any flags not translated by the table should remain numeric */
+ gfp = ~__GFP_BITS_MASK;
+ snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp);
+ test(cmp_buffer, "%pGg", &gfp);
+
+ snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx",
+ (unsigned long) gfp);
+ gfp |= __GFP_ATOMIC;
+ test(cmp_buffer, "%pGg", &gfp);
+
+ kfree(cmp_buffer);
+}
+
static void __init
test_pointer(void)
{
struct_clk();
bitmap();
netdev_features();
+ flags();
}
static int __init
#include <linux/blkdev.h>
#endif
+#include "../mm/internal.h" /* For the trace_print_flags arrays */
+
#include <asm/page.h> /* for PAGE_SIZE */
#include <asm/sections.h> /* for dereference_function_descriptor() */
#include <asm/byteorder.h> /* cpu_to_le16 */
}
}
+static
+char *format_flags(char *buf, char *end, unsigned long flags,
+ const struct trace_print_flags *names)
+{
+ unsigned long mask;
+ const struct printf_spec strspec = {
+ .field_width = -1,
+ .precision = -1,
+ };
+ const struct printf_spec numspec = {
+ .flags = SPECIAL|SMALL,
+ .field_width = -1,
+ .precision = -1,
+ .base = 16,
+ };
+
+ for ( ; flags && names->name; names++) {
+ mask = names->mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ buf = string(buf, end, names->name, strspec);
+
+ flags &= ~mask;
+ if (flags) {
+ if (buf < end)
+ *buf = '|';
+ buf++;
+ }
+ }
+
+ if (flags)
+ buf = number(buf, end, flags, numspec);
+
+ return buf;
+}
+
+static noinline_for_stack
+char *flags_string(char *buf, char *end, void *flags_ptr, const char *fmt)
+{
+ unsigned long flags;
+ const struct trace_print_flags *names;
+
+ switch (fmt[1]) {
+ case 'p':
+ flags = *(unsigned long *)flags_ptr;
+ /* Remove zone id */
+ flags &= (1UL << NR_PAGEFLAGS) - 1;
+ names = pageflag_names;
+ break;
+ case 'v':
+ flags = *(unsigned long *)flags_ptr;
+ names = vmaflag_names;
+ break;
+ case 'g':
+ flags = *(gfp_t *)flags_ptr;
+ names = gfpflag_names;
+ break;
+ default:
+ WARN_ONCE(1, "Unsupported flags modifier: %c\n", fmt[1]);
+ return buf;
+ }
+
+ return format_flags(buf, end, flags, names);
+}
+
int kptr_restrict __read_mostly;
/*
* - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
* (legacy clock framework) of the clock
* - 'Cr' For a clock, it prints the current rate of the clock
+ * - 'G' For flags to be printed as a collection of symbolic strings that would
+ * construct the specific value. Supported flags given by option:
+ * p page flags (see struct page) given as pointer to unsigned long
+ * g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
+ * v vma flags (VM_*) given as pointer to unsigned long
*
* ** Please update also Documentation/printk-formats.txt when making changes **
*
return bdev_name(buf, end, ptr, spec, fmt);
#endif
+ case 'G':
+ return flags_string(buf, end, ptr, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
- This results in a large slowdown, but helps to find certain types
- of memory corruption.
+ Depending on runtime enablement, this results in a small or large
+ slowdown, but helps to find certain types of memory corruption.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
+ By default this option will have a small overhead, e.g. by not
+ allowing the kernel mapping to be backed by large pages on some
+ architectures. Even bigger overhead comes when the debugging is
+ enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
+ command line parameter.
+
+config DEBUG_PAGEALLOC_ENABLE_DEFAULT
+ bool "Enable debug page memory allocations by default?"
+ default n
+ depends on DEBUG_PAGEALLOC
+ ---help---
+ Enable debug page memory allocations by default? This value
+ can be overridden by debug_pagealloc=off|on.
+
config PAGE_POISONING
- bool
+ bool "Poison pages after freeing"
+ select PAGE_EXTENSION
+ select PAGE_POISONING_NO_SANITY if HIBERNATION
+ ---help---
+ Fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages. The filling of the memory helps
+ reduce the risk of information leaks from freed data. This does
+ have a potential performance impact.
+
+ Note that "poison" here is not the same thing as the "HWPoison"
+ for CONFIG_MEMORY_FAILURE. This is software poisoning only.
+
+ If unsure, say N
+
+config PAGE_POISONING_NO_SANITY
+ depends on PAGE_POISONING
+ bool "Only poison, don't sanity check"
+ ---help---
+ Skip the sanity checking on alloc, only fill the pages with
+ poison on free. This reduces some of the overhead of the
+ poisoning feature.
+
+ If you are only interested in sanitization, say Y. Otherwise
+ say N.
+
+config PAGE_POISONING_ZERO
+ bool "Use zero for poisoning instead of random data"
+ depends on PAGE_POISONING
+ ---help---
+ Instead of using the existing poison value, fill the pages with
+ zeros. This makes it harder to detect when errors are occurring
+ due to sanitization but the zeroing at free means that it is
+ no longer necessary to write zeros when GFP_ZERO is used on
+ allocation.
+
+ Enabling page poisoning with this option will disable hibernation
+
+ If unsure, say N
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
-obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
+obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
-/*
- * Check that the whole (or subset of) a pageblock given by the interval of
- * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
- *
- * Return struct page pointer of start_pfn, or NULL if checks were not passed.
- *
- * It's possible on some configurations to have a setup like node0 node1 node0
- * i.e. it's possible that all pages within a zones range of pages do not
- * belong to a single zone. We assume that a border between node0 and node1
- * can occur within a single pageblock, but not a node0 node1 node0
- * interleaving within a single pageblock. It is therefore sufficient to check
- * the first and last page of a pageblock and avoid checking each individual
- * page in a pageblock.
- */
-static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
- unsigned long end_pfn, struct zone *zone)
-{
- struct page *start_page;
- struct page *end_page;
-
- /* end_pfn is one past the range we are checking */
- end_pfn--;
-
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
- return NULL;
-
- start_page = pfn_to_page(start_pfn);
-
- if (page_zone(start_page) != zone)
- return NULL;
-
- end_page = pfn_to_page(end_pfn);
-
- /* This gives a shorter code than deriving page_zone(end_page) */
- if (page_zone_id(start_page) != page_zone_id(end_page))
- return NULL;
-
- return start_page;
-}
-
#ifdef CONFIG_COMPACTION
/* Do not skip compaction more than 64 times */
{
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
+ zone->compact_cached_free_pfn =
+ round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
}
/*
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long isolated, pfn, block_end_pfn;
+ unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
LIST_HEAD(freelist);
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn += isolated,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
/* Protect pfn from changing by isolate_freepages_block */
unsigned long isolate_start_pfn = pfn;
* scanning range to right one.
*/
if (pfn >= block_end_pfn) {
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
}
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long pfn, block_end_pfn;
+ unsigned long pfn, block_start_pfn, block_end_pfn;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
block_end_pfn = min(block_end_pfn, end_pfn);
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
continue;
pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
- unsigned long low_pfn, end_pfn;
+ unsigned long block_start_pfn;
+ unsigned long block_end_pfn;
+ unsigned long low_pfn;
unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
+ block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < zone->zone_start_pfn)
+ block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
- end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
- for (; end_pfn <= cc->free_pfn;
- low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
+ for (; block_end_pfn <= cc->free_pfn;
+ low_pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
/*
* This can potentially iterate a massively long zone with
&& compact_should_abort(cc))
break;
- page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+ page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+ zone);
if (!page)
continue;
/* Perform the isolation */
isolate_start_pfn = low_pfn;
- low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
- isolate_mode);
+ low_pfn = isolate_migratepages_block(cc, low_pfn,
+ block_end_pfn, isolate_mode);
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
*/
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
- if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
- cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+ cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
zone->compact_cached_free_pfn = cc->free_pfn;
}
- if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+++ /dev/null
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/page_ext.h>
-#include <linux/poison.h>
-#include <linux/ratelimit.h>
-
-static bool page_poisoning_enabled __read_mostly;
-
-static bool need_page_poisoning(void)
-{
- if (!debug_pagealloc_enabled())
- return false;
-
- return true;
-}
-
-static void init_page_poisoning(void)
-{
- if (!debug_pagealloc_enabled())
- return;
-
- page_poisoning_enabled = true;
-}
-
-struct page_ext_operations page_poisoning_ops = {
- .need = need_page_poisoning,
- .init = init_page_poisoning,
-};
-
-static inline void set_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline void clear_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline bool page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static void poison_page(struct page *page)
-{
- void *addr = kmap_atomic(page);
-
- set_page_poison(page);
- memset(addr, PAGE_POISON, PAGE_SIZE);
- kunmap_atomic(addr);
-}
-
-static void poison_pages(struct page *page, int n)
-{
- int i;
-
- for (i = 0; i < n; i++)
- poison_page(page + i);
-}
-
-static bool single_bit_flip(unsigned char a, unsigned char b)
-{
- unsigned char error = a ^ b;
-
- return error && !(error & (error - 1));
-}
-
-static void check_poison_mem(unsigned char *mem, size_t bytes)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
- unsigned char *start;
- unsigned char *end;
-
- start = memchr_inv(mem, PAGE_POISON, bytes);
- if (!start)
- return;
-
- for (end = mem + bytes - 1; end > start; end--) {
- if (*end != PAGE_POISON)
- break;
- }
-
- if (!__ratelimit(&ratelimit))
- return;
- else if (start == end && single_bit_flip(*start, PAGE_POISON))
- printk(KERN_ERR "pagealloc: single bit error\n");
- else
- printk(KERN_ERR "pagealloc: memory corruption\n");
-
- print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
- end - start + 1, 1);
- dump_stack();
-}
-
-static void unpoison_page(struct page *page)
-{
- void *addr;
-
- if (!page_poison(page))
- return;
-
- addr = kmap_atomic(page);
- check_poison_mem(addr, PAGE_SIZE);
- clear_page_poison(page);
- kunmap_atomic(addr);
-}
-
-static void unpoison_pages(struct page *page, int n)
-{
- int i;
-
- for (i = 0; i < n; i++)
- unpoison_page(page + i);
-}
-
-void __kernel_map_pages(struct page *page, int numpages, int enable)
-{
- if (!page_poisoning_enabled)
- return;
-
- if (enable)
- unpoison_pages(page, numpages);
- else
- poison_pages(page, numpages);
-}
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
-
-static const struct trace_print_flags pageflag_names[] = {
- {1UL << PG_locked, "locked" },
- {1UL << PG_error, "error" },
- {1UL << PG_referenced, "referenced" },
- {1UL << PG_uptodate, "uptodate" },
- {1UL << PG_dirty, "dirty" },
- {1UL << PG_lru, "lru" },
- {1UL << PG_active, "active" },
- {1UL << PG_slab, "slab" },
- {1UL << PG_owner_priv_1, "owner_priv_1" },
- {1UL << PG_arch_1, "arch_1" },
- {1UL << PG_reserved, "reserved" },
- {1UL << PG_private, "private" },
- {1UL << PG_private_2, "private_2" },
- {1UL << PG_writeback, "writeback" },
- {1UL << PG_head, "head" },
- {1UL << PG_swapcache, "swapcache" },
- {1UL << PG_mappedtodisk, "mappedtodisk" },
- {1UL << PG_reclaim, "reclaim" },
- {1UL << PG_swapbacked, "swapbacked" },
- {1UL << PG_unevictable, "unevictable" },
-#ifdef CONFIG_MMU
- {1UL << PG_mlocked, "mlocked" },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- {1UL << PG_uncached, "uncached" },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
- {1UL << PG_hwpoison, "hwpoison" },
-#endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
- {1UL << PG_young, "young" },
- {1UL << PG_idle, "idle" },
-#endif
+#include <trace/events/mmflags.h>
+#include <linux/migrate.h>
+#include <linux/page_owner.h>
+
+#include "internal.h"
+
+char *migrate_reason_names[MR_TYPES] = {
+ "compaction",
+ "memory_failure",
+ "memory_hotplug",
+ "syscall_or_cpuset",
+ "mempolicy_mbind",
+ "numa_misplaced",
+ "cma",
};
-static void dump_flags(unsigned long flags,
- const struct trace_print_flags *names, int count)
-{
- const char *delim = "";
- unsigned long mask;
- int i;
-
- pr_emerg("flags: %#lx(", flags);
-
- /* remove zone id */
- flags &= (1UL << NR_PAGEFLAGS) - 1;
-
- for (i = 0; i < count && flags; i++) {
-
- mask = names[i].mask;
- if ((flags & mask) != mask)
- continue;
-
- flags &= ~mask;
- pr_cont("%s%s", delim, names[i].name);
- delim = "|";
- }
+const struct trace_print_flags pageflag_names[] = {
+ __def_pageflag_names,
+ {0, NULL}
+};
- /* check for left over flags */
- if (flags)
- pr_cont("%s%#lx", delim, flags);
+const struct trace_print_flags gfpflag_names[] = {
+ __def_gfpflag_names,
+ {0, NULL}
+};
- pr_cont(")\n");
-}
+const struct trace_print_flags vmaflag_names[] = {
+ __def_vmaflag_names,
+ {0, NULL}
+};
-void dump_page_badflags(struct page *page, const char *reason,
- unsigned long badflags)
+void __dump_page(struct page *page, const char *reason)
{
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
page, atomic_read(&page->_count), page_mapcount(page),
if (PageCompound(page))
pr_cont(" compound_mapcount: %d", compound_mapcount(page));
pr_cont("\n");
- BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
- dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
+
+ pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+
if (reason)
pr_alert("page dumped because: %s\n", reason);
- if (page->flags & badflags) {
- pr_alert("bad because of flags:\n");
- dump_flags(page->flags & badflags,
- pageflag_names, ARRAY_SIZE(pageflag_names));
- }
+
#ifdef CONFIG_MEMCG
if (page->mem_cgroup)
pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
void dump_page(struct page *page, const char *reason)
{
- dump_page_badflags(page, reason, 0);
+ __dump_page(page, reason);
+ dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
#ifdef CONFIG_DEBUG_VM
-static const struct trace_print_flags vmaflags_names[] = {
- {VM_READ, "read" },
- {VM_WRITE, "write" },
- {VM_EXEC, "exec" },
- {VM_SHARED, "shared" },
- {VM_MAYREAD, "mayread" },
- {VM_MAYWRITE, "maywrite" },
- {VM_MAYEXEC, "mayexec" },
- {VM_MAYSHARE, "mayshare" },
- {VM_GROWSDOWN, "growsdown" },
- {VM_PFNMAP, "pfnmap" },
- {VM_DENYWRITE, "denywrite" },
- {VM_LOCKONFAULT, "lockonfault" },
- {VM_LOCKED, "locked" },
- {VM_IO, "io" },
- {VM_SEQ_READ, "seqread" },
- {VM_RAND_READ, "randread" },
- {VM_DONTCOPY, "dontcopy" },
- {VM_DONTEXPAND, "dontexpand" },
- {VM_ACCOUNT, "account" },
- {VM_NORESERVE, "noreserve" },
- {VM_HUGETLB, "hugetlb" },
-#if defined(CONFIG_X86)
- {VM_PAT, "pat" },
-#elif defined(CONFIG_PPC)
- {VM_SAO, "sao" },
-#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
- {VM_GROWSUP, "growsup" },
-#elif !defined(CONFIG_MMU)
- {VM_MAPPED_COPY, "mappedcopy" },
-#else
- {VM_ARCH_1, "arch_1" },
-#endif
- {VM_DONTDUMP, "dontdump" },
-#ifdef CONFIG_MEM_SOFT_DIRTY
- {VM_SOFTDIRTY, "softdirty" },
-#endif
- {VM_MIXEDMAP, "mixedmap" },
- {VM_HUGEPAGE, "hugepage" },
- {VM_NOHUGEPAGE, "nohugepage" },
- {VM_MERGEABLE, "mergeable" },
-};
-
void dump_vma(const struct vm_area_struct *vma)
{
pr_emerg("vma %p start %p end %p\n"
"next %p prev %p mm %p\n"
"prot %lx anon_vma %p vm_ops %p\n"
- "pgoff %lx file %p private_data %p\n",
+ "pgoff %lx file %p private_data %p\n"
+ "flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
vma->vm_prev, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
- vma->vm_file, vma->vm_private_data);
- dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+ vma->vm_file, vma->vm_private_data,
+ vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
"tlb_flush_pending %d\n"
#endif
- "%s", /* This is here to hold the comma */
+ "def_flags: %#lx(%pGv)\n",
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
#ifdef CONFIG_MMU
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
mm->tlb_flush_pending,
#endif
- "" /* This is here to not have a comma! */
- );
-
- dump_flags(mm->def_flags, vmaflags_names,
- ARRAY_SIZE(vmaflags_names));
+ mm->def_flags, &mm->def_flags
+ );
}
#endif /* CONFIG_DEBUG_VM */
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/mm.h>
+#include "slab.h"
static struct {
struct fault_attr attr;
.cache_filter = false,
};
-bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
+bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
+ /* No fault-injection for bootstrap cache */
+ if (unlikely(s == kmem_cache))
+ return false;
+
if (gfpflags & __GFP_NOFAIL)
return false;
if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
return false;
- if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+ if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
return false;
- return should_fail(&failslab.attr, size);
+ return should_fail(&failslab.attr, s->object_size);
}
static int __init setup_failslab(char *str)
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat)
+ * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock and
- * mem_cgroup_begin_page_stat().
+ * is safe. The caller must hold the mapping's tree_lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow,
- struct mem_cgroup *memcg)
+void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
* anyway will be cleared before returning page into buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping, memcg,
- inode_to_wb(mapping->host));
+ account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
/**
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct mem_cgroup *memcg;
unsigned long flags;
void (*freepage)(struct page *);
freepage = mapping->a_ops->freepage;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage)
freepage(page);
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
- struct mem_cgroup *memcg;
unsigned long flags;
pgoff_t offset = old->index;
new->mapping = mapping;
new->index = offset;
- memcg = mem_cgroup_begin_page_stat(old);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(old, NULL, memcg);
+ __delete_from_page_cache(old, NULL);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
- mem_cgroup_replace_page(old, new);
+ mem_cgroup_migrate(old, new);
radix_tree_preload_end();
if (freepage)
freepage(old);
index, last_index - index);
}
if (!PageUptodate(page)) {
+ /*
+ * See comment in do_read_cache_page on why
+ * wait_on_page_locked is used to avoid unnecessarily
+ * serialisations and why it's safe.
+ */
+ wait_on_page_locked_killable(page);
+ if (PageUptodate(page))
+ goto page_ok;
+
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
return page;
}
-static struct page *__read_cache_page(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data,
/* Presumably ENOMEM for radix tree node */
return ERR_PTR(err);
}
+
+filler:
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
- page = ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
+ return ERR_PTR(err);
}
- }
- return page;
-}
-
-static struct page *do_read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data,
- gfp_t gfp)
-{
- struct page *page;
- int err;
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
+ goto out;
+ }
+ if (PageUptodate(page))
+ goto out;
-retry:
- page = __read_cache_page(mapping, index, filler, data, gfp);
- if (IS_ERR(page))
- return page;
+ /*
+ * Page is not up to date and may be locked due one of the following
+ * case a: Page is being filled and the page lock is held
+ * case b: Read/write error clearing the page uptodate status
+ * case c: Truncation in progress (page locked)
+ * case d: Reclaim in progress
+ *
+ * Case a, the page will be up to date when the page is unlocked.
+ * There is no need to serialise on the page lock here as the page
+ * is pinned so the lock gives no additional protection. Even if the
+ * the page is truncated, the data is still valid if PageUptodate as
+ * it's a race vs truncate race.
+ * Case b, the page will not be up to date
+ * Case c, the page may be truncated but in itself, the data may still
+ * be valid after IO completes as it's a read vs truncate race. The
+ * operation must restart if the page is not uptodate on unlock but
+ * otherwise serialising on page lock to stabilise the mapping gives
+ * no additional guarantees to the caller as the page lock is
+ * released before return.
+ * Case d, similar to truncation. If reclaim holds the page lock, it
+ * will be a race with remove_mapping that determines if the mapping
+ * is valid on unlock but otherwise the data is valid and there is
+ * no need to serialise with page lock.
+ *
+ * As the page lock gives no additional guarantee, we optimistically
+ * wait on the page to be unlocked and check if it's up to date and
+ * use the page if it is. Otherwise, the page lock is required to
+ * distinguish between the different cases. The motivation is that we
+ * avoid spurious serialisations and wakeups when multiple processes
+ * wait on the same page for IO to complete.
+ */
+ wait_on_page_locked(page);
if (PageUptodate(page))
goto out;
+ /* Distinguish between all the cases under the safety of the lock */
lock_page(page);
+
+ /* Case c or d, restart the operation */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
- goto retry;
+ goto repeat;
}
+
+ /* Someone else locked and filled the page in a very small window */
if (PageUptodate(page)) {
unlock_page(page);
goto out;
}
- err = filler(data, page);
- if (err < 0) {
- page_cache_release(page);
- return ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
- if (IS_ERR(page))
- return page;
- }
+ goto filler;
+
out:
mark_page_accessed(page);
return page;
}
}
-static int __split_huge_page_tail(struct page *head, int tail,
+static void __split_huge_page_tail(struct page *head, int tail,
struct lruvec *lruvec, struct list_head *list)
{
- int mapcount;
struct page *page_tail = head + tail;
- mapcount = atomic_read(&page_tail->_mapcount) + 1;
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
/*
* tail_page->_count is zero and not changing from under us. But
* get_page_unless_zero() may be running from under us on the
- * tail_page. If we used atomic_set() below instead of atomic_add(), we
+ * tail_page. If we used atomic_set() below instead of atomic_inc(), we
* would then run atomic_set() concurrently with
* get_page_unless_zero(), and atomic_set() is implemented in C not
* using locked ops. spin_unlock on x86 sometime uses locked ops
* because of PPro errata 66, 92, so unless somebody can guarantee
* atomic_set() here would be safe on all archs (and not only on x86),
- * it's safer to use atomic_add().
+ * it's safer to use atomic_inc().
*/
- atomic_add(mapcount + 1, &page_tail->_count);
-
+ atomic_inc(&page_tail->_count);
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
page_tail->index = head->index + tail;
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
lru_add_page_tail(head, page_tail, lruvec, list);
-
- return mapcount;
}
static void __split_huge_page(struct page *page, struct list_head *list)
struct page *head = compound_head(page);
struct zone *zone = page_zone(head);
struct lruvec *lruvec;
- int i, tail_mapcount;
+ int i;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
- tail_mapcount = 0;
for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
- tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
- atomic_sub(tail_mapcount, &head->_count);
+ __split_huge_page_tail(head, i, lruvec, list);
ClearPageCompound(head);
spin_unlock_irq(&zone->lru_lock);
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
+#include <linux/tracepoint-defs.h>
/*
* The set of flags that only affect watermark checking and reclaim
return page_idx ^ (1 << order);
}
+extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone);
+
+static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ if (zone->contiguous)
+ return pfn_to_page(start_pfn);
+
+ return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
+}
+
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order);
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
+extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags vmaflag_names[];
+extern const struct trace_print_flags gfpflag_names[];
+
#endif /* __MM_INTERNAL_H */
void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
size_t size)
{
+ if (unlikely(!object)) /* Skip object if allocation failed */
+ return;
+
/*
* Has already been memset(), which initializes the shadow for us
* as well.
}
pr_info("Injecting memory failure for page %#lx at %#lx\n",
page_to_pfn(p), start);
- /* Ignore return value for now */
- memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ if (ret)
+ return ret;
}
return 0;
}
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
+ * MADV_FREE - the application marks pages in the given range as lazy free,
+ * where actual purges are postponed until memory pressure happens.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
* MADV_DONTFORK - omit this area from child's address space when forking:
* typically, to avoid COWing pages pinned by get_user_pages().
* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ * MADV_HWPOISON - trigger memory error handler as if the given memory range
+ * were corrupted by unrecoverable hardware memory failure.
+ * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
* this area with pages of identical content from other such areas.
* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
+ * MADV_HUGEPAGE - the application wants to back the given range by transparent
+ * huge pages in the future. Existing pages might be coalesced and
+ * new pages might be allocated as THP.
+ * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
+ * transparent huge pages so the existing pages will not be
+ * coalesced into THP and new pages will not be allocated as THP.
+ * MADV_DONTDUMP - the application wants to prevent pages in the given range
+ * from being included in its core dump.
+ * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
*
* return values:
* zero - success
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.memory;
-
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.memory, base, size, nid, flags);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.reserved;
-
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
return (memcg == root_mem_cgroup);
}
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
- return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
- struct cgroup_subsys_state *css;
-
- css = css_from_id(id, &memory_cgrp_subsys);
- return mem_cgroup_from_css(css);
-}
-
#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
}
/**
- * mem_cgroup_begin_page_stat - begin a page state statistics transaction
- * @page: page that is going to change accounted state
- *
- * This function must mark the beginning of an accounted page state
- * change to prevent double accounting when the page is concurrently
- * being moved to another memcg:
+ * lock_page_memcg - lock a page->mem_cgroup binding
+ * @page: the page
*
- * memcg = mem_cgroup_begin_page_stat(page);
- * if (TestClearPageState(page))
- * mem_cgroup_update_page_stat(memcg, state, -1);
- * mem_cgroup_end_page_stat(memcg);
+ * This function protects unlocked LRU pages from being moved to
+ * another cgroup and stabilizes their page->mem_cgroup binding.
*/
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+void lock_page_memcg(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long flags;
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when
- * the page state that is going to change is the only thing
- * preventing the page from being uncharged.
- * E.g. end-writeback clearing PageWriteback(), which allows
- * migration to go ahead and uncharge the page before the
- * account transaction might be complete.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
- return NULL;
+ return;
again:
memcg = page->mem_cgroup;
if (unlikely(!memcg))
- return NULL;
+ return;
if (atomic_read(&memcg->moving_account) <= 0)
- return memcg;
+ return;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
/*
* When charge migration first begins, we can have locked and
* unlocked page stat updates happening concurrently. Track
- * the task who has the lock for mem_cgroup_end_page_stat().
+ * the task who has the lock for unlock_page_memcg().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
- return memcg;
+ return;
}
-EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
+EXPORT_SYMBOL(lock_page_memcg);
/**
- * mem_cgroup_end_page_stat - finish a page state statistics transaction
- * @memcg: the memcg that was accounted against
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+void unlock_page_memcg(struct page *page)
{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
rcu_read_unlock();
}
-EXPORT_SYMBOL(mem_cgroup_end_page_stat);
+EXPORT_SYMBOL(unlock_page_memcg);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
VM_BUG_ON(compound && !PageTransHuge(page));
/*
- * Prevent mem_cgroup_replace_page() from looking at
+ * Prevent mem_cgroup_migrate() from looking at
* page->mem_cgroup of its source page while we change it.
*/
ret = -EBUSY;
lru_add_drain_all();
/*
- * Signal mem_cgroup_begin_page_stat() to take the memcg's
- * move_lock while we're moving its pages to another memcg.
- * Then wait for already started RCU-only updates to finish.
+ * Signal lock_page_memcg() to take the memcg's move_lock
+ * while we're moving its pages to another memcg. Then wait
+ * for already started RCU-only updates to finish.
*/
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
}
/**
- * mem_cgroup_replace_page - migrate a charge to another page
- * @oldpage: currently charged page
- * @newpage: page to transfer the charge to
+ * mem_cgroup_migrate - charge a page's replacement
+ * @oldpage: currently circulating page
+ * @newpage: replacement page
*
- * Migrate the charge from @oldpage to @newpage.
+ * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * be uncharged upon free.
*
* Both pages must be locked, @newpage->mapping must be set up.
- * Either or both pages might be on the LRU already.
*/
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
unsigned int nr_pages;
page_counter_charge(&memcg->memsw, nr_pages);
css_get_many(&memcg->css, nr_pages);
- commit_charge(newpage, memcg, true);
+ commit_charge(newpage, memcg, false);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
#undef lru
#undef swapbacked
#undef head
-#undef tail
-#undef compound
#undef slab
#undef reserved
unsigned long end = addr + size;
int err;
- BUG_ON(addr >= end);
+ if (WARN_ON(addr >= end))
+ return -EINVAL;
+
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
- pgoff_t pgoff = (((address & PAGE_MASK)
- - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ pgoff_t pgoff = linear_page_index(vma, address);
pte_unmap(page_table);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+bool memhp_auto_online;
+EXPORT_SYMBOL_GPL(memhp_auto_online);
+
void get_online_mems(void)
{
might_sleep();
int start_sec, end_sec;
struct vmem_altmap *altmap;
+ clear_zone_contiguous(zone);
+
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
if (altmap->base_pfn != phys_start_pfn
|| vmem_altmap_offset(altmap) > nr_pages) {
pr_warn_once("memory add fail, invalid altmap\n");
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
altmap->alloc = 0;
}
err = 0;
}
vmemmap_populate_print_last();
-
+out:
+ set_zone_contiguous(zone);
return err;
}
EXPORT_SYMBOL_GPL(__add_pages);
}
}
+ clear_zone_contiguous(zone);
+
/*
* We can only remove entire sections
*/
if (ret)
break;
}
+
+ set_zone_contiguous(zone);
+
return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);
return zone_default;
}
+static int online_memory_block(struct memory_block *mem, void *arg)
+{
+ return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+}
+
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
pg_data_t *pgdat = NULL;
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
+ /* online pages if requested */
+ if (online)
+ walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
+ NULL, online_memory_block);
+
goto out;
error:
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, memhp_auto_online);
if (ret < 0)
release_memory_resource(res);
return ret;
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ if (!is_vm_hugetlb_page(vma) &&
+ (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+ !(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
}
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/page_owner.h>
#include <asm/tlbflush.h>
return -EAGAIN;
/* No turning back from here */
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
* Now we know that no one else is looking at the page:
* no turning back from here.
*/
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
return -EAGAIN;
}
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
+
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
page_unfreeze_refs(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
+
return MIGRATEPAGE_SUCCESS;
}
*/
if (PageWriteback(newpage))
end_page_writeback(newpage);
+
+ copy_page_owner(page, newpage);
+
+ mem_cgroup_migrate(page, newpage);
}
/************************************************************
* page is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- set_page_memcg(page, NULL);
if (!PageAnon(page))
page->mapping = NULL;
}
}
rc = __unmap_and_move(page, newpage, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (rc == MIGRATEPAGE_SUCCESS) {
put_new_page = NULL;
+ set_page_owner_migrate_reason(newpage, reason);
+ }
out:
if (rc != -EAGAIN) {
static int unmap_and_move_huge_page(new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
struct page *hpage, int force,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int rc = -EAGAIN;
int *result = NULL;
if (rc == MIGRATEPAGE_SUCCESS) {
hugetlb_cgroup_migrate(hpage, new_hpage);
put_new_page = NULL;
+ set_page_owner_migrate_reason(new_hpage, reason);
}
unlock_page(hpage);
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
- pass > 2, mode);
+ pass > 2, mode, reason);
else
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode,
}
mlock_migrate_page(new_page, page);
- set_page_memcg(new_page, page_memcg(page));
- set_page_memcg(page, NULL);
page_remove_rmap(page, true);
+ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, oc->order,
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, "
+ "oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
+
cpuset_print_current_mems_allowed();
dump_stack();
if (memcg)
unsigned long balanced_dirty_ratelimit;
unsigned long step;
unsigned long x;
+ unsigned long shift;
/*
* The dirty rate will match the writeout rate in long term, except
* rate itself is constantly fluctuating. So decrease the track speed
* when it gets close to the target. Helps eliminate pointless tremors.
*/
- step >>= dirty_ratelimit / (2 * step + 1);
- /*
- * Limit the tracking speed to avoid overshooting.
- */
- step = (step + 7) / 8;
+ shift = dirty_ratelimit / (2 * step + 1);
+ if (shift < BITS_PER_LONG)
+ step = DIV_ROUND_UP(step >> shift, 8);
+ else
+ step = 0;
if (dirty_ratelimit < balanced_dirty_ratelimit)
dirty_ratelimit += step;
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg)
+void account_page_dirtied(struct page *page, struct address_space *mapping)
{
struct inode *inode = mapping->host;
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*/
void account_page_cleaned(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, struct bdi_writeback *wb)
+ struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_CACHE_SIZE);
*/
int __set_page_dirty_nobuffers(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
unsigned long flags;
if (!mapping) {
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 1;
}
spin_lock_irqsave(&mapping->tree_lock, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping->host) {
/* !PageAnon && !swapper_space */
}
return 1;
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
if (mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page))
- account_page_cleaned(page, mapping, memcg, wb);
+ account_page_cleaned(page, mapping, wb);
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
} else {
ClearPageDirty(page);
}
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
/*
* always locked coming in here, so we get the desired
* exclusion.
*/
- memcg = mem_cgroup_begin_page_stat(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
return ret;
}
return TestClearPageDirty(page);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
ret = TestSetPageWriteback(page);
}
if (!ret) {
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
#endif
};
+char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Movable",
+ "Reclaimable",
+ "HighAtomic",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
compound_page_dtor * const compound_page_dtors[] = {
NULL,
free_compound_page,
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
goto out;
}
if (nr_unshown) {
- printk(KERN_ALERT
+ pr_alert(
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
- printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
+ pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page_badflags(page, reason, bad_flags);
+ __dump_page(page, reason);
+ bad_flags &= page->flags;
+ if (bad_flags)
+ pr_alert("bad because of flags: %#lx(%pGp)\n",
+ bad_flags, &bad_flags);
+ dump_page_owner(page);
print_modules();
dump_stack();
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
bool _debug_guardpage_enabled __read_mostly;
static int __init early_debug_pagealloc(char *buf)
if (strcmp(buf, "on") == 0)
_debug_pagealloc_enabled = true;
+ if (strcmp(buf, "off") == 0)
+ _debug_pagealloc_enabled = false;
+
return 0;
}
early_param("debug_pagealloc", early_debug_pagealloc);
PAGE_SIZE << order);
}
arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
return true;
return __free_pages_boot_core(page, pfn, order);
}
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ struct page *start_page;
+ struct page *end_page;
+
+ /* end_pfn is one past the range we are checking */
+ end_pfn--;
+
+ if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ return NULL;
+
+ start_page = pfn_to_page(start_pfn);
+
+ if (page_zone(start_page) != zone)
+ return NULL;
+
+ end_page = pfn_to_page(end_pfn);
+
+ /* This gives a shorter code than deriving page_zone(end_page) */
+ if (page_zone_id(start_page) != page_zone_id(end_page))
+ return NULL;
+
+ return start_page;
+}
+
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
+void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __init deferred_free_range(struct page *page,
unsigned long pfn, int nr_pages)
pgdat_init_report_one_done();
return 0;
}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
{
+ struct zone *zone;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
int nid;
/* There will be num_node_state(N_MEMORY) threads */
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
+#endif
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
}
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
return 0;
}
+static inline bool free_pages_prezeroed(bool poisoned)
+{
+ return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+ page_poisoning_enabled() && poisoned;
+}
+
static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
int alloc_flags)
{
int i;
+ bool poisoned = true;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
if (unlikely(check_new_page(p)))
return 1;
+ if (poisoned)
+ poisoned &= page_is_poisoned(p);
}
set_page_private(page, 0);
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
+ kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
- if (gfp_flags & __GFP_ZERO)
+ if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
- current->comm, order, gfp_mask);
-
+ pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
+ current->comm, order, gfp_mask, &gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long pfn;
unsigned long nr_initialised = 0;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ struct memblock_region *r = NULL, *tmp;
+#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
- * There can be holes in boot-time mem_map[]s
- * handed to this function. They do not
- * exist on hotplugged memory.
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
*/
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (context != MEMMAP_EARLY)
+ goto not_early;
+
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
+ break;
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ /*
+ * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
+ * from zone_movable_pfn[nid] to end of each node should be
+ * ZONE_MOVABLE not ZONE_NORMAL. skip it.
+ */
+ if (!mirrored_kernelcore && zone_movable_pfn[nid])
+ if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
continue;
- if (!early_pfn_in_nid(pfn, nid))
+
+ /*
+ * Check given memblock attribute by firmware which can affect
+ * kernel memory layout. If zone==ZONE_MOVABLE but memory is
+ * mirrored, it's an overlapped memmap init. skip it.
+ */
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, tmp)
+ if (pfn < memblock_region_memory_end_pfn(tmp))
+ break;
+ r = tmp;
+ }
+ if (pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ /* already initialized as NORMAL */
+ pfn = memblock_region_memory_end_pfn(r);
continue;
- if (!update_defer_init(pgdat, pfn, end_pfn,
- &nr_initialised))
- break;
+ }
}
+#endif
+not_early:
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
- /* Adjust for ZONE_MOVABLE starting within this range */
- } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
- *zone_end_pfn > zone_movable_pfn[nid]) {
- *zone_end_pfn = zone_movable_pfn[nid];
-
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *ignored)
{
- unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
+ zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */
- if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
/* Move the zone boundaries inside the node if necessary */
- zone_end_pfn = min(zone_end_pfn, node_end_pfn);
- zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
/* Return the spanned pages */
- return zone_end_pfn - zone_start_pfn;
+ return *zone_end_pfn - *zone_start_pfn;
}
/*
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long nr_absent;
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
- return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (zone_movable_pfn[nid]) {
+ if (mirrored_kernelcore) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ } else {
+ if (zone_type == ZONE_NORMAL)
+ nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ }
+ }
+
+ return nr_absent;
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *zones_size)
{
+ unsigned int zone;
+
+ *zone_start_pfn = node_start_pfn;
+ for (zone = 0; zone < zone_type; zone++)
+ *zone_start_pfn += zones_size[zone];
+
+ *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
+
return zones_size[zone_type];
}
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
size = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn,
zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ if (size)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
{
enum zone_type j;
int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
- zone_start_pfn += size;
}
}
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+#else
+ start_pfn = node_start_pfn;
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
goto out2;
}
+ /*
+ * If kernelcore=mirror is specified, ignore movablecore option
+ */
+ if (mirrored_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_memblock(memory, r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = r->nid;
+
+ usable_startpfn = memblock_region_memory_base_pfn(r);
+
+ if (usable_startpfn < 0x100000) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.");
+
+ goto out2;
+ }
+
/*
* If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
*/
static int __init cmdline_parse_kernelcore(char *p)
{
+ /* parse kernelcore=mirror */
+ if (parse_option_str(p, "mirror")) {
+ mirrored_kernelcore = true;
+ return 0;
+ }
+
return cmdline_parse_core(p, &required_kernelcore);
}
struct page_ext *base;
base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
if (unlikely(!base))
return NULL;
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
if (!section->page_ext)
return NULL;
#include <linux/bootmem.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
#include "internal.h"
static bool page_owner_disabled = true;
-bool page_owner_inited __read_mostly;
+DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static void init_early_allocated_pages(void);
if (page_owner_disabled)
return;
- page_owner_inited = true;
+ static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
page_ext->nr_entries = trace.nr_entries;
+ page_ext->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+void __set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ page_ext->last_migrate_reason = reason;
+}
+
gfp_t __get_page_owner_gfp(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
return page_ext->gfp_mask;
}
+void __copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ struct page_ext *old_ext = lookup_page_ext(oldpage);
+ struct page_ext *new_ext = lookup_page_ext(newpage);
+ int i;
+
+ new_ext->order = old_ext->order;
+ new_ext->gfp_mask = old_ext->gfp_mask;
+ new_ext->nr_entries = old_ext->nr_entries;
+
+ for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
+ new_ext->trace_entries[i] = old_ext->trace_entries[i];
+
+ /*
+ * We don't clear the bit on the oldpage as it's going to be freed
+ * after migration. Until then, the info can be useful in case of
+ * a bug, and the overal stats will be off a bit only temporarily.
+ * Also, migrate_misplaced_transhuge_page() can still fail the
+ * migration and then we want the oldpage to retain the info. But
+ * in that case we also don't need to explicitly clear the info from
+ * the new page, which will be freed.
+ */
+ __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_ext *page_ext)
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask 0x%x\n",
- page_ext->order, page_ext->gfp_mask);
+ "Page allocated via order %u, mask %#x(%pGg)\n",
+ page_ext->order, page_ext->gfp_mask,
+ &page_ext->gfp_mask);
if (ret >= count)
goto err;
pageblock_mt = get_pfnblock_migratetype(page, pfn);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
- "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+ "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
+ migratetype_names[page_mt],
pfn >> pageblock_order,
- pageblock_mt,
- pageblock_mt != page_mt ? "Fallback" : " ",
- PageLocked(page) ? "K" : " ",
- PageError(page) ? "E" : " ",
- PageReferenced(page) ? "R" : " ",
- PageUptodate(page) ? "U" : " ",
- PageDirty(page) ? "D" : " ",
- PageLRU(page) ? "L" : " ",
- PageActive(page) ? "A" : " ",
- PageSlab(page) ? "S" : " ",
- PageWriteback(page) ? "W" : " ",
- PageCompound(page) ? "C" : " ",
- PageSwapCache(page) ? "B" : " ",
- PageMappedToDisk(page) ? "M" : " ");
+ migratetype_names[pageblock_mt],
+ page->flags, &page->flags);
if (ret >= count)
goto err;
if (ret >= count)
goto err;
+ if (page_ext->last_migrate_reason != -1) {
+ ret += snprintf(kbuf + ret, count - ret,
+ "Page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+ if (ret >= count)
+ goto err;
+ }
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
return -ENOMEM;
}
+void __dump_page_owner(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct stack_trace trace = {
+ .nr_entries = page_ext->nr_entries,
+ .entries = &page_ext->trace_entries[0],
+ };
+ gfp_t gfp_mask = page_ext->gfp_mask;
+ int mt = gfpflags_to_migratetype(gfp_mask);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+
+ pr_alert("page allocated via order %u, migratetype %s, "
+ "gfp_mask %#x(%pGg)\n", page_ext->order,
+ migratetype_names[mt], gfp_mask, &gfp_mask);
+ print_stack_trace(&trace, 0);
+
+ if (page_ext->last_migrate_reason != -1)
+ pr_alert("page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+}
+
static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct page *page;
struct page_ext *page_ext;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
page = NULL;
{
struct dentry *dentry;
- if (!page_owner_inited) {
+ if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
--- /dev/null
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/page_ext.h>
+#include <linux/poison.h>
+#include <linux/ratelimit.h>
+
+static bool __page_poisoning_enabled __read_mostly;
+static bool want_page_poisoning __read_mostly;
+
+static int early_page_poison_param(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ want_page_poisoning = true;
+ else if (strcmp(buf, "off") == 0)
+ want_page_poisoning = false;
+
+ return 0;
+}
+early_param("page_poison", early_page_poison_param);
+
+bool page_poisoning_enabled(void)
+{
+ return __page_poisoning_enabled;
+}
+
+static bool need_page_poisoning(void)
+{
+ return want_page_poisoning;
+}
+
+static void init_page_poisoning(void)
+{
+ /*
+ * page poisoning is debug page alloc for some arches. If either
+ * of those options are enabled, enable poisoning
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
+ if (!want_page_poisoning && !debug_pagealloc_enabled())
+ return;
+ } else {
+ if (!want_page_poisoning)
+ return;
+ }
+
+ __page_poisoning_enabled = true;
+}
+
+struct page_ext_operations page_poisoning_ops = {
+ .need = need_page_poisoning,
+ .init = init_page_poisoning,
+};
+
+static inline void set_page_poison(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+}
+
+static inline void clear_page_poison(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+}
+
+bool page_is_poisoned(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ if (!page_ext)
+ return false;
+
+ return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+}
+
+static void poison_page(struct page *page)
+{
+ void *addr = kmap_atomic(page);
+
+ set_page_poison(page);
+ memset(addr, PAGE_POISON, PAGE_SIZE);
+ kunmap_atomic(addr);
+}
+
+static void poison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+ unsigned char error = a ^ b;
+
+ return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
+ unsigned char *start;
+ unsigned char *end;
+
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
+ return;
+
+ start = memchr_inv(mem, PAGE_POISON, bytes);
+ if (!start)
+ return;
+
+ for (end = mem + bytes - 1; end > start; end--) {
+ if (*end != PAGE_POISON)
+ break;
+ }
+
+ if (!__ratelimit(&ratelimit))
+ return;
+ else if (start == end && single_bit_flip(*start, PAGE_POISON))
+ pr_err("pagealloc: single bit error\n");
+ else
+ pr_err("pagealloc: memory corruption\n");
+
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+ end - start + 1, 1);
+ dump_stack();
+}
+
+static void unpoison_page(struct page *page)
+{
+ void *addr;
+
+ if (!page_is_poisoned(page))
+ return;
+
+ addr = kmap_atomic(page);
+ check_poison_mem(addr, PAGE_SIZE);
+ clear_page_poison(page);
+ kunmap_atomic(addr);
+}
+
+static void unpoison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ unpoison_page(page + i);
+}
+
+void kernel_poison_pages(struct page *page, int numpages, int enable)
+{
+ if (!page_poisoning_enabled())
+ return;
+
+ if (enable)
+ unpoison_pages(page, numpages);
+ else
+ poison_pages(page, numpages);
+}
+
+#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ /* This function does nothing, all work is done via poison pages */
+}
+#endif
*/
void page_add_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
}
static void page_remove_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
if (unlikely(PageHuge(page))) {
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
out:
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
*/
oldpage = newpage;
} else {
- mem_cgroup_replace_page(oldpage, newpage);
+ mem_cgroup_migrate(oldpage, newpage);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
-/*
- * true if a page was allocated from pfmemalloc reserves for network-based
- * swap
- */
-static bool pfmemalloc_active __read_mostly;
-
/*
* struct array_cache
*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
- *
- * Entries should not be directly dereferenced as
- * entries belonging to slabs marked pfmemalloc will
- * have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
struct array_cache ac;
};
-#define SLAB_OBJ_PFMEMALLOC 1
-static inline bool is_obj_pfmemalloc(void *objp)
-{
- return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
-}
-
-static inline void set_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
- return;
-}
-
-static inline void clear_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
-}
-
-/*
- * bootstrap: The caches do not work without cpuarrays anymore, but the
- * cpuarrays are allocated from the generic caches...
- */
-#define BOOT_CPUCACHE_ENTRIES 1
-struct arraycache_init {
- struct array_cache cache;
- void *entries[BOOT_CPUCACHE_ENTRIES];
-};
-
/*
* Need this for bootstrapping a per node allocator.
*/
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
} while (0)
+#define CFLGS_OBJFREELIST_SLAB (0x40000000UL)
#define CFLGS_OFF_SLAB (0x80000000UL)
+#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
-#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
#define BATCHREFILL_LIMIT 16
/*
#endif
-#define OBJECT_FREE (0)
-#define OBJECT_ACTIVE (1)
-
#ifdef CONFIG_DEBUG_SLAB_LEAK
-static void set_obj_status(struct page *page, int idx, int val)
+static inline bool is_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
- status[idx] = val;
+ return atomic_read(&cachep->store_user_clean) == 1;
}
-static inline unsigned int get_obj_status(struct page *page, int idx)
+static inline void set_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
+ atomic_set(&cachep->store_user_clean, 1);
+}
- return status[idx];
+static inline void set_store_user_dirty(struct kmem_cache *cachep)
+{
+ if (is_store_user_clean(cachep))
+ atomic_set(&cachep->store_user_clean, 0);
}
#else
-static inline void set_obj_status(struct page *page, int idx, int val) {}
+static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
#endif
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
+#define BOOT_CPUCACHE_ENTRIES 1
/* internal cache of cache description objs */
static struct kmem_cache kmem_cache_boot = {
.batchcount = 1,
return this_cpu_ptr(cachep->cpu_cache);
}
-static size_t calculate_freelist_size(int nr_objs, size_t align)
-{
- size_t freelist_size;
-
- freelist_size = nr_objs * sizeof(freelist_idx_t);
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size += nr_objs * sizeof(char);
-
- if (align)
- freelist_size = ALIGN(freelist_size, align);
-
- return freelist_size;
-}
-
-static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
- size_t idx_size, size_t align)
-{
- int nr_objs;
- size_t remained_size;
- size_t freelist_size;
- int extra_space = 0;
-
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- extra_space = sizeof(char);
- /*
- * Ignore padding for the initial guess. The padding
- * is at most @align-1 bytes, and @buffer_size is at
- * least @align. In the worst case, this result will
- * be one greater than the number of objects that fit
- * into the memory allocation when taking the padding
- * into account.
- */
- nr_objs = slab_size / (buffer_size + idx_size + extra_space);
-
- /*
- * This calculated number will be either the right
- * amount, or one greater than what we want.
- */
- remained_size = slab_size - nr_objs * buffer_size;
- freelist_size = calculate_freelist_size(nr_objs, align);
- if (remained_size < freelist_size)
- nr_objs--;
-
- return nr_objs;
-}
-
/*
* Calculate the number of objects and left-over bytes for a given buffer size.
*/
-static void cache_estimate(unsigned long gfporder, size_t buffer_size,
- size_t align, int flags, size_t *left_over,
- unsigned int *num)
+static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
+ unsigned long flags, size_t *left_over)
{
- int nr_objs;
- size_t mgmt_size;
+ unsigned int num;
size_t slab_size = PAGE_SIZE << gfporder;
/*
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
- * - One unsigned int for each object
- * - Padding to respect alignment of @align
* - @buffer_size bytes for each object
+ * - One freelist_idx_t for each object
+ *
+ * We don't need to consider alignment of freelist because
+ * freelist will be at the end of slab page. The objects will be
+ * at the correct alignment.
*
* If the slab management structure is off the slab, then the
* alignment will already be calculated into the size. Because
* the slabs are all pages aligned, the objects will be at the
* correct alignment when allocated.
*/
- if (flags & CFLGS_OFF_SLAB) {
- mgmt_size = 0;
- nr_objs = slab_size / buffer_size;
-
+ if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
+ num = slab_size / buffer_size;
+ *left_over = slab_size % buffer_size;
} else {
- nr_objs = calculate_nr_objs(slab_size, buffer_size,
- sizeof(freelist_idx_t), align);
- mgmt_size = calculate_freelist_size(nr_objs, align);
+ num = slab_size / (buffer_size + sizeof(freelist_idx_t));
+ *left_over = slab_size %
+ (buffer_size + sizeof(freelist_idx_t));
}
- *num = nr_objs;
- *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
+
+ return num;
}
#if DEBUG
return ac;
}
-static inline bool is_slab_pfmemalloc(struct page *page)
-{
- return PageSlabPfmemalloc(page);
-}
-
-/* Clears pfmemalloc_active if no slabs have pfmalloc set */
-static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
- struct array_cache *ac)
-{
- struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
- struct page *page;
- unsigned long flags;
-
- if (!pfmemalloc_active)
- return;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_partial, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_free, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- pfmemalloc_active = false;
-out:
- spin_unlock_irqrestore(&n->list_lock, flags);
-}
-
-static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
- gfp_t flags, bool force_refill)
+static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
+ struct page *page, void *objp)
{
- int i;
- void *objp = ac->entry[--ac->avail];
-
- /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
- if (unlikely(is_obj_pfmemalloc(objp))) {
- struct kmem_cache_node *n;
-
- if (gfp_pfmemalloc_allowed(flags)) {
- clear_obj_pfmemalloc(&objp);
- return objp;
- }
-
- /* The caller cannot use PFMEMALLOC objects, find another one */
- for (i = 0; i < ac->avail; i++) {
- /* If a !PFMEMALLOC object is found, swap them */
- if (!is_obj_pfmemalloc(ac->entry[i])) {
- objp = ac->entry[i];
- ac->entry[i] = ac->entry[ac->avail];
- ac->entry[ac->avail] = objp;
- return objp;
- }
- }
-
- /*
- * If there are empty slabs on the slabs_free list and we are
- * being forced to refill the cache, mark this one !pfmemalloc.
- */
- n = get_node(cachep, numa_mem_id());
- if (!list_empty(&n->slabs_free) && force_refill) {
- struct page *page = virt_to_head_page(objp);
- ClearPageSlabPfmemalloc(page);
- clear_obj_pfmemalloc(&objp);
- recheck_pfmemalloc_active(cachep, ac);
- return objp;
- }
-
- /* No !PFMEMALLOC objects available */
- ac->avail++;
- objp = NULL;
- }
-
- return objp;
-}
-
-static inline void *ac_get_obj(struct kmem_cache *cachep,
- struct array_cache *ac, gfp_t flags, bool force_refill)
-{
- void *objp;
-
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_get_obj(cachep, ac, flags, force_refill);
- else
- objp = ac->entry[--ac->avail];
-
- return objp;
-}
-
-static noinline void *__ac_put_obj(struct kmem_cache *cachep,
- struct array_cache *ac, void *objp)
-{
- if (unlikely(pfmemalloc_active)) {
- /* Some pfmemalloc slabs exist, check if this is one */
- struct page *page = virt_to_head_page(objp);
- if (PageSlabPfmemalloc(page))
- set_obj_pfmemalloc(&objp);
- }
+ struct kmem_cache_node *n;
+ int page_node;
+ LIST_HEAD(list);
- return objp;
-}
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
-static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
- void *objp)
-{
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_put_obj(cachep, ac, objp);
+ spin_lock(&n->list_lock);
+ free_block(cachep, &objp, 1, page_node, &list);
+ spin_unlock(&n->list_lock);
- ac->entry[ac->avail++] = objp;
+ slabs_destroy(cachep, &list);
}
/*
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, ac, page_node, &list);
}
- ac_put_obj(cachep, ac, objp);
+ ac->entry[ac->avail++] = objp;
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
return;
- printk(KERN_WARNING
- "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
- nodeid, gfpflags);
- printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
+ pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nodeid, gfpflags, &gfpflags);
+ pr_warn(" cache: %s, object size: %d, order: %d\n",
cachep->name, cachep->size, cachep->gfporder);
for_each_kmem_cache_node(cachep, node, n) {
num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
- printk(KERN_WARNING
- " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
node, active_slabs, num_slabs, active_objs, num_objs,
free_objects);
}
return NULL;
}
- /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (page_is_pfmemalloc(page))
- pfmemalloc_active = true;
-
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
add_zone_page_state(page_zone(page),
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
+
__SetPageSlab(page);
- if (page_is_pfmemalloc(page))
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+ if (sk_memalloc_socks() && page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
}
#if DEBUG
+static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+{
+ if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
+ (cachep->size % PAGE_SIZE) == 0)
+ return true;
+
+ return false;
+}
#ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
}
*addr++ = 0x87654321;
}
+
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller)
+{
+ if (!is_debug_pagealloc_cache(cachep))
+ return;
+
+ if (caller)
+ store_stackinfo(cachep, objp, caller);
+
+ kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+}
+
+#else
+static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller) {}
+
#endif
static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
int size, i;
int lines = 0;
+ if (is_debug_pagealloc_cache(cachep))
+ return;
+
realobj = (char *)objp + obj_offset(cachep);
size = cachep->object_size;
struct page *page)
{
int i;
+
+ if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, page->freelist - obj_offset(cachep),
+ POISON_FREE);
+ }
+
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (cachep->size % PAGE_SIZE == 0 &&
- OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
* @size: size of objects to be created in this cache.
- * @align: required alignment for the objects.
* @flags: slab allocation flags
*
* Also calculates the number of objects per slab.
* towards high-order requests, this should be changed.
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
- size_t size, size_t align, unsigned long flags)
+ size_t size, unsigned long flags)
{
- unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
unsigned int num;
size_t remainder;
- cache_estimate(gfporder, size, align, flags, &remainder, &num);
+ num = cache_estimate(gfporder, size, flags, &remainder);
if (!num)
continue;
break;
if (flags & CFLGS_OFF_SLAB) {
- size_t freelist_size_per_obj = sizeof(freelist_idx_t);
+ struct kmem_cache *freelist_cache;
+ size_t freelist_size;
+
+ freelist_size = num * sizeof(freelist_idx_t);
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+
/*
- * Max number of objs-per-slab for caches which
- * use off-slab slabs. Needed to avoid a possible
- * looping condition in cache_grow().
+ * Needed to avoid possible looping condition
+ * in cache_grow()
*/
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size_per_obj += sizeof(char);
- offslab_limit = size;
- offslab_limit /= freelist_size_per_obj;
+ if (OFF_SLAB(freelist_cache))
+ continue;
- if (num > offslab_limit)
- break;
+ /* check if off slab has enough benefit */
+ if (freelist_cache->size > cachep->size / 2)
+ continue;
}
/* Found something acceptable - save it away */
return cachep;
}
+static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
+ return false;
+
+ left = calculate_slab_order(cachep, size,
+ flags | CFLGS_OBJFREELIST_SLAB);
+ if (!cachep->num)
+ return false;
+
+ if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_off_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ /*
+ * Always use on-slab management when SLAB_NOLEAKTRACE
+ * to avoid recursive calls into kmemleak.
+ */
+ if (flags & SLAB_NOLEAKTRACE)
+ return false;
+
+ /*
+ * Size is large, assume best to place the slab management obj
+ * off-slab (should allow better packing of objs).
+ */
+ left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
+ if (!cachep->num)
+ return false;
+
+ /*
+ * If the slab has been placed off-slab, and we have enough space then
+ * move it on-slab. This is at the expense of any extra colouring.
+ */
+ if (left >= cachep->num * sizeof(freelist_idx_t))
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_on_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ left = calculate_slab_order(cachep, size, flags);
+ if (!cachep->num)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
/**
* __kmem_cache_create - Create a cache.
* @cachep: cache management descriptor
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
#endif
- if (flags & SLAB_DESTROY_BY_RCU)
- BUG_ON(flags & SLAB_POISON);
#endif
/*
* 4) Store it.
*/
cachep->align = ralign;
+ cachep->colour_off = cache_line_size();
+ /* Offset must be a multiple of the alignment. */
+ if (cachep->colour_off < cachep->align)
+ cachep->colour_off = cachep->align;
if (slab_is_available())
gfp = GFP_KERNEL;
else
size += BYTES_PER_WORD;
}
-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- /*
- * To activate debug pagealloc, off-slab management is necessary
- * requirement. In early phase of initialization, small sized slab
- * doesn't get initialized so it would not be possible. So, we need
- * to check size >= 256. It guarantees that all necessary small
- * sized slab is initialized in current slab initialization sequence.
- */
- if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
- size >= 256 && cachep->object_size > cache_line_size() &&
- ALIGN(size, cachep->align) < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
- size = PAGE_SIZE;
- }
-#endif
#endif
- /*
- * Determine if the slab management is 'on' or 'off' slab.
- * (bootstrapping cannot cope with offslab caches so don't do
- * it too early on. Always use on-slab management when
- * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
- */
- if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
- !(flags & SLAB_NOLEAKTRACE))
- /*
- * Size is large, assume best to place the slab management obj
- * off-slab (should allow better packing of objs).
- */
- flags |= CFLGS_OFF_SLAB;
-
size = ALIGN(size, cachep->align);
/*
* We should restrict the number of objects in a slab to implement
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
- left_over = calculate_slab_order(cachep, size, cachep->align, flags);
-
- if (!cachep->num)
- return -E2BIG;
-
- freelist_size = calculate_freelist_size(cachep->num, cachep->align);
-
+#if DEBUG
/*
- * If the slab has been placed off-slab, and we have enough space then
- * move it on-slab. This is at the expense of any extra colouring.
+ * To activate debug pagealloc, off-slab management is necessary
+ * requirement. In early phase of initialization, small sized slab
+ * doesn't get initialized so it would not be possible. So, we need
+ * to check size >= 256. It guarantees that all necessary small
+ * sized slab is initialized in current slab initialization sequence.
*/
- if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
- flags &= ~CFLGS_OFF_SLAB;
- left_over -= freelist_size;
+ if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
+ size >= 256 && cachep->object_size > cache_line_size()) {
+ if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
+ size_t tmp_size = ALIGN(size, PAGE_SIZE);
+
+ if (set_off_slab_cache(cachep, tmp_size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ cachep->obj_offset += tmp_size - size;
+ size = tmp_size;
+ goto done;
+ }
+ }
}
+#endif
- if (flags & CFLGS_OFF_SLAB) {
- /* really off slab. No need for manual alignment */
- freelist_size = calculate_freelist_size(cachep->num, 0);
+ if (set_objfreelist_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OBJFREELIST_SLAB;
+ goto done;
+ }
-#ifdef CONFIG_PAGE_POISONING
- /* If we're going to use the generic kernel_map_pages()
- * poisoning, then it's going to smash the contents of
- * the redzone and userword anyhow, so switch them off.
- */
- if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-#endif
+ if (set_off_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ goto done;
}
- cachep->colour_off = cache_line_size();
- /* Offset must be a multiple of the alignment. */
- if (cachep->colour_off < cachep->align)
- cachep->colour_off = cachep->align;
- cachep->colour = left_over / cachep->colour_off;
- cachep->freelist_size = freelist_size;
+ if (set_on_slab_cache(cachep, size, flags))
+ goto done;
+
+ return -E2BIG;
+
+done:
+ cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
- if (flags & CFLGS_OFF_SLAB) {
- cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
- /*
- * This is a possibility for one of the kmalloc_{dma,}_caches.
- * But since we go off slab only for object size greater than
- * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
- * in ascending order,this should not happen at all.
- * But leave a BUG_ON for some lucky dude.
- */
- BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
+#if DEBUG
+ /*
+ * If we're going to use the generic kernel_map_pages()
+ * poisoning, then it's going to smash the contents of
+ * the redzone and userword anyhow, so switch them off.
+ */
+ if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
+ (cachep->flags & SLAB_POISON) &&
+ is_debug_pagealloc_cache(cachep))
+ cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
+
+ if (OFF_SLAB(cachep)) {
+ cachep->freelist_cache =
+ kmalloc_slab(cachep->freelist_size, 0u);
}
err = setup_cpu_cache(cachep, gfp);
}
page = list_entry(p, struct page, lru);
-#if DEBUG
- BUG_ON(page->active);
-#endif
list_del(&page->lru);
/*
* Safe to drop the lock. The slab is no longer linked
void *freelist;
void *addr = page_address(page);
- if (OFF_SLAB(cachep)) {
+ page->s_mem = addr + colour_off;
+ page->active = 0;
+
+ if (OBJFREELIST_SLAB(cachep))
+ freelist = NULL;
+ else if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
if (!freelist)
return NULL;
} else {
- freelist = addr + colour_off;
- colour_off += cachep->freelist_size;
+ /* We will use last bytes at the slab for freelist */
+ freelist = addr + (PAGE_SIZE << cachep->gfporder) -
+ cachep->freelist_size;
}
- page->active = 0;
- page->s_mem = addr + colour_off;
+
return freelist;
}
((freelist_idx_t *)(page->freelist))[idx] = val;
}
-static void cache_init_objs(struct kmem_cache *cachep,
- struct page *page)
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
{
+#if DEBUG
int i;
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
-#if DEBUG
- /* need to poison the objs? */
- if (cachep->flags & SLAB_POISON)
- poison_obj(cachep, objp, POISON_FREE);
+
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = NULL;
slab_error(cachep, "constructor overwrote the"
" start of an object");
}
- if ((cachep->size % PAGE_SIZE) == 0 &&
- OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
-#else
- if (cachep->ctor)
- cachep->ctor(objp);
+ /* need to poison the objs? */
+ if (cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, objp, POISON_FREE);
+ slab_kernel_map(cachep, objp, 0, 0);
+ }
+ }
#endif
- set_obj_status(page, i, OBJECT_FREE);
+}
+
+static void cache_init_objs(struct kmem_cache *cachep,
+ struct page *page)
+{
+ int i;
+
+ cache_init_objs_debug(cachep, page);
+
+ if (OBJFREELIST_SLAB(cachep)) {
+ page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
+ obj_offset(cachep);
+ }
+
+ for (i = 0; i < cachep->num; i++) {
+ /* constructor could break poison info */
+ if (DEBUG == 0 && cachep->ctor)
+ cachep->ctor(index_to_obj(cachep, page, i));
+
set_free_obj(page, i, i);
}
}
}
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
- int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
{
void *objp;
objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
+
#if DEBUG
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+ if (cachep->flags & SLAB_STORE_USER)
+ set_store_user_dirty(cachep);
#endif
return objp;
}
-static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
- void *objp, int nodeid)
+static void slab_put_obj(struct kmem_cache *cachep,
+ struct page *page, void *objp)
{
unsigned int objnr = obj_to_index(cachep, page, objp);
#if DEBUG
unsigned int i;
- /* Verify that the slab belongs to the intended node */
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
-
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
if (get_free_obj(page, i) == objnr) {
}
#endif
page->active--;
+ if (!page->freelist)
+ page->freelist = objp + obj_offset(cachep);
+
set_free_obj(page, page->active, objnr);
}
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
- if (!freelist)
+ if (OFF_SLAB(cachep) && !freelist)
goto opps1;
slab_map_pages(cachep, page, freelist);
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
- if (cachep->flags & SLAB_STORE_USER)
+ if (cachep->flags & SLAB_STORE_USER) {
+ set_store_user_dirty(cachep);
*dbg_userword(cachep, objp) = (void *)caller;
+ }
objnr = obj_to_index(cachep, page, objp);
BUG_ON(objnr >= cachep->num);
BUG_ON(objp != index_to_obj(cachep, page, objnr));
- set_obj_status(page, objnr, OBJECT_FREE);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
- store_stackinfo(cachep, objp, caller);
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
- } else {
- poison_obj(cachep, objp, POISON_FREE);
- }
-#else
poison_obj(cachep, objp, POISON_FREE);
-#endif
+ slab_kernel_map(cachep, objp, 0, caller);
}
return objp;
}
#define cache_free_debugcheck(x,objp,z) (objp)
#endif
-static struct page *get_first_slab(struct kmem_cache_node *n)
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list)
+{
+#if DEBUG
+ void *next = *list;
+ void *objp;
+
+ while (next) {
+ objp = next - obj_offset(cachep);
+ next = *(void **)next;
+ poison_obj(cachep, objp, POISON_FREE);
+ }
+#endif
+}
+
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list)
+{
+ /* move slabp to correct slabp list: */
+ list_del(&page->lru);
+ if (page->active == cachep->num) {
+ list_add(&page->lru, &n->slabs_full);
+ if (OBJFREELIST_SLAB(cachep)) {
+#if DEBUG
+ /* Poisoning will be done without holding the lock */
+ if (cachep->flags & SLAB_POISON) {
+ void **objp = page->freelist;
+
+ *objp = *list;
+ *list = objp;
+ }
+#endif
+ page->freelist = NULL;
+ }
+ } else
+ list_add(&page->lru, &n->slabs_partial);
+}
+
+/* Try to find non-pfmemalloc slab if needed */
+static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
+ struct page *page, bool pfmemalloc)
+{
+ if (!page)
+ return NULL;
+
+ if (pfmemalloc)
+ return page;
+
+ if (!PageSlabPfmemalloc(page))
+ return page;
+
+ /* No need to keep pfmemalloc slab if we have enough free objects */
+ if (n->free_objects > n->free_limit) {
+ ClearPageSlabPfmemalloc(page);
+ return page;
+ }
+
+ /* Move pfmemalloc slab to the end of list to speed up next search */
+ list_del(&page->lru);
+ if (!page->active)
+ list_add_tail(&page->lru, &n->slabs_free);
+ else
+ list_add_tail(&page->lru, &n->slabs_partial);
+
+ list_for_each_entry(page, &n->slabs_partial, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ list_for_each_entry(page, &n->slabs_free, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ return NULL;
+}
+
+static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
struct page, lru);
}
+ if (sk_memalloc_socks())
+ return get_valid_first_slab(n, page, pfmemalloc);
+
return page;
}
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
- bool force_refill)
+static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, gfp_t flags)
+{
+ struct page *page;
+ void *obj;
+ void *list = NULL;
+
+ if (!gfp_pfmemalloc_allowed(flags))
+ return NULL;
+
+ spin_lock(&n->list_lock);
+ page = get_first_slab(n, true);
+ if (!page) {
+ spin_unlock(&n->list_lock);
+ return NULL;
+ }
+
+ obj = slab_get_obj(cachep, page);
+ n->free_objects--;
+
+ fixup_slab_list(cachep, n, page, &list);
+
+ spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
+
+ return obj;
+}
+
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node;
+ void *list = NULL;
check_irq_off();
node = numa_mem_id();
- if (unlikely(force_refill))
- goto force_grow;
+
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
while (batchcount > 0) {
struct page *page;
/* Get slab alloc is to come from. */
- page = get_first_slab(n);
+ page = get_first_slab(n, false);
if (!page)
goto must_grow;
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
- node));
+ ac->entry[ac->avail++] = slab_get_obj(cachep, page);
}
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
if (unlikely(!ac->avail)) {
int x;
-force_grow:
+
+ /* Check if we can use obj in pfmemalloc slab */
+ if (sk_memalloc_socks()) {
+ void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
+
+ if (obj)
+ return obj;
+ }
+
x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
node = numa_mem_id();
/* no objects in sight? abort */
- if (!x && (ac->avail == 0 || force_refill))
+ if (!x && ac->avail == 0)
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
}
ac->touched = 1;
- return ac_get_obj(cachep, ac, flags, force_refill);
+ return ac->entry[--ac->avail];
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
- struct page *page;
-
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
- page = virt_to_head_page(objp);
- set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif
-static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
-{
- if (unlikely(cachep == kmem_cache))
- return false;
-
- return should_failslab(cachep->object_size, flags, cachep->flags);
-}
-
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
- bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
- objp = ac_get_obj(cachep, ac, flags, false);
+ objp = ac->entry[--ac->avail];
- /*
- * Allow for the possibility all avail objects are not allowed
- * by the current flags
- */
- if (objp) {
- STATS_INC_ALLOCHIT(cachep);
- goto out;
- }
- force_refill = true;
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
}
STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags, force_refill);
+ objp = cache_alloc_refill(cachep, flags);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
struct page *page;
struct kmem_cache_node *n;
void *obj;
+ void *list = NULL;
int x;
VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
retry:
check_irq_off();
spin_lock(&n->list_lock);
- page = get_first_slab(n);
+ page = get_first_slab(n, false);
if (!page)
goto must_grow;
BUG_ON(page->active == cachep->num);
- obj = slab_get_obj(cachep, page, nodeid);
+ obj = slab_get_obj(cachep, page);
n->free_objects--;
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
goto done;
must_grow:
int slab_node = numa_mem_id();
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
- flags);
- if (likely(ptr)) {
- kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(ptr, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && ptr)
+ memset(ptr, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &ptr);
return ptr;
}
void *objp;
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
- kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
- flags);
prefetchw(objp);
- if (likely(objp)) {
- kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(objp, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && objp)
+ memset(objp, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &objp);
return objp;
}
void *objp;
struct page *page;
- clear_obj_pfmemalloc(&objpp[i]);
objp = objpp[i];
page = virt_to_head_page(objp);
list_del(&page->lru);
check_spinlock_acquired_node(cachep, node);
- slab_put_obj(cachep, page, objp, node);
+ slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
n->free_objects++;
LIST_HEAD(list);
batchcount = ac->batchcount;
-#if DEBUG
- BUG_ON(!batchcount || batchcount > ac->avail);
-#endif
+
check_irq_off();
n = get_node(cachep, node);
spin_lock(&n->list_lock);
cache_flusharray(cachep, ac);
}
- ac_put_obj(cachep, ac, objp);
+ if (sk_memalloc_socks()) {
+ struct page *page = virt_to_head_page(objp);
+
+ if (unlikely(PageSlabPfmemalloc(page))) {
+ cache_free_pfmemalloc(cachep, page, objp);
+ return;
+ }
+ }
+
+ ac->entry[ac->avail++] = objp;
}
/**
}
EXPORT_SYMBOL(kmem_cache_alloc);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+static __always_inline void
+cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, unsigned long caller)
{
- __kmem_cache_free_bulk(s, size, p);
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+ void **p)
{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
+ size_t i;
+
+ s = slab_pre_alloc_hook(s, flags);
+ if (!s)
+ return 0;
+
+ cache_alloc_debugcheck_before(s, flags);
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = __do_cache_alloc(s, flags);
+
+ if (unlikely(!objp))
+ goto error;
+ p[i] = objp;
+ }
+ local_irq_enable();
+
+ cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
+
+ /* Clear memory outside IRQ disabled section */
+ if (unlikely(flags & __GFP_ZERO))
+ for (i = 0; i < size; i++)
+ memset(p[i], 0, s->object_size);
+
+ slab_post_alloc_hook(s, flags, size, p);
+ /* FIXME: Trace call missing. Christoph would like a bulk variant */
+ return size;
+error:
+ local_irq_enable();
+ cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
+ slab_post_alloc_hook(s, flags, i, p);
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
}
EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+{
+ struct kmem_cache *s;
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = p[i];
+
+ if (!orig_s) /* called via kfree_bulk */
+ s = virt_to_cache(objp);
+ else
+ s = cache_from_obj(orig_s, objp);
+
+ debug_check_no_locks_freed(objp, s->object_size);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, s->object_size);
+
+ __cache_free(s, objp, _RET_IP_);
+ }
+ local_irq_enable();
+
+ /* FIXME: add tracing */
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
/**
* kfree - free previously allocated memory
* @objp: pointer returned by kmalloc.
struct page *page)
{
void *p;
- int i;
+ int i, j;
+ unsigned long v;
if (n[0] == n[1])
return;
for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- if (get_obj_status(page, i) != OBJECT_ACTIVE)
+ bool active = true;
+
+ for (j = page->active; j < c->num; j++) {
+ if (get_free_obj(page, j) == i) {
+ active = false;
+ break;
+ }
+ }
+
+ if (!active)
+ continue;
+
+ /*
+ * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
+ * mapping is established when actual object allocation and
+ * we could mistakenly access the unmapped object in the cpu
+ * cache.
+ */
+ if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
continue;
- if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+ if (!add_caller(n, v))
return;
}
}
if (!(cachep->flags & SLAB_RED_ZONE))
return 0;
- /* OK, we can do it */
+ /*
+ * Set store_user_clean and start to grab stored user information
+ * for all objects on this cache. If some alloc/free requests comes
+ * during the processing, information would be wrong so restart
+ * whole processing.
+ */
+ do {
+ set_store_user_clean(cachep);
+ drain_cpu_caches(cachep);
- x[1] = 0;
+ x[1] = 0;
- for_each_kmem_cache_node(cachep, node, n) {
+ for_each_kmem_cache_node(cachep, node, n) {
- check_irq_on();
- spin_lock_irq(&n->list_lock);
+ check_irq_on();
+ spin_lock_irq(&n->list_lock);
+
+ list_for_each_entry(page, &n->slabs_full, lru)
+ handle_slab(x, cachep, page);
+ list_for_each_entry(page, &n->slabs_partial, lru)
+ handle_slab(x, cachep, page);
+ spin_unlock_irq(&n->list_lock);
+ }
+ } while (!is_store_user_clean(cachep));
- list_for_each_entry(page, &n->slabs_full, lru)
- handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
- handle_slab(x, cachep, page);
- spin_unlock_irq(&n->list_lock);
- }
name = cachep->name;
if (x[0] == x[1]) {
/* Increase the buffer size */
#endif
#include <linux/memcontrol.h>
+#include <linux/fault-inject.h>
+#include <linux/kmemcheck.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
/*
* State of the slab allocator.
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
#elif defined(CONFIG_SLUB_DEBUG)
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DEBUG_FREE)
+ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
#else
#define SLAB_DEBUG_FLAGS (0)
#endif
/*
* Generic implementation of bulk operations
* These are useful for situations in which the allocator cannot
- * perform optimizations. In that case segments of the objecct listed
+ * perform optimizations. In that case segments of the object listed
* may be allocated or freed using these operations.
*/
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
* to not do even the assignment. In that case, slab_equal_or_root
* will also be a constant.
*/
- if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+ if (!memcg_kmem_enabled() &&
+ !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
return s;
page = virt_to_head_page(x);
return s;
}
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifndef CONFIG_SLUB
+ return s->object_size;
+
+#else /* CONFIG_SLUB */
+# ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->object_size;
+# endif
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+#endif
+}
+
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+ lockdep_trace_alloc(flags);
+ might_sleep_if(gfpflags_allow_blocking(flags));
+
+ if (should_failslab(s, flags))
+ return NULL;
+
+ return memcg_kmem_get_cache(s, flags);
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p)
+{
+ size_t i;
+
+ flags &= gfp_allowed_mask;
+ for (i = 0; i < size; i++) {
+ void *object = p[i];
+
+ kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+ kmemleak_alloc_recursive(object, s->object_size, 1,
+ s->flags, flags);
+ kasan_slab_alloc(s, object);
+ }
+ memcg_kmem_put_cache(s);
+}
+
#ifndef CONFIG_SLOB
/*
* The slab lists for all objects.
{
size_t i;
- for (i = 0; i < nr; i++)
- kmem_cache_free(s, p[i]);
+ for (i = 0; i < nr; i++) {
+ if (s)
+ kmem_cache_free(s, p[i]);
+ else
+ kfree(p[i]);
+ }
}
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
#endif
}
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ p += s->red_left_pad;
+
+ return p;
+}
+
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
*/
#define MAX_PARTIAL 10
-#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
+#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
+/*
+ * These debug flags cannot use CMPXCHG because there might be consistency
+ * issues when checking or reading debug information
+ */
+#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
+ SLAB_TRACE)
+
+
/*
* Debugging flags that require metadata to be stored in the slab. These get
* disabled when slub_debug=O is used and a cache's min order increases with
* Core slab cache functions
*******************************************************************/
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
- struct page *page, const void *object)
-{
- void *base;
-
- if (!object)
- return 1;
-
- base = page_address(page);
- if (object < base || object >= base + page->objects * s->size ||
- (object - base) % s->size) {
- return 0;
- }
-
- return 1;
-}
-
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
/* Loop over all objects in a slab */
#define for_each_object(__p, __s, __addr, __objects) \
- for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
- __p += (__s)->size)
+ for (__p = fixup_red_left(__s, __addr); \
+ __p < (__addr) + (__objects) * (__s)->size; \
+ __p += (__s)->size)
#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
- for (__p = (__addr), __idx = 1; __idx <= __objects;\
- __p += (__s)->size, __idx++)
+ for (__p = fixup_red_left(__s, __addr), __idx = 1; \
+ __idx <= __objects; \
+ __p += (__s)->size, __idx++)
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
-static inline size_t slab_ksize(const struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_DEBUG
- /*
- * Debugging requires use of the padding between object
- * and whatever may come after it.
- */
- if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
- return s->object_size;
-
-#endif
- /*
- * If we have the need to store the freelist pointer
- * back there or track user information then we can
- * only use the space before that information.
- */
- if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
- return s->inuse;
- /*
- * Else we can use all the padding etc for the allocation
- */
- return s->size;
-}
-
static inline int order_objects(int order, unsigned long size, int reserved)
{
return ((PAGE_SIZE << order) - reserved) / size;
set_bit(slab_index(p, s, addr), map);
}
+static inline int size_from_object(struct kmem_cache *s)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ return s->size - s->red_left_pad;
+
+ return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ p -= s->red_left_pad;
+
+ return p;
+}
+
/*
* Debug settings:
*/
/*
* Object debugging
*/
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ object = restore_red_left(s, object);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
static void print_section(char *text, u8 *addr, unsigned int length)
{
metadata_access_enable();
pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
- if (p > addr + 16)
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ else if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
print_section("Object ", p, min_t(unsigned long, s->object_size,
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- if (off != s->size)
+ if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, s->size - off);
+ print_section("Padding ", p + off, size_from_object(s) - off);
dump_stack();
}
{
u8 *p = object;
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p - s->red_left_pad, val, s->red_left_pad);
+
if (s->flags & __OBJECT_POISON) {
memset(p, POISON_FREE, s->object_size - 1);
p[s->object_size - 1] = POISON_END;
/* We also have user information there */
off += 2 * sizeof(struct track);
- if (s->size == off)
+ if (size_from_object(s) == off)
return 1;
return check_bytes_and_report(s, page, p, "Object padding",
- p + off, POISON_INUSE, s->size - off);
+ p + off, POISON_INUSE, size_from_object(s) - off);
}
/* Check the pad bytes at the end of a slab page */
u8 *endobject = object + s->object_size;
if (s->flags & SLAB_RED_ZONE) {
+ if (!check_bytes_and_report(s, page, object, "Redzone",
+ object - s->red_left_pad, val, s->red_left_pad))
+ return 0;
+
if (!check_bytes_and_report(s, page, object, "Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
init_tracking(s, object);
}
-static noinline int alloc_debug_processing(struct kmem_cache *s,
+static inline int alloc_consistency_checks(struct kmem_cache *s,
struct page *page,
void *object, unsigned long addr)
{
if (!check_slab(s, page))
- goto bad;
+ return 0;
if (!check_valid_pointer(s, page, object)) {
object_err(s, page, object, "Freelist Pointer check fails");
- goto bad;
+ return 0;
}
if (!check_object(s, page, object, SLUB_RED_INACTIVE))
- goto bad;
+ return 0;
+
+ return 1;
+}
+
+static noinline int alloc_debug_processing(struct kmem_cache *s,
+ struct page *page,
+ void *object, unsigned long addr)
+{
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!alloc_consistency_checks(s, page, object, addr))
+ goto bad;
+ }
/* Success perform special debug activities for allocs */
if (s->flags & SLAB_STORE_USER)
return 0;
}
-/* Supports checking bulk free of a constructed freelist */
-static noinline struct kmem_cache_node *free_debug_processing(
- struct kmem_cache *s, struct page *page,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr, unsigned long *flags)
+static inline int free_consistency_checks(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr)
{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- void *object = head;
- int cnt = 0;
-
- spin_lock_irqsave(&n->list_lock, *flags);
- slab_lock(page);
-
- if (!check_slab(s, page))
- goto fail;
-
-next_object:
- cnt++;
-
if (!check_valid_pointer(s, page, object)) {
slab_err(s, page, "Invalid object pointer 0x%p", object);
- goto fail;
+ return 0;
}
if (on_freelist(s, page, object)) {
object_err(s, page, object, "Object already free");
- goto fail;
+ return 0;
}
if (!check_object(s, page, object, SLUB_RED_ACTIVE))
- goto out;
+ return 0;
if (unlikely(s != page->slab_cache)) {
if (!PageSlab(page)) {
} else
object_err(s, page, object,
"page slab pointer corrupt.");
- goto fail;
+ return 0;
+ }
+ return 1;
+}
+
+/* Supports checking bulk free of a constructed freelist */
+static noinline int free_debug_processing(
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
+{
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ void *object = head;
+ int cnt = 0;
+ unsigned long uninitialized_var(flags);
+ int ret = 0;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ slab_lock(page);
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, page))
+ goto out;
+ }
+
+next_object:
+ cnt++;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, page, object, addr))
+ goto out;
}
if (s->flags & SLAB_STORE_USER)
object = get_freepointer(s, object);
goto next_object;
}
+ ret = 1;
+
out:
if (cnt != bulk_cnt)
slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
bulk_cnt, cnt);
slab_unlock(page);
- /*
- * Keep node_lock to preserve integrity
- * until the object is actually freed
- */
- return n;
-
-fail:
- slab_unlock(page);
- spin_unlock_irqrestore(&n->list_lock, *flags);
- slab_fix(s, "Object at 0x%p not freed", object);
- return NULL;
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ if (!ret)
+ slab_fix(s, "Object at 0x%p not freed", object);
+ return ret;
}
static int __init setup_slub_debug(char *str)
for (; *str && *str != ','; str++) {
switch (tolower(*str)) {
case 'f':
- slub_debug |= SLAB_DEBUG_FREE;
+ slub_debug |= SLAB_CONSISTENCY_CHECKS;
break;
case 'z':
slub_debug |= SLAB_RED_ZONE;
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
-static inline struct kmem_cache_node *free_debug_processing(
+static inline int free_debug_processing(
struct kmem_cache *s, struct page *page,
void *head, void *tail, int bulk_cnt,
- unsigned long addr, unsigned long *flags) { return NULL; }
+ unsigned long addr) { return 0; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
kasan_kfree_large(x);
}
-static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- gfp_t flags)
-{
- flags &= gfp_allowed_mask;
- lockdep_trace_alloc(flags);
- might_sleep_if(gfpflags_allow_blocking(flags));
-
- if (should_failslab(s->object_size, flags, s->flags))
- return NULL;
-
- return memcg_kmem_get_cache(s, flags);
-}
-
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p)
-{
- size_t i;
-
- flags &= gfp_allowed_mask;
- for (i = 0; i < size; i++) {
- void *object = p[i];
-
- kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
- kmemleak_alloc_recursive(object, s->object_size, 1,
- s->flags, flags);
- kasan_slab_alloc(s, object);
- }
- memcg_kmem_put_cache(s);
-}
-
static inline void slab_free_hook(struct kmem_cache *s, void *x)
{
kmemleak_free_recursive(x, s->flags);
set_freepointer(s, p, NULL);
}
- page->freelist = start;
+ page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = 1;
int order = compound_order(page);
int pages = 1 << order;
- if (kmem_cache_debug(s)) {
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
void *p;
slab_pad_check(s, page);
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
return;
- pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
- nid, gfpflags);
+ pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nid, gfpflags, &gfpflags);
pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
s->name, s->object_size, s->size, oo_order(s->oo),
oo_order(s->min));
stat(s, FREE_SLOWPATH);
if (kmem_cache_debug(s) &&
- !(n = free_debug_processing(s, page, head, tail, cnt,
- addr, &flags)))
+ !free_debug_processing(s, page, head, tail, cnt, addr))
return;
do {
void *tail;
void *freelist;
int cnt;
+ struct kmem_cache *s;
};
/*
* synchronization primitive. Look ahead in the array is limited due
* to performance reasons.
*/
-static int build_detached_freelist(struct kmem_cache *s, size_t size,
- void **p, struct detached_freelist *df)
+static inline
+int build_detached_freelist(struct kmem_cache *s, size_t size,
+ void **p, struct detached_freelist *df)
{
size_t first_skipped_index = 0;
int lookahead = 3;
void *object;
+ struct page *page;
/* Always re-init detached_freelist */
df->page = NULL;
do {
object = p[--size];
+ /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
} while (!object && size);
if (!object)
return 0;
+ page = virt_to_head_page(object);
+ if (!s) {
+ /* Handle kalloc'ed objects */
+ if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
+ kfree_hook(object);
+ __free_kmem_pages(page, compound_order(page));
+ p[size] = NULL; /* mark object processed */
+ return size;
+ }
+ /* Derive kmem_cache from object */
+ df->s = page->slab_cache;
+ } else {
+ df->s = cache_from_obj(s, object); /* Support for memcg */
+ }
+
/* Start new detached freelist */
- set_freepointer(s, object, NULL);
- df->page = virt_to_head_page(object);
+ df->page = page;
+ set_freepointer(df->s, object, NULL);
df->tail = object;
df->freelist = object;
p[size] = NULL; /* mark object processed */
/* df->page is always set at this point */
if (df->page == virt_to_head_page(object)) {
/* Opportunity build freelist */
- set_freepointer(s, object, df->freelist);
+ set_freepointer(df->s, object, df->freelist);
df->freelist = object;
df->cnt++;
p[size] = NULL; /* mark object processed */
return first_skipped_index;
}
-
/* Note that interrupts must be enabled when calling this function. */
-void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
{
if (WARN_ON(!size))
return;
do {
struct detached_freelist df;
- struct kmem_cache *s;
-
- /* Support for memcg */
- s = cache_from_obj(orig_s, p[size - 1]);
size = build_detached_freelist(s, size, p, &df);
if (unlikely(!df.page))
continue;
- slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
+ slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
} while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
*/
size += 2 * sizeof(struct track);
- if (flags & SLAB_RED_ZONE)
+ if (flags & SLAB_RED_ZONE) {
/*
* Add some empty padding so that we can catch
* overwrites from earlier objects rather than let
* of the object.
*/
size += sizeof(void *);
+
+ s->red_left_pad = sizeof(void *);
+ s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+ size += s->red_left_pad;
+ }
#endif
/*
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
+ if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
#endif
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
static ssize_t sanity_checks_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- s->flags &= ~SLAB_DEBUG_FREE;
+ s->flags &= ~SLAB_CONSISTENCY_CHECKS;
if (buf[0] == '1') {
s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_DEBUG_FREE;
+ s->flags |= SLAB_CONSISTENCY_CHECKS;
}
return length;
}
s->flags &= ~SLAB_RED_ZONE;
if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_RED_ZONE;
}
calculate_sizes(s, -1);
s->flags &= ~SLAB_POISON;
if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_POISON;
}
calculate_sizes(s, -1);
*p++ = 'd';
if (s->flags & SLAB_RECLAIM_ACCOUNT)
*p++ = 'a';
- if (s->flags & SLAB_DEBUG_FREE)
+ if (s->flags & SLAB_CONSISTENCY_CHECKS)
*p++ = 'F';
if (!(s->flags & SLAB_NOTRACK))
*p++ = 't';
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
- struct mem_cgroup *memcg;
unsigned long flags;
if (page->mapping != mapping)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
return 1;
failed:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
{
unsigned long nr;
- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE) +
- zone_page_state(zone, NR_ISOLATED_FILE);
+ nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
if (get_nr_swap_pages() > 0)
- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON) +
- zone_page_state(zone, NR_ISOLATED_ANON);
+ nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
return nr;
}
bool zone_reclaimable(struct zone *zone)
{
- return zone_page_state(zone, NR_PAGES_SCANNED) <
+ return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
zone_reclaimable_pages(zone) * 6;
}
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
{
size_t size = sizeof(*shrinker->nr_deferred);
- /*
- * If we only have one possible node in the system anyway, save
- * ourselves the trouble and disable NUMA aware behavior. This way we
- * will save memory and some small loop time later.
- */
- if (nr_node_ids == 1)
- shrinker->flags &= ~SHRINKER_NUMA_AWARE;
-
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
bool reclaimed)
{
unsigned long flags;
- struct mem_cgroup *memcg;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
/*
* The non racy check for a busy page.
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
swapcache_free(swap);
} else {
void (*freepage)(struct page *);
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow, memcg);
+ __delete_from_page_cache(page, shadow);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage != NULL)
freepage(page);
cannot_free:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
unsigned long inactive;
unsigned long active;
- inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+ inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
return active > inactive;
}
* system is under heavy pressure.
*/
if (!inactive_file_is_low(lruvec) &&
- get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
* anon in [0], file in [1]
*/
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
unsigned long size;
unsigned long scan;
- size = get_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru);
scan = size >> sc->priority;
if (!scan && pass && force_scan)
#endif
#ifdef CONFIG_PROC_FS
-static char * const migratetype_names[MIGRATE_TYPES] = {
- "Unmovable",
- "Movable",
- "Reclaimable",
- "HighAtomic",
-#ifdef CONFIG_CMA
- "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- "Isolate",
-#endif
-};
-
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
#ifdef CONFIG_PAGE_OWNER
int mtype;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return;
drain_all_pages(NULL);
* refault distance will immediately activate the refaulting page.
*/
-static void *pack_shadow(unsigned long eviction, struct zone *zone)
+#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
+ ZONES_SHIFT + NODES_SHIFT + \
+ MEM_CGROUP_ID_SHIFT)
+#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+
+/*
+ * Eviction timestamps need to be able to cover the full range of
+ * actionable refaults. However, bits are tight in the radix tree
+ * entry, and after storing the identifier for the lruvec there might
+ * not be enough left to represent every single actionable refault. In
+ * that case, we have to sacrifice granularity for distance, and group
+ * evictions into coarser buckets by shaving off lower timestamp bits.
+ */
+static unsigned int bucket_order __read_mostly;
+
+static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
{
+ eviction >>= bucket_order;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
-static void unpack_shadow(void *shadow,
- struct zone **zone,
- unsigned long *distance)
+static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
+ unsigned long *evictionp)
{
unsigned long entry = (unsigned long)shadow;
- unsigned long eviction;
- unsigned long refault;
- unsigned long mask;
- int zid, nid;
+ int memcgid, nid, zid;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
zid = entry & ((1UL << ZONES_SHIFT) - 1);
entry >>= ZONES_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
- eviction = entry;
+ memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+ entry >>= MEM_CGROUP_ID_SHIFT;
- *zone = NODE_DATA(nid)->node_zones + zid;
-
- refault = atomic_long_read(&(*zone)->inactive_age);
- mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
- RADIX_TREE_EXCEPTIONAL_SHIFT);
- /*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
- *
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
- */
- *distance = (refault - eviction) & mask;
+ *memcgidp = memcgid;
+ *zonep = NODE_DATA(nid)->node_zones + zid;
+ *evictionp = entry << bucket_order;
}
/**
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
+ struct mem_cgroup *memcg = page_memcg(page);
struct zone *zone = page_zone(page);
+ int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
+ struct lruvec *lruvec;
- eviction = atomic_long_inc_return(&zone->inactive_age);
- return pack_shadow(eviction, zone);
+ /* Page is fully exclusive and pins page->mem_cgroup */
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ eviction = atomic_long_inc_return(&lruvec->inactive_age);
+ return pack_shadow(memcgid, zone, eviction);
}
/**
bool workingset_refault(void *shadow)
{
unsigned long refault_distance;
+ unsigned long active_file;
+ struct mem_cgroup *memcg;
+ unsigned long eviction;
+ struct lruvec *lruvec;
+ unsigned long refault;
struct zone *zone;
+ int memcgid;
+
+ unpack_shadow(shadow, &memcgid, &zone, &eviction);
+
+ rcu_read_lock();
+ /*
+ * Look up the memcg associated with the stored ID. It might
+ * have been deleted since the page's eviction.
+ *
+ * Note that in rare events the ID could have been recycled
+ * for a new cgroup that refaults a shared page. This is
+ * impossible to tell from the available data. However, this
+ * should be a rare and limited disturbance, and activations
+ * are always speculative anyway. Ultimately, it's the aging
+ * algorithm's job to shake out the minimum access frequency
+ * for the active cache.
+ *
+ * XXX: On !CONFIG_MEMCG, this will always return NULL; it
+ * would be better if the root_mem_cgroup existed in all
+ * configurations instead.
+ */
+ memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_disabled() && !memcg) {
+ rcu_read_unlock();
+ return false;
+ }
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ refault = atomic_long_read(&lruvec->inactive_age);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ rcu_read_unlock();
+
+ /*
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases.
+ *
+ * There is a special case: usually, shadow entries have a
+ * short lifetime and are either refaulted or reclaimed along
+ * with the inode before they get too old. But it is not
+ * impossible for the inactive_age to lap a shadow entry in
+ * the field, which can then can result in a false small
+ * refault distance, leading to a false activation should this
+ * old entry actually refault again. However, earlier kernels
+ * used to deactivate unconditionally with *every* reclaim
+ * invocation for the longest time, so the occasional
+ * inappropriate activation leading to pressure on the active
+ * list is not a problem.
+ */
+ refault_distance = (refault - eviction) & EVICTION_MASK;
- unpack_shadow(shadow, &zone, &refault_distance);
inc_zone_state(zone, WORKINGSET_REFAULT);
- if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+ if (refault_distance <= active_file) {
inc_zone_state(zone, WORKINGSET_ACTIVATE);
return true;
}
*/
void workingset_activation(struct page *page)
{
- atomic_long_inc(&page_zone(page)->inactive_age);
+ struct lruvec *lruvec;
+
+ lock_page_memcg(page);
+ /*
+ * Filter non-memcg pages here, e.g. unmap can call
+ * mark_page_accessed() on VDSO pages.
+ *
+ * XXX: See workingset_refault() - this should return
+ * root_mem_cgroup even for !CONFIG_MEMCG.
+ */
+ if (!mem_cgroup_disabled() && !page_memcg(page))
+ goto out;
+ lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+ atomic_long_inc(&lruvec->inactive_age);
+out:
+ unlock_page_memcg(page);
}
/*
static int __init workingset_init(void)
{
+ unsigned int timestamp_bits;
+ unsigned int max_order;
int ret;
+ BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ /*
+ * Calculate the eviction bucket size to cover the longest
+ * actionable refault distance, which is currently half of
+ * memory (totalram_pages/2). However, memory hotplug may add
+ * some more pages at runtime, so keep working with up to
+ * double the initial memory by using totalram_pages as-is.
+ */
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ max_order = fls_long(totalram_pages - 1);
+ if (max_order > timestamp_bits)
+ bucket_order = max_order - timestamp_bits;
+ printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
+ timestamp_bits, max_order, bucket_order);
+
ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
if (ret)
goto err;
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
- ipv4_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
9000 - 60,
};
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
- ipv6_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch);
static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
__be16 sport, __be16 dport, u32 count, int c)
unsigned long r_offset;
};
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
- rds_page_remainders);
+static
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
/*
* returns 0 on success or -errno on failure.
#ignore lines not being added
next if ($line =~ /^[^\+]/);
+# check for declarations of signed or unsigned without int
+ while ($line =~ m{($Declare)\s*(?!char\b|short\b|int\b|long\b)\s*($Ident)?\s*[=,;\[\)\(]}g) {
+ my $type = $1;
+ my $var = $2;
+ $var = "" if (!defined $var);
+ if ($type =~ /^(?:(?:$Storage|$Inline|$Attribute)\s+)*((?:un)?signed)((?:\s*\*)*)\s*$/) {
+ my $sign = $1;
+ my $pointer = $2;
+
+ $pointer = "" if (!defined $pointer);
+
+ if (WARN("UNSPECIFIED_INT",
+ "Prefer '" . trim($sign) . " int" . rtrim($pointer) . "' to bare use of '$sign" . rtrim($pointer) . "'\n" . $herecurr) &&
+ $fix) {
+ my $decl = trim($sign) . " int ";
+ my $comp_pointer = $pointer;
+ $comp_pointer =~ s/\s//g;
+ $decl .= $comp_pointer;
+ $decl = rtrim($decl) if ($var eq "");
+ $fixed[$fixlinenr] =~ s@\b$sign\s*\Q$pointer\E\s*$var\b@$decl$var@;
+ }
+ }
+ }
+
# TEST: allow direct testing of the type matcher.
if ($dbg_type) {
if ($line =~ /^.\s*$Declare\s*$/) {
## }
#need space before brace following if, while, etc
- if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\){/) ||
+ if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\)\{/) ||
$line =~ /do\{/) {
if (ERROR("SPACING",
"space required before the open brace '{'\n" . $herecurr) &&
{
}
+ # Make asm volatile uses seem like a generic function
+ $dstat =~ s/\b_*asm_*\s+_*volatile_*\b/asm_volatile/g;
+
my $exceptions = qr{
$Declare|
module_param_named|
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
+#include <limits.h>
#ifndef ARRAY_SIZE
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
unsigned int len;
unsigned int start_pos;
unsigned char *sym;
+ unsigned int percpu_absolute;
};
struct addr_range {
};
static unsigned long long _text;
+static unsigned long long relative_base;
static struct addr_range text_ranges[] = {
{ "_stext", "_etext" },
{ "_sinittext", "_einittext" },
static int absolute_percpu = 0;
static char symbol_prefix_char = '\0';
static unsigned long long kernel_start_addr = 0;
+static int base_relative = 0;
int token_profit[0x10000];
fprintf(stderr, "Usage: kallsyms [--all-symbols] "
"[--symbol-prefix=<prefix char>] "
"[--page-offset=<CONFIG_PAGE_OFFSET>] "
- "< in.map > out.S\n");
+ "[--base-relative] < in.map > out.S\n");
exit(1);
}
strcpy((char *)s->sym + 1, str);
s->sym[0] = stype;
+ s->percpu_absolute = 0;
+
/* Record if we've found __per_cpu_start/end. */
check_symbol_range(sym, s->addr, &percpu_range, 1);
*/
static char *special_symbols[] = {
"kallsyms_addresses",
+ "kallsyms_offsets",
+ "kallsyms_relative_base",
"kallsyms_num_syms",
"kallsyms_names",
"kallsyms_markers",
static int symbol_absolute(struct sym_entry *s)
{
- return toupper(s->sym[0]) == 'A';
+ return s->percpu_absolute;
}
static void write_src(void)
printf("\t.section .rodata, \"a\"\n");
- /* Provide proper symbols relocatability by their '_text'
- * relativeness. The symbol names cannot be used to construct
- * normal symbol references as the list of symbols contains
- * symbols that are declared static and are private to their
- * .o files. This prevents .tmp_kallsyms.o or any other
- * object from referencing them.
+ /* Provide proper symbols relocatability by their relativeness
+ * to a fixed anchor point in the runtime image, either '_text'
+ * for absolute address tables, in which case the linker will
+ * emit the final addresses at build time. Otherwise, use the
+ * offset relative to the lowest value encountered of all relative
+ * symbols, and emit non-relocatable fixed offsets that will be fixed
+ * up at runtime.
+ *
+ * The symbol names cannot be used to construct normal symbol
+ * references as the list of symbols contains symbols that are
+ * declared static and are private to their .o files. This prevents
+ * .tmp_kallsyms.o or any other object from referencing them.
*/
- output_label("kallsyms_addresses");
+ if (!base_relative)
+ output_label("kallsyms_addresses");
+ else
+ output_label("kallsyms_offsets");
+
for (i = 0; i < table_cnt; i++) {
- if (!symbol_absolute(&table[i])) {
+ if (base_relative) {
+ long long offset;
+ int overflow;
+
+ if (!absolute_percpu) {
+ offset = table[i].addr - relative_base;
+ overflow = (offset < 0 || offset > UINT_MAX);
+ } else if (symbol_absolute(&table[i])) {
+ offset = table[i].addr;
+ overflow = (offset < 0 || offset > INT_MAX);
+ } else {
+ offset = relative_base - table[i].addr - 1;
+ overflow = (offset < INT_MIN || offset >= 0);
+ }
+ if (overflow) {
+ fprintf(stderr, "kallsyms failure: "
+ "%s symbol value %#llx out of range in relative mode\n",
+ symbol_absolute(&table[i]) ? "absolute" : "relative",
+ table[i].addr);
+ exit(EXIT_FAILURE);
+ }
+ printf("\t.long\t%#x\n", (int)offset);
+ } else if (!symbol_absolute(&table[i])) {
if (_text <= table[i].addr)
printf("\tPTR\t_text + %#llx\n",
table[i].addr - _text);
}
printf("\n");
+ if (base_relative) {
+ output_label("kallsyms_relative_base");
+ printf("\tPTR\t_text - %#llx\n", _text - relative_base);
+ printf("\n");
+ }
+
output_label("kallsyms_num_syms");
printf("\tPTR\t%d\n", table_cnt);
printf("\n");
unsigned int i;
for (i = 0; i < table_cnt; i++)
- if (symbol_in_range(&table[i], &percpu_range, 1))
+ if (symbol_in_range(&table[i], &percpu_range, 1)) {
+ /*
+ * Keep the 'A' override for percpu symbols to
+ * ensure consistent behavior compared to older
+ * versions of this tool.
+ */
table[i].sym[0] = 'A';
+ table[i].percpu_absolute = 1;
+ }
+}
+
+/* find the minimum non-absolute symbol address */
+static void record_relative_base(void)
+{
+ unsigned int i;
+
+ relative_base = -1ULL;
+ for (i = 0; i < table_cnt; i++)
+ if (!symbol_absolute(&table[i]) &&
+ table[i].addr < relative_base)
+ relative_base = table[i].addr;
}
int main(int argc, char **argv)
} else if (strncmp(argv[i], "--page-offset=", 14) == 0) {
const char *p = &argv[i][14];
kernel_start_addr = strtoull(p, NULL, 16);
- } else
+ } else if (strcmp(argv[i], "--base-relative") == 0)
+ base_relative = 1;
+ else
usage();
}
} else if (argc != 1)
read_map(stdin);
if (absolute_percpu)
make_percpus_absolute();
+ if (base_relative)
+ record_relative_base();
sort_symbols();
optimize_token_table();
write_src();
kallsymopt="${kallsymopt} --page-offset=$CONFIG_PAGE_OFFSET"
fi
- if [ -n "${CONFIG_X86_64}" ]; then
+ if [ -n "${CONFIG_KALLSYMS_ABSOLUTE_PERCPU}" ]; then
kallsymopt="${kallsymopt} --absolute-percpu"
fi
+ if [ -n "${CONFIG_KALLSYMS_BASE_RELATIVE}" ]; then
+ kallsymopt="${kallsymopt} --base-relative"
+ fi
+
local aflags="${KBUILD_AFLAGS} ${KBUILD_AFLAGS_KERNEL} \
${NOSTDINC_FLAGS} ${LINUXINCLUDE} ${KBUILD_CPPFLAGS}"
'kallsyms_names' => 1,
'kallsyms_num_syms' => 1,
'kallsyms_addresses'=> 1,
+ 'kallsyms_offsets' => 1,
+ 'kallsyms_relative_base'=> 1,
'__this_module' => 1,
'_etext' => 1,
'_edata' => 1,
return fa->flags - fb->flags;
}
-/* see include/trace/events/gfpflags.h */
+/* see include/trace/events/mmflags.h */
static const struct {
const char *original;
const char *compact;
{ "GFP_HIGHUSER", "HU" },
{ "GFP_USER", "U" },
{ "GFP_TEMPORARY", "TMP" },
+ { "GFP_KERNEL_ACCOUNT", "KAC" },
{ "GFP_KERNEL", "K" },
{ "GFP_NOFS", "NF" },
{ "GFP_ATOMIC", "A" },
{ "GFP_NOIO", "NI" },
- { "GFP_HIGH", "H" },
- { "GFP_WAIT", "W" },
- { "GFP_IO", "I" },
- { "GFP_COLD", "CO" },
- { "GFP_NOWARN", "NWR" },
- { "GFP_REPEAT", "R" },
- { "GFP_NOFAIL", "NF" },
- { "GFP_NORETRY", "NR" },
- { "GFP_COMP", "C" },
- { "GFP_ZERO", "Z" },
- { "GFP_NOMEMALLOC", "NMA" },
- { "GFP_MEMALLOC", "MA" },
- { "GFP_HARDWALL", "HW" },
- { "GFP_THISNODE", "TN" },
- { "GFP_RECLAIMABLE", "RC" },
- { "GFP_MOVABLE", "M" },
- { "GFP_NOTRACK", "NT" },
- { "GFP_NO_KSWAPD", "NK" },
- { "GFP_OTHER_NODE", "ON" },
{ "GFP_NOWAIT", "NW" },
+ { "GFP_DMA", "D" },
+ { "__GFP_HIGHMEM", "HM" },
+ { "GFP_DMA32", "D32" },
+ { "__GFP_HIGH", "H" },
+ { "__GFP_ATOMIC", "_A" },
+ { "__GFP_IO", "I" },
+ { "__GFP_FS", "F" },
+ { "__GFP_COLD", "CO" },
+ { "__GFP_NOWARN", "NWR" },
+ { "__GFP_REPEAT", "R" },
+ { "__GFP_NOFAIL", "NF" },
+ { "__GFP_NORETRY", "NR" },
+ { "__GFP_COMP", "C" },
+ { "__GFP_ZERO", "Z" },
+ { "__GFP_NOMEMALLOC", "NMA" },
+ { "__GFP_MEMALLOC", "MA" },
+ { "__GFP_HARDWALL", "HW" },
+ { "__GFP_THISNODE", "TN" },
+ { "__GFP_RECLAIMABLE", "RC" },
+ { "__GFP_MOVABLE", "M" },
+ { "__GFP_ACCOUNT", "AC" },
+ { "__GFP_NOTRACK", "NT" },
+ { "__GFP_WRITE", "WR" },
+ { "__GFP_RECLAIM", "R" },
+ { "__GFP_DIRECT_RECLAIM", "DR" },
+ { "__GFP_KSWAPD_RECLAIM", "KR" },
+ { "__GFP_OTHER_NODE", "ON" },
};
static size_t max_gfp_len;
"\nValid debug options (FZPUT may be combined)\n"
"a / A Switch on all debug options (=FZUP)\n"
"- Switch off all debug options\n"
- "f / F Sanity Checks (SLAB_DEBUG_FREE)\n"
+ "f / F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n"
"z / Z Redzoning\n"
"p / P Poisoning\n"
"u / U Tracking\n"