[linux-2.6-block.git] / mm / vmstat.c

/*
 *  linux/mm/vmstat.c
 *
 *  Manages VM statistics
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  zoned VM statistics
 *  Copyright (C) 2006 Silicon Graphics, Inc.,
 *		Christoph Lameter <christoph@lameter.com>
 */

#include <linux/config.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/cpu.h>

void __get_zone_counts(unsigned long *active, unsigned long *inactive,
			unsigned long *free, struct pglist_data *pgdat)
{
	struct zone *zones = pgdat->node_zones;
	int i;

	*active = 0;
	*inactive = 0;
	*free = 0;
	for (i = 0; i < MAX_NR_ZONES; i++) {
		*active += zones[i].nr_active;
		*inactive += zones[i].nr_inactive;
		*free += zones[i].free_pages;
	}
}

void get_zone_counts(unsigned long *active,
		unsigned long *inactive, unsigned long *free)
{
	struct pglist_data *pgdat;

	*active = 0;
	*inactive = 0;
	*free = 0;
	for_each_online_pgdat(pgdat) {
		unsigned long l, m, n;
		__get_zone_counts(&l, &m, &n, pgdat);
		*active += l;
		*inactive += m;
		*free += n;
	}
}

#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);

static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
{
	int cpu = 0;
	int i;

	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));

	cpu = first_cpu(*cpumask);
	while (cpu < NR_CPUS) {
		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);

		cpu = next_cpu(cpu, *cpumask);

		if (cpu < NR_CPUS)
			prefetch(&per_cpu(vm_event_states, cpu));


		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
			ret[i] += this->event[i];
	}
}

/*
 * Accumulate the vm event counters across all CPUs.
 * The result is unavoidably approximate - it can change
 * during and after execution of this function.
*/
void all_vm_events(unsigned long *ret)
{
	sum_vm_events(ret, &cpu_online_map);
}
EXPORT_SYMBOL_GPL(all_vm_events);

#ifdef CONFIG_HOTPLUG
/*
 * Fold the foreign cpu events into our own.
 *
 * This is adding to the events on one processor
 * but keeps the global counts constant.
 */
void vm_events_fold_cpu(int cpu)
{
	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
	int i;

	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
		count_vm_events(i, fold_state->event[i]);
		fold_state->event[i] = 0;
	}
}
#endif /* CONFIG_HOTPLUG */

#endif /* CONFIG_VM_EVENT_COUNTERS */

/*
 * Manage combined zone based / global counters
 *
 * vm_stat contains the global counters
 */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
EXPORT_SYMBOL(vm_stat);

#ifdef CONFIG_SMP

static int calculate_threshold(struct zone *zone)
{
	int threshold;
	int mem;	/* memory in 128 MB units */

	/*
	 * The threshold scales with the number of processors and the amount
	 * of memory per zone. More memory means that we can defer updates for
	 * longer, more processors could lead to more contention.
 	 * fls() is used to have a cheap way of logarithmic scaling.
	 *
	 * Some sample thresholds:
	 *
	 * Threshold	Processors	(fls)	Zonesize	fls(mem+1)
	 * ------------------------------------------------------------------
	 * 8		1		1	0.9-1 GB	4
	 * 16		2		2	0.9-1 GB	4
	 * 20 		2		2	1-2 GB		5
	 * 24		2		2	2-4 GB		6
	 * 28		2		2	4-8 GB		7
	 * 32		2		2	8-16 GB		8
	 * 4		2		2	<128M		1
	 * 30		4		3	2-4 GB		5
	 * 48		4		3	8-16 GB		8
	 * 32		8		4	1-2 GB		4
	 * 32		8		4	0.9-1GB		4
	 * 10		16		5	<128M		1
	 * 40		16		5	900M		4
	 * 70		64		7	2-4 GB		5
	 * 84		64		7	4-8 GB		6
	 * 108		512		9	4-8 GB		6
	 * 125		1024		10	8-16 GB		8
	 * 125		1024		10	16-32 GB	9
	 */

	mem = zone->present_pages >> (27 - PAGE_SHIFT);

	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));

	/*
	 * Maximum threshold is 125
	 */
	threshold = min(125, threshold);

	return threshold;
}

/*
 * Refresh the thresholds for each zone.
 */
static void refresh_zone_stat_thresholds(void)
{
	struct zone *zone;
	int cpu;
	int threshold;

	for_each_zone(zone) {

		if (!zone->present_pages)
			continue;

		threshold = calculate_threshold(zone);

		for_each_online_cpu(cpu)
			zone_pcp(zone, cpu)->stat_threshold = threshold;
	}
}

/*
 * For use when we know that interrupts are disabled.
 */
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
				int delta)
{
	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
	s8 *p = pcp->vm_stat_diff + item;
	long x;

	x = delta + *p;

	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
		zone_page_state_add(x, zone, item);
		x = 0;
	}
	*p = x;
}
EXPORT_SYMBOL(__mod_zone_page_state);

/*
 * For an unknown interrupt state
 */
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
					int delta)
{
	unsigned long flags;

	local_irq_save(flags);
	__mod_zone_page_state(zone, item, delta);
	local_irq_restore(flags);
}
EXPORT_SYMBOL(mod_zone_page_state);

/*
 * Optimized increment and decrement functions.
 *
 * These are only for a single page and therefore can take a struct page *
 * argument instead of struct zone *. This allows the inclusion of the code
 * generated for page_zone(page) into the optimized functions.
 *
 * No overflow check is necessary and therefore the differential can be
 * incremented or decremented in place which may allow the compilers to
 * generate better code.
 * The increment or decrement is known and therefore one boundary check can
 * be omitted.
 *
 * NOTE: These functions are very performance sensitive. Change only
 * with care.
 *
 * Some processors have inc/dec instructions that are atomic vs an interrupt.
 * However, the code must first determine the differential location in a zone
 * based on the processor number and then inc/dec the counter. There is no
 * guarantee without disabling preemption that the processor will not change
 * in between and therefore the atomicity vs. interrupt cannot be exploited
 * in a useful way here.
 */
static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
	s8 *p = pcp->vm_stat_diff + item;

	(*p)++;

	if (unlikely(*p > pcp->stat_threshold)) {
		int overstep = pcp->stat_threshold / 2;

		zone_page_state_add(*p + overstep, zone, item);
		*p = -overstep;
	}
}

void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
{
	__inc_zone_state(page_zone(page), item);
}
EXPORT_SYMBOL(__inc_zone_page_state);

void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
{
	struct zone *zone = page_zone(page);
	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
	s8 *p = pcp->vm_stat_diff + item;

	(*p)--;

	if (unlikely(*p < - pcp->stat_threshold)) {
		int overstep = pcp->stat_threshold / 2;

		zone_page_state_add(*p - overstep, zone, item);
		*p = overstep;
	}
}
EXPORT_SYMBOL(__dec_zone_page_state);

void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
	unsigned long flags;

	local_irq_save(flags);
	__inc_zone_state(zone, item);
	local_irq_restore(flags);
}

void inc_zone_page_state(struct page *page, enum zone_stat_item item)
{
	unsigned long flags;
	struct zone *zone;

	zone = page_zone(page);
	local_irq_save(flags);
	__inc_zone_state(zone, item);
	local_irq_restore(flags);
}
EXPORT_SYMBOL(inc_zone_page_state);

void dec_zone_page_state(struct page *page, enum zone_stat_item item)
{
	unsigned long flags;

	local_irq_save(flags);
	__dec_zone_page_state(page, item);
	local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);

/*
 * Update the zone counters for one cpu.
 */
void refresh_cpu_vm_stats(int cpu)
{
	struct zone *zone;
	int i;
	unsigned long flags;

	for_each_zone(zone) {
		struct per_cpu_pageset *pcp;

		if (!populated_zone(zone))
			continue;

		pcp = zone_pcp(zone, cpu);

		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
			if (pcp->vm_stat_diff[i]) {
				local_irq_save(flags);
				zone_page_state_add(pcp->vm_stat_diff[i],
					zone, i);
				pcp->vm_stat_diff[i] = 0;
				local_irq_restore(flags);
			}
	}
}

static void __refresh_cpu_vm_stats(void *dummy)
{
	refresh_cpu_vm_stats(smp_processor_id());
}

/*
 * Consolidate all counters.
 *
 * Note that the result is less inaccurate but still inaccurate
 * if concurrent processes are allowed to run.
 */
void refresh_vm_stats(void)
{
	on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
}
EXPORT_SYMBOL(refresh_vm_stats);

#endif

#ifdef CONFIG_NUMA
/*
 * zonelist = the list of zones passed to the allocator
 * z 	    = the zone from which the allocation occurred.
 *
 * Must be called with interrupts disabled.
 */
void zone_statistics(struct zonelist *zonelist, struct zone *z)
{
	if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
		__inc_zone_state(z, NUMA_HIT);
	} else {
		__inc_zone_state(z, NUMA_MISS);
		__inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
	}
	if (z->zone_pgdat == NODE_DATA(numa_node_id()))
		__inc_zone_state(z, NUMA_LOCAL);
	else
		__inc_zone_state(z, NUMA_OTHER);
}
#endif

#ifdef CONFIG_PROC_FS

#include <linux/seq_file.h>

static void *frag_start(struct seq_file *m, loff_t *pos)
{
	pg_data_t *pgdat;
	loff_t node = *pos;
	for (pgdat = first_online_pgdat();
	     pgdat && node;
	     pgdat = next_online_pgdat(pgdat))
		--node;

	return pgdat;
}

static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
{
	pg_data_t *pgdat = (pg_data_t *)arg;

	(*pos)++;
	return next_online_pgdat(pgdat);
}

static void frag_stop(struct seq_file *m, void *arg)
{
}

/*
 * This walks the free areas for each zone.
 */
static int frag_show(struct seq_file *m, void *arg)
{
	pg_data_t *pgdat = (pg_data_t *)arg;
	struct zone *zone;
	struct zone *node_zones = pgdat->node_zones;
	unsigned long flags;
	int order;

	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
		if (!populated_zone(zone))
			continue;

		spin_lock_irqsave(&zone->lock, flags);
		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
		for (order = 0; order < MAX_ORDER; ++order)
			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
		spin_unlock_irqrestore(&zone->lock, flags);
		seq_putc(m, '\n');
	}
	return 0;
}

struct seq_operations fragmentation_op = {
	.start	= frag_start,
	.next	= frag_next,
	.stop	= frag_stop,
	.show	= frag_show,
};

#ifdef CONFIG_ZONE_DMA32
#define TEXT_FOR_DMA32(xx) xx "_dma32",
#else
#define TEXT_FOR_DMA32(xx)
#endif

#ifdef CONFIG_HIGHMEM
#define TEXT_FOR_HIGHMEM(xx) xx "_high",
#else
#define TEXT_FOR_HIGHMEM(xx)
#endif

#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
					TEXT_FOR_HIGHMEM(xx)

static char *vmstat_text[] = {
	/* Zoned VM counters */
	"nr_anon_pages",
	"nr_mapped",
	"nr_file_pages",
	"nr_slab",
	"nr_page_table_pages",
	"nr_dirty",
	"nr_writeback",
	"nr_unstable",
	"nr_bounce",

#ifdef CONFIG_NUMA
	"numa_hit",
	"numa_miss",
	"numa_foreign",
	"numa_interleave",
	"numa_local",
	"numa_other",
#endif

#ifdef CONFIG_VM_EVENT_COUNTERS
	"pgpgin",
	"pgpgout",
	"pswpin",
	"pswpout",

	TEXTS_FOR_ZONES("pgalloc")

	"pgfree",
	"pgactivate",
	"pgdeactivate",

	"pgfault",
	"pgmajfault",

	TEXTS_FOR_ZONES("pgrefill")
	TEXTS_FOR_ZONES("pgsteal")
	TEXTS_FOR_ZONES("pgscan_kswapd")
	TEXTS_FOR_ZONES("pgscan_direct")

	"pginodesteal",
	"slabs_scanned",
	"kswapd_steal",
	"kswapd_inodesteal",
	"pageoutrun",
	"allocstall",

	"pgrotated",
#endif
};

/*
 * Output information about zones in @pgdat.
 */
static int zoneinfo_show(struct seq_file *m, void *arg)
{
	pg_data_t *pgdat = arg;
	struct zone *zone;
	struct zone *node_zones = pgdat->node_zones;
	unsigned long flags;

	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
		int i;

		if (!populated_zone(zone))
			continue;

		spin_lock_irqsave(&zone->lock, flags);
		seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
		seq_printf(m,
			   "\n  pages free     %lu"
			   "\n        min      %lu"
			   "\n        low      %lu"
			   "\n        high     %lu"
			   "\n        active   %lu"
			   "\n        inactive %lu"
			   "\n        scanned  %lu (a: %lu i: %lu)"
			   "\n        spanned  %lu"
			   "\n        present  %lu",
			   zone->free_pages,
			   zone->pages_min,
			   zone->pages_low,
			   zone->pages_high,
			   zone->nr_active,
			   zone->nr_inactive,
			   zone->pages_scanned,
			   zone->nr_scan_active, zone->nr_scan_inactive,
			   zone->spanned_pages,
			   zone->present_pages);

		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
			seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
					zone_page_state(zone, i));

		seq_printf(m,
			   "\n        protection: (%lu",
			   zone->lowmem_reserve[0]);
		for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
			seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
		seq_printf(m,
			   ")"
			   "\n  pagesets");
		for_each_online_cpu(i) {
			struct per_cpu_pageset *pageset;
			int j;

			pageset = zone_pcp(zone, i);
			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
				if (pageset->pcp[j].count)
					break;
			}
			if (j == ARRAY_SIZE(pageset->pcp))
				continue;
			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
				seq_printf(m,
					   "\n    cpu: %i pcp: %i"
					   "\n              count: %i"
					   "\n              high:  %i"
					   "\n              batch: %i",
					   i, j,
					   pageset->pcp[j].count,
					   pageset->pcp[j].high,
					   pageset->pcp[j].batch);
			}
#ifdef CONFIG_SMP
			seq_printf(m, "\n  vm stats threshold: %d",
					pageset->stat_threshold);
#endif
		}
		seq_printf(m,
			   "\n  all_unreclaimable: %u"
			   "\n  prev_priority:     %i"
			   "\n  temp_priority:     %i"
			   "\n  start_pfn:         %lu",
			   zone->all_unreclaimable,
			   zone->prev_priority,
			   zone->temp_priority,
			   zone->zone_start_pfn);
		spin_unlock_irqrestore(&zone->lock, flags);
		seq_putc(m, '\n');
	}
	return 0;
}

struct seq_operations zoneinfo_op = {
	.start	= frag_start, /* iterate over all zones. The same as in
			       * fragmentation. */
	.next	= frag_next,
	.stop	= frag_stop,
	.show	= zoneinfo_show,
};

static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
	unsigned long *v;
#ifdef CONFIG_VM_EVENT_COUNTERS
	unsigned long *e;
#endif
	int i;

	if (*pos >= ARRAY_SIZE(vmstat_text))
		return NULL;

#ifdef CONFIG_VM_EVENT_COUNTERS
	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
			+ sizeof(struct vm_event_state), GFP_KERNEL);
#else
	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
			GFP_KERNEL);
#endif
	m->private = v;
	if (!v)
		return ERR_PTR(-ENOMEM);
	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
		v[i] = global_page_state(i);
#ifdef CONFIG_VM_EVENT_COUNTERS
	e = v + NR_VM_ZONE_STAT_ITEMS;
	all_vm_events(e);
	e[PGPGIN] /= 2;		/* sectors -> kbytes */
	e[PGPGOUT] /= 2;
#endif
	return v + *pos;
}

static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
{
	(*pos)++;
	if (*pos >= ARRAY_SIZE(vmstat_text))
		return NULL;
	return (unsigned long *)m->private + *pos;
}

static int vmstat_show(struct seq_file *m, void *arg)
{
	unsigned long *l = arg;
	unsigned long off = l - (unsigned long *)m->private;

	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
	return 0;
}

static void vmstat_stop(struct seq_file *m, void *arg)
{
	kfree(m->private);
	m->private = NULL;
}

struct seq_operations vmstat_op = {
	.start	= vmstat_start,
	.next	= vmstat_next,
	.stop	= vmstat_stop,
	.show	= vmstat_show,
};

#endif /* CONFIG_PROC_FS */

#ifdef CONFIG_SMP
/*
 * Use the cpu notifier to insure that the thresholds are recalculated
 * when necessary.
 */
static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
		unsigned long action,
		void *hcpu)
{
	switch (action) {
		case CPU_UP_PREPARE:
		case CPU_UP_CANCELED:
		case CPU_DEAD:
			refresh_zone_stat_thresholds();
			break;
		default:
			break;
	}
	return NOTIFY_OK;
}

static struct notifier_block __cpuinitdata vmstat_notifier =
	{ &vmstat_cpuup_callback, NULL, 0 };

int __init setup_vmstat(void)
{
	refresh_zone_stat_thresholds();
	register_cpu_notifier(&vmstat_notifier);
	return 0;
}
module_init(setup_vmstat)
#endif
Commit	Line	Data
f6ac2354 CL	1	/*
	2	* linux/mm/vmstat.c
	3	*
	4	* Manages VM statistics
	5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a CL	6	*
	7	* zoned VM statistics
	8	* Copyright (C) 2006 Silicon Graphics, Inc.,
	9	* Christoph Lameter <christoph@lameter.com>
f6ac2354 CL	10	*/
	11
	12	#include <linux/config.h>
	13	#include <linux/mm.h>
2244b95a	14	#include <linux/module.h>
df9ecaba	15	#include <linux/cpu.h>
f6ac2354	16
f6ac2354 CL	17	void __get_zone_counts(unsigned long active, unsigned long inactive,
	18	unsigned long free, struct pglist_data pgdat)
	19	{
	20	struct zone *zones = pgdat->node_zones;
	21	int i;
	22
	23	*active = 0;
	24	*inactive = 0;
	25	*free = 0;
	26	for (i = 0; i < MAX_NR_ZONES; i++) {
	27	*active += zones[i].nr_active;
	28	*inactive += zones[i].nr_inactive;
	29	*free += zones[i].free_pages;
	30	}
	31	}
	32
	33	void get_zone_counts(unsigned long *active,
	34	unsigned long inactive, unsigned long free)
	35	{
	36	struct pglist_data *pgdat;
	37
	38	*active = 0;
	39	*inactive = 0;
	40	*free = 0;
	41	for_each_online_pgdat(pgdat) {
	42	unsigned long l, m, n;
	43	__get_zone_counts(&l, &m, &n, pgdat);
	44	*active += l;
	45	*inactive += m;
	46	*free += n;
	47	}
	48	}
	49
f8891e5e CL	50	#ifdef CONFIG_VM_EVENT_COUNTERS
	51	DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
	52	EXPORT_PER_CPU_SYMBOL(vm_event_states);
	53
	54	static void sum_vm_events(unsigned long ret, cpumask_t cpumask)
	55	{
	56	int cpu = 0;
	57	int i;
	58
	59	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
	60
	61	cpu = first_cpu(*cpumask);
	62	while (cpu < NR_CPUS) {
	63	struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
	64
	65	cpu = next_cpu(cpu, *cpumask);
	66
	67	if (cpu < NR_CPUS)
	68	prefetch(&per_cpu(vm_event_states, cpu));
	69
	70
	71	for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	72	ret[i] += this->event[i];
	73	}
	74	}
	75
	76	/*
	77	* Accumulate the vm event counters across all CPUs.
	78	* The result is unavoidably approximate - it can change
	79	* during and after execution of this function.
	80	*/
	81	void all_vm_events(unsigned long *ret)
	82	{
	83	sum_vm_events(ret, &cpu_online_map);
	84	}
32dd66fc	85	EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e CL	86
	87	#ifdef CONFIG_HOTPLUG
	88	/*
	89	* Fold the foreign cpu events into our own.
	90	*
	91	* This is adding to the events on one processor
	92	* but keeps the global counts constant.
	93	*/
	94	void vm_events_fold_cpu(int cpu)
	95	{
	96	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
	97	int i;
	98
	99	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
	100	count_vm_events(i, fold_state->event[i]);
	101	fold_state->event[i] = 0;
	102	}
	103	}
	104	#endif /* CONFIG_HOTPLUG */
	105
	106	#endif /* CONFIG_VM_EVENT_COUNTERS */
	107
2244b95a CL	108	/*
	109	* Manage combined zone based / global counters
	110	*
	111	* vm_stat contains the global counters
	112	*/
	113	atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
	114	EXPORT_SYMBOL(vm_stat);
	115
	116	#ifdef CONFIG_SMP
	117
df9ecaba CL	118	static int calculate_threshold(struct zone *zone)
	119	{
	120	int threshold;
	121	int mem; /* memory in 128 MB units */
	122
	123	/*
	124	* The threshold scales with the number of processors and the amount
	125	* of memory per zone. More memory means that we can defer updates for
	126	* longer, more processors could lead to more contention.
	127	* fls() is used to have a cheap way of logarithmic scaling.
	128	*
	129	* Some sample thresholds:
	130	*
	131	* Threshold Processors (fls) Zonesize fls(mem+1)
	132	* ------------------------------------------------------------------
	133	* 8 1 1 0.9-1 GB 4
	134	* 16 2 2 0.9-1 GB 4
	135	* 20 2 2 1-2 GB 5
	136	* 24 2 2 2-4 GB 6
	137	* 28 2 2 4-8 GB 7
	138	* 32 2 2 8-16 GB 8
	139	* 4 2 2 <128M 1
	140	* 30 4 3 2-4 GB 5
	141	* 48 4 3 8-16 GB 8
	142	* 32 8 4 1-2 GB 4
	143	* 32 8 4 0.9-1GB 4
	144	* 10 16 5 <128M 1
	145	* 40 16 5 900M 4
	146	* 70 64 7 2-4 GB 5
	147	* 84 64 7 4-8 GB 6
	148	* 108 512 9 4-8 GB 6
	149	* 125 1024 10 8-16 GB 8
	150	* 125 1024 10 16-32 GB 9
	151	*/
	152
	153	mem = zone->present_pages >> (27 - PAGE_SHIFT);
	154
	155	threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
	156
	157	/*
	158	* Maximum threshold is 125
	159	*/
	160	threshold = min(125, threshold);
	161
	162	return threshold;
	163	}
2244b95a CL	164
2244b95a CL	165	/*
df9ecaba	166	* Refresh the thresholds for each zone.
2244b95a	167	*/
df9ecaba	168	static void refresh_zone_stat_thresholds(void)
2244b95a	169	{
df9ecaba CL	170	struct zone *zone;
	171	int cpu;
	172	int threshold;
	173
	174	for_each_zone(zone) {
	175
	176	if (!zone->present_pages)
	177	continue;
	178
	179	threshold = calculate_threshold(zone);
	180
	181	for_each_online_cpu(cpu)
	182	zone_pcp(zone, cpu)->stat_threshold = threshold;
	183	}
2244b95a CL	184	}
	185
	186	/*
	187	* For use when we know that interrupts are disabled.
	188	*/
	189	void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
	190	int delta)
	191	{
df9ecaba CL	192	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
df9ecaba CL	193	s8 *p = pcp->vm_stat_diff + item;
2244b95a CL	194	long x;
2244b95a CL	195
2244b95a CL	196	x = delta + *p;
2244b95a CL	197
df9ecaba	198	if (unlikely(x > pcp->stat_threshold \|\| x < -pcp->stat_threshold)) {
2244b95a CL	199	zone_page_state_add(x, zone, item);
	200	x = 0;
	201	}
2244b95a CL	202	*p = x;
	203	}
	204	EXPORT_SYMBOL(__mod_zone_page_state);
	205
	206	/*
	207	* For an unknown interrupt state
	208	*/
	209	void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
	210	int delta)
	211	{
	212	unsigned long flags;
	213
	214	local_irq_save(flags);
	215	__mod_zone_page_state(zone, item, delta);
	216	local_irq_restore(flags);
	217	}
	218	EXPORT_SYMBOL(mod_zone_page_state);
	219
	220	/*
	221	* Optimized increment and decrement functions.
	222	*
	223	* These are only for a single page and therefore can take a struct page *
	224	* argument instead of struct zone *. This allows the inclusion of the code
	225	* generated for page_zone(page) into the optimized functions.
	226	*
	227	* No overflow check is necessary and therefore the differential can be
	228	* incremented or decremented in place which may allow the compilers to
	229	* generate better code.
2244b95a CL	230	* The increment or decrement is known and therefore one boundary check can
	231	* be omitted.
	232	*
df9ecaba CL	233	* NOTE: These functions are very performance sensitive. Change only
	234	* with care.
	235	*
2244b95a CL	236	* Some processors have inc/dec instructions that are atomic vs an interrupt.
	237	* However, the code must first determine the differential location in a zone
	238	* based on the processor number and then inc/dec the counter. There is no
	239	* guarantee without disabling preemption that the processor will not change
	240	* in between and therefore the atomicity vs. interrupt cannot be exploited
	241	* in a useful way here.
	242	*/
ca889e6c	243	static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a	244	{
df9ecaba CL	245	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
df9ecaba CL	246	s8 *p = pcp->vm_stat_diff + item;
2244b95a CL	247
	248	(*p)++;
	249
df9ecaba CL	250	if (unlikely(*p > pcp->stat_threshold)) {
	251	int overstep = pcp->stat_threshold / 2;
	252
	253	zone_page_state_add(*p + overstep, zone, item);
	254	*p = -overstep;
2244b95a CL	255	}
2244b95a CL	256	}
ca889e6c CL	257
	258	void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
	259	{
	260	__inc_zone_state(page_zone(page), item);
	261	}
2244b95a CL	262	EXPORT_SYMBOL(__inc_zone_page_state);
	263
	264	void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
	265	{
	266	struct zone *zone = page_zone(page);
df9ecaba CL	267	struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
df9ecaba CL	268	s8 *p = pcp->vm_stat_diff + item;
2244b95a CL	269
	270	(*p)--;
	271
df9ecaba CL	272	if (unlikely(*p < - pcp->stat_threshold)) {
	273	int overstep = pcp->stat_threshold / 2;
	274
	275	zone_page_state_add(*p - overstep, zone, item);
	276	*p = overstep;
2244b95a CL	277	}
	278	}
	279	EXPORT_SYMBOL(__dec_zone_page_state);
	280
ca889e6c CL	281	void inc_zone_state(struct zone *zone, enum zone_stat_item item)
	282	{
	283	unsigned long flags;
	284
	285	local_irq_save(flags);
	286	__inc_zone_state(zone, item);
	287	local_irq_restore(flags);
	288	}
	289
2244b95a CL	290	void inc_zone_page_state(struct page *page, enum zone_stat_item item)
	291	{
	292	unsigned long flags;
	293	struct zone *zone;
2244b95a CL	294
	295	zone = page_zone(page);
	296	local_irq_save(flags);
ca889e6c	297	__inc_zone_state(zone, item);
2244b95a CL	298	local_irq_restore(flags);
	299	}
	300	EXPORT_SYMBOL(inc_zone_page_state);
	301
	302	void dec_zone_page_state(struct page *page, enum zone_stat_item item)
	303	{
	304	unsigned long flags;
2244b95a	305
2244b95a	306	local_irq_save(flags);
a302eb4e	307	__dec_zone_page_state(page, item);
2244b95a CL	308	local_irq_restore(flags);
	309	}
	310	EXPORT_SYMBOL(dec_zone_page_state);
	311
	312	/*
	313	* Update the zone counters for one cpu.
	314	*/
	315	void refresh_cpu_vm_stats(int cpu)
	316	{
	317	struct zone *zone;
	318	int i;
	319	unsigned long flags;
	320
	321	for_each_zone(zone) {
	322	struct per_cpu_pageset *pcp;
	323
39bbcb8f CL	324	if (!populated_zone(zone))
	325	continue;
	326
2244b95a CL	327	pcp = zone_pcp(zone, cpu);
	328
	329	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
	330	if (pcp->vm_stat_diff[i]) {
	331	local_irq_save(flags);
	332	zone_page_state_add(pcp->vm_stat_diff[i],
	333	zone, i);
	334	pcp->vm_stat_diff[i] = 0;
	335	local_irq_restore(flags);
	336	}
	337	}
	338	}
	339
	340	static void __refresh_cpu_vm_stats(void *dummy)
	341	{
	342	refresh_cpu_vm_stats(smp_processor_id());
	343	}
	344
	345	/*
	346	* Consolidate all counters.
	347	*
	348	* Note that the result is less inaccurate but still inaccurate
	349	* if concurrent processes are allowed to run.
	350	*/
	351	void refresh_vm_stats(void)
	352	{
	353	on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
	354	}
	355	EXPORT_SYMBOL(refresh_vm_stats);
	356
	357	#endif
	358
ca889e6c CL	359	#ifdef CONFIG_NUMA
	360	/*
	361	* zonelist = the list of zones passed to the allocator
	362	* z = the zone from which the allocation occurred.
	363	*
	364	* Must be called with interrupts disabled.
	365	*/
	366	void zone_statistics(struct zonelist zonelist, struct zone z)
	367	{
	368	if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
	369	__inc_zone_state(z, NUMA_HIT);
	370	} else {
	371	__inc_zone_state(z, NUMA_MISS);
	372	__inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
	373	}
	374	if (z->zone_pgdat == NODE_DATA(numa_node_id()))
	375	__inc_zone_state(z, NUMA_LOCAL);
	376	else
	377	__inc_zone_state(z, NUMA_OTHER);
	378	}
	379	#endif
	380
f6ac2354 CL	381	#ifdef CONFIG_PROC_FS
	382
	383	#include <linux/seq_file.h>
	384
	385	static void frag_start(struct seq_file m, loff_t *pos)
	386	{
	387	pg_data_t *pgdat;
	388	loff_t node = *pos;
	389	for (pgdat = first_online_pgdat();
	390	pgdat && node;
	391	pgdat = next_online_pgdat(pgdat))
	392	--node;
	393
	394	return pgdat;
	395	}
	396
	397	static void frag_next(struct seq_file m, void arg, loff_t pos)
	398	{
	399	pg_data_t pgdat = (pg_data_t )arg;
	400
	401	(*pos)++;
	402	return next_online_pgdat(pgdat);
	403	}
	404
	405	static void frag_stop(struct seq_file m, void arg)
	406	{
	407	}
	408
	409	/*
	410	* This walks the free areas for each zone.
	411	*/
	412	static int frag_show(struct seq_file m, void arg)
	413	{
	414	pg_data_t pgdat = (pg_data_t )arg;
	415	struct zone *zone;
	416	struct zone *node_zones = pgdat->node_zones;
	417	unsigned long flags;
	418	int order;
	419
	420	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
	421	if (!populated_zone(zone))
	422	continue;
	423
	424	spin_lock_irqsave(&zone->lock, flags);
	425	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
	426	for (order = 0; order < MAX_ORDER; ++order)
	427	seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
	428	spin_unlock_irqrestore(&zone->lock, flags);
	429	seq_putc(m, '\n');
	430	}
	431	return 0;
	432	}
	433
	434	struct seq_operations fragmentation_op = {
	435	.start = frag_start,
	436	.next = frag_next,
	437	.stop = frag_stop,
	438	.show = frag_show,
	439	};
	440
27bf71c2 CL	441	#ifdef CONFIG_ZONE_DMA32
	442	#define TEXT_FOR_DMA32(xx) xx "_dma32",
	443	#else
	444	#define TEXT_FOR_DMA32(xx)
	445	#endif
	446
	447	#ifdef CONFIG_HIGHMEM
	448	#define TEXT_FOR_HIGHMEM(xx) xx "_high",
	449	#else
	450	#define TEXT_FOR_HIGHMEM(xx)
	451	#endif
	452
	453	#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
	454	TEXT_FOR_HIGHMEM(xx)
	455
f6ac2354	456	static char *vmstat_text[] = {
2244b95a	457	/* Zoned VM counters */
f3dbd344	458	"nr_anon_pages",
65ba55f5	459	"nr_mapped",
347ce434	460	"nr_file_pages",
9a865ffa	461	"nr_slab",
df849a15	462	"nr_page_table_pages",
b1e7a8fd	463	"nr_dirty",
ce866b34	464	"nr_writeback",
f6ac2354	465	"nr_unstable",
d2c5e30c	466	"nr_bounce",
f6ac2354	467
ca889e6c CL	468	#ifdef CONFIG_NUMA
	469	"numa_hit",
	470	"numa_miss",
	471	"numa_foreign",
	472	"numa_interleave",
	473	"numa_local",
	474	"numa_other",
	475	#endif
	476
f8891e5e	477	#ifdef CONFIG_VM_EVENT_COUNTERS
f6ac2354 CL	478	"pgpgin",
	479	"pgpgout",
	480	"pswpin",
	481	"pswpout",
	482
27bf71c2	483	TEXTS_FOR_ZONES("pgalloc")
f6ac2354 CL	484
	485	"pgfree",
	486	"pgactivate",
	487	"pgdeactivate",
	488
	489	"pgfault",
	490	"pgmajfault",
	491
27bf71c2 CL	492	TEXTS_FOR_ZONES("pgrefill")
	493	TEXTS_FOR_ZONES("pgsteal")
	494	TEXTS_FOR_ZONES("pgscan_kswapd")
	495	TEXTS_FOR_ZONES("pgscan_direct")
f6ac2354 CL	496
	497	"pginodesteal",
	498	"slabs_scanned",
	499	"kswapd_steal",
	500	"kswapd_inodesteal",
	501	"pageoutrun",
	502	"allocstall",
	503
	504	"pgrotated",
f8891e5e	505	#endif
f6ac2354 CL	506	};
	507
	508	/*
	509	* Output information about zones in @pgdat.
	510	*/
	511	static int zoneinfo_show(struct seq_file m, void arg)
	512	{
	513	pg_data_t *pgdat = arg;
	514	struct zone *zone;
	515	struct zone *node_zones = pgdat->node_zones;
	516	unsigned long flags;
	517
	518	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
	519	int i;
	520
	521	if (!populated_zone(zone))
	522	continue;
	523
	524	spin_lock_irqsave(&zone->lock, flags);
	525	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
	526	seq_printf(m,
	527	"\n pages free %lu"
	528	"\n min %lu"
	529	"\n low %lu"
	530	"\n high %lu"
	531	"\n active %lu"
	532	"\n inactive %lu"
	533	"\n scanned %lu (a: %lu i: %lu)"
	534	"\n spanned %lu"
	535	"\n present %lu",
	536	zone->free_pages,
	537	zone->pages_min,
	538	zone->pages_low,
	539	zone->pages_high,
	540	zone->nr_active,
	541	zone->nr_inactive,
	542	zone->pages_scanned,
	543	zone->nr_scan_active, zone->nr_scan_inactive,
	544	zone->spanned_pages,
	545	zone->present_pages);
2244b95a CL	546
	547	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
	548	seq_printf(m, "\n %-12s %lu", vmstat_text[i],
	549	zone_page_state(zone, i));
	550
f6ac2354 CL	551	seq_printf(m,
	552	"\n protection: (%lu",
	553	zone->lowmem_reserve[0]);
	554	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
	555	seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
	556	seq_printf(m,
	557	")"
	558	"\n pagesets");
	559	for_each_online_cpu(i) {
	560	struct per_cpu_pageset *pageset;
	561	int j;
	562
	563	pageset = zone_pcp(zone, i);
	564	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
	565	if (pageset->pcp[j].count)
	566	break;
	567	}
	568	if (j == ARRAY_SIZE(pageset->pcp))
	569	continue;
	570	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
	571	seq_printf(m,
	572	"\n cpu: %i pcp: %i"
	573	"\n count: %i"
	574	"\n high: %i"
	575	"\n batch: %i",
	576	i, j,
	577	pageset->pcp[j].count,
	578	pageset->pcp[j].high,
	579	pageset->pcp[j].batch);
	580	}
df9ecaba CL	581	#ifdef CONFIG_SMP
	582	seq_printf(m, "\n vm stats threshold: %d",
	583	pageset->stat_threshold);
	584	#endif
f6ac2354 CL	585	}
	586	seq_printf(m,
	587	"\n all_unreclaimable: %u"
	588	"\n prev_priority: %i"
	589	"\n temp_priority: %i"
	590	"\n start_pfn: %lu",
	591	zone->all_unreclaimable,
	592	zone->prev_priority,
	593	zone->temp_priority,
	594	zone->zone_start_pfn);
	595	spin_unlock_irqrestore(&zone->lock, flags);
	596	seq_putc(m, '\n');
	597	}
	598	return 0;
	599	}
	600
	601	struct seq_operations zoneinfo_op = {
	602	.start = frag_start, /* iterate over all zones. The same as in
	603	* fragmentation. */
	604	.next = frag_next,
	605	.stop = frag_stop,
	606	.show = zoneinfo_show,
	607	};
	608
	609	static void vmstat_start(struct seq_file m, loff_t *pos)
	610	{
2244b95a	611	unsigned long *v;
f8891e5e CL	612	#ifdef CONFIG_VM_EVENT_COUNTERS
	613	unsigned long *e;
	614	#endif
2244b95a	615	int i;
f6ac2354 CL	616
	617	if (*pos >= ARRAY_SIZE(vmstat_text))
	618	return NULL;
	619
f8891e5e	620	#ifdef CONFIG_VM_EVENT_COUNTERS
2244b95a	621	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
f8891e5e CL	622	+ sizeof(struct vm_event_state), GFP_KERNEL);
	623	#else
	624	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
	625	GFP_KERNEL);
	626	#endif
2244b95a CL	627	m->private = v;
2244b95a CL	628	if (!v)
f6ac2354	629	return ERR_PTR(-ENOMEM);
2244b95a CL	630	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
2244b95a CL	631	v[i] = global_page_state(i);
f8891e5e CL	632	#ifdef CONFIG_VM_EVENT_COUNTERS
	633	e = v + NR_VM_ZONE_STAT_ITEMS;
	634	all_vm_events(e);
	635	e[PGPGIN] /= 2; /* sectors -> kbytes */
	636	e[PGPGOUT] /= 2;
	637	#endif
2244b95a	638	return v + *pos;
f6ac2354 CL	639	}
	640
	641	static void vmstat_next(struct seq_file m, void arg, loff_t pos)
	642	{
	643	(*pos)++;
	644	if (*pos >= ARRAY_SIZE(vmstat_text))
	645	return NULL;
	646	return (unsigned long )m->private + pos;
	647	}
	648
	649	static int vmstat_show(struct seq_file m, void arg)
	650	{
	651	unsigned long *l = arg;
	652	unsigned long off = l - (unsigned long *)m->private;
	653
	654	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
	655	return 0;
	656	}
	657
	658	static void vmstat_stop(struct seq_file m, void arg)
	659	{
	660	kfree(m->private);
	661	m->private = NULL;
	662	}
	663
	664	struct seq_operations vmstat_op = {
	665	.start = vmstat_start,
	666	.next = vmstat_next,
	667	.stop = vmstat_stop,
	668	.show = vmstat_show,
	669	};
	670
	671	#endif /* CONFIG_PROC_FS */
	672
df9ecaba CL	673	#ifdef CONFIG_SMP
	674	/*
	675	* Use the cpu notifier to insure that the thresholds are recalculated
	676	* when necessary.
	677	*/
	678	static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
	679	unsigned long action,
	680	void *hcpu)
	681	{
	682	switch (action) {
	683	case CPU_UP_PREPARE:
	684	case CPU_UP_CANCELED:
	685	case CPU_DEAD:
	686	refresh_zone_stat_thresholds();
	687	break;
	688	default:
	689	break;
	690	}
	691	return NOTIFY_OK;
	692	}
	693
	694	static struct notifier_block __cpuinitdata vmstat_notifier =
	695	{ &vmstat_cpuup_callback, NULL, 0 };
	696
	697	int __init setup_vmstat(void)
	698	{
	699	refresh_zone_stat_thresholds();
	700	register_cpu_notifier(&vmstat_notifier);
	701	return 0;
	702	}
	703	module_init(setup_vmstat)
	704	#endif