[linux-2.6-block.git] / kernel / profile.c

/*
 *  linux/kernel/profile.c
 *  Simple profiling. Manages a direct-mapped profile hit count buffer,
 *  with configurable resolution, support for restricting the cpus on
 *  which profiling is done, and switching between cpu time and
 *  schedule() calls via kernel command line parameters passed at boot.
 *
 *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
 *	Red Hat, July 2004
 *  Consolidation of architecture support code for profiling,
 *	William Irwin, Oracle, July 2004
 *  Amortized hit count accounting via per-cpu open-addressed hashtables
 *	to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
 */

#include <linux/module.h>
#include <linux/profile.h>
#include <linux/bootmem.h>
#include <linux/notifier.h>
#include <linux/mm.h>
#include <linux/cpumask.h>
#include <linux/cpu.h>
#include <linux/profile.h>
#include <linux/highmem.h>
#include <linux/mutex.h>
#include <asm/sections.h>
#include <asm/semaphore.h>
#include <asm/irq_regs.h>
#include <asm/ptrace.h>

struct profile_hit {
	u32 pc, hits;
};
#define PROFILE_GRPSHIFT	3
#define PROFILE_GRPSZ		(1 << PROFILE_GRPSHIFT)
#define NR_PROFILE_HIT		(PAGE_SIZE/sizeof(struct profile_hit))
#define NR_PROFILE_GRP		(NR_PROFILE_HIT/PROFILE_GRPSZ)

/* Oprofile timer tick hook */
static int (*timer_hook)(struct pt_regs *) __read_mostly;

static atomic_t *prof_buffer;
static unsigned long prof_len, prof_shift;

int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);

static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
static DEFINE_PER_CPU(int, cpu_profile_flip);
static DEFINE_MUTEX(profile_flip_mutex);
#endif /* CONFIG_SMP */

static int __init profile_setup(char * str)
{
	static char __initdata schedstr[] = "schedule";
	static char __initdata sleepstr[] = "sleep";
	static char __initdata kvmstr[] = "kvm";
	int par;

	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
		prof_on = SLEEP_PROFILING;
		if (str[strlen(sleepstr)] == ',')
			str += strlen(sleepstr) + 1;
		if (get_option(&str, &par))
			prof_shift = par;
		printk(KERN_INFO
			"kernel sleep profiling enabled (shift: %ld)\n",
			prof_shift);
#else
		printk(KERN_WARNING
			"kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
#endif /* CONFIG_SCHEDSTATS */
	} else if (!strncmp(str, schedstr, strlen(schedstr))) {
		prof_on = SCHED_PROFILING;
		if (str[strlen(schedstr)] == ',')
			str += strlen(schedstr) + 1;
		if (get_option(&str, &par))
			prof_shift = par;
		printk(KERN_INFO
			"kernel schedule profiling enabled (shift: %ld)\n",
			prof_shift);
	} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
		prof_on = KVM_PROFILING;
		if (str[strlen(kvmstr)] == ',')
			str += strlen(kvmstr) + 1;
		if (get_option(&str, &par))
			prof_shift = par;
		printk(KERN_INFO
			"kernel KVM profiling enabled (shift: %ld)\n",
			prof_shift);
	} else if (get_option(&str, &par)) {
		prof_shift = par;
		prof_on = CPU_PROFILING;
		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
			prof_shift);
	}
	return 1;
}
__setup("profile=", profile_setup);


void __init profile_init(void)
{
	if (!prof_on) 
		return;
 
	/* only text is profiled */
	prof_len = (_etext - _stext) >> prof_shift;
	prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
}

/* Profile event notifications */
 
#ifdef CONFIG_PROFILING
 
static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
 
void profile_task_exit(struct task_struct * task)
{
	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
}
 
int profile_handoff_task(struct task_struct * task)
{
	int ret;
	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
	return (ret == NOTIFY_OK) ? 1 : 0;
}

void profile_munmap(unsigned long addr)
{
	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
}

int task_handoff_register(struct notifier_block * n)
{
	return atomic_notifier_chain_register(&task_free_notifier, n);
}

int task_handoff_unregister(struct notifier_block * n)
{
	return atomic_notifier_chain_unregister(&task_free_notifier, n);
}

int profile_event_register(enum profile_type type, struct notifier_block * n)
{
	int err = -EINVAL;
 
	switch (type) {
		case PROFILE_TASK_EXIT:
			err = blocking_notifier_chain_register(
					&task_exit_notifier, n);
			break;
		case PROFILE_MUNMAP:
			err = blocking_notifier_chain_register(
					&munmap_notifier, n);
			break;
	}
 
	return err;
}

 
int profile_event_unregister(enum profile_type type, struct notifier_block * n)
{
	int err = -EINVAL;
 
	switch (type) {
		case PROFILE_TASK_EXIT:
			err = blocking_notifier_chain_unregister(
					&task_exit_notifier, n);
			break;
		case PROFILE_MUNMAP:
			err = blocking_notifier_chain_unregister(
					&munmap_notifier, n);
			break;
	}

	return err;
}

int register_timer_hook(int (*hook)(struct pt_regs *))
{
	if (timer_hook)
		return -EBUSY;
	timer_hook = hook;
	return 0;
}

void unregister_timer_hook(int (*hook)(struct pt_regs *))
{
	WARN_ON(hook != timer_hook);
	timer_hook = NULL;
	/* make sure all CPUs see the NULL hook */
	synchronize_sched();  /* Allow ongoing interrupts to complete. */
}

EXPORT_SYMBOL_GPL(register_timer_hook);
EXPORT_SYMBOL_GPL(unregister_timer_hook);
EXPORT_SYMBOL_GPL(task_handoff_register);
EXPORT_SYMBOL_GPL(task_handoff_unregister);
EXPORT_SYMBOL_GPL(profile_event_register);
EXPORT_SYMBOL_GPL(profile_event_unregister);

#endif /* CONFIG_PROFILING */


#ifdef CONFIG_SMP
/*
 * Each cpu has a pair of open-addressed hashtables for pending
 * profile hits. read_profile() IPI's all cpus to request them
 * to flip buffers and flushes their contents to prof_buffer itself.
 * Flip requests are serialized by the profile_flip_mutex. The sole
 * use of having a second hashtable is for avoiding cacheline
 * contention that would otherwise happen during flushes of pending
 * profile hits required for the accuracy of reported profile hits
 * and so resurrect the interrupt livelock issue.
 *
 * The open-addressed hashtables are indexed by profile buffer slot
 * and hold the number of pending hits to that profile buffer slot on
 * a cpu in an entry. When the hashtable overflows, all pending hits
 * are accounted to their corresponding profile buffer slots with
 * atomic_add() and the hashtable emptied. As numerous pending hits
 * may be accounted to a profile buffer slot in a hashtable entry,
 * this amortizes a number of atomic profile buffer increments likely
 * to be far larger than the number of entries in the hashtable,
 * particularly given that the number of distinct profile buffer
 * positions to which hits are accounted during short intervals (e.g.
 * several seconds) is usually very small. Exclusion from buffer
 * flipping is provided by interrupt disablement (note that for
 * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
 * process context).
 * The hash function is meant to be lightweight as opposed to strong,
 * and was vaguely inspired by ppc64 firmware-supported inverted
 * pagetable hash functions, but uses a full hashtable full of finite
 * collision chains, not just pairs of them.
 *
 * -- wli
 */
static void __profile_flip_buffers(void *unused)
{
	int cpu = smp_processor_id();

	per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
}

static void profile_flip_buffers(void)
{
	int i, j, cpu;

	mutex_lock(&profile_flip_mutex);
	j = per_cpu(cpu_profile_flip, get_cpu());
	put_cpu();
	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
	for_each_online_cpu(cpu) {
		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
		for (i = 0; i < NR_PROFILE_HIT; ++i) {
			if (!hits[i].hits) {
				if (hits[i].pc)
					hits[i].pc = 0;
				continue;
			}
			atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
			hits[i].hits = hits[i].pc = 0;
		}
	}
	mutex_unlock(&profile_flip_mutex);
}

static void profile_discard_flip_buffers(void)
{
	int i, cpu;

	mutex_lock(&profile_flip_mutex);
	i = per_cpu(cpu_profile_flip, get_cpu());
	put_cpu();
	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
	for_each_online_cpu(cpu) {
		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
	}
	mutex_unlock(&profile_flip_mutex);
}

void profile_hits(int type, void *__pc, unsigned int nr_hits)
{
	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
	int i, j, cpu;
	struct profile_hit *hits;

	if (prof_on != type || !prof_buffer)
		return;
	pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
	i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	cpu = get_cpu();
	hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
	if (!hits) {
		put_cpu();
		return;
	}
	/*
	 * We buffer the global profiler buffer into a per-CPU
	 * queue and thus reduce the number of global (and possibly
	 * NUMA-alien) accesses. The write-queue is self-coalescing:
	 */
	local_irq_save(flags);
	do {
		for (j = 0; j < PROFILE_GRPSZ; ++j) {
			if (hits[i + j].pc == pc) {
				hits[i + j].hits += nr_hits;
				goto out;
			} else if (!hits[i + j].hits) {
				hits[i + j].pc = pc;
				hits[i + j].hits = nr_hits;
				goto out;
			}
		}
		i = (i + secondary) & (NR_PROFILE_HIT - 1);
	} while (i != primary);

	/*
	 * Add the current hit(s) and flush the write-queue out
	 * to the global buffer:
	 */
	atomic_add(nr_hits, &prof_buffer[pc]);
	for (i = 0; i < NR_PROFILE_HIT; ++i) {
		atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
		hits[i].pc = hits[i].hits = 0;
	}
out:
	local_irq_restore(flags);
	put_cpu();
}

static int __devinit profile_cpu_callback(struct notifier_block *info,
					unsigned long action, void *__cpu)
{
	int node, cpu = (unsigned long)__cpu;
	struct page *page;

	switch (action) {
	case CPU_UP_PREPARE:
	case CPU_UP_PREPARE_FROZEN:
		node = cpu_to_node(cpu);
		per_cpu(cpu_profile_flip, cpu) = 0;
		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
			page = alloc_pages_node(node,
					GFP_KERNEL | __GFP_ZERO,
					0);
			if (!page)
				return NOTIFY_BAD;
			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
		}
		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
			page = alloc_pages_node(node,
					GFP_KERNEL | __GFP_ZERO,
					0);
			if (!page)
				goto out_free;
			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
		}
		break;
	out_free:
		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
		__free_page(page);
		return NOTIFY_BAD;
	case CPU_ONLINE:
	case CPU_ONLINE_FROZEN:
		cpu_set(cpu, prof_cpu_mask);
		break;
	case CPU_UP_CANCELED:
	case CPU_UP_CANCELED_FROZEN:
	case CPU_DEAD:
	case CPU_DEAD_FROZEN:
		cpu_clear(cpu, prof_cpu_mask);
		if (per_cpu(cpu_profile_hits, cpu)[0]) {
			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
			__free_page(page);
		}
		if (per_cpu(cpu_profile_hits, cpu)[1]) {
			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
			__free_page(page);
		}
		break;
	}
	return NOTIFY_OK;
}
#else /* !CONFIG_SMP */
#define profile_flip_buffers()		do { } while (0)
#define profile_discard_flip_buffers()	do { } while (0)
#define profile_cpu_callback		NULL

void profile_hits(int type, void *__pc, unsigned int nr_hits)
{
	unsigned long pc;

	if (prof_on != type || !prof_buffer)
		return;
	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
}
#endif /* !CONFIG_SMP */

EXPORT_SYMBOL_GPL(profile_hits);

void profile_tick(int type)
{
	struct pt_regs *regs = get_irq_regs();

	if (type == CPU_PROFILING && timer_hook)
		timer_hook(regs);
	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
		profile_hit(type, (void *)profile_pc(regs));
}

#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>

static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
			int count, int *eof, void *data)
{
	int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
	if (count - len < 2)
		return -EINVAL;
	len += sprintf(page + len, "\n");
	return len;
}

static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
					unsigned long count, void *data)
{
	cpumask_t *mask = (cpumask_t *)data;
	unsigned long full_count = count, err;
	cpumask_t new_value;

	err = cpumask_parse_user(buffer, count, new_value);
	if (err)
		return err;

	*mask = new_value;
	return full_count;
}

void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
{
	struct proc_dir_entry *entry;

	/* create /proc/irq/prof_cpu_mask */
	if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
		return;
	entry->data = (void *)&prof_cpu_mask;
	entry->read_proc = prof_cpu_mask_read_proc;
	entry->write_proc = prof_cpu_mask_write_proc;
}

/*
 * This function accesses profiling information. The returned data is
 * binary: the sampling step and the actual contents of the profile
 * buffer. Use of the program readprofile is recommended in order to
 * get meaningful info out of these data.
 */
static ssize_t
read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
	unsigned long p = *ppos;
	ssize_t read;
	char * pnt;
	unsigned int sample_step = 1 << prof_shift;

	profile_flip_buffers();
	if (p >= (prof_len+1)*sizeof(unsigned int))
		return 0;
	if (count > (prof_len+1)*sizeof(unsigned int) - p)
		count = (prof_len+1)*sizeof(unsigned int) - p;
	read = 0;

	while (p < sizeof(unsigned int) && count > 0) {
		if (put_user(*((char *)(&sample_step)+p),buf))
			return -EFAULT;
		buf++; p++; count--; read++;
	}
	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
	if (copy_to_user(buf,(void *)pnt,count))
		return -EFAULT;
	read += count;
	*ppos += read;
	return read;
}

/*
 * Writing to /proc/profile resets the counters
 *
 * Writing a 'profiling multiplier' value into it also re-sets the profiling
 * interrupt frequency, on architectures that support this.
 */
static ssize_t write_profile(struct file *file, const char __user *buf,
			     size_t count, loff_t *ppos)
{
#ifdef CONFIG_SMP
	extern int setup_profiling_timer (unsigned int multiplier);

	if (count == sizeof(int)) {
		unsigned int multiplier;

		if (copy_from_user(&multiplier, buf, sizeof(int)))
			return -EFAULT;

		if (setup_profiling_timer(multiplier))
			return -EINVAL;
	}
#endif
	profile_discard_flip_buffers();
	memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
	return count;
}

static const struct file_operations proc_profile_operations = {
	.read		= read_profile,
	.write		= write_profile,
};

#ifdef CONFIG_SMP
static void __init profile_nop(void *unused)
{
}

static int __init create_hash_tables(void)
{
	int cpu;

	for_each_online_cpu(cpu) {
		int node = cpu_to_node(cpu);
		struct page *page;

		page = alloc_pages_node(node,
				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
				0);
		if (!page)
			goto out_cleanup;
		per_cpu(cpu_profile_hits, cpu)[1]
				= (struct profile_hit *)page_address(page);
		page = alloc_pages_node(node,
				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
				0);
		if (!page)
			goto out_cleanup;
		per_cpu(cpu_profile_hits, cpu)[0]
				= (struct profile_hit *)page_address(page);
	}
	return 0;
out_cleanup:
	prof_on = 0;
	smp_mb();
	on_each_cpu(profile_nop, NULL, 0, 1);
	for_each_online_cpu(cpu) {
		struct page *page;

		if (per_cpu(cpu_profile_hits, cpu)[0]) {
			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
			__free_page(page);
		}
		if (per_cpu(cpu_profile_hits, cpu)[1]) {
			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
			__free_page(page);
		}
	}
	return -1;
}
#else
#define create_hash_tables()			({ 0; })
#endif

static int __init create_proc_profile(void)
{
	struct proc_dir_entry *entry;

	if (!prof_on)
		return 0;
	if (create_hash_tables())
		return -1;
	if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
		return 0;
	entry->proc_fops = &proc_profile_operations;
	entry->size = (1+prof_len) * sizeof(atomic_t);
	hotcpu_notifier(profile_cpu_callback, 0);
	return 0;
}
module_init(create_proc_profile);
#endif /* CONFIG_PROC_FS */
Commit	Line	Data
1da177e4 LT	1	/*
	2	* linux/kernel/profile.c
	3	* Simple profiling. Manages a direct-mapped profile hit count buffer,
	4	* with configurable resolution, support for restricting the cpus on
	5	* which profiling is done, and switching between cpu time and
	6	* schedule() calls via kernel command line parameters passed at boot.
	7	*
	8	* Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
	9	* Red Hat, July 2004
	10	* Consolidation of architecture support code for profiling,
	11	* William Irwin, Oracle, July 2004
	12	* Amortized hit count accounting via per-cpu open-addressed hashtables
	13	* to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
	14	*/
	15
1da177e4 LT	16	#include <linux/module.h>
	17	#include <linux/profile.h>
	18	#include <linux/bootmem.h>
	19	#include <linux/notifier.h>
	20	#include <linux/mm.h>
	21	#include <linux/cpumask.h>
	22	#include <linux/cpu.h>
	23	#include <linux/profile.h>
	24	#include <linux/highmem.h>
97d1f15b	25	#include <linux/mutex.h>
1da177e4 LT	26	#include <asm/sections.h>
1da177e4 LT	27	#include <asm/semaphore.h>
7d12e780	28	#include <asm/irq_regs.h>
e8edc6e0	29	#include <asm/ptrace.h>
1da177e4 LT	30
	31	struct profile_hit {
	32	u32 pc, hits;
	33	};
	34	#define PROFILE_GRPSHIFT 3
	35	#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT)
	36	#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
	37	#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
	38
	39	/* Oprofile timer tick hook */
b012d346	40	static int (timer_hook)(struct pt_regs ) __read_mostly;
1da177e4 LT	41
	42	static atomic_t *prof_buffer;
	43	static unsigned long prof_len, prof_shift;
07031e14	44
ece8a684	45	int prof_on __read_mostly;
07031e14 IM	46	EXPORT_SYMBOL_GPL(prof_on);
07031e14 IM	47
1da177e4 LT	48	static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
	49	#ifdef CONFIG_SMP
	50	static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
	51	static DEFINE_PER_CPU(int, cpu_profile_flip);
97d1f15b	52	static DEFINE_MUTEX(profile_flip_mutex);
1da177e4 LT	53	#endif /* CONFIG_SMP */
	54
	55	static int __init profile_setup(char * str)
	56	{
dfaa9c94	57	static char __initdata schedstr[] = "schedule";
ece8a684	58	static char __initdata sleepstr[] = "sleep";
07031e14	59	static char __initdata kvmstr[] = "kvm";
1da177e4 LT	60	int par;
1da177e4 LT	61
ece8a684	62	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
b3da2a73	63	#ifdef CONFIG_SCHEDSTATS
ece8a684 IM	64	prof_on = SLEEP_PROFILING;
	65	if (str[strlen(sleepstr)] == ',')
	66	str += strlen(sleepstr) + 1;
	67	if (get_option(&str, &par))
	68	prof_shift = par;
	69	printk(KERN_INFO
	70	"kernel sleep profiling enabled (shift: %ld)\n",
	71	prof_shift);
b3da2a73 MG	72	#else
	73	printk(KERN_WARNING
	74	"kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
	75	#endif /* CONFIG_SCHEDSTATS */
a75acf85	76	} else if (!strncmp(str, schedstr, strlen(schedstr))) {
1da177e4	77	prof_on = SCHED_PROFILING;
dfaa9c94 WLII	78	if (str[strlen(schedstr)] == ',')
	79	str += strlen(schedstr) + 1;
	80	if (get_option(&str, &par))
	81	prof_shift = par;
	82	printk(KERN_INFO
	83	"kernel schedule profiling enabled (shift: %ld)\n",
	84	prof_shift);
07031e14 IM	85	} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
	86	prof_on = KVM_PROFILING;
	87	if (str[strlen(kvmstr)] == ',')
	88	str += strlen(kvmstr) + 1;
	89	if (get_option(&str, &par))
	90	prof_shift = par;
	91	printk(KERN_INFO
	92	"kernel KVM profiling enabled (shift: %ld)\n",
	93	prof_shift);
dfaa9c94	94	} else if (get_option(&str, &par)) {
1da177e4 LT	95	prof_shift = par;
	96	prof_on = CPU_PROFILING;
	97	printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
	98	prof_shift);
	99	}
	100	return 1;
	101	}
	102	__setup("profile=", profile_setup);
	103
	104
	105	void __init profile_init(void)
	106	{
	107	if (!prof_on)
	108	return;
	109
	110	/* only text is profiled */
	111	prof_len = (_etext - _stext) >> prof_shift;
	112	prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
	113	}
	114
	115	/* Profile event notifications */
	116
	117	#ifdef CONFIG_PROFILING
	118
e041c683 AS	119	static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
	120	static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
	121	static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
1da177e4 LT	122
	123	void profile_task_exit(struct task_struct * task)
	124	{
e041c683	125	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
1da177e4 LT	126	}
	127
	128	int profile_handoff_task(struct task_struct * task)
	129	{
	130	int ret;
e041c683	131	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
1da177e4 LT	132	return (ret == NOTIFY_OK) ? 1 : 0;
	133	}
	134
	135	void profile_munmap(unsigned long addr)
	136	{
e041c683	137	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
1da177e4 LT	138	}
	139
	140	int task_handoff_register(struct notifier_block * n)
	141	{
e041c683	142	return atomic_notifier_chain_register(&task_free_notifier, n);
1da177e4 LT	143	}
	144
	145	int task_handoff_unregister(struct notifier_block * n)
	146	{
e041c683	147	return atomic_notifier_chain_unregister(&task_free_notifier, n);
1da177e4 LT	148	}
	149
	150	int profile_event_register(enum profile_type type, struct notifier_block * n)
	151	{
	152	int err = -EINVAL;
	153
1da177e4 LT	154	switch (type) {
1da177e4 LT	155	case PROFILE_TASK_EXIT:
e041c683 AS	156	err = blocking_notifier_chain_register(
e041c683 AS	157	&task_exit_notifier, n);
1da177e4 LT	158	break;
1da177e4 LT	159	case PROFILE_MUNMAP:
e041c683 AS	160	err = blocking_notifier_chain_register(
e041c683 AS	161	&munmap_notifier, n);
1da177e4 LT	162	break;
	163	}
	164
1da177e4 LT	165	return err;
	166	}
	167
	168
	169	int profile_event_unregister(enum profile_type type, struct notifier_block * n)
	170	{
	171	int err = -EINVAL;
	172
1da177e4 LT	173	switch (type) {
1da177e4 LT	174	case PROFILE_TASK_EXIT:
e041c683 AS	175	err = blocking_notifier_chain_unregister(
e041c683 AS	176	&task_exit_notifier, n);
1da177e4 LT	177	break;
1da177e4 LT	178	case PROFILE_MUNMAP:
e041c683 AS	179	err = blocking_notifier_chain_unregister(
e041c683 AS	180	&munmap_notifier, n);
1da177e4 LT	181	break;
	182	}
	183
1da177e4 LT	184	return err;
	185	}
	186
	187	int register_timer_hook(int (hook)(struct pt_regs ))
	188	{
	189	if (timer_hook)
	190	return -EBUSY;
	191	timer_hook = hook;
	192	return 0;
	193	}
	194
	195	void unregister_timer_hook(int (hook)(struct pt_regs ))
	196	{
	197	WARN_ON(hook != timer_hook);
	198	timer_hook = NULL;
	199	/* make sure all CPUs see the NULL hook */
fbd568a3	200	synchronize_sched(); /* Allow ongoing interrupts to complete. */
1da177e4 LT	201	}
	202
	203	EXPORT_SYMBOL_GPL(register_timer_hook);
	204	EXPORT_SYMBOL_GPL(unregister_timer_hook);
	205	EXPORT_SYMBOL_GPL(task_handoff_register);
	206	EXPORT_SYMBOL_GPL(task_handoff_unregister);
cd5bfea2 PC	207	EXPORT_SYMBOL_GPL(profile_event_register);
cd5bfea2 PC	208	EXPORT_SYMBOL_GPL(profile_event_unregister);
1da177e4 LT	209
	210	#endif /* CONFIG_PROFILING */
	211
1da177e4 LT	212
	213	#ifdef CONFIG_SMP
	214	/*
	215	* Each cpu has a pair of open-addressed hashtables for pending
	216	* profile hits. read_profile() IPI's all cpus to request them
	217	* to flip buffers and flushes their contents to prof_buffer itself.
	218	* Flip requests are serialized by the profile_flip_mutex. The sole
	219	* use of having a second hashtable is for avoiding cacheline
	220	* contention that would otherwise happen during flushes of pending
	221	* profile hits required for the accuracy of reported profile hits
	222	* and so resurrect the interrupt livelock issue.
	223	*
	224	* The open-addressed hashtables are indexed by profile buffer slot
	225	* and hold the number of pending hits to that profile buffer slot on
	226	* a cpu in an entry. When the hashtable overflows, all pending hits
	227	* are accounted to their corresponding profile buffer slots with
	228	* atomic_add() and the hashtable emptied. As numerous pending hits
	229	* may be accounted to a profile buffer slot in a hashtable entry,
	230	* this amortizes a number of atomic profile buffer increments likely
	231	* to be far larger than the number of entries in the hashtable,
	232	* particularly given that the number of distinct profile buffer
	233	* positions to which hits are accounted during short intervals (e.g.
	234	* several seconds) is usually very small. Exclusion from buffer
	235	* flipping is provided by interrupt disablement (note that for
ece8a684 IM	236	* SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
ece8a684 IM	237	* process context).
1da177e4 LT	238	* The hash function is meant to be lightweight as opposed to strong,
	239	* and was vaguely inspired by ppc64 firmware-supported inverted
	240	* pagetable hash functions, but uses a full hashtable full of finite
	241	* collision chains, not just pairs of them.
	242	*
	243	* -- wli
	244	*/
	245	static void __profile_flip_buffers(void *unused)
	246	{
	247	int cpu = smp_processor_id();
	248
	249	per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
	250	}
	251
	252	static void profile_flip_buffers(void)
	253	{
	254	int i, j, cpu;
	255
97d1f15b	256	mutex_lock(&profile_flip_mutex);
1da177e4 LT	257	j = per_cpu(cpu_profile_flip, get_cpu());
	258	put_cpu();
	259	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
	260	for_each_online_cpu(cpu) {
	261	struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
	262	for (i = 0; i < NR_PROFILE_HIT; ++i) {
	263	if (!hits[i].hits) {
	264	if (hits[i].pc)
	265	hits[i].pc = 0;
	266	continue;
	267	}
	268	atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
	269	hits[i].hits = hits[i].pc = 0;
	270	}
	271	}
97d1f15b	272	mutex_unlock(&profile_flip_mutex);
1da177e4 LT	273	}
	274
	275	static void profile_discard_flip_buffers(void)
	276	{
	277	int i, cpu;
	278
97d1f15b	279	mutex_lock(&profile_flip_mutex);
1da177e4 LT	280	i = per_cpu(cpu_profile_flip, get_cpu());
	281	put_cpu();
	282	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
	283	for_each_online_cpu(cpu) {
	284	struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
	285	memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
	286	}
97d1f15b	287	mutex_unlock(&profile_flip_mutex);
1da177e4 LT	288	}
1da177e4 LT	289
ece8a684	290	void profile_hits(int type, void *__pc, unsigned int nr_hits)
1da177e4 LT	291	{
	292	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
	293	int i, j, cpu;
	294	struct profile_hit *hits;
	295
	296	if (prof_on != type \|\| !prof_buffer)
	297	return;
	298	pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
	299	i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	300	secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	301	cpu = get_cpu();
	302	hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
	303	if (!hits) {
	304	put_cpu();
	305	return;
	306	}
ece8a684 IM	307	/*
	308	* We buffer the global profiler buffer into a per-CPU
	309	* queue and thus reduce the number of global (and possibly
	310	* NUMA-alien) accesses. The write-queue is self-coalescing:
	311	*/
1da177e4 LT	312	local_irq_save(flags);
	313	do {
	314	for (j = 0; j < PROFILE_GRPSZ; ++j) {
	315	if (hits[i + j].pc == pc) {
ece8a684	316	hits[i + j].hits += nr_hits;
1da177e4 LT	317	goto out;
	318	} else if (!hits[i + j].hits) {
	319	hits[i + j].pc = pc;
ece8a684	320	hits[i + j].hits = nr_hits;
1da177e4 LT	321	goto out;
	322	}
	323	}
	324	i = (i + secondary) & (NR_PROFILE_HIT - 1);
	325	} while (i != primary);
ece8a684 IM	326
	327	/*
	328	* Add the current hit(s) and flush the write-queue out
	329	* to the global buffer:
	330	*/
	331	atomic_add(nr_hits, &prof_buffer[pc]);
1da177e4 LT	332	for (i = 0; i < NR_PROFILE_HIT; ++i) {
	333	atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
	334	hits[i].pc = hits[i].hits = 0;
	335	}
	336	out:
	337	local_irq_restore(flags);
	338	put_cpu();
	339	}
	340
9c7b216d	341	static int __devinit profile_cpu_callback(struct notifier_block *info,
1da177e4 LT	342	unsigned long action, void *__cpu)
	343	{
	344	int node, cpu = (unsigned long)__cpu;
	345	struct page *page;
	346
	347	switch (action) {
	348	case CPU_UP_PREPARE:
8bb78442	349	case CPU_UP_PREPARE_FROZEN:
1da177e4 LT	350	node = cpu_to_node(cpu);
	351	per_cpu(cpu_profile_flip, cpu) = 0;
	352	if (!per_cpu(cpu_profile_hits, cpu)[1]) {
fbd98167	353	page = alloc_pages_node(node,
4199cfa0	354	GFP_KERNEL \| __GFP_ZERO,
fbd98167	355	0);
1da177e4 LT	356	if (!page)
	357	return NOTIFY_BAD;
	358	per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
	359	}
	360	if (!per_cpu(cpu_profile_hits, cpu)[0]) {
fbd98167	361	page = alloc_pages_node(node,
4199cfa0	362	GFP_KERNEL \| __GFP_ZERO,
fbd98167	363	0);
1da177e4 LT	364	if (!page)
	365	goto out_free;
	366	per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
	367	}
	368	break;
	369	out_free:
	370	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
	371	per_cpu(cpu_profile_hits, cpu)[1] = NULL;
	372	__free_page(page);
	373	return NOTIFY_BAD;
	374	case CPU_ONLINE:
8bb78442	375	case CPU_ONLINE_FROZEN:
1da177e4 LT	376	cpu_set(cpu, prof_cpu_mask);
	377	break;
	378	case CPU_UP_CANCELED:
8bb78442	379	case CPU_UP_CANCELED_FROZEN:
1da177e4	380	case CPU_DEAD:
8bb78442	381	case CPU_DEAD_FROZEN:
1da177e4 LT	382	cpu_clear(cpu, prof_cpu_mask);
	383	if (per_cpu(cpu_profile_hits, cpu)[0]) {
	384	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
	385	per_cpu(cpu_profile_hits, cpu)[0] = NULL;
	386	__free_page(page);
	387	}
	388	if (per_cpu(cpu_profile_hits, cpu)[1]) {
	389	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
	390	per_cpu(cpu_profile_hits, cpu)[1] = NULL;
	391	__free_page(page);
	392	}
	393	break;
	394	}
	395	return NOTIFY_OK;
	396	}
1da177e4 LT	397	#else /* !CONFIG_SMP */
	398	#define profile_flip_buffers() do { } while (0)
	399	#define profile_discard_flip_buffers() do { } while (0)
02316067	400	#define profile_cpu_callback NULL
1da177e4	401
ece8a684	402	void profile_hits(int type, void *__pc, unsigned int nr_hits)
1da177e4 LT	403	{
	404	unsigned long pc;
	405
	406	if (prof_on != type \|\| !prof_buffer)
	407	return;
	408	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
ece8a684	409	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
1da177e4 LT	410	}
	411	#endif /* !CONFIG_SMP */
	412
bbe1a59b AM	413	EXPORT_SYMBOL_GPL(profile_hits);
bbe1a59b AM	414
7d12e780	415	void profile_tick(int type)
1da177e4	416	{
7d12e780 DH	417	struct pt_regs *regs = get_irq_regs();
7d12e780 DH	418
1da177e4 LT	419	if (type == CPU_PROFILING && timer_hook)
	420	timer_hook(regs);
	421	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
	422	profile_hit(type, (void *)profile_pc(regs));
	423	}
	424
	425	#ifdef CONFIG_PROC_FS
	426	#include <linux/proc_fs.h>
	427	#include <asm/uaccess.h>
	428	#include <asm/ptrace.h>
	429
	430	static int prof_cpu_mask_read_proc (char page, char *start, off_t off,
	431	int count, int eof, void data)
	432	{
	433	int len = cpumask_scnprintf(page, count, (cpumask_t )data);
	434	if (count - len < 2)
	435	return -EINVAL;
	436	len += sprintf(page + len, "\n");
	437	return len;
	438	}
	439
	440	static int prof_cpu_mask_write_proc (struct file file, const char __user buffer,
	441	unsigned long count, void *data)
	442	{
	443	cpumask_t mask = (cpumask_t )data;
	444	unsigned long full_count = count, err;
	445	cpumask_t new_value;
	446
01a3ee2b	447	err = cpumask_parse_user(buffer, count, new_value);
1da177e4 LT	448	if (err)
	449	return err;
	450
	451	*mask = new_value;
	452	return full_count;
	453	}
	454
	455	void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
	456	{
	457	struct proc_dir_entry *entry;
	458
	459	/* create /proc/irq/prof_cpu_mask */
	460	if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
	461	return;
1da177e4 LT	462	entry->data = (void *)&prof_cpu_mask;
	463	entry->read_proc = prof_cpu_mask_read_proc;
	464	entry->write_proc = prof_cpu_mask_write_proc;
	465	}
	466
	467	/*
	468	* This function accesses profiling information. The returned data is
	469	* binary: the sampling step and the actual contents of the profile
	470	* buffer. Use of the program readprofile is recommended in order to
	471	* get meaningful info out of these data.
	472	*/
	473	static ssize_t
	474	read_profile(struct file file, char __user buf, size_t count, loff_t *ppos)
	475	{
	476	unsigned long p = *ppos;
	477	ssize_t read;
	478	char * pnt;
	479	unsigned int sample_step = 1 << prof_shift;
	480
	481	profile_flip_buffers();
	482	if (p >= (prof_len+1)*sizeof(unsigned int))
	483	return 0;
	484	if (count > (prof_len+1)*sizeof(unsigned int) - p)
	485	count = (prof_len+1)*sizeof(unsigned int) - p;
	486	read = 0;
	487
	488	while (p < sizeof(unsigned int) && count > 0) {
064b022c HC	489	if (put_user(((char )(&sample_step)+p),buf))
064b022c HC	490	return -EFAULT;
1da177e4 LT	491	buf++; p++; count--; read++;
	492	}
	493	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
	494	if (copy_to_user(buf,(void *)pnt,count))
	495	return -EFAULT;
	496	read += count;
	497	*ppos += read;
	498	return read;
	499	}
	500
	501	/*
	502	* Writing to /proc/profile resets the counters
	503	*
	504	* Writing a 'profiling multiplier' value into it also re-sets the profiling
	505	* interrupt frequency, on architectures that support this.
	506	*/
	507	static ssize_t write_profile(struct file file, const char __user buf,
	508	size_t count, loff_t *ppos)
	509	{
	510	#ifdef CONFIG_SMP
	511	extern int setup_profiling_timer (unsigned int multiplier);
	512
	513	if (count == sizeof(int)) {
	514	unsigned int multiplier;
	515
	516	if (copy_from_user(&multiplier, buf, sizeof(int)))
	517	return -EFAULT;
	518
	519	if (setup_profiling_timer(multiplier))
	520	return -EINVAL;
	521	}
	522	#endif
	523	profile_discard_flip_buffers();
	524	memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
	525	return count;
	526	}
	527
15ad7cdc	528	static const struct file_operations proc_profile_operations = {
1da177e4 LT	529	.read = read_profile,
	530	.write = write_profile,
	531	};
	532
	533	#ifdef CONFIG_SMP
	534	static void __init profile_nop(void *unused)
	535	{
	536	}
	537
	538	static int __init create_hash_tables(void)
	539	{
	540	int cpu;
	541
	542	for_each_online_cpu(cpu) {
	543	int node = cpu_to_node(cpu);
	544	struct page *page;
	545
fbd98167 CL	546	page = alloc_pages_node(node,
	547	GFP_KERNEL \| __GFP_ZERO \| GFP_THISNODE,
	548	0);
1da177e4 LT	549	if (!page)
	550	goto out_cleanup;
	551	per_cpu(cpu_profile_hits, cpu)[1]
	552	= (struct profile_hit *)page_address(page);
fbd98167 CL	553	page = alloc_pages_node(node,
	554	GFP_KERNEL \| __GFP_ZERO \| GFP_THISNODE,
	555	0);
1da177e4 LT	556	if (!page)
	557	goto out_cleanup;
	558	per_cpu(cpu_profile_hits, cpu)[0]
	559	= (struct profile_hit *)page_address(page);
	560	}
	561	return 0;
	562	out_cleanup:
	563	prof_on = 0;
d59dd462	564	smp_mb();
1da177e4 LT	565	on_each_cpu(profile_nop, NULL, 0, 1);
	566	for_each_online_cpu(cpu) {
	567	struct page *page;
	568
	569	if (per_cpu(cpu_profile_hits, cpu)[0]) {
	570	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
	571	per_cpu(cpu_profile_hits, cpu)[0] = NULL;
	572	__free_page(page);
	573	}
	574	if (per_cpu(cpu_profile_hits, cpu)[1]) {
	575	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
	576	per_cpu(cpu_profile_hits, cpu)[1] = NULL;
	577	__free_page(page);
	578	}
	579	}
	580	return -1;
	581	}
	582	#else
	583	#define create_hash_tables() ({ 0; })
	584	#endif
	585
	586	static int __init create_proc_profile(void)
	587	{
	588	struct proc_dir_entry *entry;
	589
	590	if (!prof_on)
	591	return 0;
	592	if (create_hash_tables())
	593	return -1;
	594	if (!(entry = create_proc_entry("profile", S_IWUSR \| S_IRUGO, NULL)))
	595	return 0;
	596	entry->proc_fops = &proc_profile_operations;
	597	entry->size = (1+prof_len) * sizeof(atomic_t);
	598	hotcpu_notifier(profile_cpu_callback, 0);
	599	return 0;
	600	}
	601	module_init(create_proc_profile);
	602	#endif /* CONFIG_PROC_FS */