[linux-2.6-block.git] / arch / powerpc / mm / gup.c

/*
 * Lockless get_user_pages_fast for powerpc
 *
 * Copyright (C) 2008 Nick Piggin
 * Copyright (C) 2008 Novell Inc.
 */
#undef DEBUG

#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/vmstat.h>
#include <linux/pagemap.h>
#include <linux/rwsem.h>
#include <asm/pgtable.h>

/*
 * The performance critical leaf functions are made noinline otherwise gcc
 * inlines everything into a single function which results in too much
 * register pressure.
 */
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
		unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long mask, result;
	pte_t *ptep;

	result = _PAGE_PRESENT|_PAGE_USER;
	if (write)
		result |= _PAGE_RW;
	mask = result | _PAGE_SPECIAL;

	ptep = pte_offset_kernel(&pmd, addr);
	do {
		pte_t pte = *ptep;
		struct page *page;

		if ((pte_val(pte) & mask) != result)
			return 0;
		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
		page = pte_page(pte);
		if (!page_cache_get_speculative(page))
			return 0;
		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
			put_page(page);
			return 0;
		}
		pages[*nr] = page;
		(*nr)++;

	} while (ptep++, addr += PAGE_SIZE, addr != end);

	return 1;
}

#ifdef CONFIG_HUGETLB_PAGE
static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
				 unsigned long *addr, unsigned long end,
				 int write, struct page **pages, int *nr)
{
	unsigned long mask;
	unsigned long pte_end;
	struct page *head, *page;
	pte_t pte;
	int refs;

	pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
	if (pte_end < end)
		end = pte_end;

	pte = *ptep;
	mask = _PAGE_PRESENT|_PAGE_USER;
	if (write)
		mask |= _PAGE_RW;
	if ((pte_val(pte) & mask) != mask)
		return 0;
	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);
	page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (*addr += PAGE_SIZE, *addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}
	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		while (*nr) {
			put_page(page);
			(*nr)--;
		}
	}

	return 1;
}
#endif /* CONFIG_HUGETLB_PAGE */

static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
		int write, struct page **pages, int *nr)
{
	unsigned long next;
	pmd_t *pmdp;

	pmdp = pmd_offset(&pud, addr);
	do {
		pmd_t pmd = *pmdp;

		next = pmd_addr_end(addr, end);
		if (pmd_none(pmd))
			return 0;
		if (!gup_pte_range(pmd, addr, next, write, pages, nr))
			return 0;
	} while (pmdp++, addr = next, addr != end);

	return 1;
}

static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
		int write, struct page **pages, int *nr)
{
	unsigned long next;
	pud_t *pudp;

	pudp = pud_offset(&pgd, addr);
	do {
		pud_t pud = *pudp;

		next = pud_addr_end(addr, end);
		if (pud_none(pud))
			return 0;
		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
			return 0;
	} while (pudp++, addr = next, addr != end);

	return 1;
}

int get_user_pages_fast(unsigned long start, int nr_pages, int write,
			struct page **pages)
{
	struct mm_struct *mm = current->mm;
	unsigned long addr, len, end;
	unsigned long next;
	pgd_t *pgdp;
	int psize, nr = 0;
	unsigned int shift;

	pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");

	start &= PAGE_MASK;
	addr = start;
	len = (unsigned long) nr_pages << PAGE_SHIFT;
	end = start + len;

	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
					start, len)))
		goto slow_irqon;

	pr_debug("  aligned: %lx .. %lx\n", start, end);

#ifdef CONFIG_HUGETLB_PAGE
	/* We bail out on slice boundary crossing when hugetlb is
	 * enabled in order to not have to deal with two different
	 * page table formats
	 */
	if (addr < SLICE_LOW_TOP) {
		if (end > SLICE_LOW_TOP)
			goto slow_irqon;

		if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
			     GET_LOW_SLICE_INDEX(end - 1)))
			goto slow_irqon;
	} else {
		if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
			     GET_HIGH_SLICE_INDEX(end - 1)))
			goto slow_irqon;
	}
#endif /* CONFIG_HUGETLB_PAGE */

	/*
	 * XXX: batch / limit 'nr', to avoid large irq off latency
	 * needs some instrumenting to determine the common sizes used by
	 * important workloads (eg. DB2), and whether limiting the batch size
	 * will decrease performance.
	 *
	 * It seems like we're in the clear for the moment. Direct-IO is
	 * the main guy that batches up lots of get_user_pages, and even
	 * they are limited to 64-at-a-time which is not so many.
	 */
	/*
	 * This doesn't prevent pagetable teardown, but does prevent
	 * the pagetables from being freed on powerpc.
	 *
	 * So long as we atomically load page table pointers versus teardown,
	 * we can follow the address down to the the page and take a ref on it.
	 */
	local_irq_disable();

	psize = get_slice_psize(mm, addr);
	shift = mmu_psize_defs[psize].shift;

#ifdef CONFIG_HUGETLB_PAGE
	if (unlikely(mmu_huge_psizes[psize])) {
		pte_t *ptep;
		unsigned long a = addr;
		unsigned long sz = ((1UL) << shift);
		struct hstate *hstate = size_to_hstate(sz);

		BUG_ON(!hstate);
		/*
		 * XXX: could be optimized to avoid hstate
		 * lookup entirely (just use shift)
		 */

		do {
			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
			ptep = huge_pte_offset(mm, a);
			pr_debug(" %016lx: huge ptep %p\n", a, ptep);
			if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
						   &nr))
				goto slow;
		} while (a != end);
	} else
#endif /* CONFIG_HUGETLB_PAGE */
	{
		pgdp = pgd_offset(mm, addr);
		do {
			pgd_t pgd = *pgdp;

			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
			pr_debug("  %016lx: normal pgd %p\n", addr,
				 (void *)pgd_val(pgd));
			next = pgd_addr_end(addr, end);
			if (pgd_none(pgd))
				goto slow;
			if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
				goto slow;
		} while (pgdp++, addr = next, addr != end);
	}
	local_irq_enable();

	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
	return nr;

	{
		int ret;

slow:
		local_irq_enable();
slow_irqon:
		pr_debug("  slow path ! nr = %d\n", nr);

		/* Try to get the remaining pages with get_user_pages */
		start += nr << PAGE_SHIFT;
		pages += nr;

		down_read(&mm->mmap_sem);
		ret = get_user_pages(current, mm, start,
			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
		up_read(&mm->mmap_sem);

		/* Have to be a bit careful with return values */
		if (nr > 0) {
			if (ret < 0)
				ret = nr;
			else
				ret += nr;
		}

		return ret;
	}
}
Commit	Line	Data
ce0ad7f0 NP	1	/*
	2	* Lockless get_user_pages_fast for powerpc
	3	*
	4	* Copyright (C) 2008 Nick Piggin
	5	* Copyright (C) 2008 Novell Inc.
	6	*/
	7	#undef DEBUG
	8
	9	#include <linux/sched.h>
	10	#include <linux/mm.h>
	11	#include <linux/hugetlb.h>
	12	#include <linux/vmstat.h>
	13	#include <linux/pagemap.h>
	14	#include <linux/rwsem.h>
	15	#include <asm/pgtable.h>
	16
	17	/*
	18	* The performance critical leaf functions are made noinline otherwise gcc
	19	* inlines everything into a single function which results in too much
	20	* register pressure.
	21	*/
	22	static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
	23	unsigned long end, int write, struct page *pages, int nr)
	24	{
	25	unsigned long mask, result;
	26	pte_t *ptep;
	27
	28	result = _PAGE_PRESENT\|_PAGE_USER;
	29	if (write)
	30	result \|= _PAGE_RW;
	31	mask = result \| _PAGE_SPECIAL;
	32
	33	ptep = pte_offset_kernel(&pmd, addr);
	34	do {
	35	pte_t pte = *ptep;
	36	struct page *page;
	37
	38	if ((pte_val(pte) & mask) != result)
	39	return 0;
	40	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
	41	page = pte_page(pte);
	42	if (!page_cache_get_speculative(page))
	43	return 0;
f5ea64dc	44	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
ce0ad7f0 NP	45	put_page(page);
	46	return 0;
	47	}
	48	pages[*nr] = page;
	49	(*nr)++;
	50
	51	} while (ptep++, addr += PAGE_SIZE, addr != end);
	52
	53	return 1;
	54	}
	55
	56	#ifdef CONFIG_HUGETLB_PAGE
	57	static noinline int gup_huge_pte(pte_t ptep, struct hstate hstate,
	58	unsigned long *addr, unsigned long end,
	59	int write, struct page *pages, int nr)
	60	{
	61	unsigned long mask;
	62	unsigned long pte_end;
	63	struct page head, page;
	64	pte_t pte;
	65	int refs;
	66
	67	pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
	68	if (pte_end < end)
	69	end = pte_end;
	70
	71	pte = *ptep;
	72	mask = _PAGE_PRESENT\|_PAGE_USER;
	73	if (write)
	74	mask \|= _PAGE_RW;
	75	if ((pte_val(pte) & mask) != mask)
	76	return 0;
	77	/* hugepages are never "special" */
	78	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
	79
	80	refs = 0;
	81	head = pte_page(pte);
	82	page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
	83	do {
	84	VM_BUG_ON(compound_head(page) != head);
	85	pages[*nr] = page;
	86	(*nr)++;
	87	page++;
	88	refs++;
	89	} while (addr += PAGE_SIZE, addr != end);
	90
	91	if (!page_cache_add_speculative(head, refs)) {
	92	*nr -= refs;
	93	return 0;
	94	}
f5ea64dc	95	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
ce0ad7f0 NP	96	/* Could be optimized better */
	97	while (*nr) {
	98	put_page(page);
	99	(*nr)--;
	100	}
	101	}
	102
	103	return 1;
	104	}
	105	#endif /* CONFIG_HUGETLB_PAGE */
	106
	107	static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
	108	int write, struct page *pages, int nr)
	109	{
	110	unsigned long next;
	111	pmd_t *pmdp;
	112
	113	pmdp = pmd_offset(&pud, addr);
	114	do {
	115	pmd_t pmd = *pmdp;
	116
	117	next = pmd_addr_end(addr, end);
	118	if (pmd_none(pmd))
	119	return 0;
	120	if (!gup_pte_range(pmd, addr, next, write, pages, nr))
	121	return 0;
	122	} while (pmdp++, addr = next, addr != end);
	123
	124	return 1;
	125	}
	126
	127	static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
	128	int write, struct page *pages, int nr)
	129	{
	130	unsigned long next;
	131	pud_t *pudp;
	132
	133	pudp = pud_offset(&pgd, addr);
	134	do {
	135	pud_t pud = *pudp;
	136
	137	next = pud_addr_end(addr, end);
	138	if (pud_none(pud))
	139	return 0;
	140	if (!gup_pmd_range(pud, addr, next, write, pages, nr))
	141	return 0;
	142	} while (pudp++, addr = next, addr != end);
	143
	144	return 1;
	145	}
	146
	147	int get_user_pages_fast(unsigned long start, int nr_pages, int write,
	148	struct page **pages)
	149	{
	150	struct mm_struct *mm = current->mm;
	151	unsigned long addr, len, end;
	152	unsigned long next;
	153	pgd_t *pgdp;
	154	int psize, nr = 0;
	155	unsigned int shift;
	156
	157	pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
	158
	159	start &= PAGE_MASK;
160	addr = start;
161	len = (unsigned long) nr_pages << PAGE_SHIFT;
162	end = start + len;
163
164	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
165	start, len)))
166	goto slow_irqon;
167
168	pr_debug(" aligned: %lx .. %lx\n", start, end);
169
170	#ifdef CONFIG_HUGETLB_PAGE
171	/* We bail out on slice boundary crossing when hugetlb is
172	* enabled in order to not have to deal with two different
173	* page table formats
174	*/
175	if (addr < SLICE_LOW_TOP) {
176	if (end > SLICE_LOW_TOP)
177	goto slow_irqon;
178
179	if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
180	GET_LOW_SLICE_INDEX(end - 1)))
181	goto slow_irqon;
182	} else {
183	if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
184	GET_HIGH_SLICE_INDEX(end - 1)))
185	goto slow_irqon;
186	}
187	#endif /* CONFIG_HUGETLB_PAGE */
188
189	/*
190	* XXX: batch / limit 'nr', to avoid large irq off latency
191	* needs some instrumenting to determine the common sizes used by
192	* important workloads (eg. DB2), and whether limiting the batch size
193	* will decrease performance.
194	*
195	* It seems like we're in the clear for the moment. Direct-IO is
196	* the main guy that batches up lots of get_user_pages, and even
197	* they are limited to 64-at-a-time which is not so many.
198	*/
199	/*
200	* This doesn't prevent pagetable teardown, but does prevent
201	* the pagetables from being freed on powerpc.
202	*
203	* So long as we atomically load page table pointers versus teardown,
204	* we can follow the address down to the the page and take a ref on it.
205	*/
206	local_irq_disable();
207
208	psize = get_slice_psize(mm, addr);
209	shift = mmu_psize_defs[psize].shift;
210
211	#ifdef CONFIG_HUGETLB_PAGE
212	if (unlikely(mmu_huge_psizes[psize])) {
213	pte_t *ptep;
214	unsigned long a = addr;
215	unsigned long sz = ((1UL) << shift);
216	struct hstate *hstate = size_to_hstate(sz);
217
218	BUG_ON(!hstate);
219	/*
220	* XXX: could be optimized to avoid hstate
221	* lookup entirely (just use shift)
222	*/
223
224	do {
225	VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
226	ptep = huge_pte_offset(mm, a);
227	pr_debug(" %016lx: huge ptep %p\n", a, ptep);
228	if (!ptep \|\| !gup_huge_pte(ptep, hstate, &a, end, write, pages,
229	&nr))
230	goto slow;
231	} while (a != end);
232	} else
233	#endif /* CONFIG_HUGETLB_PAGE */
234	{
235	pgdp = pgd_offset(mm, addr);
236	do {
237	pgd_t pgd = *pgdp;
238
239	VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
f5ea64dc DG	240	pr_debug(" %016lx: normal pgd %p\n", addr,
f5ea64dc DG	241	(void *)pgd_val(pgd));
ce0ad7f0 NP	242	next = pgd_addr_end(addr, end);
	243	if (pgd_none(pgd))
	244	goto slow;
	245	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
	246	goto slow;
	247	} while (pgdp++, addr = next, addr != end);
	248	}
	249	local_irq_enable();
	250
	251	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
	252	return nr;
	253
	254	{
	255	int ret;
	256
	257	slow:
	258	local_irq_enable();
	259	slow_irqon:
	260	pr_debug(" slow path ! nr = %d\n", nr);
	261
	262	/* Try to get the remaining pages with get_user_pages */
	263	start += nr << PAGE_SHIFT;
	264	pages += nr;
	265
	266	down_read(&mm->mmap_sem);
	267	ret = get_user_pages(current, mm, start,
	268	(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
	269	up_read(&mm->mmap_sem);
	270
	271	/* Have to be a bit careful with return values */
	272	if (nr > 0) {
	273	if (ret < 0)
	274	ret = nr;
	275	else
	276	ret += nr;
	277	}
	278
	279	return ret;
	280	}
	281	}