[linux-2.6-block.git] / mm / swap_state.c

/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>

#include <asm/pgtable.h>

/*
 * swapper_space is a fiction, retained to simplify the path through
 * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
 * future use of radix_tree tags in the swap cache.
 */
static const struct address_space_operations swap_aops = {
	.writepage	= swap_writepage,
	.sync_page	= block_sync_page,
	.set_page_dirty	= __set_page_dirty_nobuffers,
	.migratepage	= migrate_page,
};

static struct backing_dev_info swap_backing_dev_info = {
	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
	.unplug_io_fn	= swap_unplug_io_fn,
};

struct address_space swapper_space = {
	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
	.tree_lock	= __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
	.a_ops		= &swap_aops,
	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
	.backing_dev_info = &swap_backing_dev_info,
};

#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)

static struct {
	unsigned long add_total;
	unsigned long del_total;
	unsigned long find_success;
	unsigned long find_total;
	unsigned long noent_race;
	unsigned long exist_race;
} swap_cache_info;

void show_swap_cache_info(void)
{
	printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
		swap_cache_info.add_total, swap_cache_info.del_total,
		swap_cache_info.find_success, swap_cache_info.find_total,
		swap_cache_info.noent_race, swap_cache_info.exist_race);
	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}

/*
 * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
			       gfp_t gfp_mask)
{
	int error;

	BUG_ON(!PageLocked(page));
	BUG_ON(PageSwapCache(page));
	BUG_ON(PagePrivate(page));
	error = radix_tree_preload(gfp_mask);
	if (!error) {
		write_lock_irq(&swapper_space.tree_lock);
		error = radix_tree_insert(&swapper_space.page_tree,
						entry.val, page);
		if (!error) {
			page_cache_get(page);
			SetPageSwapCache(page);
			set_page_private(page, entry.val);
			total_swapcache_pages++;
			__inc_zone_page_state(page, NR_FILE_PAGES);
		}
		write_unlock_irq(&swapper_space.tree_lock);
		radix_tree_preload_end();
	}
	return error;
}

static int add_to_swap_cache(struct page *page, swp_entry_t entry,
				gfp_t gfp_mask)
{
	int error;

	BUG_ON(PageLocked(page));
	if (!swap_duplicate(entry)) {
		INC_CACHE_INFO(noent_race);
		return -ENOENT;
	}
	SetPageLocked(page);
	error = __add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL);
	/*
	 * Anon pages are already on the LRU, we don't run lru_cache_add here.
	 */
	if (error) {
		ClearPageLocked(page);
		swap_free(entry);
		if (error == -EEXIST)
			INC_CACHE_INFO(exist_race);
		return error;
	}
	INC_CACHE_INFO(add_total);
	return 0;
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
 */
void __delete_from_swap_cache(struct page *page)
{
	BUG_ON(!PageLocked(page));
	BUG_ON(!PageSwapCache(page));
	BUG_ON(PageWriteback(page));
	BUG_ON(PagePrivate(page));

	radix_tree_delete(&swapper_space.page_tree, page_private(page));
	set_page_private(page, 0);
	ClearPageSwapCache(page);
	total_swapcache_pages--;
	__dec_zone_page_state(page, NR_FILE_PAGES);
	INC_CACHE_INFO(del_total);
}

/**
 * add_to_swap - allocate swap space for a page
 * @page: page we want to move to swap
 *
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
int add_to_swap(struct page * page, gfp_t gfp_mask)
{
	swp_entry_t entry;
	int err;

	BUG_ON(!PageLocked(page));

	for (;;) {
		entry = get_swap_page();
		if (!entry.val)
			return 0;

		/*
		 * Radix-tree node allocations from PF_MEMALLOC contexts could
		 * completely exhaust the page allocator. __GFP_NOMEMALLOC
		 * stops emergency reserves from being allocated.
		 *
		 * TODO: this could cause a theoretical memory reclaim
		 * deadlock in the swap out path.
		 */
		/*
		 * Add it to the swap cache and mark it dirty
		 */
		err = __add_to_swap_cache(page, entry,
				gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);

		switch (err) {
		case 0:				/* Success */
			SetPageUptodate(page);
			SetPageDirty(page);
			INC_CACHE_INFO(add_total);
			return 1;
		case -EEXIST:
			/* Raced with "speculative" read_swap_cache_async */
			INC_CACHE_INFO(exist_race);
			swap_free(entry);
			continue;
		default:
			/* -ENOMEM radix-tree allocation failure */
			swap_free(entry);
			return 0;
		}
	}
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache and locked.
 * It will never put the page into the free list,
 * the caller has a reference on the page.
 */
void delete_from_swap_cache(struct page *page)
{
	swp_entry_t entry;

	entry.val = page_private(page);

	write_lock_irq(&swapper_space.tree_lock);
	__delete_from_swap_cache(page);
	write_unlock_irq(&swapper_space.tree_lock);

	swap_free(entry);
	page_cache_release(page);
}

/*
 * Strange swizzling function only for use by shmem_writepage
 */
int move_to_swap_cache(struct page *page, swp_entry_t entry)
{
	int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
	if (!err) {
		remove_from_page_cache(page);
		page_cache_release(page);	/* pagecache ref */
		if (!swap_duplicate(entry))
			BUG();
		SetPageDirty(page);
		INC_CACHE_INFO(add_total);
	} else if (err == -EEXIST)
		INC_CACHE_INFO(exist_race);
	return err;
}

/*
 * Strange swizzling function for shmem_getpage (and shmem_unuse)
 */
int move_from_swap_cache(struct page *page, unsigned long index,
		struct address_space *mapping)
{
	int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
	if (!err) {
		delete_from_swap_cache(page);
		/* shift page from clean_pages to dirty_pages list */
		ClearPageDirty(page);
		set_page_dirty(page);
	}
	return err;
}

/* 
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
 * here because we are going to recheck again inside 
 * exclusive_swap_page() _with_ the lock. 
 * 					- Marcelo
 */
static inline void free_swap_cache(struct page *page)
{
	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
		remove_exclusive_swap_page(page);
		unlock_page(page);
	}
}

/* 
 * Perform a free_page(), also freeing any swap cache associated with
 * this page if it is the last user of the page.
 */
void free_page_and_swap_cache(struct page *page)
{
	free_swap_cache(page);
	page_cache_release(page);
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct page **pages, int nr)
{
	struct page **pagep = pages;

	lru_add_drain();
	while (nr) {
		int todo = min(nr, PAGEVEC_SIZE);
		int i;

		for (i = 0; i < todo; i++)
			free_swap_cache(pagep[i]);
		release_pages(pagep, todo, 0);
		pagep += todo;
		nr -= todo;
	}
}

/*
 * Lookup a swap entry in the swap cache. A found page will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
struct page * lookup_swap_cache(swp_entry_t entry)
{
	struct page *page;

	page = find_get_page(&swapper_space, entry.val);

	if (page)
		INC_CACHE_INFO(find_success);

	INC_CACHE_INFO(find_total);
	return page;
}

/* 
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
			struct vm_area_struct *vma, unsigned long addr)
{
	struct page *found_page, *new_page = NULL;
	int err;

	do {
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics.
		 */
		found_page = find_get_page(&swapper_space, entry.val);
		if (found_page)
			break;

		/*
		 * Get a new page to read into from swap.
		 */
		if (!new_page) {
			new_page = alloc_page_vma(gfp_mask, vma, addr);
			if (!new_page)
				break;		/* Out of memory */
		}

		/*
		 * Associate the page with swap entry in the swap cache.
		 * May fail (-ENOENT) if swap entry has been freed since
		 * our caller observed it.  May fail (-EEXIST) if there
		 * is already a page associated with this entry in the
		 * swap cache: added by a racing read_swap_cache_async,
		 * or by try_to_swap_out (or shmem_writepage) re-using
		 * the just freed swap entry for an existing page.
		 * May fail (-ENOMEM) if radix-tree node allocation failed.
		 */
		err = add_to_swap_cache(new_page, entry, gfp_mask);
		if (!err) {
			/*
			 * Initiate read into locked page and return.
			 */
			lru_cache_add_active(new_page);
			swap_readpage(NULL, new_page);
			return new_page;
		}
	} while (err != -ENOENT && err != -ENOMEM);

	if (new_page)
		page_cache_release(new_page);
	return found_page;
}

/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @vma: user vma this address belongs to
 * @addr: target address for mempolicy
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * This has been extended to use the NUMA policies from the mm triggering
 * the readahead.
 *
 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
 */
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
			struct vm_area_struct *vma, unsigned long addr)
{
	int nr_pages;
	struct page *page;
	unsigned long offset;
	unsigned long end_offset;

	/*
	 * Get starting offset for readaround, and number of pages to read.
	 * Adjust starting address by readbehind (for NUMA interleave case)?
	 * No, it's very unlikely that swap layout would follow vma layout,
	 * more likely that neighbouring swap pages came from the same node:
	 * so use the same "addr" to choose the same node for each swap read.
	 */
	nr_pages = valid_swaphandles(entry, &offset);
	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
		/* Ok, do the async read-ahead now */
		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
						gfp_mask, vma, addr);
		if (!page)
			break;
		page_cache_release(page);
	}
	lru_add_drain();	/* Push any new pages onto the LRU now */
	return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
Commit	Line	Data
1da177e4 LT	1	/*
	2	* linux/mm/swap_state.c
	3	*
	4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
	5	* Swap reorganised 29.12.95, Stephen Tweedie
	6	*
	7	* Rewritten to use page cache, (C) 1998 Stephen Tweedie
	8	*/
	9	#include <linux/module.h>
	10	#include <linux/mm.h>
	11	#include <linux/kernel_stat.h>
	12	#include <linux/swap.h>
46017e95	13	#include <linux/swapops.h>
1da177e4 LT	14	#include <linux/init.h>
	15	#include <linux/pagemap.h>
	16	#include <linux/buffer_head.h>
	17	#include <linux/backing-dev.h>
c484d410	18	#include <linux/pagevec.h>
b20a3503	19	#include <linux/migrate.h>
1da177e4 LT	20
	21	#include <asm/pgtable.h>
	22
	23	/*
	24	* swapper_space is a fiction, retained to simplify the path through
2706a1b8	25	* vmscan's shrink_page_list, to make sync_page look nicer, and to allow
1da177e4 LT	26	* future use of radix_tree tags in the swap cache.
1da177e4 LT	27	*/
f5e54d6e	28	static const struct address_space_operations swap_aops = {
1da177e4 LT	29	.writepage = swap_writepage,
	30	.sync_page = block_sync_page,
	31	.set_page_dirty = __set_page_dirty_nobuffers,
e965f963	32	.migratepage = migrate_page,
1da177e4 LT	33	};
	34
	35	static struct backing_dev_info swap_backing_dev_info = {
	36	.capabilities = BDI_CAP_NO_ACCT_DIRTY \| BDI_CAP_NO_WRITEBACK,
	37	.unplug_io_fn = swap_unplug_io_fn,
	38	};
	39
	40	struct address_space swapper_space = {
	41	.page_tree = RADIX_TREE_INIT(GFP_ATOMIC\|__GFP_NOWARN),
e4d91918	42	.tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
1da177e4 LT	43	.a_ops = &swap_aops,
	44	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
	45	.backing_dev_info = &swap_backing_dev_info,
	46	};
1da177e4 LT	47
	48	#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
	49
	50	static struct {
	51	unsigned long add_total;
	52	unsigned long del_total;
	53	unsigned long find_success;
	54	unsigned long find_total;
	55	unsigned long noent_race;
	56	unsigned long exist_race;
	57	} swap_cache_info;
	58
	59	void show_swap_cache_info(void)
	60	{
	61	printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
	62	swap_cache_info.add_total, swap_cache_info.del_total,
	63	swap_cache_info.find_success, swap_cache_info.find_total,
	64	swap_cache_info.noent_race, swap_cache_info.exist_race);
	65	printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
	66	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
	67	}
	68
	69	/*
	70	* __add_to_swap_cache resembles add_to_page_cache on swapper_space,
	71	* but sets SwapCache flag and private instead of mapping and index.
	72	*/
9de75d11	73	static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
dd0fc66f	74	gfp_t gfp_mask)
1da177e4 LT	75	{
	76	int error;
	77
b55ed816	78	BUG_ON(!PageLocked(page));
1da177e4 LT	79	BUG_ON(PageSwapCache(page));
	80	BUG_ON(PagePrivate(page));
	81	error = radix_tree_preload(gfp_mask);
	82	if (!error) {
	83	write_lock_irq(&swapper_space.tree_lock);
	84	error = radix_tree_insert(&swapper_space.page_tree,
	85	entry.val, page);
	86	if (!error) {
	87	page_cache_get(page);
1da177e4	88	SetPageSwapCache(page);
4c21e2f2	89	set_page_private(page, entry.val);
1da177e4	90	total_swapcache_pages++;
347ce434	91	__inc_zone_page_state(page, NR_FILE_PAGES);
1da177e4 LT	92	}
	93	write_unlock_irq(&swapper_space.tree_lock);
	94	radix_tree_preload_end();
	95	}
	96	return error;
	97	}
	98
02098fea HD	99	static int add_to_swap_cache(struct page *page, swp_entry_t entry,
02098fea HD	100	gfp_t gfp_mask)
1da177e4 LT	101	{
	102	int error;
	103
b55ed816	104	BUG_ON(PageLocked(page));
1da177e4 LT	105	if (!swap_duplicate(entry)) {
	106	INC_CACHE_INFO(noent_race);
	107	return -ENOENT;
	108	}
b55ed816	109	SetPageLocked(page);
02098fea	110	error = __add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL);
1da177e4 LT	111	/*
	112	* Anon pages are already on the LRU, we don't run lru_cache_add here.
	113	*/
	114	if (error) {
b55ed816	115	ClearPageLocked(page);
1da177e4 LT	116	swap_free(entry);
	117	if (error == -EEXIST)
	118	INC_CACHE_INFO(exist_race);
	119	return error;
	120	}
	121	INC_CACHE_INFO(add_total);
	122	return 0;
	123	}
	124
	125	/*
	126	* This must be called only on pages that have
	127	* been verified to be in the swap cache.
	128	*/
	129	void __delete_from_swap_cache(struct page *page)
	130	{
	131	BUG_ON(!PageLocked(page));
	132	BUG_ON(!PageSwapCache(page));
	133	BUG_ON(PageWriteback(page));
3279ffd9	134	BUG_ON(PagePrivate(page));
1da177e4	135
4c21e2f2 HD	136	radix_tree_delete(&swapper_space.page_tree, page_private(page));
4c21e2f2 HD	137	set_page_private(page, 0);
1da177e4 LT	138	ClearPageSwapCache(page);
1da177e4 LT	139	total_swapcache_pages--;
347ce434	140	__dec_zone_page_state(page, NR_FILE_PAGES);
1da177e4 LT	141	INC_CACHE_INFO(del_total);
	142	}
	143
	144	/**
	145	* add_to_swap - allocate swap space for a page
	146	* @page: page we want to move to swap
	147	*
	148	* Allocate swap space for the page and add the page to the
	149	* swap cache. Caller needs to hold the page lock.
	150	*/
1480a540	151	int add_to_swap(struct page * page, gfp_t gfp_mask)
1da177e4 LT	152	{
1da177e4 LT	153	swp_entry_t entry;
1da177e4 LT	154	int err;
1da177e4 LT	155
e74ca2b4	156	BUG_ON(!PageLocked(page));
1da177e4 LT	157
	158	for (;;) {
	159	entry = get_swap_page();
	160	if (!entry.val)
	161	return 0;
	162
bd53b714 NP	163	/*
	164	* Radix-tree node allocations from PF_MEMALLOC contexts could
	165	* completely exhaust the page allocator. __GFP_NOMEMALLOC
	166	* stops emergency reserves from being allocated.
1da177e4	167	*
bd53b714 NP	168	* TODO: this could cause a theoretical memory reclaim
bd53b714 NP	169	* deadlock in the swap out path.
1da177e4	170	*/
1da177e4 LT	171	/*
	172	* Add it to the swap cache and mark it dirty
	173	*/
bd53b714	174	err = __add_to_swap_cache(page, entry,
1480a540	175	gfp_mask\|__GFP_NOMEMALLOC\|__GFP_NOWARN);
1da177e4 LT	176
	177	switch (err) {
	178	case 0: /* Success */
	179	SetPageUptodate(page);
	180	SetPageDirty(page);
	181	INC_CACHE_INFO(add_total);
	182	return 1;
	183	case -EEXIST:
	184	/* Raced with "speculative" read_swap_cache_async */
	185	INC_CACHE_INFO(exist_race);
	186	swap_free(entry);
	187	continue;
	188	default:
	189	/* -ENOMEM radix-tree allocation failure */
	190	swap_free(entry);
	191	return 0;
	192	}
	193	}
	194	}
	195
	196	/*
	197	* This must be called only on pages that have
	198	* been verified to be in the swap cache and locked.
	199	* It will never put the page into the free list,
	200	* the caller has a reference on the page.
	201	*/
	202	void delete_from_swap_cache(struct page *page)
	203	{
	204	swp_entry_t entry;
	205
4c21e2f2	206	entry.val = page_private(page);
1da177e4 LT	207
	208	write_lock_irq(&swapper_space.tree_lock);
	209	__delete_from_swap_cache(page);
	210	write_unlock_irq(&swapper_space.tree_lock);
	211
	212	swap_free(entry);
	213	page_cache_release(page);
	214	}
	215
	216	/*
	217	* Strange swizzling function only for use by shmem_writepage
	218	*/
	219	int move_to_swap_cache(struct page *page, swp_entry_t entry)
	220	{
	221	int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
	222	if (!err) {
	223	remove_from_page_cache(page);
	224	page_cache_release(page); /* pagecache ref */
	225	if (!swap_duplicate(entry))
	226	BUG();
	227	SetPageDirty(page);
	228	INC_CACHE_INFO(add_total);
	229	} else if (err == -EEXIST)
	230	INC_CACHE_INFO(exist_race);
	231	return err;
	232	}
	233
	234	/*
	235	* Strange swizzling function for shmem_getpage (and shmem_unuse)
	236	*/
	237	int move_from_swap_cache(struct page *page, unsigned long index,
	238	struct address_space *mapping)
	239	{
	240	int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
	241	if (!err) {
	242	delete_from_swap_cache(page);
	243	/* shift page from clean_pages to dirty_pages list */
	244	ClearPageDirty(page);
	245	set_page_dirty(page);
	246	}
	247	return err;
	248	}
	249
	250	/*
	251	* If we are the only user, then try to free up the swap cache.
	252	*
	253	* Its ok to check for PageSwapCache without the page lock
	254	* here because we are going to recheck again inside
	255	* exclusive_swap_page() _with_ the lock.
	256	* - Marcelo
	257	*/
	258	static inline void free_swap_cache(struct page *page)
	259	{
	260	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
	261	remove_exclusive_swap_page(page);
	262	unlock_page(page);
	263	}
	264	}
	265
	266	/*
	267	* Perform a free_page(), also freeing any swap cache associated with
b8072f09	268	* this page if it is the last user of the page.
1da177e4 LT	269	*/
	270	void free_page_and_swap_cache(struct page *page)
	271	{
	272	free_swap_cache(page);
	273	page_cache_release(page);
	274	}
	275
	276	/*
	277	* Passed an array of pages, drop them all from swapcache and then release
	278	* them. They are removed from the LRU and freed if this is their last use.
	279	*/
	280	void free_pages_and_swap_cache(struct page **pages, int nr)
	281	{
1da177e4 LT	282	struct page **pagep = pages;
	283
	284	lru_add_drain();
	285	while (nr) {
c484d410	286	int todo = min(nr, PAGEVEC_SIZE);
1da177e4 LT	287	int i;
	288
	289	for (i = 0; i < todo; i++)
	290	free_swap_cache(pagep[i]);
	291	release_pages(pagep, todo, 0);
	292	pagep += todo;
	293	nr -= todo;
	294	}
	295	}
	296
	297	/*
	298	* Lookup a swap entry in the swap cache. A found page will be returned
	299	* unlocked and with its refcount incremented - we rely on the kernel
	300	* lock getting page table operations atomic even if we drop the page
	301	* lock before returning.
	302	*/
	303	struct page * lookup_swap_cache(swp_entry_t entry)
	304	{
	305	struct page *page;
	306
	307	page = find_get_page(&swapper_space, entry.val);
	308
	309	if (page)
	310	INC_CACHE_INFO(find_success);
	311
	312	INC_CACHE_INFO(find_total);
	313	return page;
	314	}
	315
	316	/*
	317	* Locate a page of swap in physical memory, reserving swap cache space
	318	* and reading the disk if it is not already cached.
	319	* A failure return means that either the page allocation failed or that
	320	* the swap entry is no longer in use.
	321	*/
02098fea	322	struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
1da177e4 LT	323	struct vm_area_struct *vma, unsigned long addr)
	324	{
	325	struct page found_page, new_page = NULL;
	326	int err;
	327
	328	do {
	329	/*
	330	* First check the swap cache. Since this is normally
	331	* called after lookup_swap_cache() failed, re-calling
	332	* that would confuse statistics.
	333	*/
	334	found_page = find_get_page(&swapper_space, entry.val);
	335	if (found_page)
	336	break;
	337
	338	/*
	339	* Get a new page to read into from swap.
	340	*/
	341	if (!new_page) {
02098fea	342	new_page = alloc_page_vma(gfp_mask, vma, addr);
1da177e4 LT	343	if (!new_page)
	344	break; /* Out of memory */
	345	}
	346
	347	/*
	348	* Associate the page with swap entry in the swap cache.
	349	* May fail (-ENOENT) if swap entry has been freed since
	350	* our caller observed it. May fail (-EEXIST) if there
	351	* is already a page associated with this entry in the
	352	* swap cache: added by a racing read_swap_cache_async,
	353	* or by try_to_swap_out (or shmem_writepage) re-using
	354	* the just freed swap entry for an existing page.
	355	* May fail (-ENOMEM) if radix-tree node allocation failed.
	356	*/
02098fea	357	err = add_to_swap_cache(new_page, entry, gfp_mask);
1da177e4 LT	358	if (!err) {
	359	/*
	360	* Initiate read into locked page and return.
	361	*/
	362	lru_cache_add_active(new_page);
	363	swap_readpage(NULL, new_page);
	364	return new_page;
	365	}
	366	} while (err != -ENOENT && err != -ENOMEM);
	367
	368	if (new_page)
	369	page_cache_release(new_page);
	370	return found_page;
	371	}
46017e95 HD	372
	373	/**
	374	* swapin_readahead - swap in pages in hope we need them soon
	375	* @entry: swap entry of this memory
	376	* @vma: user vma this address belongs to
	377	* @addr: target address for mempolicy
	378	*
	379	* Returns the struct page for entry and addr, after queueing swapin.
	380	*
	381	* Primitive swap readahead code. We simply read an aligned block of
	382	* (1 << page_cluster) entries in the swap area. This method is chosen
	383	* because it doesn't cost us any seek time. We also make sure to queue
	384	* the 'original' request together with the readahead ones...
	385	*
	386	* This has been extended to use the NUMA policies from the mm triggering
	387	* the readahead.
	388	*
	389	* Caller must hold down_read on the vma->vm_mm if vma is not NULL.
	390	*/
02098fea	391	struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
46017e95 HD	392	struct vm_area_struct *vma, unsigned long addr)
	393	{
	394	int nr_pages;
	395	struct page *page;
	396	unsigned long offset;
	397	unsigned long end_offset;
	398
	399	/*
	400	* Get starting offset for readaround, and number of pages to read.
	401	* Adjust starting address by readbehind (for NUMA interleave case)?
	402	* No, it's very unlikely that swap layout would follow vma layout,
	403	* more likely that neighbouring swap pages came from the same node:
	404	* so use the same "addr" to choose the same node for each swap read.
	405	*/
	406	nr_pages = valid_swaphandles(entry, &offset);
	407	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
	408	/* Ok, do the async read-ahead now */
	409	page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
02098fea	410	gfp_mask, vma, addr);
46017e95 HD	411	if (!page)
	412	break;
	413	page_cache_release(page);
	414	}
	415	lru_add_drain(); /* Push any new pages onto the LRU now */
02098fea	416	return read_swap_cache_async(entry, gfp_mask, vma, addr);
46017e95	417	}