[linux-block.git] / mm / compaction.c

/*
 * linux/mm/compaction.c
 *
 * Memory compaction for the reduction of external fragmentation. Note that
 * this heavily depends upon page migration to do all the real heavy
 * lifting
 *
 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
 */
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
#include "internal.h"

/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
	struct list_head freepages;	/* List of free pages to migrate to */
	struct list_head migratepages;	/* List of pages being migrated */
	unsigned long nr_freepages;	/* Number of isolated free pages */
	unsigned long nr_migratepages;	/* Number of pages to migrate */
	unsigned long free_pfn;		/* isolate_freepages search base */
	unsigned long migrate_pfn;	/* isolate_migratepages search base */

	/* Account for isolated anon and file pages */
	unsigned long nr_anon;
	unsigned long nr_file;

	struct zone *zone;
};

static unsigned long release_freepages(struct list_head *freelist)
{
	struct page *page, *next;
	unsigned long count = 0;

	list_for_each_entry_safe(page, next, freelist, lru) {
		list_del(&page->lru);
		__free_page(page);
		count++;
	}

	return count;
}

/* Isolate free pages onto a private freelist. Must hold zone->lock */
static unsigned long isolate_freepages_block(struct zone *zone,
				unsigned long blockpfn,
				struct list_head *freelist)
{
	unsigned long zone_end_pfn, end_pfn;
	int total_isolated = 0;
	struct page *cursor;

	/* Get the last PFN we should scan for free pages at */
	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);

	/* Find the first usable PFN in the block to initialse page cursor */
	for (; blockpfn < end_pfn; blockpfn++) {
		if (pfn_valid_within(blockpfn))
			break;
	}
	cursor = pfn_to_page(blockpfn);

	/* Isolate free pages. This assumes the block is valid */
	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
		int isolated, i;
		struct page *page = cursor;

		if (!pfn_valid_within(blockpfn))
			continue;

		if (!PageBuddy(page))
			continue;

		/* Found a free page, break it into order-0 pages */
		isolated = split_free_page(page);
		total_isolated += isolated;
		for (i = 0; i < isolated; i++) {
			list_add(&page->lru, freelist);
			page++;
		}

		/* If a page was split, advance to the end of it */
		if (isolated) {
			blockpfn += isolated - 1;
			cursor += isolated - 1;
		}
	}

	return total_isolated;
}

/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{

	int migratetype = get_pageblock_migratetype(page);

	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
		return false;

	/* If the page is a large free page, then allow migration */
	if (PageBuddy(page) && page_order(page) >= pageblock_order)
		return true;

	/* If the block is MIGRATE_MOVABLE, allow migration */
	if (migratetype == MIGRATE_MOVABLE)
		return true;

	/* Otherwise skip the block */
	return false;
}

/*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
 */
static void isolate_freepages(struct zone *zone,
				struct compact_control *cc)
{
	struct page *page;
	unsigned long high_pfn, low_pfn, pfn;
	unsigned long flags;
	int nr_freepages = cc->nr_freepages;
	struct list_head *freelist = &cc->freepages;

	pfn = cc->free_pfn;
	low_pfn = cc->migrate_pfn + pageblock_nr_pages;
	high_pfn = low_pfn;

	/*
	 * Isolate free pages until enough are available to migrate the
	 * pages on cc->migratepages. We stop searching if the migrate
	 * and free page scanners meet or enough free pages are isolated.
	 */
	spin_lock_irqsave(&zone->lock, flags);
	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
					pfn -= pageblock_nr_pages) {
		unsigned long isolated;

		if (!pfn_valid(pfn))
			continue;

		/*
		 * Check for overlapping nodes/zones. It's possible on some
		 * configurations to have a setup like
		 * node0 node1 node0
		 * i.e. it's possible that all pages within a zones range of
		 * pages do not belong to a single zone.
		 */
		page = pfn_to_page(pfn);
		if (page_zone(page) != zone)
			continue;

		/* Check the block is suitable for migration */
		if (!suitable_migration_target(page))
			continue;

		/* Found a block suitable for isolating free pages from */
		isolated = isolate_freepages_block(zone, pfn, freelist);
		nr_freepages += isolated;

		/*
		 * Record the highest PFN we isolated pages from. When next
		 * looking for free pages, the search will restart here as
		 * page migration may have returned some pages to the allocator
		 */
		if (isolated)
			high_pfn = max(high_pfn, pfn);
	}
	spin_unlock_irqrestore(&zone->lock, flags);

	/* split_free_page does not map the pages */
	list_for_each_entry(page, freelist, lru) {
		arch_alloc_page(page, 0);
		kernel_map_pages(page, 1, 1);
	}

	cc->free_pfn = high_pfn;
	cc->nr_freepages = nr_freepages;
}

/* Update the number of anon and file isolated pages in the zone */
static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
	struct page *page;
	unsigned int count[NR_LRU_LISTS] = { 0, };

	list_for_each_entry(page, &cc->migratepages, lru) {
		int lru = page_lru_base_type(page);
		count[lru]++;
	}

	cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
	cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
	__mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
	__mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
}

/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{

	unsigned long inactive, isolated;

	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
					zone_page_state(zone, NR_INACTIVE_ANON);
	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
					zone_page_state(zone, NR_ISOLATED_ANON);

	return isolated > inactive;
}

/*
 * Isolate all pages that can be migrated from the block pointed to by
 * the migrate scanner within compact_control.
 */
static unsigned long isolate_migratepages(struct zone *zone,
					struct compact_control *cc)
{
	unsigned long low_pfn, end_pfn;
	struct list_head *migratelist = &cc->migratepages;

	/* Do not scan outside zone boundaries */
	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);

	/* Only scan within a pageblock boundary */
	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);

	/* Do not cross the free scanner or scan within a memory hole */
	if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
		cc->migrate_pfn = end_pfn;
		return 0;
	}

	/*
	 * Ensure that there are not too many pages isolated from the LRU
	 * list by either parallel reclaimers or compaction. If there are,
	 * delay for some time until fewer pages are isolated
	 */
	while (unlikely(too_many_isolated(zone))) {
		congestion_wait(BLK_RW_ASYNC, HZ/10);

		if (fatal_signal_pending(current))
			return 0;
	}

	/* Time to isolate some pages for migration */
	spin_lock_irq(&zone->lru_lock);
	for (; low_pfn < end_pfn; low_pfn++) {
		struct page *page;
		if (!pfn_valid_within(low_pfn))
			continue;

		/* Get the page and skip if free */
		page = pfn_to_page(low_pfn);
		if (PageBuddy(page))
			continue;

		/* Try isolate the page */
		if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
			continue;

		/* Successfully isolated */
		del_page_from_lru_list(zone, page, page_lru(page));
		list_add(&page->lru, migratelist);
		mem_cgroup_del_lru(page);
		cc->nr_migratepages++;

		/* Avoid isolating too much */
		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
			break;
	}

	acct_isolated(zone, cc);

	spin_unlock_irq(&zone->lru_lock);
	cc->migrate_pfn = low_pfn;

	return cc->nr_migratepages;
}

/*
 * This is a migrate-callback that "allocates" freepages by taking pages
 * from the isolated freelists in the block we are migrating to.
 */
static struct page *compaction_alloc(struct page *migratepage,
					unsigned long data,
					int **result)
{
	struct compact_control *cc = (struct compact_control *)data;
	struct page *freepage;

	/* Isolate free pages if necessary */
	if (list_empty(&cc->freepages)) {
		isolate_freepages(cc->zone, cc);

		if (list_empty(&cc->freepages))
			return NULL;
	}

	freepage = list_entry(cc->freepages.next, struct page, lru);
	list_del(&freepage->lru);
	cc->nr_freepages--;

	return freepage;
}

/*
 * We cannot control nr_migratepages and nr_freepages fully when migration is
 * running as migrate_pages() has no knowledge of compact_control. When
 * migration is complete, we count the number of pages on the lists by hand.
 */
static void update_nr_listpages(struct compact_control *cc)
{
	int nr_migratepages = 0;
	int nr_freepages = 0;
	struct page *page;

	list_for_each_entry(page, &cc->migratepages, lru)
		nr_migratepages++;
	list_for_each_entry(page, &cc->freepages, lru)
		nr_freepages++;

	cc->nr_migratepages = nr_migratepages;
	cc->nr_freepages = nr_freepages;
}

static int compact_finished(struct zone *zone,
						struct compact_control *cc)
{
	if (fatal_signal_pending(current))
		return COMPACT_PARTIAL;

	/* Compaction run completes if the migrate and free scanner meet */
	if (cc->free_pfn <= cc->migrate_pfn)
		return COMPACT_COMPLETE;

	return COMPACT_CONTINUE;
}

static int compact_zone(struct zone *zone, struct compact_control *cc)
{
	int ret;

	/* Setup to move all movable pages to the end of the zone */
	cc->migrate_pfn = zone->zone_start_pfn;
	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
	cc->free_pfn &= ~(pageblock_nr_pages-1);

	migrate_prep_local();

	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
		unsigned long nr_migrate, nr_remaining;

		if (!isolate_migratepages(zone, cc))
			continue;

		nr_migrate = cc->nr_migratepages;
		migrate_pages(&cc->migratepages, compaction_alloc,
						(unsigned long)cc, 0);
		update_nr_listpages(cc);
		nr_remaining = cc->nr_migratepages;

		count_vm_event(COMPACTBLOCKS);
		count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
		if (nr_remaining)
			count_vm_events(COMPACTPAGEFAILED, nr_remaining);

		/* Release LRU pages not migrated */
		if (!list_empty(&cc->migratepages)) {
			putback_lru_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
		}

	}

	/* Release free pages and check accounting */
	cc->nr_freepages -= release_freepages(&cc->freepages);
	VM_BUG_ON(cc->nr_freepages != 0);

	return ret;
}

/* Compact all zones within a node */
static int compact_node(int nid)
{
	int zoneid;
	pg_data_t *pgdat;
	struct zone *zone;

	if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
		return -EINVAL;
	pgdat = NODE_DATA(nid);

	/* Flush pending updates to the LRU lists */
	lru_add_drain_all();

	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
		struct compact_control cc = {
			.nr_freepages = 0,
			.nr_migratepages = 0,
		};

		zone = &pgdat->node_zones[zoneid];
		if (!populated_zone(zone))
			continue;

		cc.zone = zone;
		INIT_LIST_HEAD(&cc.freepages);
		INIT_LIST_HEAD(&cc.migratepages);

		compact_zone(zone, &cc);

		VM_BUG_ON(!list_empty(&cc.freepages));
		VM_BUG_ON(!list_empty(&cc.migratepages));
	}

	return 0;
}

/* Compact all nodes in the system */
static int compact_nodes(void)
{
	int nid;

	for_each_online_node(nid)
		compact_node(nid);

	return COMPACT_COMPLETE;
}

/* The written value is actually unused, all memory is compacted */
int sysctl_compact_memory;

/* This is the entry point for compacting all nodes via /proc/sys/vm */
int sysctl_compaction_handler(struct ctl_table *table, int write,
			void __user *buffer, size_t *length, loff_t *ppos)
{
	if (write)
		return compact_nodes();

	return 0;
}

#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
ssize_t sysfs_compact_node(struct sys_device *dev,
			struct sysdev_attribute *attr,
			const char *buf, size_t count)
{
	compact_node(dev->id);

	return count;
}
static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);

int compaction_register_node(struct node *node)
{
	return sysdev_create_file(&node->sysdev, &attr_compact);
}

void compaction_unregister_node(struct node *node)
{
	return sysdev_remove_file(&node->sysdev, &attr_compact);
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
Commit	Line	Data
748446bb MG	1	/*
	2	* linux/mm/compaction.c
	3	*
	4	* Memory compaction for the reduction of external fragmentation. Note that
	5	* this heavily depends upon page migration to do all the real heavy
	6	* lifting
	7	*
	8	* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
	9	*/
	10	#include <linux/swap.h>
	11	#include <linux/migrate.h>
	12	#include <linux/compaction.h>
	13	#include <linux/mm_inline.h>
	14	#include <linux/backing-dev.h>
76ab0f53	15	#include <linux/sysctl.h>
ed4a6d7f	16	#include <linux/sysfs.h>
748446bb MG	17	#include "internal.h"
	18
	19	/*
	20	* compact_control is used to track pages being migrated and the free pages
	21	* they are being migrated to during memory compaction. The free_pfn starts
	22	* at the end of a zone and migrate_pfn begins at the start. Movable pages
	23	* are moved to the end of a zone during a compaction run and the run
	24	* completes when free_pfn <= migrate_pfn
	25	*/
	26	struct compact_control {
	27	struct list_head freepages; /* List of free pages to migrate to */
	28	struct list_head migratepages; /* List of pages being migrated */
	29	unsigned long nr_freepages; /* Number of isolated free pages */
	30	unsigned long nr_migratepages; /* Number of pages to migrate */
	31	unsigned long free_pfn; /* isolate_freepages search base */
	32	unsigned long migrate_pfn; /* isolate_migratepages search base */
	33
	34	/* Account for isolated anon and file pages */
	35	unsigned long nr_anon;
	36	unsigned long nr_file;
	37
	38	struct zone *zone;
	39	};
	40
	41	static unsigned long release_freepages(struct list_head *freelist)
	42	{
	43	struct page page, next;
	44	unsigned long count = 0;
	45
	46	list_for_each_entry_safe(page, next, freelist, lru) {
	47	list_del(&page->lru);
	48	__free_page(page);
	49	count++;
	50	}
	51
	52	return count;
	53	}
	54
	55	/* Isolate free pages onto a private freelist. Must hold zone->lock */
	56	static unsigned long isolate_freepages_block(struct zone *zone,
	57	unsigned long blockpfn,
	58	struct list_head *freelist)
	59	{
	60	unsigned long zone_end_pfn, end_pfn;
	61	int total_isolated = 0;
	62	struct page *cursor;
	63
	64	/* Get the last PFN we should scan for free pages at */
	65	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	66	end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
	67
	68	/* Find the first usable PFN in the block to initialse page cursor */
	69	for (; blockpfn < end_pfn; blockpfn++) {
	70	if (pfn_valid_within(blockpfn))
	71	break;
	72	}
	73	cursor = pfn_to_page(blockpfn);
	74
	75	/* Isolate free pages. This assumes the block is valid */
	76	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
	77	int isolated, i;
	78	struct page *page = cursor;
	79
	80	if (!pfn_valid_within(blockpfn))
81	continue;
82
83	if (!PageBuddy(page))
84	continue;
85
86	/* Found a free page, break it into order-0 pages */
87	isolated = split_free_page(page);
88	total_isolated += isolated;
89	for (i = 0; i < isolated; i++) {
90	list_add(&page->lru, freelist);
91	page++;
92	}
93
94	/* If a page was split, advance to the end of it */
95	if (isolated) {
96	blockpfn += isolated - 1;
97	cursor += isolated - 1;
98	}
99	}
100
101	return total_isolated;
102	}
103
104	/* Returns true if the page is within a block suitable for migration to */
105	static bool suitable_migration_target(struct page *page)
106	{
107
108	int migratetype = get_pageblock_migratetype(page);
109
110	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
111	if (migratetype == MIGRATE_ISOLATE \|\| migratetype == MIGRATE_RESERVE)
112	return false;
113
114	/* If the page is a large free page, then allow migration */
115	if (PageBuddy(page) && page_order(page) >= pageblock_order)
116	return true;
117
118	/* If the block is MIGRATE_MOVABLE, allow migration */
119	if (migratetype == MIGRATE_MOVABLE)
120	return true;
121
122	/* Otherwise skip the block */
123	return false;
124	}
125
126	/*
127	* Based on information in the current compact_control, find blocks
128	* suitable for isolating free pages from and then isolate them.
129	*/
130	static void isolate_freepages(struct zone *zone,
131	struct compact_control *cc)
132	{
133	struct page *page;
134	unsigned long high_pfn, low_pfn, pfn;
135	unsigned long flags;
136	int nr_freepages = cc->nr_freepages;
137	struct list_head *freelist = &cc->freepages;
138
139	pfn = cc->free_pfn;
140	low_pfn = cc->migrate_pfn + pageblock_nr_pages;
141	high_pfn = low_pfn;
142
143	/*
144	* Isolate free pages until enough are available to migrate the
145	* pages on cc->migratepages. We stop searching if the migrate
146	* and free page scanners meet or enough free pages are isolated.
147	*/
148	spin_lock_irqsave(&zone->lock, flags);
149	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
150	pfn -= pageblock_nr_pages) {
151	unsigned long isolated;
152
153	if (!pfn_valid(pfn))
154	continue;
155
156	/*
157	* Check for overlapping nodes/zones. It's possible on some
158	* configurations to have a setup like
159	* node0 node1 node0
160	* i.e. it's possible that all pages within a zones range of
161	* pages do not belong to a single zone.
162	*/
163	page = pfn_to_page(pfn);
164	if (page_zone(page) != zone)
165	continue;
166
167	/* Check the block is suitable for migration */
168	if (!suitable_migration_target(page))
169	continue;
170
171	/* Found a block suitable for isolating free pages from */
172	isolated = isolate_freepages_block(zone, pfn, freelist);
173	nr_freepages += isolated;
174
175	/*
176	* Record the highest PFN we isolated pages from. When next
177	* looking for free pages, the search will restart here as
178	* page migration may have returned some pages to the allocator
179	*/
180	if (isolated)
181	high_pfn = max(high_pfn, pfn);
182	}
183	spin_unlock_irqrestore(&zone->lock, flags);
184
185	/* split_free_page does not map the pages */
186	list_for_each_entry(page, freelist, lru) {
187	arch_alloc_page(page, 0);
188	kernel_map_pages(page, 1, 1);
189	}
190
191	cc->free_pfn = high_pfn;
192	cc->nr_freepages = nr_freepages;
193	}
194
195	/* Update the number of anon and file isolated pages in the zone */
196	static void acct_isolated(struct zone zone, struct compact_control cc)
197	{
198	struct page *page;
199	unsigned int count[NR_LRU_LISTS] = { 0, };
200
201	list_for_each_entry(page, &cc->migratepages, lru) {
202	int lru = page_lru_base_type(page);
203	count[lru]++;
204	}
205
206	cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
207	cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
208	__mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
209	__mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
210	}
211
212	/* Similar to reclaim, but different enough that they don't share logic */
213	static bool too_many_isolated(struct zone *zone)
214	{
215
216	unsigned long inactive, isolated;
217
218	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
219	zone_page_state(zone, NR_INACTIVE_ANON);
220	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
221	zone_page_state(zone, NR_ISOLATED_ANON);
222
223	return isolated > inactive;
224	}
225
226	/*
227	* Isolate all pages that can be migrated from the block pointed to by
228	* the migrate scanner within compact_control.
229	*/
230	static unsigned long isolate_migratepages(struct zone *zone,
231	struct compact_control *cc)
232	{
233	unsigned long low_pfn, end_pfn;
234	struct list_head *migratelist = &cc->migratepages;
235
236	/* Do not scan outside zone boundaries */
237	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
238
239	/* Only scan within a pageblock boundary */
240	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
241
242	/* Do not cross the free scanner or scan within a memory hole */
243	if (end_pfn > cc->free_pfn \|\| !pfn_valid(low_pfn)) {
244	cc->migrate_pfn = end_pfn;
245	return 0;
246	}
247
248	/*
249	* Ensure that there are not too many pages isolated from the LRU
250	* list by either parallel reclaimers or compaction. If there are,
251	* delay for some time until fewer pages are isolated
252	*/
253	while (unlikely(too_many_isolated(zone))) {
254	congestion_wait(BLK_RW_ASYNC, HZ/10);
255
256	if (fatal_signal_pending(current))
257	return 0;
258	}
259
260	/* Time to isolate some pages for migration */
261	spin_lock_irq(&zone->lru_lock);
262	for (; low_pfn < end_pfn; low_pfn++) {
263	struct page *page;
264	if (!pfn_valid_within(low_pfn))
265	continue;
266
267	/* Get the page and skip if free */
268	page = pfn_to_page(low_pfn);
269	if (PageBuddy(page))
270	continue;
271
272	/* Try isolate the page */
273	if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
274	continue;
275
276	/* Successfully isolated */
277	del_page_from_lru_list(zone, page, page_lru(page));
278	list_add(&page->lru, migratelist);
279	mem_cgroup_del_lru(page);
280	cc->nr_migratepages++;
281
282	/* Avoid isolating too much */
283	if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
284	break;
285	}
286
287	acct_isolated(zone, cc);
288
289	spin_unlock_irq(&zone->lru_lock);
290	cc->migrate_pfn = low_pfn;
291
292	return cc->nr_migratepages;
293	}
294
295	/*
296	* This is a migrate-callback that "allocates" freepages by taking pages
297	* from the isolated freelists in the block we are migrating to.
298	*/
299	static struct page compaction_alloc(struct page migratepage,
300	unsigned long data,
301	int **result)
302	{
303	struct compact_control cc = (struct compact_control )data;
304	struct page *freepage;
305
306	/* Isolate free pages if necessary */
307	if (list_empty(&cc->freepages)) {
308	isolate_freepages(cc->zone, cc);
309
310	if (list_empty(&cc->freepages))
311	return NULL;
312	}
313
314	freepage = list_entry(cc->freepages.next, struct page, lru);
315	list_del(&freepage->lru);
316	cc->nr_freepages--;
317
318	return freepage;
319	}
320
321	/*
322	* We cannot control nr_migratepages and nr_freepages fully when migration is
323	* running as migrate_pages() has no knowledge of compact_control. When
324	* migration is complete, we count the number of pages on the lists by hand.
325	*/
326	static void update_nr_listpages(struct compact_control *cc)
327	{
328	int nr_migratepages = 0;
329	int nr_freepages = 0;
330	struct page *page;
331
332	list_for_each_entry(page, &cc->migratepages, lru)
333	nr_migratepages++;
334	list_for_each_entry(page, &cc->freepages, lru)
335	nr_freepages++;
336
337	cc->nr_migratepages = nr_migratepages;
338	cc->nr_freepages = nr_freepages;
339	}
340
341	static int compact_finished(struct zone *zone,
342	struct compact_control *cc)
343	{
344	if (fatal_signal_pending(current))
345	return COMPACT_PARTIAL;
346
347	/* Compaction run completes if the migrate and free scanner meet */
348	if (cc->free_pfn <= cc->migrate_pfn)
349	return COMPACT_COMPLETE;
350
351	return COMPACT_CONTINUE;
352	}
353
354	static int compact_zone(struct zone zone, struct compact_control cc)
355	{
356	int ret;
357
358	/* Setup to move all movable pages to the end of the zone */
359	cc->migrate_pfn = zone->zone_start_pfn;
360	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
361	cc->free_pfn &= ~(pageblock_nr_pages-1);
362
363	migrate_prep_local();
364
365	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
366	unsigned long nr_migrate, nr_remaining;
367
368	if (!isolate_migratepages(zone, cc))
369	continue;
370
371	nr_migrate = cc->nr_migratepages;
372	migrate_pages(&cc->migratepages, compaction_alloc,
373	(unsigned long)cc, 0);
374	update_nr_listpages(cc);
375	nr_remaining = cc->nr_migratepages;
376
377	count_vm_event(COMPACTBLOCKS);
378	count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
379	if (nr_remaining)
380	count_vm_events(COMPACTPAGEFAILED, nr_remaining);
381
382	/* Release LRU pages not migrated */
383	if (!list_empty(&cc->migratepages)) {
384	putback_lru_pages(&cc->migratepages);
385	cc->nr_migratepages = 0;
386	}
387
388	}
389
390	/* Release free pages and check accounting */
391	cc->nr_freepages -= release_freepages(&cc->freepages);
392	VM_BUG_ON(cc->nr_freepages != 0);
393
394	return ret;
395	}
76ab0f53 MG	396
	397	/* Compact all zones within a node */
	398	static int compact_node(int nid)
	399	{
	400	int zoneid;
	401	pg_data_t *pgdat;
	402	struct zone *zone;
	403
	404	if (nid < 0 \|\| nid >= nr_node_ids \|\| !node_online(nid))
	405	return -EINVAL;
	406	pgdat = NODE_DATA(nid);
	407
	408	/* Flush pending updates to the LRU lists */
	409	lru_add_drain_all();
	410
	411	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
	412	struct compact_control cc = {
	413	.nr_freepages = 0,
	414	.nr_migratepages = 0,
	415	};
	416
	417	zone = &pgdat->node_zones[zoneid];
	418	if (!populated_zone(zone))
	419	continue;
	420
	421	cc.zone = zone;
	422	INIT_LIST_HEAD(&cc.freepages);
	423	INIT_LIST_HEAD(&cc.migratepages);
	424
	425	compact_zone(zone, &cc);
	426
	427	VM_BUG_ON(!list_empty(&cc.freepages));
	428	VM_BUG_ON(!list_empty(&cc.migratepages));
	429	}
	430
	431	return 0;
	432	}
	433
	434	/* Compact all nodes in the system */
	435	static int compact_nodes(void)
	436	{
	437	int nid;
	438
	439	for_each_online_node(nid)
	440	compact_node(nid);
	441
	442	return COMPACT_COMPLETE;
	443	}
	444
	445	/* The written value is actually unused, all memory is compacted */
	446	int sysctl_compact_memory;
	447
	448	/* This is the entry point for compacting all nodes via /proc/sys/vm */
	449	int sysctl_compaction_handler(struct ctl_table *table, int write,
	450	void __user buffer, size_t length, loff_t *ppos)
	451	{
	452	if (write)
	453	return compact_nodes();
	454
	455	return 0;
	456	}
ed4a6d7f MG	457
	458	#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
	459	ssize_t sysfs_compact_node(struct sys_device *dev,
	460	struct sysdev_attribute *attr,
	461	const char *buf, size_t count)
	462	{
	463	compact_node(dev->id);
	464
	465	return count;
	466	}
	467	static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
	468
	469	int compaction_register_node(struct node *node)
	470	{
	471	return sysdev_create_file(&node->sysdev, &attr_compact);
	472	}
	473
	474	void compaction_unregister_node(struct node *node)
	475	{
	476	return sysdev_remove_file(&node->sysdev, &attr_compact);
	477	}
	478	#endif /* CONFIG_SYSFS && CONFIG_NUMA */