[linux-block.git] / mm / mmu_gather.c

#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/swap.h>

#include <asm/pgalloc.h>
#include <asm/tlb.h>

#ifndef CONFIG_MMU_GATHER_NO_GATHER

static bool tlb_next_batch(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;

	batch = tlb->active;
	if (batch->next) {
		tlb->active = batch->next;
		return true;
	}

	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
		return false;

	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
	if (!batch)
		return false;

	tlb->batch_count++;
	batch->next = NULL;
	batch->nr   = 0;
	batch->max  = MAX_GATHER_BATCH;

	tlb->active->next = batch;
	tlb->active = batch;

	return true;
}

static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;

	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
		free_pages_and_swap_cache(batch->pages, batch->nr);
		batch->nr = 0;
	}
	tlb->active = &tlb->local;
}

static void tlb_batch_list_free(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch, *next;

	for (batch = tlb->local.next; batch; batch = next) {
		next = batch->next;
		free_pages((unsigned long)batch, 0);
	}
	tlb->local.next = NULL;
}

bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
{
	struct mmu_gather_batch *batch;

	VM_BUG_ON(!tlb->end);

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
	VM_WARN_ON(tlb->page_size != page_size);
#endif

	batch = tlb->active;
	/*
	 * Add the page and check if we are full. If so
	 * force a flush.
	 */
	batch->pages[batch->nr++] = page;
	if (batch->nr == batch->max) {
		if (!tlb_next_batch(tlb))
			return true;
		batch = tlb->active;
	}
	VM_BUG_ON_PAGE(batch->nr > batch->max, page);

	return false;
}

#endif /* MMU_GATHER_NO_GATHER */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

static void __tlb_remove_table_free(struct mmu_table_batch *batch)
{
	int i;

	for (i = 0; i < batch->nr; i++)
		__tlb_remove_table(batch->tables[i]);

	free_page((unsigned long)batch);
}

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE

/*
 * Semi RCU freeing of the page directories.
 *
 * This is needed by some architectures to implement software pagetable walkers.
 *
 * gup_fast() and other software pagetable walkers do a lockless page-table
 * walk and therefore needs some synchronization with the freeing of the page
 * directories. The chosen means to accomplish that is by disabling IRQs over
 * the walk.
 *
 * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 * since we unlink the page, flush TLBs, free the page. Since the disabling of
 * IRQs delays the completion of the TLB flush we can never observe an already
 * freed page.
 *
 * Architectures that do not have this (PPC) need to delay the freeing by some
 * other means, this is that means.
 *
 * What we do is batch the freed directory pages (tables) and RCU free them.
 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 * holds off grace periods.
 *
 * However, in order to batch these pages we need to allocate storage, this
 * allocation is deep inside the MM code and can thus easily fail on memory
 * pressure. To guarantee progress we fall back to single table freeing, see
 * the implementation of tlb_remove_table_one().
 *
 */

static void tlb_remove_table_smp_sync(void *arg)
{
	/* Simply deliver the interrupt */
}

static void tlb_remove_table_sync_one(void)
{
	/*
	 * This isn't an RCU grace period and hence the page-tables cannot be
	 * assumed to be actually RCU-freed.
	 *
	 * It is however sufficient for software page-table walkers that rely on
	 * IRQ disabling.
	 */
	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
}

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
	call_rcu(&batch->rcu, tlb_remove_table_rcu);
}

#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */

static void tlb_remove_table_sync_one(void) { }

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
	__tlb_remove_table_free(batch);
}

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */

/*
 * If we want tlb_remove_table() to imply TLB invalidates.
 */
static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
	if (tlb_needs_table_invalidate()) {
		/*
		 * Invalidate page-table caches used by hardware walkers. Then
		 * we still need to RCU-sched wait while freeing the pages
		 * because software walkers can still be in-flight.
		 */
		tlb_flush_mmu_tlbonly(tlb);
	}
}

static void tlb_remove_table_one(void *table)
{
	tlb_remove_table_sync_one();
	__tlb_remove_table(table);
}

static void tlb_table_flush(struct mmu_gather *tlb)
{
	struct mmu_table_batch **batch = &tlb->batch;

	if (*batch) {
		tlb_table_invalidate(tlb);
		tlb_remove_table_free(*batch);
		*batch = NULL;
	}
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
	struct mmu_table_batch **batch = &tlb->batch;

	if (*batch == NULL) {
		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
		if (*batch == NULL) {
			tlb_table_invalidate(tlb);
			tlb_remove_table_one(table);
			return;
		}
		(*batch)->nr = 0;
	}

	(*batch)->tables[(*batch)->nr++] = table;
	if ((*batch)->nr == MAX_TABLE_BATCH)
		tlb_table_flush(tlb);
}

static inline void tlb_table_init(struct mmu_gather *tlb)
{
	tlb->batch = NULL;
}

#else /* !CONFIG_MMU_GATHER_TABLE_FREE */

static inline void tlb_table_flush(struct mmu_gather *tlb) { }
static inline void tlb_table_init(struct mmu_gather *tlb) { }

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
	tlb_table_flush(tlb);
#ifndef CONFIG_MMU_GATHER_NO_GATHER
	tlb_batch_pages_flush(tlb);
#endif
}

void tlb_flush_mmu(struct mmu_gather *tlb)
{
	tlb_flush_mmu_tlbonly(tlb);
	tlb_flush_mmu_free(tlb);
}

static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
			     bool fullmm)
{
	tlb->mm = mm;
	tlb->fullmm = fullmm;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
	tlb->need_flush_all = 0;
	tlb->local.next = NULL;
	tlb->local.nr   = 0;
	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
	tlb->active     = &tlb->local;
	tlb->batch_count = 0;
#endif

	tlb_table_init(tlb);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
	tlb->page_size = 0;
#endif

	__tlb_reset_range(tlb);
	inc_tlb_flush_pending(tlb->mm);
}

/**
 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
{
	__tlb_gather_mmu(tlb, mm, false);
}

/**
 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * In this case, @mm is without users and we're going to destroy the
 * full address space (exit/execve).
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
{
	__tlb_gather_mmu(tlb, mm, true);
}

/**
 * tlb_finish_mmu - finish an mmu_gather structure
 * @tlb: the mmu_gather structure to finish
 *
 * Called at the end of the shootdown operation to free up any resources that
 * were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb)
{
	/*
	 * If there are parallel threads are doing PTE changes on same range
	 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
	 * flush by batching, one thread may end up seeing inconsistent PTEs
	 * and result in having stale TLB entries.  So flush TLB forcefully
	 * if we detect parallel PTE batching threads.
	 *
	 * However, some syscalls, e.g. munmap(), may free page tables, this
	 * needs force flush everything in the given range. Otherwise this
	 * may result in having stale TLB entries for some architectures,
	 * e.g. aarch64, that could specify flush what level TLB.
	 */
	if (mm_tlb_flush_nested(tlb->mm)) {
		/*
		 * The aarch64 yields better performance with fullmm by
		 * avoiding multiple CPUs spamming TLBI messages at the
		 * same time.
		 *
		 * On x86 non-fullmm doesn't yield significant difference
		 * against fullmm.
		 */
		tlb->fullmm = 1;
		__tlb_reset_range(tlb);
		tlb->freed_tables = 1;
	}

	tlb_flush_mmu(tlb);

#ifndef CONFIG_MMU_GATHER_NO_GATHER
	tlb_batch_list_free(tlb);
#endif
	dec_tlb_flush_pending(tlb->mm);
}
Commit	Line	Data
196d9d8b PZ	1	#include <linux/gfp.h>
	2	#include <linux/highmem.h>
	3	#include <linux/kernel.h>
	4	#include <linux/mmdebug.h>
	5	#include <linux/mm_types.h>
36090def	6	#include <linux/mm_inline.h>
196d9d8b PZ	7	#include <linux/pagemap.h>
	8	#include <linux/rcupdate.h>
	9	#include <linux/smp.h>
	10	#include <linux/swap.h>
	11
	12	#include <asm/pgalloc.h>
	13	#include <asm/tlb.h>
	14
580a586c	15	#ifndef CONFIG_MMU_GATHER_NO_GATHER
952a31c9	16
196d9d8b PZ	17	static bool tlb_next_batch(struct mmu_gather *tlb)
	18	{
	19	struct mmu_gather_batch *batch;
	20
	21	batch = tlb->active;
	22	if (batch->next) {
	23	tlb->active = batch->next;
	24	return true;
	25	}
	26
	27	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
	28	return false;
	29
	30	batch = (void *)__get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);
	31	if (!batch)
	32	return false;
	33
	34	tlb->batch_count++;
	35	batch->next = NULL;
	36	batch->nr = 0;
	37	batch->max = MAX_GATHER_BATCH;
	38
	39	tlb->active->next = batch;
	40	tlb->active = batch;
	41
	42	return true;
	43	}
	44
952a31c9 MS	45	static void tlb_batch_pages_flush(struct mmu_gather *tlb)
	46	{
	47	struct mmu_gather_batch *batch;
	48
	49	for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
	50	free_pages_and_swap_cache(batch->pages, batch->nr);
	51	batch->nr = 0;
	52	}
	53	tlb->active = &tlb->local;
	54	}
	55
	56	static void tlb_batch_list_free(struct mmu_gather *tlb)
	57	{
	58	struct mmu_gather_batch batch, next;
	59
	60	for (batch = tlb->local.next; batch; batch = next) {
	61	next = batch->next;
	62	free_pages((unsigned long)batch, 0);
	63	}
	64	tlb->local.next = NULL;
	65	}
	66
	67	bool __tlb_remove_page_size(struct mmu_gather tlb, struct page page, int page_size)
	68	{
	69	struct mmu_gather_batch *batch;
	70
	71	VM_BUG_ON(!tlb->end);
	72
3af4bd03	73	#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
952a31c9 MS	74	VM_WARN_ON(tlb->page_size != page_size);
	75	#endif
	76
	77	batch = tlb->active;
	78	/*
	79	* Add the page and check if we are full. If so
	80	* force a flush.
	81	*/
	82	batch->pages[batch->nr++] = page;
	83	if (batch->nr == batch->max) {
	84	if (!tlb_next_batch(tlb))
	85	return true;
	86	batch = tlb->active;
	87	}
	88	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
	89
	90	return false;
	91	}
	92
580a586c	93	#endif /* MMU_GATHER_NO_GATHER */
952a31c9	94
0d6e24d4	95	#ifdef CONFIG_MMU_GATHER_TABLE_FREE
196d9d8b	96
0d6e24d4 PZ	97	static void __tlb_remove_table_free(struct mmu_table_batch *batch)
	98	{
	99	int i;
	100
	101	for (i = 0; i < batch->nr; i++)
	102	__tlb_remove_table(batch->tables[i]);
	103
	104	free_page((unsigned long)batch);
	105	}
	106
	107	#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
196d9d8b PZ	108
196d9d8b PZ	109	/*
0d6e24d4 PZ	110	* Semi RCU freeing of the page directories.
	111	*
	112	* This is needed by some architectures to implement software pagetable walkers.
	113	*
	114	* gup_fast() and other software pagetable walkers do a lockless page-table
	115	* walk and therefore needs some synchronization with the freeing of the page
	116	* directories. The chosen means to accomplish that is by disabling IRQs over
	117	* the walk.
	118	*
	119	* Architectures that use IPIs to flush TLBs will then automagically DTRT,
	120	* since we unlink the page, flush TLBs, free the page. Since the disabling of
	121	* IRQs delays the completion of the TLB flush we can never observe an already
	122	* freed page.
	123	*
	124	* Architectures that do not have this (PPC) need to delay the freeing by some
	125	* other means, this is that means.
	126	*
	127	* What we do is batch the freed directory pages (tables) and RCU free them.
	128	* We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
	129	* holds off grace periods.
	130	*
	131	* However, in order to batch these pages we need to allocate storage, this
	132	* allocation is deep inside the MM code and can thus easily fail on memory
	133	* pressure. To guarantee progress we fall back to single table freeing, see
	134	* the implementation of tlb_remove_table_one().
	135	*
196d9d8b	136	*/
196d9d8b PZ	137
	138	static void tlb_remove_table_smp_sync(void *arg)
	139	{
	140	/* Simply deliver the interrupt */
	141	}
	142
0d6e24d4	143	static void tlb_remove_table_sync_one(void)
196d9d8b PZ	144	{
	145	/*
	146	* This isn't an RCU grace period and hence the page-tables cannot be
	147	* assumed to be actually RCU-freed.
	148	*
	149	* It is however sufficient for software page-table walkers that rely on
0d6e24d4	150	* IRQ disabling.
196d9d8b PZ	151	*/
196d9d8b PZ	152	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
196d9d8b PZ	153	}
	154
	155	static void tlb_remove_table_rcu(struct rcu_head *head)
	156	{
0d6e24d4 PZ	157	__tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
0d6e24d4 PZ	158	}
196d9d8b	159
0d6e24d4 PZ	160	static void tlb_remove_table_free(struct mmu_table_batch *batch)
	161	{
	162	call_rcu(&batch->rcu, tlb_remove_table_rcu);
	163	}
196d9d8b	164
0d6e24d4	165	#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
196d9d8b	166
0d6e24d4 PZ	167	static void tlb_remove_table_sync_one(void) { }
	168
	169	static void tlb_remove_table_free(struct mmu_table_batch *batch)
	170	{
	171	__tlb_remove_table_free(batch);
	172	}
	173
	174	#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
	175
	176	/*
	177	* If we want tlb_remove_table() to imply TLB invalidates.
	178	*/
	179	static inline void tlb_table_invalidate(struct mmu_gather *tlb)
	180	{
	181	if (tlb_needs_table_invalidate()) {
	182	/*
	183	* Invalidate page-table caches used by hardware walkers. Then
	184	* we still need to RCU-sched wait while freeing the pages
	185	* because software walkers can still be in-flight.
	186	*/
	187	tlb_flush_mmu_tlbonly(tlb);
	188	}
	189	}
	190
	191	static void tlb_remove_table_one(void *table)
	192	{
	193	tlb_remove_table_sync_one();
	194	__tlb_remove_table(table);
196d9d8b PZ	195	}
196d9d8b PZ	196
0a8caf21	197	static void tlb_table_flush(struct mmu_gather *tlb)
196d9d8b PZ	198	{
	199	struct mmu_table_batch **batch = &tlb->batch;
	200
	201	if (*batch) {
	202	tlb_table_invalidate(tlb);
0d6e24d4	203	tlb_remove_table_free(*batch);
196d9d8b PZ	204	*batch = NULL;
	205	}
	206	}
	207
	208	void tlb_remove_table(struct mmu_gather tlb, void table)
	209	{
	210	struct mmu_table_batch **batch = &tlb->batch;
	211
	212	if (*batch == NULL) {
	213	batch = (struct mmu_table_batch )__get_free_page(GFP_NOWAIT \| __GFP_NOWARN);
	214	if (*batch == NULL) {
	215	tlb_table_invalidate(tlb);
	216	tlb_remove_table_one(table);
	217	return;
	218	}
	219	(*batch)->nr = 0;
	220	}
	221
	222	(batch)->tables[(batch)->nr++] = table;
	223	if ((*batch)->nr == MAX_TABLE_BATCH)
	224	tlb_table_flush(tlb);
	225	}
	226
0d6e24d4 PZ	227	static inline void tlb_table_init(struct mmu_gather *tlb)
	228	{
	229	tlb->batch = NULL;
	230	}
	231
	232	#else /* !CONFIG_MMU_GATHER_TABLE_FREE */
	233
	234	static inline void tlb_table_flush(struct mmu_gather *tlb) { }
	235	static inline void tlb_table_init(struct mmu_gather *tlb) { }
	236
	237	#endif /* CONFIG_MMU_GATHER_TABLE_FREE */
196d9d8b	238
0a8caf21 PZ	239	static void tlb_flush_mmu_free(struct mmu_gather *tlb)
0a8caf21 PZ	240	{
0a8caf21	241	tlb_table_flush(tlb);
580a586c	242	#ifndef CONFIG_MMU_GATHER_NO_GATHER
0a8caf21 PZ	243	tlb_batch_pages_flush(tlb);
	244	#endif
	245	}
	246
	247	void tlb_flush_mmu(struct mmu_gather *tlb)
	248	{
	249	tlb_flush_mmu_tlbonly(tlb);
	250	tlb_flush_mmu_free(tlb);
	251	}
	252
d8b45053	253	static void __tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm,
a72afd87	254	bool fullmm)
196d9d8b	255	{
1808d65b	256	tlb->mm = mm;
a72afd87	257	tlb->fullmm = fullmm;
1808d65b	258
580a586c	259	#ifndef CONFIG_MMU_GATHER_NO_GATHER
1808d65b PZ	260	tlb->need_flush_all = 0;
	261	tlb->local.next = NULL;
	262	tlb->local.nr = 0;
	263	tlb->local.max = ARRAY_SIZE(tlb->__pages);
	264	tlb->active = &tlb->local;
	265	tlb->batch_count = 0;
	266	#endif
	267
0d6e24d4	268	tlb_table_init(tlb);
3af4bd03	269	#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
1808d65b PZ	270	tlb->page_size = 0;
	271	#endif
	272
	273	__tlb_reset_range(tlb);
196d9d8b PZ	274	inc_tlb_flush_pending(tlb->mm);
	275	}
	276
845be1cd RD	277	/**
	278	* tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
	279	* @tlb: the mmu_gather structure to initialize
	280	* @mm: the mm_struct of the target address space
	281	*
	282	* Called to initialize an (on-stack) mmu_gather structure for page-table
	283	* tear-down from @mm.
	284	*/
a72afd87	285	void tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm)
d8b45053	286	{
a72afd87	287	__tlb_gather_mmu(tlb, mm, false);
d8b45053 WD	288	}
d8b45053 WD	289
845be1cd RD	290	/**
	291	* tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
	292	* @tlb: the mmu_gather structure to initialize
	293	* @mm: the mm_struct of the target address space
	294	*
	295	* In this case, @mm is without users and we're going to destroy the
	296	* full address space (exit/execve).
	297	*
	298	* Called to initialize an (on-stack) mmu_gather structure for page-table
	299	* tear-down from @mm.
	300	*/
d8b45053 WD	301	void tlb_gather_mmu_fullmm(struct mmu_gather tlb, struct mm_struct mm)
d8b45053 WD	302	{
a72afd87	303	__tlb_gather_mmu(tlb, mm, true);
d8b45053 WD	304	}
d8b45053 WD	305
1808d65b PZ	306	/**
	307	* tlb_finish_mmu - finish an mmu_gather structure
	308	* @tlb: the mmu_gather structure to finish
1808d65b PZ	309	*
	310	* Called at the end of the shootdown operation to free up any resources that
	311	* were required.
	312	*/
ae8eba8b	313	void tlb_finish_mmu(struct mmu_gather *tlb)
196d9d8b PZ	314	{
	315	/*
	316	* If there are parallel threads are doing PTE changes on same range
c1e8d7c6	317	* under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
7a30df49 YS	318	* flush by batching, one thread may end up seeing inconsistent PTEs
	319	* and result in having stale TLB entries. So flush TLB forcefully
	320	* if we detect parallel PTE batching threads.
	321	*
	322	* However, some syscalls, e.g. munmap(), may free page tables, this
	323	* needs force flush everything in the given range. Otherwise this
	324	* may result in having stale TLB entries for some architectures,
	325	* e.g. aarch64, that could specify flush what level TLB.
196d9d8b	326	*/
1808d65b	327	if (mm_tlb_flush_nested(tlb->mm)) {
7a30df49 YS	328	/*
	329	* The aarch64 yields better performance with fullmm by
	330	* avoiding multiple CPUs spamming TLBI messages at the
	331	* same time.
	332	*
	333	* On x86 non-fullmm doesn't yield significant difference
	334	* against fullmm.
	335	*/
	336	tlb->fullmm = 1;
1808d65b	337	__tlb_reset_range(tlb);
7a30df49	338	tlb->freed_tables = 1;
1808d65b	339	}
196d9d8b	340
1808d65b PZ	341	tlb_flush_mmu(tlb);
1808d65b PZ	342
580a586c	343	#ifndef CONFIG_MMU_GATHER_NO_GATHER
1808d65b PZ	344	tlb_batch_list_free(tlb);
1808d65b PZ	345	#endif
196d9d8b PZ	346	dec_tlb_flush_pending(tlb->mm);
196d9d8b PZ	347	}