s390/irq: Use defines for external interruption codes
[linux-2.6-block.git] / arch / s390 / mm / pgtable.c
CommitLineData
3610cce8 1/*
a53c8fab 2 * Copyright IBM Corp. 2007, 2011
3610cce8
MS
3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/errno.h>
5a0e3ad6 9#include <linux/gfp.h>
3610cce8
MS
10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/smp.h>
13#include <linux/highmem.h>
3610cce8
MS
14#include <linux/pagemap.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/quicklist.h>
80217147 18#include <linux/rcupdate.h>
e5992f2e 19#include <linux/slab.h>
b31288fa 20#include <linux/swapops.h>
3610cce8 21
3610cce8
MS
22#include <asm/pgtable.h>
23#include <asm/pgalloc.h>
24#include <asm/tlb.h>
25#include <asm/tlbflush.h>
6252d702 26#include <asm/mmu_context.h>
3610cce8
MS
27
28#ifndef CONFIG_64BIT
29#define ALLOC_ORDER 1
36409f63 30#define FRAG_MASK 0x0f
3610cce8
MS
31#else
32#define ALLOC_ORDER 2
36409f63 33#define FRAG_MASK 0x03
3610cce8
MS
34#endif
35
239a6425 36
043d0708 37unsigned long *crst_table_alloc(struct mm_struct *mm)
3610cce8
MS
38{
39 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
40
41 if (!page)
42 return NULL;
3610cce8
MS
43 return (unsigned long *) page_to_phys(page);
44}
45
80217147
MS
46void crst_table_free(struct mm_struct *mm, unsigned long *table)
47{
043d0708 48 free_pages((unsigned long) table, ALLOC_ORDER);
80217147
MS
49}
50
6252d702 51#ifdef CONFIG_64BIT
10607864
MS
52static void __crst_table_upgrade(void *arg)
53{
54 struct mm_struct *mm = arg;
55
56 if (current->active_mm == mm)
57 update_mm(mm, current);
58 __tlb_flush_local();
59}
60
6252d702
MS
61int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
62{
63 unsigned long *table, *pgd;
64 unsigned long entry;
10607864 65 int flush;
6252d702
MS
66
67 BUG_ON(limit > (1UL << 53));
10607864 68 flush = 0;
6252d702 69repeat:
043d0708 70 table = crst_table_alloc(mm);
6252d702
MS
71 if (!table)
72 return -ENOMEM;
80217147 73 spin_lock_bh(&mm->page_table_lock);
6252d702
MS
74 if (mm->context.asce_limit < limit) {
75 pgd = (unsigned long *) mm->pgd;
76 if (mm->context.asce_limit <= (1UL << 31)) {
77 entry = _REGION3_ENTRY_EMPTY;
78 mm->context.asce_limit = 1UL << 42;
79 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
80 _ASCE_USER_BITS |
81 _ASCE_TYPE_REGION3;
82 } else {
83 entry = _REGION2_ENTRY_EMPTY;
84 mm->context.asce_limit = 1UL << 53;
85 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
86 _ASCE_USER_BITS |
87 _ASCE_TYPE_REGION2;
88 }
89 crst_table_init(table, entry);
90 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
91 mm->pgd = (pgd_t *) table;
f481bfaf 92 mm->task_size = mm->context.asce_limit;
6252d702 93 table = NULL;
10607864 94 flush = 1;
6252d702 95 }
80217147 96 spin_unlock_bh(&mm->page_table_lock);
6252d702
MS
97 if (table)
98 crst_table_free(mm, table);
99 if (mm->context.asce_limit < limit)
100 goto repeat;
10607864
MS
101 if (flush)
102 on_each_cpu(__crst_table_upgrade, mm, 0);
6252d702
MS
103 return 0;
104}
105
106void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
107{
108 pgd_t *pgd;
109
10607864
MS
110 if (current->active_mm == mm)
111 __tlb_flush_mm(mm);
6252d702
MS
112 while (mm->context.asce_limit > limit) {
113 pgd = mm->pgd;
114 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
115 case _REGION_ENTRY_TYPE_R2:
116 mm->context.asce_limit = 1UL << 42;
117 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
118 _ASCE_USER_BITS |
119 _ASCE_TYPE_REGION3;
120 break;
121 case _REGION_ENTRY_TYPE_R3:
122 mm->context.asce_limit = 1UL << 31;
123 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
124 _ASCE_USER_BITS |
125 _ASCE_TYPE_SEGMENT;
126 break;
127 default:
128 BUG();
129 }
130 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
f481bfaf 131 mm->task_size = mm->context.asce_limit;
6252d702
MS
132 crst_table_free(mm, (unsigned long *) pgd);
133 }
10607864
MS
134 if (current->active_mm == mm)
135 update_mm(mm, current);
6252d702
MS
136}
137#endif
138
e5992f2e
MS
139#ifdef CONFIG_PGSTE
140
141/**
142 * gmap_alloc - allocate a guest address space
143 * @mm: pointer to the parent mm_struct
144 *
145 * Returns a guest address space structure.
146 */
147struct gmap *gmap_alloc(struct mm_struct *mm)
36409f63 148{
e5992f2e
MS
149 struct gmap *gmap;
150 struct page *page;
151 unsigned long *table;
36409f63 152
e5992f2e
MS
153 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
154 if (!gmap)
155 goto out;
156 INIT_LIST_HEAD(&gmap->crst_list);
157 gmap->mm = mm;
158 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
159 if (!page)
160 goto out_free;
161 list_add(&page->lru, &gmap->crst_list);
162 table = (unsigned long *) page_to_phys(page);
163 crst_table_init(table, _REGION1_ENTRY_EMPTY);
164 gmap->table = table;
480e5926
CB
165 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
166 _ASCE_USER_BITS | __pa(table);
e5992f2e
MS
167 list_add(&gmap->list, &mm->context.gmap_list);
168 return gmap;
169
170out_free:
171 kfree(gmap);
172out:
173 return NULL;
36409f63 174}
e5992f2e 175EXPORT_SYMBOL_GPL(gmap_alloc);
36409f63 176
e5992f2e
MS
177static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
178{
179 struct gmap_pgtable *mp;
180 struct gmap_rmap *rmap;
181 struct page *page;
182
e5098611 183 if (*table & _SEGMENT_ENTRY_INVALID)
e5992f2e
MS
184 return 0;
185 page = pfn_to_page(*table >> PAGE_SHIFT);
186 mp = (struct gmap_pgtable *) page->index;
187 list_for_each_entry(rmap, &mp->mapper, list) {
188 if (rmap->entry != table)
189 continue;
190 list_del(&rmap->list);
191 kfree(rmap);
192 break;
193 }
e5098611 194 *table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT;
e5992f2e
MS
195 return 1;
196}
197
198static void gmap_flush_tlb(struct gmap *gmap)
199{
200 if (MACHINE_HAS_IDTE)
201 __tlb_flush_idte((unsigned long) gmap->table |
202 _ASCE_TYPE_REGION1);
203 else
204 __tlb_flush_global();
205}
206
207/**
208 * gmap_free - free a guest address space
209 * @gmap: pointer to the guest address space structure
3610cce8 210 */
e5992f2e
MS
211void gmap_free(struct gmap *gmap)
212{
213 struct page *page, *next;
214 unsigned long *table;
215 int i;
216
217
218 /* Flush tlb. */
219 if (MACHINE_HAS_IDTE)
220 __tlb_flush_idte((unsigned long) gmap->table |
221 _ASCE_TYPE_REGION1);
222 else
223 __tlb_flush_global();
224
225 /* Free all segment & region tables. */
226 down_read(&gmap->mm->mmap_sem);
cc772456 227 spin_lock(&gmap->mm->page_table_lock);
e5992f2e
MS
228 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
229 table = (unsigned long *) page_to_phys(page);
230 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
231 /* Remove gmap rmap structures for segment table. */
232 for (i = 0; i < PTRS_PER_PMD; i++, table++)
233 gmap_unlink_segment(gmap, table);
234 __free_pages(page, ALLOC_ORDER);
235 }
cc772456 236 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e
MS
237 up_read(&gmap->mm->mmap_sem);
238 list_del(&gmap->list);
239 kfree(gmap);
240}
241EXPORT_SYMBOL_GPL(gmap_free);
242
243/**
244 * gmap_enable - switch primary space to the guest address space
245 * @gmap: pointer to the guest address space structure
246 */
247void gmap_enable(struct gmap *gmap)
248{
e5992f2e
MS
249 S390_lowcore.gmap = (unsigned long) gmap;
250}
251EXPORT_SYMBOL_GPL(gmap_enable);
252
253/**
254 * gmap_disable - switch back to the standard primary address space
255 * @gmap: pointer to the guest address space structure
256 */
257void gmap_disable(struct gmap *gmap)
258{
e5992f2e
MS
259 S390_lowcore.gmap = 0UL;
260}
261EXPORT_SYMBOL_GPL(gmap_disable);
262
a9162f23
CO
263/*
264 * gmap_alloc_table is assumed to be called with mmap_sem held
265 */
e5992f2e 266static int gmap_alloc_table(struct gmap *gmap,
984e2a59
HC
267 unsigned long *table, unsigned long init)
268 __releases(&gmap->mm->page_table_lock)
269 __acquires(&gmap->mm->page_table_lock)
e5992f2e
MS
270{
271 struct page *page;
272 unsigned long *new;
273
c86cce2a
CB
274 /* since we dont free the gmap table until gmap_free we can unlock */
275 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e 276 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
c86cce2a 277 spin_lock(&gmap->mm->page_table_lock);
e5992f2e
MS
278 if (!page)
279 return -ENOMEM;
280 new = (unsigned long *) page_to_phys(page);
281 crst_table_init(new, init);
e5098611 282 if (*table & _REGION_ENTRY_INVALID) {
e5992f2e
MS
283 list_add(&page->lru, &gmap->crst_list);
284 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
285 (*table & _REGION_ENTRY_TYPE_MASK);
286 } else
287 __free_pages(page, ALLOC_ORDER);
e5992f2e
MS
288 return 0;
289}
290
291/**
292 * gmap_unmap_segment - unmap segment from the guest address space
293 * @gmap: pointer to the guest address space structure
294 * @addr: address in the guest address space
295 * @len: length of the memory area to unmap
296 *
b4a96015 297 * Returns 0 if the unmap succeeded, -EINVAL if not.
e5992f2e
MS
298 */
299int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
300{
301 unsigned long *table;
302 unsigned long off;
303 int flush;
304
305 if ((to | len) & (PMD_SIZE - 1))
306 return -EINVAL;
307 if (len == 0 || to + len < to)
308 return -EINVAL;
309
310 flush = 0;
311 down_read(&gmap->mm->mmap_sem);
cc772456 312 spin_lock(&gmap->mm->page_table_lock);
e5992f2e
MS
313 for (off = 0; off < len; off += PMD_SIZE) {
314 /* Walk the guest addr space page table */
315 table = gmap->table + (((to + off) >> 53) & 0x7ff);
e5098611 316 if (*table & _REGION_ENTRY_INVALID)
05873df9 317 goto out;
e5992f2e
MS
318 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
319 table = table + (((to + off) >> 42) & 0x7ff);
e5098611 320 if (*table & _REGION_ENTRY_INVALID)
05873df9 321 goto out;
e5992f2e
MS
322 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
323 table = table + (((to + off) >> 31) & 0x7ff);
e5098611 324 if (*table & _REGION_ENTRY_INVALID)
05873df9 325 goto out;
e5992f2e
MS
326 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
327 table = table + (((to + off) >> 20) & 0x7ff);
328
329 /* Clear segment table entry in guest address space. */
330 flush |= gmap_unlink_segment(gmap, table);
e5098611 331 *table = _SEGMENT_ENTRY_INVALID;
e5992f2e 332 }
05873df9 333out:
cc772456 334 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e
MS
335 up_read(&gmap->mm->mmap_sem);
336 if (flush)
337 gmap_flush_tlb(gmap);
338 return 0;
339}
340EXPORT_SYMBOL_GPL(gmap_unmap_segment);
341
342/**
343 * gmap_mmap_segment - map a segment to the guest address space
344 * @gmap: pointer to the guest address space structure
345 * @from: source address in the parent address space
346 * @to: target address in the guest address space
347 *
b4a96015 348 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
e5992f2e
MS
349 */
350int gmap_map_segment(struct gmap *gmap, unsigned long from,
351 unsigned long to, unsigned long len)
352{
353 unsigned long *table;
354 unsigned long off;
355 int flush;
356
357 if ((from | to | len) & (PMD_SIZE - 1))
358 return -EINVAL;
ee6ee55b 359 if (len == 0 || from + len > TASK_MAX_SIZE ||
e5992f2e
MS
360 from + len < from || to + len < to)
361 return -EINVAL;
362
363 flush = 0;
364 down_read(&gmap->mm->mmap_sem);
cc772456 365 spin_lock(&gmap->mm->page_table_lock);
e5992f2e
MS
366 for (off = 0; off < len; off += PMD_SIZE) {
367 /* Walk the gmap address space page table */
368 table = gmap->table + (((to + off) >> 53) & 0x7ff);
e5098611 369 if ((*table & _REGION_ENTRY_INVALID) &&
e5992f2e
MS
370 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
371 goto out_unmap;
372 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
373 table = table + (((to + off) >> 42) & 0x7ff);
e5098611 374 if ((*table & _REGION_ENTRY_INVALID) &&
e5992f2e
MS
375 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
376 goto out_unmap;
377 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
378 table = table + (((to + off) >> 31) & 0x7ff);
e5098611 379 if ((*table & _REGION_ENTRY_INVALID) &&
e5992f2e
MS
380 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
381 goto out_unmap;
382 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
383 table = table + (((to + off) >> 20) & 0x7ff);
384
385 /* Store 'from' address in an invalid segment table entry. */
386 flush |= gmap_unlink_segment(gmap, table);
e5098611
MS
387 *table = (from + off) | (_SEGMENT_ENTRY_INVALID |
388 _SEGMENT_ENTRY_PROTECT);
e5992f2e 389 }
cc772456 390 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e
MS
391 up_read(&gmap->mm->mmap_sem);
392 if (flush)
393 gmap_flush_tlb(gmap);
394 return 0;
395
396out_unmap:
cc772456 397 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e
MS
398 up_read(&gmap->mm->mmap_sem);
399 gmap_unmap_segment(gmap, to, len);
400 return -ENOMEM;
401}
402EXPORT_SYMBOL_GPL(gmap_map_segment);
403
c5034945
HC
404static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap)
405{
406 unsigned long *table;
407
408 table = gmap->table + ((address >> 53) & 0x7ff);
e5098611 409 if (unlikely(*table & _REGION_ENTRY_INVALID))
c5034945
HC
410 return ERR_PTR(-EFAULT);
411 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
412 table = table + ((address >> 42) & 0x7ff);
e5098611 413 if (unlikely(*table & _REGION_ENTRY_INVALID))
c5034945
HC
414 return ERR_PTR(-EFAULT);
415 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
416 table = table + ((address >> 31) & 0x7ff);
e5098611 417 if (unlikely(*table & _REGION_ENTRY_INVALID))
c5034945
HC
418 return ERR_PTR(-EFAULT);
419 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
420 table = table + ((address >> 20) & 0x7ff);
421 return table;
422}
423
424/**
425 * __gmap_translate - translate a guest address to a user space address
426 * @address: guest address
427 * @gmap: pointer to guest mapping meta data structure
428 *
429 * Returns user space address which corresponds to the guest address or
430 * -EFAULT if no such mapping exists.
431 * This function does not establish potentially missing page table entries.
432 * The mmap_sem of the mm that belongs to the address space must be held
433 * when this function gets called.
434 */
435unsigned long __gmap_translate(unsigned long address, struct gmap *gmap)
436{
437 unsigned long *segment_ptr, vmaddr, segment;
438 struct gmap_pgtable *mp;
439 struct page *page;
440
441 current->thread.gmap_addr = address;
442 segment_ptr = gmap_table_walk(address, gmap);
443 if (IS_ERR(segment_ptr))
444 return PTR_ERR(segment_ptr);
445 /* Convert the gmap address to an mm address. */
446 segment = *segment_ptr;
e5098611 447 if (!(segment & _SEGMENT_ENTRY_INVALID)) {
c5034945
HC
448 page = pfn_to_page(segment >> PAGE_SHIFT);
449 mp = (struct gmap_pgtable *) page->index;
450 return mp->vmaddr | (address & ~PMD_MASK);
e5098611 451 } else if (segment & _SEGMENT_ENTRY_PROTECT) {
c5034945
HC
452 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
453 return vmaddr | (address & ~PMD_MASK);
454 }
455 return -EFAULT;
456}
457EXPORT_SYMBOL_GPL(__gmap_translate);
458
459/**
460 * gmap_translate - translate a guest address to a user space address
461 * @address: guest address
462 * @gmap: pointer to guest mapping meta data structure
463 *
464 * Returns user space address which corresponds to the guest address or
465 * -EFAULT if no such mapping exists.
466 * This function does not establish potentially missing page table entries.
467 */
468unsigned long gmap_translate(unsigned long address, struct gmap *gmap)
469{
470 unsigned long rc;
471
472 down_read(&gmap->mm->mmap_sem);
473 rc = __gmap_translate(address, gmap);
474 up_read(&gmap->mm->mmap_sem);
475 return rc;
476}
477EXPORT_SYMBOL_GPL(gmap_translate);
478
d3383632
MS
479static int gmap_connect_pgtable(unsigned long address, unsigned long segment,
480 unsigned long *segment_ptr, struct gmap *gmap)
e5992f2e 481{
ab8e5235 482 unsigned long vmaddr;
c5034945 483 struct vm_area_struct *vma;
e5992f2e
MS
484 struct gmap_pgtable *mp;
485 struct gmap_rmap *rmap;
c5034945 486 struct mm_struct *mm;
e5992f2e
MS
487 struct page *page;
488 pgd_t *pgd;
489 pud_t *pud;
490 pmd_t *pmd;
491
ab8e5235
MS
492 mm = gmap->mm;
493 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
494 vma = find_vma(mm, vmaddr);
495 if (!vma || vma->vm_start > vmaddr)
496 return -EFAULT;
497 /* Walk the parent mm page table */
498 pgd = pgd_offset(mm, vmaddr);
499 pud = pud_alloc(mm, pgd, vmaddr);
500 if (!pud)
501 return -ENOMEM;
502 pmd = pmd_alloc(mm, pud, vmaddr);
503 if (!pmd)
504 return -ENOMEM;
505 if (!pmd_present(*pmd) &&
506 __pte_alloc(mm, vma, pmd, vmaddr))
507 return -ENOMEM;
508 /* pmd now points to a valid segment table entry. */
509 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
510 if (!rmap)
511 return -ENOMEM;
512 /* Link gmap segment table entry location to page table. */
513 page = pmd_page(*pmd);
514 mp = (struct gmap_pgtable *) page->index;
d3383632 515 rmap->gmap = gmap;
ab8e5235 516 rmap->entry = segment_ptr;
e86cbd87 517 rmap->vmaddr = address & PMD_MASK;
ab8e5235
MS
518 spin_lock(&mm->page_table_lock);
519 if (*segment_ptr == segment) {
520 list_add(&rmap->list, &mp->mapper);
521 /* Set gmap segment table entry to page table. */
522 *segment_ptr = pmd_val(*pmd) & PAGE_MASK;
523 rmap = NULL;
524 }
525 spin_unlock(&mm->page_table_lock);
526 kfree(rmap);
527 return 0;
528}
529
530static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table)
531{
532 struct gmap_rmap *rmap, *next;
533 struct gmap_pgtable *mp;
534 struct page *page;
535 int flush;
536
537 flush = 0;
538 spin_lock(&mm->page_table_lock);
539 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
540 mp = (struct gmap_pgtable *) page->index;
541 list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
e5098611
MS
542 *rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID |
543 _SEGMENT_ENTRY_PROTECT);
ab8e5235
MS
544 list_del(&rmap->list);
545 kfree(rmap);
546 flush = 1;
547 }
548 spin_unlock(&mm->page_table_lock);
549 if (flush)
550 __tlb_flush_global();
551}
552
553/*
554 * this function is assumed to be called with mmap_sem held
555 */
556unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
557{
558 unsigned long *segment_ptr, segment;
559 struct gmap_pgtable *mp;
560 struct page *page;
561 int rc;
562
e5992f2e 563 current->thread.gmap_addr = address;
c5034945
HC
564 segment_ptr = gmap_table_walk(address, gmap);
565 if (IS_ERR(segment_ptr))
e5992f2e 566 return -EFAULT;
e5992f2e 567 /* Convert the gmap address to an mm address. */
ab8e5235
MS
568 while (1) {
569 segment = *segment_ptr;
e5098611 570 if (!(segment & _SEGMENT_ENTRY_INVALID)) {
ab8e5235
MS
571 /* Page table is present */
572 page = pfn_to_page(segment >> PAGE_SHIFT);
573 mp = (struct gmap_pgtable *) page->index;
574 return mp->vmaddr | (address & ~PMD_MASK);
575 }
e5098611 576 if (!(segment & _SEGMENT_ENTRY_PROTECT))
ab8e5235
MS
577 /* Nothing mapped in the gmap address space. */
578 break;
d3383632 579 rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap);
ab8e5235
MS
580 if (rc)
581 return rc;
e5992f2e
MS
582 }
583 return -EFAULT;
499069e1
CO
584}
585
586unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
587{
588 unsigned long rc;
589
590 down_read(&gmap->mm->mmap_sem);
591 rc = __gmap_fault(address, gmap);
592 up_read(&gmap->mm->mmap_sem);
e5992f2e 593
499069e1 594 return rc;
e5992f2e
MS
595}
596EXPORT_SYMBOL_GPL(gmap_fault);
597
b31288fa
KW
598static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
599{
600 if (!non_swap_entry(entry))
601 dec_mm_counter(mm, MM_SWAPENTS);
602 else if (is_migration_entry(entry)) {
603 struct page *page = migration_entry_to_page(entry);
604
605 if (PageAnon(page))
606 dec_mm_counter(mm, MM_ANONPAGES);
607 else
608 dec_mm_counter(mm, MM_FILEPAGES);
609 }
610 free_swap_and_cache(entry);
611}
612
613/**
614 * The mm->mmap_sem lock must be held
615 */
616static void gmap_zap_unused(struct mm_struct *mm, unsigned long address)
617{
618 unsigned long ptev, pgstev;
619 spinlock_t *ptl;
620 pgste_t pgste;
621 pte_t *ptep, pte;
622
623 ptep = get_locked_pte(mm, address, &ptl);
624 if (unlikely(!ptep))
625 return;
626 pte = *ptep;
627 if (!pte_swap(pte))
628 goto out_pte;
629 /* Zap unused and logically-zero pages */
630 pgste = pgste_get_lock(ptep);
631 pgstev = pgste_val(pgste);
632 ptev = pte_val(pte);
633 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
634 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
635 gmap_zap_swap_entry(pte_to_swp_entry(pte), mm);
636 pte_clear(mm, address, ptep);
637 }
638 pgste_set_unlock(ptep, pgste);
639out_pte:
640 pte_unmap_unlock(*ptep, ptl);
641}
642
643/*
644 * this function is assumed to be called with mmap_sem held
645 */
646void __gmap_zap(unsigned long address, struct gmap *gmap)
647{
648 unsigned long *table, *segment_ptr;
649 unsigned long segment, pgstev, ptev;
650 struct gmap_pgtable *mp;
651 struct page *page;
652
653 segment_ptr = gmap_table_walk(address, gmap);
654 if (IS_ERR(segment_ptr))
655 return;
656 segment = *segment_ptr;
657 if (segment & _SEGMENT_ENTRY_INVALID)
658 return;
659 page = pfn_to_page(segment >> PAGE_SHIFT);
660 mp = (struct gmap_pgtable *) page->index;
661 address = mp->vmaddr | (address & ~PMD_MASK);
662 /* Page table is present */
663 table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN);
664 table = table + ((address >> 12) & 0xff);
665 pgstev = table[PTRS_PER_PTE];
666 ptev = table[0];
667 /* quick check, checked again with locks held */
668 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
669 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID)))
670 gmap_zap_unused(gmap->mm, address);
671}
672EXPORT_SYMBOL_GPL(__gmap_zap);
673
388186bc
CB
674void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
675{
676
677 unsigned long *table, address, size;
678 struct vm_area_struct *vma;
679 struct gmap_pgtable *mp;
680 struct page *page;
681
682 down_read(&gmap->mm->mmap_sem);
683 address = from;
684 while (address < to) {
685 /* Walk the gmap address space page table */
686 table = gmap->table + ((address >> 53) & 0x7ff);
e5098611 687 if (unlikely(*table & _REGION_ENTRY_INVALID)) {
388186bc
CB
688 address = (address + PMD_SIZE) & PMD_MASK;
689 continue;
690 }
691 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
692 table = table + ((address >> 42) & 0x7ff);
e5098611 693 if (unlikely(*table & _REGION_ENTRY_INVALID)) {
388186bc
CB
694 address = (address + PMD_SIZE) & PMD_MASK;
695 continue;
696 }
697 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
698 table = table + ((address >> 31) & 0x7ff);
e5098611 699 if (unlikely(*table & _REGION_ENTRY_INVALID)) {
388186bc
CB
700 address = (address + PMD_SIZE) & PMD_MASK;
701 continue;
702 }
703 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
704 table = table + ((address >> 20) & 0x7ff);
e5098611 705 if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) {
388186bc
CB
706 address = (address + PMD_SIZE) & PMD_MASK;
707 continue;
708 }
709 page = pfn_to_page(*table >> PAGE_SHIFT);
710 mp = (struct gmap_pgtable *) page->index;
711 vma = find_vma(gmap->mm, mp->vmaddr);
712 size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
713 zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
714 size, NULL);
715 address = (address + PMD_SIZE) & PMD_MASK;
716 }
717 up_read(&gmap->mm->mmap_sem);
718}
719EXPORT_SYMBOL_GPL(gmap_discard);
720
d3383632
MS
721static LIST_HEAD(gmap_notifier_list);
722static DEFINE_SPINLOCK(gmap_notifier_lock);
723
724/**
725 * gmap_register_ipte_notifier - register a pte invalidation callback
726 * @nb: pointer to the gmap notifier block
727 */
728void gmap_register_ipte_notifier(struct gmap_notifier *nb)
729{
730 spin_lock(&gmap_notifier_lock);
731 list_add(&nb->list, &gmap_notifier_list);
732 spin_unlock(&gmap_notifier_lock);
733}
734EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
735
736/**
737 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
738 * @nb: pointer to the gmap notifier block
739 */
740void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
741{
742 spin_lock(&gmap_notifier_lock);
743 list_del_init(&nb->list);
744 spin_unlock(&gmap_notifier_lock);
745}
746EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
747
748/**
749 * gmap_ipte_notify - mark a range of ptes for invalidation notification
750 * @gmap: pointer to guest mapping meta data structure
c7c5be73 751 * @start: virtual address in the guest address space
d3383632
MS
752 * @len: size of area
753 *
754 * Returns 0 if for each page in the given range a gmap mapping exists and
755 * the invalidation notification could be set. If the gmap mapping is missing
756 * for one or more pages -EFAULT is returned. If no memory could be allocated
757 * -ENOMEM is returned. This function establishes missing page table entries.
758 */
759int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len)
760{
761 unsigned long addr;
762 spinlock_t *ptl;
763 pte_t *ptep, entry;
764 pgste_t pgste;
765 int rc = 0;
766
767 if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK))
768 return -EINVAL;
769 down_read(&gmap->mm->mmap_sem);
770 while (len) {
771 /* Convert gmap address and connect the page tables */
772 addr = __gmap_fault(start, gmap);
773 if (IS_ERR_VALUE(addr)) {
774 rc = addr;
775 break;
776 }
777 /* Get the page mapped */
bb4b42ce 778 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
d3383632
MS
779 rc = -EFAULT;
780 break;
781 }
782 /* Walk the process page table, lock and get pte pointer */
783 ptep = get_locked_pte(gmap->mm, addr, &ptl);
784 if (unlikely(!ptep))
785 continue;
786 /* Set notification bit in the pgste of the pte */
787 entry = *ptep;
e5098611 788 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
d3383632 789 pgste = pgste_get_lock(ptep);
0d0dafc1 790 pgste_val(pgste) |= PGSTE_IN_BIT;
d3383632
MS
791 pgste_set_unlock(ptep, pgste);
792 start += PAGE_SIZE;
793 len -= PAGE_SIZE;
794 }
795 spin_unlock(ptl);
796 }
797 up_read(&gmap->mm->mmap_sem);
798 return rc;
799}
800EXPORT_SYMBOL_GPL(gmap_ipte_notify);
801
802/**
803 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
804 * @mm: pointer to the process mm_struct
d3383632
MS
805 * @pte: pointer to the page table entry
806 *
807 * This function is assumed to be called with the page table lock held
808 * for the pte to notify.
809 */
aaeff84a 810void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte)
d3383632
MS
811{
812 unsigned long segment_offset;
813 struct gmap_notifier *nb;
814 struct gmap_pgtable *mp;
815 struct gmap_rmap *rmap;
816 struct page *page;
817
818 segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
819 segment_offset = segment_offset * (4096 / sizeof(pte_t));
820 page = pfn_to_page(__pa(pte) >> PAGE_SHIFT);
821 mp = (struct gmap_pgtable *) page->index;
822 spin_lock(&gmap_notifier_lock);
823 list_for_each_entry(rmap, &mp->mapper, list) {
824 list_for_each_entry(nb, &gmap_notifier_list, list)
825 nb->notifier_call(rmap->gmap,
826 rmap->vmaddr + segment_offset);
827 }
828 spin_unlock(&gmap_notifier_lock);
829}
830
3eabaee9
MS
831static inline int page_table_with_pgste(struct page *page)
832{
833 return atomic_read(&page->_mapcount) == 0;
834}
835
e5992f2e
MS
836static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
837 unsigned long vmaddr)
36409f63
MS
838{
839 struct page *page;
840 unsigned long *table;
e5992f2e 841 struct gmap_pgtable *mp;
36409f63
MS
842
843 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
844 if (!page)
845 return NULL;
e5992f2e
MS
846 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
847 if (!mp) {
848 __free_page(page);
849 return NULL;
850 }
e89cfa58
KS
851 if (!pgtable_page_ctor(page)) {
852 kfree(mp);
853 __free_page(page);
854 return NULL;
855 }
e5992f2e
MS
856 mp->vmaddr = vmaddr & PMD_MASK;
857 INIT_LIST_HEAD(&mp->mapper);
858 page->index = (unsigned long) mp;
3eabaee9 859 atomic_set(&page->_mapcount, 0);
36409f63 860 table = (unsigned long *) page_to_phys(page);
e5098611 861 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
0944fe3f
MS
862 clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
863 PAGE_SIZE/2);
36409f63
MS
864 return table;
865}
866
867static inline void page_table_free_pgste(unsigned long *table)
868{
869 struct page *page;
e5992f2e 870 struct gmap_pgtable *mp;
36409f63
MS
871
872 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
e5992f2e
MS
873 mp = (struct gmap_pgtable *) page->index;
874 BUG_ON(!list_empty(&mp->mapper));
2320c579 875 pgtable_page_dtor(page);
36409f63 876 atomic_set(&page->_mapcount, -1);
e5992f2e 877 kfree(mp);
36409f63
MS
878 __free_page(page);
879}
36409f63 880
deedabb2
MS
881static inline unsigned long page_table_reset_pte(struct mm_struct *mm,
882 pmd_t *pmd, unsigned long addr, unsigned long end)
883{
884 pte_t *start_pte, *pte;
885 spinlock_t *ptl;
886 pgste_t pgste;
887
888 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
889 pte = start_pte;
890 do {
891 pgste = pgste_get_lock(pte);
892 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
893 pgste_set_unlock(pte, pgste);
894 } while (pte++, addr += PAGE_SIZE, addr != end);
895 pte_unmap_unlock(start_pte, ptl);
896
897 return addr;
898}
899
900static inline unsigned long page_table_reset_pmd(struct mm_struct *mm,
901 pud_t *pud, unsigned long addr, unsigned long end)
902{
903 unsigned long next;
904 pmd_t *pmd;
905
906 pmd = pmd_offset(pud, addr);
907 do {
908 next = pmd_addr_end(addr, end);
909 if (pmd_none_or_clear_bad(pmd))
910 continue;
911 next = page_table_reset_pte(mm, pmd, addr, next);
912 } while (pmd++, addr = next, addr != end);
913
914 return addr;
915}
916
917static inline unsigned long page_table_reset_pud(struct mm_struct *mm,
918 pgd_t *pgd, unsigned long addr, unsigned long end)
919{
920 unsigned long next;
921 pud_t *pud;
922
923 pud = pud_offset(pgd, addr);
924 do {
925 next = pud_addr_end(addr, end);
926 if (pud_none_or_clear_bad(pud))
927 continue;
928 next = page_table_reset_pmd(mm, pud, addr, next);
929 } while (pud++, addr = next, addr != end);
930
931 return addr;
932}
933
934void page_table_reset_pgste(struct mm_struct *mm,
935 unsigned long start, unsigned long end)
936{
937 unsigned long addr, next;
938 pgd_t *pgd;
939
940 addr = start;
941 down_read(&mm->mmap_sem);
942 pgd = pgd_offset(mm, addr);
943 do {
944 next = pgd_addr_end(addr, end);
945 if (pgd_none_or_clear_bad(pgd))
946 continue;
947 next = page_table_reset_pud(mm, pgd, addr, next);
948 } while (pgd++, addr = next, addr != end);
949 up_read(&mm->mmap_sem);
950}
951EXPORT_SYMBOL(page_table_reset_pgste);
952
24d5dd02
CB
953int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
954 unsigned long key, bool nq)
955{
956 spinlock_t *ptl;
957 pgste_t old, new;
958 pte_t *ptep;
959
960 down_read(&mm->mmap_sem);
961 ptep = get_locked_pte(current->mm, addr, &ptl);
962 if (unlikely(!ptep)) {
963 up_read(&mm->mmap_sem);
964 return -EFAULT;
965 }
966
967 new = old = pgste_get_lock(ptep);
968 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
969 PGSTE_ACC_BITS | PGSTE_FP_BIT);
970 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
971 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
972 if (!(pte_val(*ptep) & _PAGE_INVALID)) {
0944fe3f 973 unsigned long address, bits, skey;
24d5dd02
CB
974
975 address = pte_val(*ptep) & PAGE_MASK;
0944fe3f 976 skey = (unsigned long) page_get_storage_key(address);
24d5dd02 977 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
0944fe3f 978 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
24d5dd02 979 /* Set storage key ACC and FP */
0944fe3f 980 page_set_storage_key(address, skey, !nq);
24d5dd02
CB
981 /* Merge host changed & referenced into pgste */
982 pgste_val(new) |= bits << 52;
24d5dd02
CB
983 }
984 /* changing the guest storage key is considered a change of the page */
985 if ((pgste_val(new) ^ pgste_val(old)) &
986 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
0944fe3f 987 pgste_val(new) |= PGSTE_HC_BIT;
24d5dd02
CB
988
989 pgste_set_unlock(ptep, new);
990 pte_unmap_unlock(*ptep, ptl);
991 up_read(&mm->mmap_sem);
992 return 0;
993}
994EXPORT_SYMBOL(set_guest_storage_key);
995
e5992f2e
MS
996#else /* CONFIG_PGSTE */
997
3eabaee9
MS
998static inline int page_table_with_pgste(struct page *page)
999{
1000 return 0;
1001}
1002
e5992f2e
MS
1003static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
1004 unsigned long vmaddr)
1005{
944291de 1006 return NULL;
e5992f2e
MS
1007}
1008
1009static inline void page_table_free_pgste(unsigned long *table)
1010{
1011}
1012
ab8e5235
MS
1013static inline void gmap_disconnect_pgtable(struct mm_struct *mm,
1014 unsigned long *table)
e5992f2e
MS
1015{
1016}
1017
1018#endif /* CONFIG_PGSTE */
1019
1020static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
1021{
1022 unsigned int old, new;
1023
1024 do {
1025 old = atomic_read(v);
1026 new = old ^ bits;
1027 } while (atomic_cmpxchg(v, old, new) != old);
1028 return new;
1029}
1030
1031/*
1032 * page table entry allocation/free routines.
1033 */
1034unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
3610cce8 1035{
41459d36
HC
1036 unsigned long *uninitialized_var(table);
1037 struct page *uninitialized_var(page);
36409f63 1038 unsigned int mask, bit;
3610cce8 1039
36409f63 1040 if (mm_has_pgste(mm))
e5992f2e 1041 return page_table_alloc_pgste(mm, vmaddr);
36409f63 1042 /* Allocate fragments of a 4K page as 1K/2K page table */
80217147 1043 spin_lock_bh(&mm->context.list_lock);
36409f63 1044 mask = FRAG_MASK;
146e4b3c
MS
1045 if (!list_empty(&mm->context.pgtable_list)) {
1046 page = list_first_entry(&mm->context.pgtable_list,
1047 struct page, lru);
36409f63
MS
1048 table = (unsigned long *) page_to_phys(page);
1049 mask = atomic_read(&page->_mapcount);
1050 mask = mask | (mask >> 4);
146e4b3c 1051 }
36409f63 1052 if ((mask & FRAG_MASK) == FRAG_MASK) {
80217147 1053 spin_unlock_bh(&mm->context.list_lock);
146e4b3c
MS
1054 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
1055 if (!page)
3610cce8 1056 return NULL;
e89cfa58
KS
1057 if (!pgtable_page_ctor(page)) {
1058 __free_page(page);
1059 return NULL;
1060 }
36409f63 1061 atomic_set(&page->_mapcount, 1);
146e4b3c 1062 table = (unsigned long *) page_to_phys(page);
e5098611 1063 clear_table(table, _PAGE_INVALID, PAGE_SIZE);
80217147 1064 spin_lock_bh(&mm->context.list_lock);
146e4b3c 1065 list_add(&page->lru, &mm->context.pgtable_list);
36409f63
MS
1066 } else {
1067 for (bit = 1; mask & bit; bit <<= 1)
1068 table += PTRS_PER_PTE;
1069 mask = atomic_xor_bits(&page->_mapcount, bit);
1070 if ((mask & FRAG_MASK) == FRAG_MASK)
1071 list_del(&page->lru);
3610cce8 1072 }
80217147 1073 spin_unlock_bh(&mm->context.list_lock);
3610cce8
MS
1074 return table;
1075}
1076
36409f63 1077void page_table_free(struct mm_struct *mm, unsigned long *table)
80217147
MS
1078{
1079 struct page *page;
36409f63 1080 unsigned int bit, mask;
80217147 1081
3eabaee9
MS
1082 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1083 if (page_table_with_pgste(page)) {
ab8e5235 1084 gmap_disconnect_pgtable(mm, table);
36409f63 1085 return page_table_free_pgste(table);
e5992f2e 1086 }
36409f63 1087 /* Free 1K/2K page table fragment of a 4K page */
36409f63
MS
1088 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
1089 spin_lock_bh(&mm->context.list_lock);
1090 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1091 list_del(&page->lru);
1092 mask = atomic_xor_bits(&page->_mapcount, bit);
1093 if (mask & FRAG_MASK)
1094 list_add(&page->lru, &mm->context.pgtable_list);
1095 spin_unlock_bh(&mm->context.list_lock);
1096 if (mask == 0) {
80217147 1097 pgtable_page_dtor(page);
36409f63 1098 atomic_set(&page->_mapcount, -1);
80217147
MS
1099 __free_page(page);
1100 }
1101}
1102
36409f63 1103static void __page_table_free_rcu(void *table, unsigned bit)
3610cce8 1104{
146e4b3c 1105 struct page *page;
3610cce8 1106
36409f63
MS
1107 if (bit == FRAG_MASK)
1108 return page_table_free_pgste(table);
36409f63 1109 /* Free 1K/2K page table fragment of a 4K page */
146e4b3c 1110 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
36409f63 1111 if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
146e4b3c 1112 pgtable_page_dtor(page);
36409f63 1113 atomic_set(&page->_mapcount, -1);
146e4b3c
MS
1114 __free_page(page);
1115 }
1116}
3610cce8 1117
36409f63 1118void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
80217147 1119{
36409f63 1120 struct mm_struct *mm;
80217147 1121 struct page *page;
36409f63 1122 unsigned int bit, mask;
80217147 1123
36409f63 1124 mm = tlb->mm;
3eabaee9
MS
1125 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1126 if (page_table_with_pgste(page)) {
ab8e5235 1127 gmap_disconnect_pgtable(mm, table);
36409f63
MS
1128 table = (unsigned long *) (__pa(table) | FRAG_MASK);
1129 tlb_remove_table(tlb, table);
1130 return;
80217147 1131 }
36409f63 1132 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
80217147 1133 spin_lock_bh(&mm->context.list_lock);
36409f63
MS
1134 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1135 list_del(&page->lru);
1136 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
1137 if (mask & FRAG_MASK)
1138 list_add_tail(&page->lru, &mm->context.pgtable_list);
80217147 1139 spin_unlock_bh(&mm->context.list_lock);
36409f63
MS
1140 table = (unsigned long *) (__pa(table) | (bit << 4));
1141 tlb_remove_table(tlb, table);
1142}
1143
63df41d6 1144static void __tlb_remove_table(void *_table)
36409f63 1145{
e73b7fff
MS
1146 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
1147 void *table = (void *)((unsigned long) _table & ~mask);
1148 unsigned type = (unsigned long) _table & mask;
36409f63
MS
1149
1150 if (type)
1151 __page_table_free_rcu(table, type);
1152 else
1153 free_pages((unsigned long) table, ALLOC_ORDER);
80217147
MS
1154}
1155
cd94154c
MS
1156static void tlb_remove_table_smp_sync(void *arg)
1157{
1158 /* Simply deliver the interrupt */
1159}
1160
1161static void tlb_remove_table_one(void *table)
1162{
1163 /*
1164 * This isn't an RCU grace period and hence the page-tables cannot be
1165 * assumed to be actually RCU-freed.
1166 *
1167 * It is however sufficient for software page-table walkers that rely
1168 * on IRQ disabling. See the comment near struct mmu_table_batch.
1169 */
1170 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
1171 __tlb_remove_table(table);
1172}
1173
1174static void tlb_remove_table_rcu(struct rcu_head *head)
1175{
1176 struct mmu_table_batch *batch;
1177 int i;
1178
1179 batch = container_of(head, struct mmu_table_batch, rcu);
1180
1181 for (i = 0; i < batch->nr; i++)
1182 __tlb_remove_table(batch->tables[i]);
1183
1184 free_page((unsigned long)batch);
1185}
1186
1187void tlb_table_flush(struct mmu_gather *tlb)
1188{
1189 struct mmu_table_batch **batch = &tlb->batch;
1190
1191 if (*batch) {
cd94154c
MS
1192 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
1193 *batch = NULL;
1194 }
1195}
1196
1197void tlb_remove_table(struct mmu_gather *tlb, void *table)
1198{
1199 struct mmu_table_batch **batch = &tlb->batch;
1200
5c474a1e 1201 tlb->mm->context.flush_mm = 1;
cd94154c
MS
1202 if (*batch == NULL) {
1203 *batch = (struct mmu_table_batch *)
1204 __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
1205 if (*batch == NULL) {
5c474a1e 1206 __tlb_flush_mm_lazy(tlb->mm);
cd94154c
MS
1207 tlb_remove_table_one(table);
1208 return;
1209 }
1210 (*batch)->nr = 0;
1211 }
1212 (*batch)->tables[(*batch)->nr++] = table;
1213 if ((*batch)->nr == MAX_TABLE_BATCH)
5c474a1e 1214 tlb_flush_mmu(tlb);
cd94154c 1215}
36409f63 1216
274023da 1217#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3eabaee9 1218static inline void thp_split_vma(struct vm_area_struct *vma)
274023da
GS
1219{
1220 unsigned long addr;
274023da 1221
3eabaee9
MS
1222 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
1223 follow_page(vma, addr, FOLL_SPLIT);
274023da
GS
1224}
1225
3eabaee9 1226static inline void thp_split_mm(struct mm_struct *mm)
274023da 1227{
3eabaee9 1228 struct vm_area_struct *vma;
274023da 1229
3eabaee9 1230 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
274023da
GS
1231 thp_split_vma(vma);
1232 vma->vm_flags &= ~VM_HUGEPAGE;
1233 vma->vm_flags |= VM_NOHUGEPAGE;
274023da 1234 }
3eabaee9
MS
1235 mm->def_flags |= VM_NOHUGEPAGE;
1236}
1237#else
1238static inline void thp_split_mm(struct mm_struct *mm)
1239{
274023da
GS
1240}
1241#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1242
3eabaee9
MS
1243static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
1244 struct mm_struct *mm, pud_t *pud,
1245 unsigned long addr, unsigned long end)
1246{
1247 unsigned long next, *table, *new;
1248 struct page *page;
1249 pmd_t *pmd;
1250
1251 pmd = pmd_offset(pud, addr);
1252 do {
1253 next = pmd_addr_end(addr, end);
1254again:
1255 if (pmd_none_or_clear_bad(pmd))
1256 continue;
1257 table = (unsigned long *) pmd_deref(*pmd);
1258 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1259 if (page_table_with_pgste(page))
1260 continue;
1261 /* Allocate new page table with pgstes */
1262 new = page_table_alloc_pgste(mm, addr);
be39f196
DD
1263 if (!new)
1264 return -ENOMEM;
1265
3eabaee9
MS
1266 spin_lock(&mm->page_table_lock);
1267 if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
1268 /* Nuke pmd entry pointing to the "short" page table */
1269 pmdp_flush_lazy(mm, addr, pmd);
1270 pmd_clear(pmd);
1271 /* Copy ptes from old table to new table */
1272 memcpy(new, table, PAGE_SIZE/2);
1273 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
1274 /* Establish new table */
1275 pmd_populate(mm, pmd, (pte_t *) new);
1276 /* Free old table with rcu, there might be a walker! */
1277 page_table_free_rcu(tlb, table);
1278 new = NULL;
1279 }
1280 spin_unlock(&mm->page_table_lock);
1281 if (new) {
1282 page_table_free_pgste(new);
1283 goto again;
1284 }
1285 } while (pmd++, addr = next, addr != end);
1286
1287 return addr;
1288}
1289
1290static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
1291 struct mm_struct *mm, pgd_t *pgd,
1292 unsigned long addr, unsigned long end)
1293{
1294 unsigned long next;
1295 pud_t *pud;
1296
1297 pud = pud_offset(pgd, addr);
1298 do {
1299 next = pud_addr_end(addr, end);
1300 if (pud_none_or_clear_bad(pud))
1301 continue;
1302 next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
be39f196
DD
1303 if (unlikely(IS_ERR_VALUE(next)))
1304 return next;
3eabaee9
MS
1305 } while (pud++, addr = next, addr != end);
1306
1307 return addr;
1308}
1309
be39f196
DD
1310static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
1311 unsigned long addr, unsigned long end)
3eabaee9
MS
1312{
1313 unsigned long next;
1314 pgd_t *pgd;
1315
1316 pgd = pgd_offset(mm, addr);
1317 do {
1318 next = pgd_addr_end(addr, end);
1319 if (pgd_none_or_clear_bad(pgd))
1320 continue;
1321 next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
be39f196
DD
1322 if (unlikely(IS_ERR_VALUE(next)))
1323 return next;
3eabaee9 1324 } while (pgd++, addr = next, addr != end);
be39f196
DD
1325
1326 return 0;
3eabaee9
MS
1327}
1328
402b0862
CO
1329/*
1330 * switch on pgstes for its userspace process (for kvm)
1331 */
1332int s390_enable_sie(void)
1333{
1334 struct task_struct *tsk = current;
3eabaee9
MS
1335 struct mm_struct *mm = tsk->mm;
1336 struct mmu_gather tlb;
402b0862 1337
74b6b522 1338 /* Do we have pgstes? if yes, we are done */
36409f63 1339 if (mm_has_pgste(tsk->mm))
74b6b522 1340 return 0;
402b0862 1341
3eabaee9 1342 down_write(&mm->mmap_sem);
274023da
GS
1343 /* split thp mappings and disable thp for future mappings */
1344 thp_split_mm(mm);
3eabaee9 1345 /* Reallocate the page tables with pgstes */
ae7a835c 1346 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
be39f196
DD
1347 if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE))
1348 mm->context.has_pgste = 1;
ae7a835c 1349 tlb_finish_mmu(&tlb, 0, TASK_SIZE);
3eabaee9
MS
1350 up_write(&mm->mmap_sem);
1351 return mm->context.has_pgste ? 0 : -ENOMEM;
402b0862
CO
1352}
1353EXPORT_SYMBOL_GPL(s390_enable_sie);
7db11a36 1354
75077afb 1355#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1ae1c1d0
GS
1356int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
1357 pmd_t *pmdp)
1358{
1359 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1360 /* No need to flush TLB
1361 * On s390 reference bits are in storage key and never in TLB */
1362 return pmdp_test_and_clear_young(vma, address, pmdp);
1363}
1364
1365int pmdp_set_access_flags(struct vm_area_struct *vma,
1366 unsigned long address, pmd_t *pmdp,
1367 pmd_t entry, int dirty)
1368{
1369 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1370
1371 if (pmd_same(*pmdp, entry))
1372 return 0;
1373 pmdp_invalidate(vma, address, pmdp);
1374 set_pmd_at(vma->vm_mm, address, pmdp, entry);
1375 return 1;
1376}
1377
75077afb
GS
1378static void pmdp_splitting_flush_sync(void *arg)
1379{
1380 /* Simply deliver the interrupt */
1381}
1382
1383void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1384 pmd_t *pmdp)
1385{
1386 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1387 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
1388 (unsigned long *) pmdp)) {
1389 /* need to serialize against gup-fast (IRQ disabled) */
1390 smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
1391 }
1392}
9501d09f 1393
6b0b50b0
AK
1394void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1395 pgtable_t pgtable)
9501d09f
GS
1396{
1397 struct list_head *lh = (struct list_head *) pgtable;
1398
ec66ad66 1399 assert_spin_locked(pmd_lockptr(mm, pmdp));
9501d09f
GS
1400
1401 /* FIFO */
c389a250 1402 if (!pmd_huge_pte(mm, pmdp))
9501d09f
GS
1403 INIT_LIST_HEAD(lh);
1404 else
c389a250
KS
1405 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1406 pmd_huge_pte(mm, pmdp) = pgtable;
9501d09f
GS
1407}
1408
6b0b50b0 1409pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
9501d09f
GS
1410{
1411 struct list_head *lh;
1412 pgtable_t pgtable;
1413 pte_t *ptep;
1414
ec66ad66 1415 assert_spin_locked(pmd_lockptr(mm, pmdp));
9501d09f
GS
1416
1417 /* FIFO */
c389a250 1418 pgtable = pmd_huge_pte(mm, pmdp);
9501d09f
GS
1419 lh = (struct list_head *) pgtable;
1420 if (list_empty(lh))
c389a250 1421 pmd_huge_pte(mm, pmdp) = NULL;
9501d09f 1422 else {
c389a250 1423 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
9501d09f
GS
1424 list_del(lh);
1425 }
1426 ptep = (pte_t *) pgtable;
e5098611 1427 pte_val(*ptep) = _PAGE_INVALID;
9501d09f 1428 ptep++;
e5098611 1429 pte_val(*ptep) = _PAGE_INVALID;
9501d09f
GS
1430 return pgtable;
1431}
75077afb 1432#endif /* CONFIG_TRANSPARENT_HUGEPAGE */