powerpc/book3s64/vmemmap: switch radix to use a different vmemmap handling function
[linux-2.6-block.git] / arch / powerpc / mm / book3s64 / radix_pgtable.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
2bfd65e4
AK
2/*
3 * Page table handling routines for radix page table.
4 *
5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
2bfd65e4 6 */
bd350f71
ME
7
8#define pr_fmt(fmt) "radix-mmu: " fmt
9
d38153f9 10#include <linux/io.h>
bd350f71 11#include <linux/kernel.h>
589ee628 12#include <linux/sched/mm.h>
2bfd65e4 13#include <linux/memblock.h>
13a9a5d1 14#include <linux/of.h>
2bfd65e4 15#include <linux/of_fdt.h>
7614ff32 16#include <linux/mm.h>
997cdcb0 17#include <linux/hugetlb.h>
6deb6b47 18#include <linux/string_helpers.h>
af9d00e9 19#include <linux/memory.h>
2bfd65e4 20
2bfd65e4 21#include <asm/pgalloc.h>
eeb715c3 22#include <asm/mmu_context.h>
2bfd65e4
AK
23#include <asm/dma.h>
24#include <asm/machdep.h>
25#include <asm/mmu.h>
26#include <asm/firmware.h>
1d0761d2 27#include <asm/powernv.h>
9abcc981 28#include <asm/sections.h>
993cfecc 29#include <asm/smp.h>
0428491c 30#include <asm/trace.h>
890274c2 31#include <asm/uaccess.h>
52231340 32#include <asm/ultravisor.h>
5e8b2c4d 33#include <asm/set_memory.h>
2bfd65e4 34
bde3eb62
AK
35#include <trace/events/thp.h>
36
a5edf981
NM
37#include <mm/mmu_decl.h>
38
a25bd72b 39unsigned int mmu_base_pid;
950805f4 40unsigned long radix_mem_block_size __ro_after_init;
a25bd72b 41
2ad452ff
NP
42static __ref void *early_alloc_pgtable(unsigned long size, int nid,
43 unsigned long region_start, unsigned long region_end)
2bfd65e4 44{
f806714f
MR
45 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
46 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
8a7f97b9 47 void *ptr;
2bfd65e4 48
f806714f
MR
49 if (region_start)
50 min_addr = region_start;
51 if (region_end)
52 max_addr = region_end;
2ad452ff 53
8a7f97b9
MR
54 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
55
56 if (!ptr)
57 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
58 __func__, size, size, nid, &min_addr, &max_addr);
59
60 return ptr;
2bfd65e4
AK
61}
62
645d5ce2
AK
63/*
64 * When allocating pud or pmd pointers, we allocate a complete page
65 * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
66 * is to ensure that the page obtained from the memblock allocator
67 * can be completely used as page table page and can be freed
68 * correctly when the page table entries are removed.
69 */
0633dafc 70static int early_map_kernel_page(unsigned long ea, unsigned long pa,
2bfd65e4 71 pgprot_t flags,
2ad452ff
NP
72 unsigned int map_page_size,
73 int nid,
74 unsigned long region_start, unsigned long region_end)
2bfd65e4 75{
2ad452ff 76 unsigned long pfn = pa >> PAGE_SHIFT;
0633dafc 77 pgd_t *pgdp;
2fb47060 78 p4d_t *p4dp;
0633dafc
NP
79 pud_t *pudp;
80 pmd_t *pmdp;
81 pte_t *ptep;
82
83 pgdp = pgd_offset_k(ea);
2fb47060
MR
84 p4dp = p4d_offset(pgdp, ea);
85 if (p4d_none(*p4dp)) {
645d5ce2
AK
86 pudp = early_alloc_pgtable(PAGE_SIZE, nid,
87 region_start, region_end);
2fb47060 88 p4d_populate(&init_mm, p4dp, pudp);
0633dafc 89 }
2fb47060 90 pudp = pud_offset(p4dp, ea);
0633dafc
NP
91 if (map_page_size == PUD_SIZE) {
92 ptep = (pte_t *)pudp;
93 goto set_the_pte;
94 }
95 if (pud_none(*pudp)) {
645d5ce2
AK
96 pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
97 region_end);
0633dafc
NP
98 pud_populate(&init_mm, pudp, pmdp);
99 }
100 pmdp = pmd_offset(pudp, ea);
101 if (map_page_size == PMD_SIZE) {
102 ptep = pmdp_ptep(pmdp);
103 goto set_the_pte;
104 }
105 if (!pmd_present(*pmdp)) {
2ad452ff
NP
106 ptep = early_alloc_pgtable(PAGE_SIZE, nid,
107 region_start, region_end);
0633dafc
NP
108 pmd_populate_kernel(&init_mm, pmdp, ptep);
109 }
110 ptep = pte_offset_kernel(pmdp, ea);
111
112set_the_pte:
2ad452ff 113 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
b8b2f37c 114 asm volatile("ptesync": : :"memory");
0633dafc
NP
115 return 0;
116}
117
2ad452ff
NP
118/*
119 * nid, region_start, and region_end are hints to try to place the page
120 * table memory in the same node or region.
121 */
122static int __map_kernel_page(unsigned long ea, unsigned long pa,
2bfd65e4 123 pgprot_t flags,
2ad452ff
NP
124 unsigned int map_page_size,
125 int nid,
126 unsigned long region_start, unsigned long region_end)
2bfd65e4 127{
2ad452ff 128 unsigned long pfn = pa >> PAGE_SHIFT;
2bfd65e4 129 pgd_t *pgdp;
2fb47060 130 p4d_t *p4dp;
2bfd65e4
AK
131 pud_t *pudp;
132 pmd_t *pmdp;
133 pte_t *ptep;
134 /*
135 * Make sure task size is correct as per the max adddr
136 */
137 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
0633dafc 138
0034d395
AK
139#ifdef CONFIG_PPC_64K_PAGES
140 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
141#endif
142
2ad452ff
NP
143 if (unlikely(!slab_is_available()))
144 return early_map_kernel_page(ea, pa, flags, map_page_size,
145 nid, region_start, region_end);
0633dafc 146
2ad452ff
NP
147 /*
148 * Should make page table allocation functions be able to take a
149 * node, so we can place kernel page tables on the right nodes after
150 * boot.
151 */
0633dafc 152 pgdp = pgd_offset_k(ea);
2fb47060
MR
153 p4dp = p4d_offset(pgdp, ea);
154 pudp = pud_alloc(&init_mm, p4dp, ea);
0633dafc
NP
155 if (!pudp)
156 return -ENOMEM;
157 if (map_page_size == PUD_SIZE) {
158 ptep = (pte_t *)pudp;
159 goto set_the_pte;
2bfd65e4 160 }
0633dafc
NP
161 pmdp = pmd_alloc(&init_mm, pudp, ea);
162 if (!pmdp)
163 return -ENOMEM;
164 if (map_page_size == PMD_SIZE) {
165 ptep = pmdp_ptep(pmdp);
166 goto set_the_pte;
2bfd65e4 167 }
0633dafc
NP
168 ptep = pte_alloc_kernel(pmdp, ea);
169 if (!ptep)
170 return -ENOMEM;
2bfd65e4
AK
171
172set_the_pte:
2ad452ff 173 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
b8b2f37c 174 asm volatile("ptesync": : :"memory");
2bfd65e4
AK
175 return 0;
176}
177
2ad452ff
NP
178int radix__map_kernel_page(unsigned long ea, unsigned long pa,
179 pgprot_t flags,
180 unsigned int map_page_size)
181{
182 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
183}
184
7614ff32 185#ifdef CONFIG_STRICT_KERNEL_RWX
7098f8f0
ME
186static void radix__change_memory_range(unsigned long start, unsigned long end,
187 unsigned long clear)
7614ff32 188{
7614ff32
BS
189 unsigned long idx;
190 pgd_t *pgdp;
2fb47060 191 p4d_t *p4dp;
7614ff32
BS
192 pud_t *pudp;
193 pmd_t *pmdp;
194 pte_t *ptep;
195
196 start = ALIGN_DOWN(start, PAGE_SIZE);
197 end = PAGE_ALIGN(end); // aligns up
198
b134bd90
ME
199 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
200 start, end, clear);
7614ff32
BS
201
202 for (idx = start; idx < end; idx += PAGE_SIZE) {
203 pgdp = pgd_offset_k(idx);
2fb47060
MR
204 p4dp = p4d_offset(pgdp, idx);
205 pudp = pud_alloc(&init_mm, p4dp, idx);
7614ff32
BS
206 if (!pudp)
207 continue;
d6eacedd 208 if (pud_is_leaf(*pudp)) {
7614ff32
BS
209 ptep = (pte_t *)pudp;
210 goto update_the_pte;
211 }
212 pmdp = pmd_alloc(&init_mm, pudp, idx);
213 if (!pmdp)
214 continue;
d6eacedd 215 if (pmd_is_leaf(*pmdp)) {
7614ff32
BS
216 ptep = pmdp_ptep(pmdp);
217 goto update_the_pte;
218 }
219 ptep = pte_alloc_kernel(pmdp, idx);
220 if (!ptep)
221 continue;
222update_the_pte:
b134bd90 223 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
7614ff32
BS
224 }
225
226 radix__flush_tlb_kernel_range(start, end);
227}
b134bd90
ME
228
229void radix__mark_rodata_ro(void)
230{
231 unsigned long start, end;
232
233 start = (unsigned long)_stext;
7082f8e7 234 end = (unsigned long)__end_rodata;
b134bd90
ME
235
236 radix__change_memory_range(start, end, _PAGE_WRITE);
111bcb37
ME
237
238 for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
239 end = start + PAGE_SIZE;
240 if (overlaps_interrupt_vector_text(start, end))
241 radix__change_memory_range(start, end, _PAGE_WRITE);
242 else
243 break;
244 }
b134bd90 245}
029d9252
ME
246
247void radix__mark_initmem_nx(void)
248{
249 unsigned long start = (unsigned long)__init_begin;
250 unsigned long end = (unsigned long)__init_end;
251
252 radix__change_memory_range(start, end, _PAGE_EXEC);
253}
7614ff32
BS
254#endif /* CONFIG_STRICT_KERNEL_RWX */
255
afb6d064
ME
256static inline void __meminit
257print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
b5200ec9 258{
6deb6b47
ME
259 char buf[10];
260
b5200ec9
RA
261 if (end <= start)
262 return;
263
6deb6b47
ME
264 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
265
afb6d064
ME
266 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
267 exec ? " (exec)" : "");
b5200ec9
RA
268}
269
232aa407
ME
270static unsigned long next_boundary(unsigned long addr, unsigned long end)
271{
272#ifdef CONFIG_STRICT_KERNEL_RWX
98d0219e
ME
273 unsigned long stext_phys;
274
275 stext_phys = __pa_symbol(_stext);
276
277 // Relocatable kernel running at non-zero real address
278 if (stext_phys != 0) {
111bcb37
ME
279 // The end of interrupts code at zero is a rodata boundary
280 unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
281 if (addr < end_intr)
282 return end_intr;
283
98d0219e
ME
284 // Start of relocated kernel text is a rodata boundary
285 if (addr < stext_phys)
286 return stext_phys;
287 }
288
b150a4d1
ME
289 if (addr < __pa_symbol(__srwx_boundary))
290 return __pa_symbol(__srwx_boundary);
232aa407
ME
291#endif
292 return end;
293}
294
b5200ec9 295static int __meminit create_physical_mapping(unsigned long start,
2ad452ff 296 unsigned long end,
4e00c5af 297 int nid, pgprot_t _prot)
b5200ec9 298{
9abcc981 299 unsigned long vaddr, addr, mapping_size = 0;
afb6d064 300 bool prev_exec, exec = false;
9abcc981 301 pgprot_t prot;
a2dc009a 302 int psize;
5e8b2c4d
NM
303 unsigned long max_mapping_size = radix_mem_block_size;
304
a5edf981 305 if (debug_pagealloc_enabled_or_kfence())
5e8b2c4d 306 max_mapping_size = PAGE_SIZE;
b5200ec9 307
b7115316 308 start = ALIGN(start, PAGE_SIZE);
79b123cd 309 end = ALIGN_DOWN(end, PAGE_SIZE);
b5200ec9
RA
310 for (addr = start; addr < end; addr += mapping_size) {
311 unsigned long gap, previous_size;
312 int rc;
313
232aa407 314 gap = next_boundary(addr, end) - addr;
af9d00e9
AK
315 if (gap > max_mapping_size)
316 gap = max_mapping_size;
b5200ec9 317 previous_size = mapping_size;
afb6d064 318 prev_exec = exec;
b5200ec9
RA
319
320 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
57306c66 321 mmu_psize_defs[MMU_PAGE_1G].shift) {
b5200ec9 322 mapping_size = PUD_SIZE;
a2dc009a
AK
323 psize = MMU_PAGE_1G;
324 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
325 mmu_psize_defs[MMU_PAGE_2M].shift) {
b5200ec9 326 mapping_size = PMD_SIZE;
a2dc009a
AK
327 psize = MMU_PAGE_2M;
328 } else {
b5200ec9 329 mapping_size = PAGE_SIZE;
a2dc009a
AK
330 psize = mmu_virtual_psize;
331 }
7614ff32 332
9abcc981
ME
333 vaddr = (unsigned long)__va(addr);
334
7f6d498e 335 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
afb6d064 336 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
9abcc981 337 prot = PAGE_KERNEL_X;
afb6d064
ME
338 exec = true;
339 } else {
4e00c5af 340 prot = _prot;
afb6d064
ME
341 exec = false;
342 }
343
344 if (mapping_size != previous_size || exec != prev_exec) {
345 print_mapping(start, addr, previous_size, prev_exec);
346 start = addr;
347 }
9abcc981 348
2ad452ff 349 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
b5200ec9
RA
350 if (rc)
351 return rc;
a2dc009a
AK
352
353 update_page_count(psize, 1);
b5200ec9
RA
354 }
355
afb6d064 356 print_mapping(start, addr, mapping_size, exec);
b5200ec9
RA
357 return 0;
358}
359
d667edc0 360static void __init radix_init_pgtable(void)
2bfd65e4 361{
2bfd65e4 362 unsigned long rts_field;
b10d6bca
MR
363 phys_addr_t start, end;
364 u64 i;
2bfd65e4
AK
365
366 /* We don't support slb for radix */
387e220a 367 slb_set_size(0);
af9d00e9 368
2bfd65e4 369 /*
af9d00e9 370 * Create the linear mapping
2bfd65e4 371 */
b10d6bca 372 for_each_mem_range(i, &start, &end) {
2ad452ff
NP
373 /*
374 * The memblock allocator is up at this point, so the
375 * page tables will be allocated within the range. No
376 * need or a node (which we don't have yet).
377 */
e0909392 378
b10d6bca 379 if (end >= RADIX_VMALLOC_START) {
f341d897 380 pr_warn("Outside the supported range\n");
e0909392
AK
381 continue;
382 }
383
b10d6bca 384 WARN_ON(create_physical_mapping(start, end,
4e00c5af 385 -1, PAGE_KERNEL));
2ad452ff 386 }
a25bd72b 387
2e1ae9cd
NP
388 if (!cpu_has_feature(CPU_FTR_HVMODE) &&
389 cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
a25bd72b 390 /*
1fd02f66 391 * Older versions of KVM on these machines prefer if the
2e1ae9cd 392 * guest only uses the low 19 PID bits.
a25bd72b 393 */
5402e239 394 mmu_pid_bits = 19;
a25bd72b 395 }
2e1ae9cd 396 mmu_base_pid = 1;
a25bd72b 397
2bfd65e4
AK
398 /*
399 * Allocate Partition table and process table for the
400 * host.
401 */
a25bd72b 402 BUG_ON(PRTB_SIZE_SHIFT > 36);
2ad452ff 403 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
2bfd65e4
AK
404 /*
405 * Fill in the process table.
2bfd65e4 406 */
b23d9c5b 407 rts_field = radix__get_tree_size();
2bfd65e4 408 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
ed6546bd 409
eeb715c3
NP
410 /*
411 * The init_mm context is given the first available (non-zero) PID,
412 * which is the "guard PID" and contains no page table. PIDR should
413 * never be set to zero because that duplicates the kernel address
414 * space at the 0x0... offset (quadrant 0)!
415 *
416 * An arbitrary PID that may later be allocated by the PID allocator
417 * for userspace processes must not be used either, because that
418 * would cause stale user mappings for that PID on CPUs outside of
419 * the TLB invalidation scheme (because it won't be in mm_cpumask).
420 *
421 * So permanently carve out one PID for the purpose of a guard PID.
422 */
423 init_mm.context.id = mmu_base_pid;
424 mmu_base_pid++;
2bfd65e4
AK
425}
426
427static void __init radix_init_partition_table(void)
428{
ed6546bd 429 unsigned long rts_field, dw0, dw1;
b23d9c5b 430
9d661958 431 mmu_partition_table_init();
b23d9c5b 432 rts_field = radix__get_tree_size();
9d661958 433 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
ed6546bd 434 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
7d805acc 435 mmu_partition_table_set_entry(0, dw0, dw1, false);
2bfd65e4 436
56547411 437 pr_info("Initializing Radix MMU\n");
2bfd65e4
AK
438}
439
2bfd65e4
AK
440static int __init get_idx_from_shift(unsigned int shift)
441{
442 int idx = -1;
443
444 switch (shift) {
445 case 0xc:
446 idx = MMU_PAGE_4K;
447 break;
448 case 0x10:
449 idx = MMU_PAGE_64K;
450 break;
451 case 0x15:
452 idx = MMU_PAGE_2M;
453 break;
454 case 0x1e:
455 idx = MMU_PAGE_1G;
456 break;
457 }
458 return idx;
459}
460
461static int __init radix_dt_scan_page_sizes(unsigned long node,
462 const char *uname, int depth,
463 void *data)
464{
465 int size = 0;
466 int shift, idx;
467 unsigned int ap;
468 const __be32 *prop;
469 const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
470
471 /* We are scanning "cpu" nodes only */
472 if (type == NULL || strcmp(type, "cpu") != 0)
473 return 0;
474
a25bd72b 475 /* Grab page size encodings */
2bfd65e4
AK
476 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
477 if (!prop)
478 return 0;
479
480 pr_info("Page sizes from device-tree:\n");
481 for (; size >= 4; size -= 4, ++prop) {
482
483 struct mmu_psize_def *def;
484
485 /* top 3 bit is AP encoding */
486 shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
487 ap = be32_to_cpu(prop[0]) >> 29;
ac8d3818 488 pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
2bfd65e4
AK
489
490 idx = get_idx_from_shift(shift);
491 if (idx < 0)
492 continue;
493
494 def = &mmu_psize_defs[idx];
495 def->shift = shift;
496 def->ap = ap;
d6265cb3 497 def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
2bfd65e4
AK
498 }
499
500 /* needed ? */
501 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
502 return 1;
503}
504
af9d00e9
AK
505#ifdef CONFIG_MEMORY_HOTPLUG
506static int __init probe_memory_block_size(unsigned long node, const char *uname, int
507 depth, void *data)
508{
509 unsigned long *mem_block_size = (unsigned long *)data;
fbf2f134 510 const __be32 *prop;
af9d00e9
AK
511 int len;
512
513 if (depth != 1)
514 return 0;
515
516 if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
517 return 0;
518
519 prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
fbf2f134
AK
520
521 if (!prop || len < dt_root_size_cells * sizeof(__be32))
af9d00e9
AK
522 /*
523 * Nothing in the device tree
524 */
525 *mem_block_size = MIN_MEMORY_BLOCK_SIZE;
526 else
fbf2f134 527 *mem_block_size = of_read_number(prop, dt_root_size_cells);
af9d00e9
AK
528 return 1;
529}
530
c13f2b2b 531static unsigned long __init radix_memory_block_size(void)
af9d00e9
AK
532{
533 unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
534
535 /*
536 * OPAL firmware feature is set by now. Hence we are ok
537 * to test OPAL feature.
538 */
539 if (firmware_has_feature(FW_FEATURE_OPAL))
540 mem_block_size = 1UL * 1024 * 1024 * 1024;
541 else
542 of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
543
544 return mem_block_size;
545}
546
547#else /* CONFIG_MEMORY_HOTPLUG */
548
c13f2b2b 549static unsigned long __init radix_memory_block_size(void)
af9d00e9
AK
550{
551 return 1UL * 1024 * 1024 * 1024;
552}
553
554#endif /* CONFIG_MEMORY_HOTPLUG */
555
556
2537b09c 557void __init radix__early_init_devtree(void)
2bfd65e4
AK
558{
559 int rc;
560
561 /*
562 * Try to find the available page sizes in the device-tree
563 */
564 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
af9d00e9
AK
565 if (!rc) {
566 /*
567 * No page size details found in device tree.
568 * Let's assume we have page 4k and 64k support
569 */
570 mmu_psize_defs[MMU_PAGE_4K].shift = 12;
571 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
d6265cb3
BR
572 mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
573 psize_to_rpti_pgsize(MMU_PAGE_4K);
af9d00e9
AK
574
575 mmu_psize_defs[MMU_PAGE_64K].shift = 16;
576 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
d6265cb3
BR
577 mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
578 psize_to_rpti_pgsize(MMU_PAGE_64K);
af9d00e9
AK
579 }
580
2bfd65e4 581 /*
af9d00e9
AK
582 * Max mapping size used when mapping pages. We don't use
583 * ppc_md.memory_block_size() here because this get called
584 * early and we don't have machine probe called yet. Also
585 * the pseries implementation only check for ibm,lmb-size.
586 * All hypervisor supporting radix do expose that device
587 * tree node.
2bfd65e4 588 */
af9d00e9 589 radix_mem_block_size = radix_memory_block_size();
2bfd65e4
AK
590 return;
591}
592
593void __init radix__early_init_mmu(void)
594{
595 unsigned long lpcr;
2bfd65e4 596
387e220a 597#ifdef CONFIG_PPC_64S_HASH_MMU
2bfd65e4
AK
598#ifdef CONFIG_PPC_64K_PAGES
599 /* PAGE_SIZE mappings */
600 mmu_virtual_psize = MMU_PAGE_64K;
601#else
602 mmu_virtual_psize = MMU_PAGE_4K;
603#endif
604
605#ifdef CONFIG_SPARSEMEM_VMEMMAP
606 /* vmemmap mapping */
89a3496e
AK
607 if (mmu_psize_defs[MMU_PAGE_2M].shift) {
608 /*
609 * map vmemmap using 2M if available
610 */
611 mmu_vmemmap_psize = MMU_PAGE_2M;
612 } else
613 mmu_vmemmap_psize = mmu_virtual_psize;
387e220a 614#endif
2bfd65e4
AK
615#endif
616 /*
617 * initialize page table size
618 */
619 __pte_index_size = RADIX_PTE_INDEX_SIZE;
620 __pmd_index_size = RADIX_PMD_INDEX_SIZE;
621 __pud_index_size = RADIX_PUD_INDEX_SIZE;
622 __pgd_index_size = RADIX_PGD_INDEX_SIZE;
fae22116 623 __pud_cache_index = RADIX_PUD_INDEX_SIZE;
2bfd65e4
AK
624 __pte_table_size = RADIX_PTE_TABLE_SIZE;
625 __pmd_table_size = RADIX_PMD_TABLE_SIZE;
626 __pud_table_size = RADIX_PUD_TABLE_SIZE;
627 __pgd_table_size = RADIX_PGD_TABLE_SIZE;
628
a2f41eb9
AK
629 __pmd_val_bits = RADIX_PMD_VAL_BITS;
630 __pud_val_bits = RADIX_PUD_VAL_BITS;
631 __pgd_val_bits = RADIX_PGD_VAL_BITS;
2bfd65e4 632
d6a9996e 633 __kernel_virt_start = RADIX_KERN_VIRT_START;
d6a9996e
AK
634 __vmalloc_start = RADIX_VMALLOC_START;
635 __vmalloc_end = RADIX_VMALLOC_END;
63ee9b2f 636 __kernel_io_start = RADIX_KERN_IO_START;
a35a3c6f 637 __kernel_io_end = RADIX_KERN_IO_END;
0034d395 638 vmemmap = (struct page *)RADIX_VMEMMAP_START;
d6a9996e 639 ioremap_bot = IOREMAP_BASE;
bfa37087
DS
640
641#ifdef CONFIG_PCI
642 pci_io_base = ISA_IO_BASE;
643#endif
fb4e5dbd
AK
644 __pte_frag_nr = RADIX_PTE_FRAG_NR;
645 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
8a6c697b
AK
646 __pmd_frag_nr = RADIX_PMD_FRAG_NR;
647 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
d6a9996e 648
ed6546bd
NP
649 radix_init_pgtable();
650
d6c88600
AK
651 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
652 lpcr = mfspr(SPRN_LPCR);
bf16cdf4 653 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2bfd65e4 654 radix_init_partition_table();
cc3d2940
PM
655 } else {
656 radix_init_pseries();
d6c88600 657 }
2bfd65e4 658
9d661958
PM
659 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
660
eeb715c3
NP
661 /* Switch to the guard PID before turning on MMU */
662 radix__switch_mmu_context(NULL, &init_mm);
7e71c428 663 tlbiel_all();
2bfd65e4
AK
664}
665
666void radix__early_init_mmu_secondary(void)
667{
668 unsigned long lpcr;
669 /*
d6c88600 670 * update partition table control register and UPRT
2bfd65e4 671 */
d6c88600
AK
672 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
673 lpcr = mfspr(SPRN_LPCR);
bf16cdf4 674 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
d6c88600 675
52231340
CC
676 set_ptcr_when_no_uv(__pa(partition_tb) |
677 (PATB_SIZE_SHIFT - 12));
d6c88600 678 }
d4748276 679
eeb715c3 680 radix__switch_mmu_context(NULL, &init_mm);
7e71c428 681 tlbiel_all();
39df17bc
AK
682
683 /* Make sure userspace can't change the AMR */
684 mtspr(SPRN_UAMOR, 0);
2bfd65e4
AK
685}
686
8119cefd
HB
687/* Called during kexec sequence with MMU off */
688notrace void radix__mmu_cleanup_all(void)
fe036a06
BH
689{
690 unsigned long lpcr;
691
692 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
693 lpcr = mfspr(SPRN_LPCR);
694 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
52231340 695 set_ptcr_when_no_uv(0);
1d0761d2 696 powernv_set_nmmu_ptcr(0);
fe036a06
BH
697 radix__flush_tlb_all();
698 }
699}
700
6cc27341 701#ifdef CONFIG_MEMORY_HOTPLUG
4b5d62ca
RA
702static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
703{
704 pte_t *pte;
705 int i;
706
707 for (i = 0; i < PTRS_PER_PTE; i++) {
708 pte = pte_start + i;
709 if (!pte_none(*pte))
710 return;
711 }
712
713 pte_free_kernel(&init_mm, pte_start);
714 pmd_clear(pmd);
715}
716
717static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
718{
719 pmd_t *pmd;
720 int i;
721
722 for (i = 0; i < PTRS_PER_PMD; i++) {
723 pmd = pmd_start + i;
724 if (!pmd_none(*pmd))
725 return;
726 }
727
728 pmd_free(&init_mm, pmd_start);
729 pud_clear(pud);
730}
731
9ce8853b
BR
732static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
733{
734 pud_t *pud;
735 int i;
736
737 for (i = 0; i < PTRS_PER_PUD; i++) {
738 pud = pud_start + i;
739 if (!pud_none(*pud))
740 return;
741 }
742
743 pud_free(&init_mm, pud_start);
744 p4d_clear(p4d);
745}
746
368a0590
AK
747#ifdef CONFIG_SPARSEMEM_VMEMMAP
748static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
749{
750 unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
751
752 return !vmemmap_populated(start, PMD_SIZE);
753}
754
755static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
756{
757 unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
758
759 return !vmemmap_populated(start, PAGE_SIZE);
760
761}
762#endif
763
764static void __meminit free_vmemmap_pages(struct page *page,
765 struct vmem_altmap *altmap,
766 int order)
767{
768 unsigned int nr_pages = 1 << order;
769
770 if (altmap) {
771 unsigned long alt_start, alt_end;
772 unsigned long base_pfn = page_to_pfn(page);
773
774 /*
775 * with 2M vmemmap mmaping we can have things setup
776 * such that even though atlmap is specified we never
777 * used altmap.
778 */
779 alt_start = altmap->base_pfn;
780 alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
781
782 if (base_pfn >= alt_start && base_pfn < alt_end) {
783 vmem_altmap_free(altmap, nr_pages);
784 return;
785 }
786 }
787
788 if (PageReserved(page)) {
789 /* allocated from memblock */
790 while (nr_pages--)
791 free_reserved_page(page++);
792 } else
793 free_pages((unsigned long)page_address(page), order);
794}
795
796static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
797 unsigned long end, bool direct,
798 struct vmem_altmap *altmap)
4b5d62ca 799{
0da90af4 800 unsigned long next, pages = 0;
4b5d62ca
RA
801 pte_t *pte;
802
803 pte = pte_start + pte_index(addr);
804 for (; addr < end; addr = next, pte++) {
805 next = (addr + PAGE_SIZE) & PAGE_MASK;
806 if (next > end)
807 next = end;
808
809 if (!pte_present(*pte))
810 continue;
811
368a0590
AK
812 if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
813 if (!direct)
814 free_vmemmap_pages(pte_page(*pte), altmap, 0);
815 pte_clear(&init_mm, addr, pte);
816 pages++;
0d0a4bc2 817 }
368a0590
AK
818#ifdef CONFIG_SPARSEMEM_VMEMMAP
819 else if (!direct && vmemmap_page_is_unused(addr, next)) {
820 free_vmemmap_pages(pte_page(*pte), altmap, 0);
821 pte_clear(&init_mm, addr, pte);
822 }
823#endif
4b5d62ca 824 }
0da90af4
AK
825 if (direct)
826 update_page_count(mmu_virtual_psize, -pages);
4b5d62ca
RA
827}
828
aff77951 829static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
368a0590
AK
830 unsigned long end, bool direct,
831 struct vmem_altmap *altmap)
4b5d62ca 832{
0da90af4 833 unsigned long next, pages = 0;
4b5d62ca
RA
834 pte_t *pte_base;
835 pmd_t *pmd;
836
837 pmd = pmd_start + pmd_index(addr);
838 for (; addr < end; addr = next, pmd++) {
839 next = pmd_addr_end(addr, end);
840
841 if (!pmd_present(*pmd))
842 continue;
843
d6eacedd 844 if (pmd_is_leaf(*pmd)) {
368a0590
AK
845 if (IS_ALIGNED(addr, PMD_SIZE) &&
846 IS_ALIGNED(next, PMD_SIZE)) {
847 if (!direct)
848 free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
849 pte_clear(&init_mm, addr, (pte_t *)pmd);
850 pages++;
d6d6ebfc 851 }
368a0590
AK
852#ifdef CONFIG_SPARSEMEM_VMEMMAP
853 else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
854 free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
855 pte_clear(&init_mm, addr, (pte_t *)pmd);
856 }
857#endif
4b5d62ca
RA
858 continue;
859 }
860
861 pte_base = (pte_t *)pmd_page_vaddr(*pmd);
368a0590 862 remove_pte_table(pte_base, addr, next, direct, altmap);
4b5d62ca
RA
863 free_pte_table(pte_base, pmd);
864 }
0da90af4
AK
865 if (direct)
866 update_page_count(MMU_PAGE_2M, -pages);
4b5d62ca
RA
867}
868
aff77951 869static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
368a0590
AK
870 unsigned long end, bool direct,
871 struct vmem_altmap *altmap)
4b5d62ca 872{
0da90af4 873 unsigned long next, pages = 0;
4b5d62ca
RA
874 pmd_t *pmd_base;
875 pud_t *pud;
876
877 pud = pud_start + pud_index(addr);
878 for (; addr < end; addr = next, pud++) {
879 next = pud_addr_end(addr, end);
880
881 if (!pud_present(*pud))
882 continue;
883
d6eacedd 884 if (pud_is_leaf(*pud)) {
d6d6ebfc
BR
885 if (!IS_ALIGNED(addr, PUD_SIZE) ||
886 !IS_ALIGNED(next, PUD_SIZE)) {
887 WARN_ONCE(1, "%s: unaligned range\n", __func__);
888 continue;
889 }
890 pte_clear(&init_mm, addr, (pte_t *)pud);
0da90af4 891 pages++;
4b5d62ca
RA
892 continue;
893 }
894
9cf6fa24 895 pmd_base = pud_pgtable(*pud);
368a0590 896 remove_pmd_table(pmd_base, addr, next, direct, altmap);
4b5d62ca
RA
897 free_pmd_table(pmd_base, pud);
898 }
0da90af4
AK
899 if (direct)
900 update_page_count(MMU_PAGE_1G, -pages);
4b5d62ca
RA
901}
902
368a0590
AK
903static void __meminit
904remove_pagetable(unsigned long start, unsigned long end, bool direct,
905 struct vmem_altmap *altmap)
4b5d62ca
RA
906{
907 unsigned long addr, next;
908 pud_t *pud_base;
909 pgd_t *pgd;
2fb47060 910 p4d_t *p4d;
4b5d62ca
RA
911
912 spin_lock(&init_mm.page_table_lock);
913
914 for (addr = start; addr < end; addr = next) {
915 next = pgd_addr_end(addr, end);
916
917 pgd = pgd_offset_k(addr);
2fb47060
MR
918 p4d = p4d_offset(pgd, addr);
919 if (!p4d_present(*p4d))
4b5d62ca
RA
920 continue;
921
2fb47060 922 if (p4d_is_leaf(*p4d)) {
d6d6ebfc
BR
923 if (!IS_ALIGNED(addr, P4D_SIZE) ||
924 !IS_ALIGNED(next, P4D_SIZE)) {
925 WARN_ONCE(1, "%s: unaligned range\n", __func__);
926 continue;
927 }
928
929 pte_clear(&init_mm, addr, (pte_t *)pgd);
4b5d62ca
RA
930 continue;
931 }
932
dc4875f0 933 pud_base = p4d_pgtable(*p4d);
368a0590 934 remove_pud_table(pud_base, addr, next, direct, altmap);
9ce8853b 935 free_pud_table(pud_base, p4d);
4b5d62ca
RA
936 }
937
938 spin_unlock(&init_mm.page_table_lock);
939 radix__flush_tlb_kernel_range(start, end);
940}
941
4e00c5af
LG
942int __meminit radix__create_section_mapping(unsigned long start,
943 unsigned long end, int nid,
944 pgprot_t prot)
6cc27341 945{
e0909392 946 if (end >= RADIX_VMALLOC_START) {
f341d897 947 pr_warn("Outside the supported range\n");
e0909392
AK
948 return -1;
949 }
950
af9d00e9 951 return create_physical_mapping(__pa(start), __pa(end),
5e8b2c4d 952 nid, prot);
6cc27341 953}
4b5d62ca 954
bde709a7 955int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
4b5d62ca 956{
368a0590 957 remove_pagetable(start, end, true, NULL);
4b5d62ca
RA
958 return 0;
959}
6cc27341
RA
960#endif /* CONFIG_MEMORY_HOTPLUG */
961
d9225ad9 962#ifdef CONFIG_SPARSEMEM_VMEMMAP
29ab6c47
NP
963static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
964 pgprot_t flags, unsigned int map_page_size,
965 int nid)
966{
967 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
968}
969
d9225ad9
AK
970int __meminit radix__vmemmap_create_mapping(unsigned long start,
971 unsigned long page_size,
972 unsigned long phys)
973{
974 /* Create a PTE encoding */
2ad452ff
NP
975 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
976 int ret;
977
e0909392 978 if ((start + page_size) >= RADIX_VMEMMAP_END) {
f341d897 979 pr_warn("Outside the supported range\n");
e0909392
AK
980 return -1;
981 }
982
d933557b 983 ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
2ad452ff 984 BUG_ON(ret);
d9225ad9 985
d9225ad9
AK
986 return 0;
987}
988
368a0590
AK
989int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
990 unsigned long addr, unsigned long next)
991{
992 int large = pmd_large(*pmdp);
993
994 if (large)
995 vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
996
997 return large;
998}
999
1000void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
1001 unsigned long addr, unsigned long next)
1002{
1003 pte_t entry;
1004 pte_t *ptep = pmdp_ptep(pmdp);
1005
1006 VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
1007 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1008 set_pte_at(&init_mm, addr, ptep, entry);
1009 asm volatile("ptesync": : :"memory");
1010
1011 vmemmap_verify(ptep, node, addr, next);
1012}
1013
1014static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
1015 int node,
1016 struct vmem_altmap *altmap,
1017 struct page *reuse)
1018{
1019 pte_t *pte = pte_offset_kernel(pmdp, addr);
1020
1021 if (pte_none(*pte)) {
1022 pte_t entry;
1023 void *p;
1024
1025 if (!reuse) {
1026 /*
1027 * make sure we don't create altmap mappings
1028 * covering things outside the device.
1029 */
1030 if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
1031 altmap = NULL;
1032
1033 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
1034 if (!p && altmap)
1035 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
1036 if (!p)
1037 return NULL;
1038 } else {
1039 /*
1040 * When a PTE/PMD entry is freed from the init_mm
1041 * there's a free_pages() call to this page allocated
1042 * above. Thus this get_page() is paired with the
1043 * put_page_testzero() on the freeing path.
1044 * This can only called by certain ZONE_DEVICE path,
1045 * and through vmemmap_populate_compound_pages() when
1046 * slab is available.
1047 */
1048 get_page(reuse);
1049 p = page_to_virt(reuse);
1050 }
1051
1052 VM_BUG_ON(!PAGE_ALIGNED(addr));
1053 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1054 set_pte_at(&init_mm, addr, pte, entry);
1055 asm volatile("ptesync": : :"memory");
1056 }
1057 return pte;
1058}
1059
1060static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
1061 unsigned long address)
1062{
1063 pud_t *pud;
1064
1065 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1066 if (unlikely(p4d_none(*p4dp))) {
1067 if (unlikely(!slab_is_available())) {
1068 pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1069 p4d_populate(&init_mm, p4dp, pud);
1070 /* go to the pud_offset */
1071 } else
1072 return pud_alloc(&init_mm, p4dp, address);
1073 }
1074 return pud_offset(p4dp, address);
1075}
1076
1077static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
1078 unsigned long address)
1079{
1080 pmd_t *pmd;
1081
1082 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1083 if (unlikely(pud_none(*pudp))) {
1084 if (unlikely(!slab_is_available())) {
1085 pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1086 pud_populate(&init_mm, pudp, pmd);
1087 } else
1088 return pmd_alloc(&init_mm, pudp, address);
1089 }
1090 return pmd_offset(pudp, address);
1091}
1092
1093static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
1094 unsigned long address)
1095{
1096 pte_t *pte;
1097
1098 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1099 if (unlikely(pmd_none(*pmdp))) {
1100 if (unlikely(!slab_is_available())) {
1101 pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1102 pmd_populate(&init_mm, pmdp, pte);
1103 } else
1104 return pte_alloc_kernel(pmdp, address);
1105 }
1106 return pte_offset_kernel(pmdp, address);
1107}
1108
1109
1110
1111int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
1112 struct vmem_altmap *altmap)
1113{
1114 unsigned long addr;
1115 unsigned long next;
1116 pgd_t *pgd;
1117 p4d_t *p4d;
1118 pud_t *pud;
1119 pmd_t *pmd;
1120 pte_t *pte;
1121
1122 for (addr = start; addr < end; addr = next) {
1123 next = pmd_addr_end(addr, end);
1124
1125 pgd = pgd_offset_k(addr);
1126 p4d = p4d_offset(pgd, addr);
1127 pud = vmemmap_pud_alloc(p4d, node, addr);
1128 if (!pud)
1129 return -ENOMEM;
1130 pmd = vmemmap_pmd_alloc(pud, node, addr);
1131 if (!pmd)
1132 return -ENOMEM;
1133
1134 if (pmd_none(READ_ONCE(*pmd))) {
1135 void *p;
1136
1137 /*
1138 * keep it simple by checking addr PMD_SIZE alignment
1139 * and verifying the device boundary condition.
1140 * For us to use a pmd mapping, both addr and pfn should
1141 * be aligned. We skip if addr is not aligned and for
1142 * pfn we hope we have extra area in the altmap that
1143 * can help to find an aligned block. This can result
1144 * in altmap block allocation failures, in which case
1145 * we fallback to RAM for vmemmap allocation.
1146 */
1147 if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
1148 altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
1149 /*
1150 * make sure we don't create altmap mappings
1151 * covering things outside the device.
1152 */
1153 goto base_mapping;
1154 }
1155
1156 p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1157 if (p) {
1158 vmemmap_set_pmd(pmd, p, node, addr, next);
1159 continue;
1160 } else if (altmap) {
1161 /*
1162 * A vmemmap block allocation can fail due to
1163 * alignment requirements and we trying to align
1164 * things aggressively there by running out of
1165 * space. Try base mapping on failure.
1166 */
1167 goto base_mapping;
1168 }
1169 } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
1170 /*
1171 * If a huge mapping exist due to early call to
1172 * vmemmap_populate, let's try to use that.
1173 */
1174 continue;
1175 }
1176base_mapping:
1177 /*
1178 * Not able allocate higher order memory to back memmap
1179 * or we found a pointer to pte page. Allocate base page
1180 * size vmemmap
1181 */
1182 pte = vmemmap_pte_alloc(pmd, node, addr);
1183 if (!pte)
1184 return -ENOMEM;
1185
1186 pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
1187 if (!pte)
1188 return -ENOMEM;
1189
1190 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1191 next = addr + PAGE_SIZE;
1192 }
1193 return 0;
1194}
1195
d9225ad9 1196#ifdef CONFIG_MEMORY_HOTPLUG
bde709a7 1197void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
d9225ad9 1198{
368a0590
AK
1199 remove_pagetable(start, start + page_size, true, NULL);
1200}
1201
1202void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
1203 struct vmem_altmap *altmap)
1204{
1205 remove_pagetable(start, end, false, altmap);
d9225ad9
AK
1206}
1207#endif
1208#endif
bde3eb62 1209
a5edf981 1210#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
4f703e7f
JS
1211void radix__kernel_map_pages(struct page *page, int numpages, int enable)
1212{
5e8b2c4d
NM
1213 unsigned long addr;
1214
1215 addr = (unsigned long)page_address(page);
1216
1217 if (enable)
1218 set_memory_p(addr, numpages);
1219 else
1220 set_memory_np(addr, numpages);
4f703e7f
JS
1221}
1222#endif
1223
bde3eb62
AK
1224#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1225
1226unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
1227 pmd_t *pmdp, unsigned long clr,
1228 unsigned long set)
1229{
1230 unsigned long old;
1231
1232#ifdef CONFIG_DEBUG_VM
ebd31197 1233 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
af60a4cf 1234 assert_spin_locked(pmd_lockptr(mm, pmdp));
bde3eb62
AK
1235#endif
1236
040ec620 1237 old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
104c49d5 1238 trace_hugepage_update_pmd(addr, old, clr, set);
bde3eb62
AK
1239
1240 return old;
1241}
1242
27af67f3
AK
1243unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
1244 pud_t *pudp, unsigned long clr,
1245 unsigned long set)
1246{
1247 unsigned long old;
1248
1249#ifdef CONFIG_DEBUG_VM
1250 WARN_ON(!pud_devmap(*pudp));
1251 assert_spin_locked(pud_lockptr(mm, pudp));
1252#endif
1253
1254 old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
1255 trace_hugepage_update_pud(addr, old, clr, set);
1256
1257 return old;
1258}
1259
bde3eb62
AK
1260pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
1261 pmd_t *pmdp)
1262
1263{
1264 pmd_t pmd;
1265
1266 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1267 VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
ebd31197 1268 VM_BUG_ON(pmd_devmap(*pmdp));
bde3eb62
AK
1269 /*
1270 * khugepaged calls this for normal pmd
1271 */
1272 pmd = *pmdp;
1273 pmd_clear(pmdp);
424de9c6 1274
424de9c6
BH
1275 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
1276
bde3eb62
AK
1277 return pmd;
1278}
1279
1280/*
1281 * For us pgtable_t is pte_t *. Inorder to save the deposisted
1282 * page table, we consider the allocated page table as a list
1283 * head. On withdraw we need to make sure we zero out the used
1284 * list_head memory area.
1285 */
1286void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1287 pgtable_t pgtable)
1288{
47d99948 1289 struct list_head *lh = (struct list_head *) pgtable;
bde3eb62 1290
47d99948 1291 assert_spin_locked(pmd_lockptr(mm, pmdp));
bde3eb62 1292
47d99948
CL
1293 /* FIFO */
1294 if (!pmd_huge_pte(mm, pmdp))
1295 INIT_LIST_HEAD(lh);
1296 else
1297 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1298 pmd_huge_pte(mm, pmdp) = pgtable;
bde3eb62
AK
1299}
1300
1301pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1302{
47d99948
CL
1303 pte_t *ptep;
1304 pgtable_t pgtable;
1305 struct list_head *lh;
bde3eb62 1306
47d99948
CL
1307 assert_spin_locked(pmd_lockptr(mm, pmdp));
1308
1309 /* FIFO */
1310 pgtable = pmd_huge_pte(mm, pmdp);
1311 lh = (struct list_head *) pgtable;
1312 if (list_empty(lh))
1313 pmd_huge_pte(mm, pmdp) = NULL;
1314 else {
1315 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1316 list_del(lh);
1317 }
1318 ptep = (pte_t *) pgtable;
1319 *ptep = __pte(0);
1320 ptep++;
1321 *ptep = __pte(0);
1322 return pgtable;
1323}
bde3eb62
AK
1324
1325pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
47d99948 1326 unsigned long addr, pmd_t *pmdp)
bde3eb62
AK
1327{
1328 pmd_t old_pmd;
1329 unsigned long old;
1330
1331 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
1332 old_pmd = __pmd(old);
bde3eb62
AK
1333 return old_pmd;
1334}
1335
27af67f3
AK
1336pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
1337 unsigned long addr, pud_t *pudp)
1338{
1339 pud_t old_pud;
1340 unsigned long old;
1341
1342 old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
1343 old_pud = __pud(old);
1344 return old_pud;
1345}
1346
bde3eb62 1347#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
044003b5 1348
e4c1112c
AK
1349void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1350 pte_t entry, unsigned long address, int psize)
044003b5 1351{
e4c1112c 1352 struct mm_struct *mm = vma->vm_mm;
66b2ca08
ME
1353 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
1354 _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
f08d08f3
AK
1355
1356 unsigned long change = pte_val(entry) ^ pte_val(*ptep);
bd5050e3 1357 /*
2a8a0f42
NP
1358 * On POWER9, the NMMU is not able to relax PTE access permissions
1359 * for a translation with a TLB. The PTE must be invalidated, TLB
1360 * flushed before the new PTE is installed.
1361 *
1362 * This only needs to be done for radix, because hash translation does
1363 * flush when updating the linux pte (and we don't support NMMU
1364 * accelerators on HPT on POWER9 anyway XXX: do we?).
1365 *
1366 * POWER10 (and P9P) NMMU does behave as per ISA.
bd5050e3 1367 */
2a8a0f42
NP
1368 if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
1369 atomic_read(&mm->context.copros) > 0) {
044003b5
AK
1370 unsigned long old_pte, new_pte;
1371
f08d08f3 1372 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
044003b5 1373 new_pte = old_pte | set;
bd5050e3 1374 radix__flush_tlb_page_psize(mm, address, psize);
f08d08f3 1375 __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
bd5050e3 1376 } else {
044003b5 1377 __radix_pte_update(ptep, 0, set);
e5f7cb58
NP
1378 /*
1379 * Book3S does not require a TLB flush when relaxing access
2a8a0f42
NP
1380 * restrictions when the address space (modulo the POWER9 nest
1381 * MMU issue above) because the MMU will reload the PTE after
1382 * taking an access fault, as defined by the architecture. See
1383 * "Setting a Reference or Change Bit or Upgrading Access
1384 * Authority (PTE Subject to Atomic Hardware Updates)" in
1385 * Power ISA Version 3.1B.
e5f7cb58 1386 */
bd5050e3 1387 }
f1cb8f9b 1388 /* See ptesync comment in radix__set_pte_at */
044003b5 1389}
5b323367
AK
1390
1391void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1392 unsigned long addr, pte_t *ptep,
1393 pte_t old_pte, pte_t pte)
1394{
1395 struct mm_struct *mm = vma->vm_mm;
1396
1397 /*
2a8a0f42
NP
1398 * POWER9 NMMU must flush the TLB after clearing the PTE before
1399 * installing a PTE with more relaxed access permissions, see
1400 * radix__ptep_set_access_flags.
5b323367 1401 */
2a8a0f42
NP
1402 if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
1403 is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
5b323367
AK
1404 (atomic_read(&mm->context.copros) > 0))
1405 radix__flush_tlb_page(vma, addr);
1406
1407 set_pte_at(mm, addr, ptep, pte);
1408}
d38153f9 1409
d909f910
NP
1410int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
1411{
1412 pte_t *ptep = (pte_t *)pud;
1413 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
1414
1415 if (!radix_enabled())
1416 return 0;
1417
1418 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
1419
1420 return 1;
1421}
1422
1423int pud_clear_huge(pud_t *pud)
1424{
467ba14e 1425 if (pud_is_leaf(*pud)) {
d909f910
NP
1426 pud_clear(pud);
1427 return 1;
1428 }
1429
1430 return 0;
1431}
1432
1433int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1434{
1435 pmd_t *pmd;
1436 int i;
1437
9cf6fa24 1438 pmd = pud_pgtable(*pud);
d909f910
NP
1439 pud_clear(pud);
1440
1441 flush_tlb_kernel_range(addr, addr + PUD_SIZE);
1442
1443 for (i = 0; i < PTRS_PER_PMD; i++) {
1444 if (!pmd_none(pmd[i])) {
1445 pte_t *pte;
1446 pte = (pte_t *)pmd_page_vaddr(pmd[i]);
1447
1448 pte_free_kernel(&init_mm, pte);
1449 }
1450 }
1451
1452 pmd_free(&init_mm, pmd);
1453
1454 return 1;
1455}
1456
1457int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
1458{
1459 pte_t *ptep = (pte_t *)pmd;
1460 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
1461
1462 if (!radix_enabled())
1463 return 0;
1464
1465 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
1466
1467 return 1;
1468}
1469
1470int pmd_clear_huge(pmd_t *pmd)
1471{
467ba14e 1472 if (pmd_is_leaf(*pmd)) {
d909f910
NP
1473 pmd_clear(pmd);
1474 return 1;
1475 }
1476
1477 return 0;
1478}
1479
1480int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1481{
1482 pte_t *pte;
1483
1484 pte = (pte_t *)pmd_page_vaddr(*pmd);
1485 pmd_clear(pmd);
1486
1487 flush_tlb_kernel_range(addr, addr + PMD_SIZE);
1488
1489 pte_free_kernel(&init_mm, pte);
1490
1491 return 1;
1492}