powerpc/mm: Pass node id into create_section_mapping
[linux-2.6-block.git] / arch / powerpc / mm / pgtable-radix.c
CommitLineData
2bfd65e4
AK
1/*
2 * Page table handling routines for radix page table.
3 *
4 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
bd350f71
ME
11
12#define pr_fmt(fmt) "radix-mmu: " fmt
13
14#include <linux/kernel.h>
589ee628 15#include <linux/sched/mm.h>
2bfd65e4
AK
16#include <linux/memblock.h>
17#include <linux/of_fdt.h>
7614ff32 18#include <linux/mm.h>
6deb6b47 19#include <linux/string_helpers.h>
2bfd65e4
AK
20
21#include <asm/pgtable.h>
22#include <asm/pgalloc.h>
23#include <asm/dma.h>
24#include <asm/machdep.h>
25#include <asm/mmu.h>
26#include <asm/firmware.h>
1d0761d2 27#include <asm/powernv.h>
9abcc981 28#include <asm/sections.h>
0428491c 29#include <asm/trace.h>
2bfd65e4 30
bde3eb62
AK
31#include <trace/events/thp.h>
32
a25bd72b
BH
33unsigned int mmu_pid_bits;
34unsigned int mmu_base_pid;
35
83209bc8
AK
36static int native_register_process_table(unsigned long base, unsigned long pg_sz,
37 unsigned long table_size)
2bfd65e4 38{
7cd2a869
SJS
39 unsigned long patb0, patb1;
40
41 patb0 = be64_to_cpu(partition_tb[0].patb0);
42 patb1 = base | table_size | PATB_GR;
43
44 mmu_partition_table_set_entry(0, patb0, patb1);
83209bc8 45
2bfd65e4
AK
46 return 0;
47}
48
2ad452ff
NP
49static __ref void *early_alloc_pgtable(unsigned long size, int nid,
50 unsigned long region_start, unsigned long region_end)
2bfd65e4 51{
2ad452ff 52 unsigned long pa = 0;
2bfd65e4
AK
53 void *pt;
54
2ad452ff
NP
55 if (region_start || region_end) /* has region hint */
56 pa = memblock_alloc_range(size, size, region_start, region_end,
57 MEMBLOCK_NONE);
58 else if (nid != -1) /* has node hint */
59 pa = memblock_alloc_base_nid(size, size,
60 MEMBLOCK_ALLOC_ANYWHERE,
61 nid, MEMBLOCK_NONE);
62
63 if (!pa)
64 pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE);
65
66 BUG_ON(!pa);
67
68 pt = __va(pa);
2bfd65e4
AK
69 memset(pt, 0, size);
70
71 return pt;
72}
73
0633dafc
NP
74static int early_map_kernel_page(unsigned long ea, unsigned long pa,
75 pgprot_t flags,
2ad452ff
NP
76 unsigned int map_page_size,
77 int nid,
78 unsigned long region_start, unsigned long region_end)
0633dafc 79{
2ad452ff 80 unsigned long pfn = pa >> PAGE_SHIFT;
0633dafc
NP
81 pgd_t *pgdp;
82 pud_t *pudp;
83 pmd_t *pmdp;
84 pte_t *ptep;
85
86 pgdp = pgd_offset_k(ea);
87 if (pgd_none(*pgdp)) {
2ad452ff
NP
88 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
89 region_start, region_end);
0633dafc
NP
90 pgd_populate(&init_mm, pgdp, pudp);
91 }
92 pudp = pud_offset(pgdp, ea);
93 if (map_page_size == PUD_SIZE) {
94 ptep = (pte_t *)pudp;
95 goto set_the_pte;
96 }
97 if (pud_none(*pudp)) {
2ad452ff
NP
98 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
99 region_start, region_end);
0633dafc
NP
100 pud_populate(&init_mm, pudp, pmdp);
101 }
102 pmdp = pmd_offset(pudp, ea);
103 if (map_page_size == PMD_SIZE) {
104 ptep = pmdp_ptep(pmdp);
105 goto set_the_pte;
106 }
107 if (!pmd_present(*pmdp)) {
2ad452ff
NP
108 ptep = early_alloc_pgtable(PAGE_SIZE, nid,
109 region_start, region_end);
0633dafc
NP
110 pmd_populate_kernel(&init_mm, pmdp, ptep);
111 }
112 ptep = pte_offset_kernel(pmdp, ea);
113
114set_the_pte:
2ad452ff 115 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
0633dafc
NP
116 smp_wmb();
117 return 0;
118}
119
2ad452ff
NP
120/*
121 * nid, region_start, and region_end are hints to try to place the page
122 * table memory in the same node or region.
123 */
124static int __map_kernel_page(unsigned long ea, unsigned long pa,
2bfd65e4 125 pgprot_t flags,
2ad452ff
NP
126 unsigned int map_page_size,
127 int nid,
128 unsigned long region_start, unsigned long region_end)
2bfd65e4 129{
2ad452ff 130 unsigned long pfn = pa >> PAGE_SHIFT;
2bfd65e4
AK
131 pgd_t *pgdp;
132 pud_t *pudp;
133 pmd_t *pmdp;
134 pte_t *ptep;
135 /*
136 * Make sure task size is correct as per the max adddr
137 */
138 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
0633dafc 139
2ad452ff
NP
140 if (unlikely(!slab_is_available()))
141 return early_map_kernel_page(ea, pa, flags, map_page_size,
142 nid, region_start, region_end);
0633dafc 143
2ad452ff
NP
144 /*
145 * Should make page table allocation functions be able to take a
146 * node, so we can place kernel page tables on the right nodes after
147 * boot.
148 */
0633dafc
NP
149 pgdp = pgd_offset_k(ea);
150 pudp = pud_alloc(&init_mm, pgdp, ea);
151 if (!pudp)
152 return -ENOMEM;
153 if (map_page_size == PUD_SIZE) {
154 ptep = (pte_t *)pudp;
155 goto set_the_pte;
156 }
157 pmdp = pmd_alloc(&init_mm, pudp, ea);
158 if (!pmdp)
159 return -ENOMEM;
160 if (map_page_size == PMD_SIZE) {
161 ptep = pmdp_ptep(pmdp);
162 goto set_the_pte;
2bfd65e4 163 }
0633dafc
NP
164 ptep = pte_alloc_kernel(pmdp, ea);
165 if (!ptep)
166 return -ENOMEM;
2bfd65e4
AK
167
168set_the_pte:
2ad452ff 169 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
2bfd65e4
AK
170 smp_wmb();
171 return 0;
172}
173
2ad452ff
NP
174int radix__map_kernel_page(unsigned long ea, unsigned long pa,
175 pgprot_t flags,
176 unsigned int map_page_size)
177{
178 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
179}
180
7614ff32 181#ifdef CONFIG_STRICT_KERNEL_RWX
b134bd90
ME
182void radix__change_memory_range(unsigned long start, unsigned long end,
183 unsigned long clear)
7614ff32 184{
7614ff32
BS
185 unsigned long idx;
186 pgd_t *pgdp;
187 pud_t *pudp;
188 pmd_t *pmdp;
189 pte_t *ptep;
190
191 start = ALIGN_DOWN(start, PAGE_SIZE);
192 end = PAGE_ALIGN(end); // aligns up
193
b134bd90
ME
194 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
195 start, end, clear);
7614ff32
BS
196
197 for (idx = start; idx < end; idx += PAGE_SIZE) {
198 pgdp = pgd_offset_k(idx);
199 pudp = pud_alloc(&init_mm, pgdp, idx);
200 if (!pudp)
201 continue;
202 if (pud_huge(*pudp)) {
203 ptep = (pte_t *)pudp;
204 goto update_the_pte;
205 }
206 pmdp = pmd_alloc(&init_mm, pudp, idx);
207 if (!pmdp)
208 continue;
209 if (pmd_huge(*pmdp)) {
210 ptep = pmdp_ptep(pmdp);
211 goto update_the_pte;
212 }
213 ptep = pte_alloc_kernel(pmdp, idx);
214 if (!ptep)
215 continue;
216update_the_pte:
b134bd90 217 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
7614ff32
BS
218 }
219
220 radix__flush_tlb_kernel_range(start, end);
221}
b134bd90
ME
222
223void radix__mark_rodata_ro(void)
224{
225 unsigned long start, end;
226
f79ad50e
BS
227 /*
228 * mark_rodata_ro() will mark itself as !writable at some point.
229 * Due to DD1 workaround in radix__pte_update(), we'll end up with
230 * an invalid pte and the system will crash quite severly.
231 */
232 if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
233 pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n");
234 return;
235 }
236
b134bd90
ME
237 start = (unsigned long)_stext;
238 end = (unsigned long)__init_begin;
239
240 radix__change_memory_range(start, end, _PAGE_WRITE);
241}
029d9252
ME
242
243void radix__mark_initmem_nx(void)
244{
245 unsigned long start = (unsigned long)__init_begin;
246 unsigned long end = (unsigned long)__init_end;
247
248 radix__change_memory_range(start, end, _PAGE_EXEC);
249}
7614ff32
BS
250#endif /* CONFIG_STRICT_KERNEL_RWX */
251
b5200ec9
RA
252static inline void __meminit print_mapping(unsigned long start,
253 unsigned long end,
254 unsigned long size)
255{
6deb6b47
ME
256 char buf[10];
257
b5200ec9
RA
258 if (end <= start)
259 return;
260
6deb6b47
ME
261 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
262
263 pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
b5200ec9
RA
264}
265
266static int __meminit create_physical_mapping(unsigned long start,
2ad452ff
NP
267 unsigned long end,
268 int nid)
b5200ec9 269{
9abcc981
ME
270 unsigned long vaddr, addr, mapping_size = 0;
271 pgprot_t prot;
7614ff32
BS
272 unsigned long max_mapping_size;
273#ifdef CONFIG_STRICT_KERNEL_RWX
274 int split_text_mapping = 1;
275#else
276 int split_text_mapping = 0;
277#endif
b5200ec9
RA
278
279 start = _ALIGN_UP(start, PAGE_SIZE);
280 for (addr = start; addr < end; addr += mapping_size) {
281 unsigned long gap, previous_size;
282 int rc;
283
284 gap = end - addr;
285 previous_size = mapping_size;
7614ff32 286 max_mapping_size = PUD_SIZE;
b5200ec9 287
7614ff32 288retry:
b5200ec9 289 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
7614ff32
BS
290 mmu_psize_defs[MMU_PAGE_1G].shift &&
291 PUD_SIZE <= max_mapping_size)
b5200ec9
RA
292 mapping_size = PUD_SIZE;
293 else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
294 mmu_psize_defs[MMU_PAGE_2M].shift)
295 mapping_size = PMD_SIZE;
296 else
297 mapping_size = PAGE_SIZE;
298
7614ff32
BS
299 if (split_text_mapping && (mapping_size == PUD_SIZE) &&
300 (addr <= __pa_symbol(__init_begin)) &&
301 (addr + mapping_size) >= __pa_symbol(_stext)) {
302 max_mapping_size = PMD_SIZE;
303 goto retry;
304 }
305
306 if (split_text_mapping && (mapping_size == PMD_SIZE) &&
307 (addr <= __pa_symbol(__init_begin)) &&
308 (addr + mapping_size) >= __pa_symbol(_stext))
309 mapping_size = PAGE_SIZE;
310
b5200ec9
RA
311 if (mapping_size != previous_size) {
312 print_mapping(start, addr, previous_size);
313 start = addr;
314 }
315
9abcc981
ME
316 vaddr = (unsigned long)__va(addr);
317
7f6d498e
BS
318 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
319 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size))
9abcc981
ME
320 prot = PAGE_KERNEL_X;
321 else
322 prot = PAGE_KERNEL;
323
2ad452ff 324 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
b5200ec9
RA
325 if (rc)
326 return rc;
327 }
328
329 print_mapping(start, addr, mapping_size);
330 return 0;
331}
332
2ad452ff 333void __init radix_init_pgtable(void)
2bfd65e4 334{
2bfd65e4
AK
335 unsigned long rts_field;
336 struct memblock_region *reg;
2bfd65e4
AK
337
338 /* We don't support slb for radix */
339 mmu_slb_size = 0;
340 /*
341 * Create the linear mapping, using standard page size for now
342 */
2ad452ff
NP
343 for_each_memblock(memory, reg) {
344 /*
345 * The memblock allocator is up at this point, so the
346 * page tables will be allocated within the range. No
347 * need or a node (which we don't have yet).
348 */
b5200ec9 349 WARN_ON(create_physical_mapping(reg->base,
2ad452ff
NP
350 reg->base + reg->size,
351 -1));
352 }
a25bd72b
BH
353
354 /* Find out how many PID bits are supported */
355 if (cpu_has_feature(CPU_FTR_HVMODE)) {
356 if (!mmu_pid_bits)
357 mmu_pid_bits = 20;
358#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
359 /*
360 * When KVM is possible, we only use the top half of the
361 * PID space to avoid collisions between host and guest PIDs
362 * which can cause problems due to prefetch when exiting the
363 * guest with AIL=3
364 */
365 mmu_base_pid = 1 << (mmu_pid_bits - 1);
366#else
367 mmu_base_pid = 1;
368#endif
369 } else {
370 /* The guest uses the bottom half of the PID space */
371 if (!mmu_pid_bits)
372 mmu_pid_bits = 19;
373 mmu_base_pid = 1;
374 }
375
2bfd65e4
AK
376 /*
377 * Allocate Partition table and process table for the
378 * host.
379 */
a25bd72b 380 BUG_ON(PRTB_SIZE_SHIFT > 36);
2ad452ff 381 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
2bfd65e4
AK
382 /*
383 * Fill in the process table.
2bfd65e4 384 */
b23d9c5b 385 rts_field = radix__get_tree_size();
2bfd65e4
AK
386 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
387 /*
388 * Fill in the partition table. We are suppose to use effective address
389 * of process table here. But our linear mapping also enable us to use
390 * physical address here.
391 */
eea8148c 392 register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
2bfd65e4 393 pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
7a70d728
PM
394 asm volatile("ptesync" : : : "memory");
395 asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
396 "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
397 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
0428491c 398 trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
2bfd65e4
AK
399}
400
401static void __init radix_init_partition_table(void)
402{
9d661958 403 unsigned long rts_field, dw0;
b23d9c5b 404
9d661958 405 mmu_partition_table_init();
b23d9c5b 406 rts_field = radix__get_tree_size();
9d661958
PM
407 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
408 mmu_partition_table_set_entry(0, dw0, 0);
2bfd65e4 409
56547411
AK
410 pr_info("Initializing Radix MMU\n");
411 pr_info("Partition table %p\n", partition_tb);
2bfd65e4
AK
412}
413
414void __init radix_init_native(void)
415{
eea8148c 416 register_process_table = native_register_process_table;
2bfd65e4
AK
417}
418
419static int __init get_idx_from_shift(unsigned int shift)
420{
421 int idx = -1;
422
423 switch (shift) {
424 case 0xc:
425 idx = MMU_PAGE_4K;
426 break;
427 case 0x10:
428 idx = MMU_PAGE_64K;
429 break;
430 case 0x15:
431 idx = MMU_PAGE_2M;
432 break;
433 case 0x1e:
434 idx = MMU_PAGE_1G;
435 break;
436 }
437 return idx;
438}
439
440static int __init radix_dt_scan_page_sizes(unsigned long node,
441 const char *uname, int depth,
442 void *data)
443{
444 int size = 0;
445 int shift, idx;
446 unsigned int ap;
447 const __be32 *prop;
448 const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
449
450 /* We are scanning "cpu" nodes only */
451 if (type == NULL || strcmp(type, "cpu") != 0)
452 return 0;
453
a25bd72b
BH
454 /* Find MMU PID size */
455 prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
456 if (prop && size == 4)
457 mmu_pid_bits = be32_to_cpup(prop);
458
459 /* Grab page size encodings */
2bfd65e4
AK
460 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
461 if (!prop)
462 return 0;
463
464 pr_info("Page sizes from device-tree:\n");
465 for (; size >= 4; size -= 4, ++prop) {
466
467 struct mmu_psize_def *def;
468
469 /* top 3 bit is AP encoding */
470 shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
471 ap = be32_to_cpu(prop[0]) >> 29;
ac8d3818 472 pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
2bfd65e4
AK
473
474 idx = get_idx_from_shift(shift);
475 if (idx < 0)
476 continue;
477
478 def = &mmu_psize_defs[idx];
479 def->shift = shift;
480 def->ap = ap;
481 }
482
483 /* needed ? */
484 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
485 return 1;
486}
487
2537b09c 488void __init radix__early_init_devtree(void)
2bfd65e4
AK
489{
490 int rc;
491
492 /*
493 * Try to find the available page sizes in the device-tree
494 */
495 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
496 if (rc != 0) /* Found */
497 goto found;
498 /*
499 * let's assume we have page 4k and 64k support
500 */
501 mmu_psize_defs[MMU_PAGE_4K].shift = 12;
502 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
503
504 mmu_psize_defs[MMU_PAGE_64K].shift = 16;
505 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
506found:
507#ifdef CONFIG_SPARSEMEM_VMEMMAP
508 if (mmu_psize_defs[MMU_PAGE_2M].shift) {
509 /*
510 * map vmemmap using 2M if available
511 */
512 mmu_vmemmap_psize = MMU_PAGE_2M;
513 }
514#endif /* CONFIG_SPARSEMEM_VMEMMAP */
515 return;
516}
517
ad410674
AK
518static void update_hid_for_radix(void)
519{
520 unsigned long hid0;
521 unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
522
523 asm volatile("ptesync": : :"memory");
524 /* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
525 asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
526 : : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory");
527 /* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */
528 asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
529 : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory");
530 asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
0428491c
BS
531 trace_tlbie(0, 0, rb, 0, 2, 0, 1);
532 trace_tlbie(0, 0, rb, 0, 2, 1, 1);
533
ad410674
AK
534 /*
535 * now switch the HID
536 */
537 hid0 = mfspr(SPRN_HID0);
538 hid0 |= HID0_POWER9_RADIX;
539 mtspr(SPRN_HID0, hid0);
540 asm volatile("isync": : :"memory");
541
542 /* Wait for it to happen */
543 while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
544 cpu_relax();
545}
546
ee97b6b9
BS
547static void radix_init_amor(void)
548{
549 /*
550 * In HV mode, we init AMOR (Authority Mask Override Register) so that
551 * the hypervisor and guest can setup IAMR (Instruction Authority Mask
552 * Register), enable key 0 and set it to 1.
553 *
554 * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
555 */
556 mtspr(SPRN_AMOR, (3ul << 62));
557}
558
3b10d009
BS
559static void radix_init_iamr(void)
560{
561 unsigned long iamr;
562
563 /*
564 * The IAMR should set to 0 on DD1.
565 */
566 if (cpu_has_feature(CPU_FTR_POWER9_DD1))
567 iamr = 0;
568 else
569 iamr = (1ul << 62);
570
571 /*
572 * Radix always uses key0 of the IAMR to determine if an access is
573 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
574 * fetch.
575 */
576 mtspr(SPRN_IAMR, iamr);
577}
578
2bfd65e4
AK
579void __init radix__early_init_mmu(void)
580{
581 unsigned long lpcr;
2bfd65e4
AK
582
583#ifdef CONFIG_PPC_64K_PAGES
584 /* PAGE_SIZE mappings */
585 mmu_virtual_psize = MMU_PAGE_64K;
586#else
587 mmu_virtual_psize = MMU_PAGE_4K;
588#endif
589
590#ifdef CONFIG_SPARSEMEM_VMEMMAP
591 /* vmemmap mapping */
592 mmu_vmemmap_psize = mmu_virtual_psize;
593#endif
594 /*
595 * initialize page table size
596 */
597 __pte_index_size = RADIX_PTE_INDEX_SIZE;
598 __pmd_index_size = RADIX_PMD_INDEX_SIZE;
599 __pud_index_size = RADIX_PUD_INDEX_SIZE;
600 __pgd_index_size = RADIX_PGD_INDEX_SIZE;
601 __pmd_cache_index = RADIX_PMD_INDEX_SIZE;
602 __pte_table_size = RADIX_PTE_TABLE_SIZE;
603 __pmd_table_size = RADIX_PMD_TABLE_SIZE;
604 __pud_table_size = RADIX_PUD_TABLE_SIZE;
605 __pgd_table_size = RADIX_PGD_TABLE_SIZE;
606
a2f41eb9
AK
607 __pmd_val_bits = RADIX_PMD_VAL_BITS;
608 __pud_val_bits = RADIX_PUD_VAL_BITS;
609 __pgd_val_bits = RADIX_PGD_VAL_BITS;
2bfd65e4 610
d6a9996e
AK
611 __kernel_virt_start = RADIX_KERN_VIRT_START;
612 __kernel_virt_size = RADIX_KERN_VIRT_SIZE;
613 __vmalloc_start = RADIX_VMALLOC_START;
614 __vmalloc_end = RADIX_VMALLOC_END;
63ee9b2f 615 __kernel_io_start = RADIX_KERN_IO_START;
d6a9996e
AK
616 vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
617 ioremap_bot = IOREMAP_BASE;
bfa37087
DS
618
619#ifdef CONFIG_PCI
620 pci_io_base = ISA_IO_BASE;
621#endif
622
5ed7ecd0
AK
623 /*
624 * For now radix also use the same frag size
625 */
626 __pte_frag_nr = H_PTE_FRAG_NR;
627 __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
d6a9996e 628
d6c88600 629 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
166dd7d3 630 radix_init_native();
ad410674
AK
631 if (cpu_has_feature(CPU_FTR_POWER9_DD1))
632 update_hid_for_radix();
d6c88600 633 lpcr = mfspr(SPRN_LPCR);
bf16cdf4 634 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2bfd65e4 635 radix_init_partition_table();
ee97b6b9 636 radix_init_amor();
cc3d2940
PM
637 } else {
638 radix_init_pseries();
d6c88600 639 }
2bfd65e4 640
9d661958
PM
641 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
642
3b10d009 643 radix_init_iamr();
2bfd65e4 644 radix_init_pgtable();
d4748276
NP
645
646 if (cpu_has_feature(CPU_FTR_HVMODE))
647 tlbiel_all();
2bfd65e4
AK
648}
649
650void radix__early_init_mmu_secondary(void)
651{
652 unsigned long lpcr;
653 /*
d6c88600 654 * update partition table control register and UPRT
2bfd65e4 655 */
d6c88600 656 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
cac4a185
AK
657
658 if (cpu_has_feature(CPU_FTR_POWER9_DD1))
659 update_hid_for_radix();
660
d6c88600 661 lpcr = mfspr(SPRN_LPCR);
bf16cdf4 662 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
d6c88600 663
2bfd65e4
AK
664 mtspr(SPRN_PTCR,
665 __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
ee97b6b9 666 radix_init_amor();
d6c88600 667 }
3b10d009 668 radix_init_iamr();
d4748276
NP
669
670 if (cpu_has_feature(CPU_FTR_HVMODE))
671 tlbiel_all();
2bfd65e4
AK
672}
673
fe036a06
BH
674void radix__mmu_cleanup_all(void)
675{
676 unsigned long lpcr;
677
678 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
679 lpcr = mfspr(SPRN_LPCR);
680 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
681 mtspr(SPRN_PTCR, 0);
1d0761d2 682 powernv_set_nmmu_ptcr(0);
fe036a06
BH
683 radix__flush_tlb_all();
684 }
685}
686
2bfd65e4
AK
687void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
688 phys_addr_t first_memblock_size)
689{
177ba7c6
AK
690 /* We don't currently support the first MEMBLOCK not mapping 0
691 * physical on those processors
692 */
693 BUG_ON(first_memblock_base != 0);
1513c33d 694
5eae82ca
NP
695 /*
696 * Radix mode is not limited by RMA / VRMA addressing.
697 */
698 ppc64_rma_size = ULONG_MAX;
2bfd65e4 699}
d9225ad9 700
6cc27341 701#ifdef CONFIG_MEMORY_HOTPLUG
4b5d62ca
RA
702static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
703{
704 pte_t *pte;
705 int i;
706
707 for (i = 0; i < PTRS_PER_PTE; i++) {
708 pte = pte_start + i;
709 if (!pte_none(*pte))
710 return;
711 }
712
713 pte_free_kernel(&init_mm, pte_start);
714 pmd_clear(pmd);
715}
716
717static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
718{
719 pmd_t *pmd;
720 int i;
721
722 for (i = 0; i < PTRS_PER_PMD; i++) {
723 pmd = pmd_start + i;
724 if (!pmd_none(*pmd))
725 return;
726 }
727
728 pmd_free(&init_mm, pmd_start);
729 pud_clear(pud);
730}
731
732static void remove_pte_table(pte_t *pte_start, unsigned long addr,
733 unsigned long end)
734{
735 unsigned long next;
736 pte_t *pte;
737
738 pte = pte_start + pte_index(addr);
739 for (; addr < end; addr = next, pte++) {
740 next = (addr + PAGE_SIZE) & PAGE_MASK;
741 if (next > end)
742 next = end;
743
744 if (!pte_present(*pte))
745 continue;
746
0d0a4bc2
RA
747 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
748 /*
749 * The vmemmap_free() and remove_section_mapping()
750 * codepaths call us with aligned addresses.
751 */
752 WARN_ONCE(1, "%s: unaligned range\n", __func__);
753 continue;
754 }
755
4b5d62ca
RA
756 pte_clear(&init_mm, addr, pte);
757 }
758}
759
760static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
761 unsigned long end)
762{
763 unsigned long next;
764 pte_t *pte_base;
765 pmd_t *pmd;
766
767 pmd = pmd_start + pmd_index(addr);
768 for (; addr < end; addr = next, pmd++) {
769 next = pmd_addr_end(addr, end);
770
771 if (!pmd_present(*pmd))
772 continue;
773
774 if (pmd_huge(*pmd)) {
0d0a4bc2
RA
775 if (!IS_ALIGNED(addr, PMD_SIZE) ||
776 !IS_ALIGNED(next, PMD_SIZE)) {
777 WARN_ONCE(1, "%s: unaligned range\n", __func__);
778 continue;
779 }
780
4b5d62ca
RA
781 pte_clear(&init_mm, addr, (pte_t *)pmd);
782 continue;
783 }
784
785 pte_base = (pte_t *)pmd_page_vaddr(*pmd);
786 remove_pte_table(pte_base, addr, next);
787 free_pte_table(pte_base, pmd);
788 }
789}
790
791static void remove_pud_table(pud_t *pud_start, unsigned long addr,
792 unsigned long end)
793{
794 unsigned long next;
795 pmd_t *pmd_base;
796 pud_t *pud;
797
798 pud = pud_start + pud_index(addr);
799 for (; addr < end; addr = next, pud++) {
800 next = pud_addr_end(addr, end);
801
802 if (!pud_present(*pud))
803 continue;
804
805 if (pud_huge(*pud)) {
0d0a4bc2
RA
806 if (!IS_ALIGNED(addr, PUD_SIZE) ||
807 !IS_ALIGNED(next, PUD_SIZE)) {
808 WARN_ONCE(1, "%s: unaligned range\n", __func__);
809 continue;
810 }
811
4b5d62ca
RA
812 pte_clear(&init_mm, addr, (pte_t *)pud);
813 continue;
814 }
815
816 pmd_base = (pmd_t *)pud_page_vaddr(*pud);
817 remove_pmd_table(pmd_base, addr, next);
818 free_pmd_table(pmd_base, pud);
819 }
820}
821
822static void remove_pagetable(unsigned long start, unsigned long end)
823{
824 unsigned long addr, next;
825 pud_t *pud_base;
826 pgd_t *pgd;
827
828 spin_lock(&init_mm.page_table_lock);
829
830 for (addr = start; addr < end; addr = next) {
831 next = pgd_addr_end(addr, end);
832
833 pgd = pgd_offset_k(addr);
834 if (!pgd_present(*pgd))
835 continue;
836
837 if (pgd_huge(*pgd)) {
0d0a4bc2
RA
838 if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
839 !IS_ALIGNED(next, PGDIR_SIZE)) {
840 WARN_ONCE(1, "%s: unaligned range\n", __func__);
841 continue;
842 }
843
4b5d62ca
RA
844 pte_clear(&init_mm, addr, (pte_t *)pgd);
845 continue;
846 }
847
848 pud_base = (pud_t *)pgd_page_vaddr(*pgd);
849 remove_pud_table(pud_base, addr, next);
850 }
851
852 spin_unlock(&init_mm.page_table_lock);
853 radix__flush_tlb_kernel_range(start, end);
854}
855
29ab6c47 856int __ref radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
6cc27341 857{
29ab6c47 858 return create_physical_mapping(start, end, nid);
6cc27341 859}
4b5d62ca
RA
860
861int radix__remove_section_mapping(unsigned long start, unsigned long end)
862{
863 remove_pagetable(start, end);
864 return 0;
865}
6cc27341
RA
866#endif /* CONFIG_MEMORY_HOTPLUG */
867
d9225ad9 868#ifdef CONFIG_SPARSEMEM_VMEMMAP
29ab6c47
NP
869static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
870 pgprot_t flags, unsigned int map_page_size,
871 int nid)
872{
873 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
874}
875
d9225ad9
AK
876int __meminit radix__vmemmap_create_mapping(unsigned long start,
877 unsigned long page_size,
878 unsigned long phys)
879{
880 /* Create a PTE encoding */
881 unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
2ad452ff
NP
882 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
883 int ret;
884
885 ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
886 BUG_ON(ret);
d9225ad9 887
d9225ad9
AK
888 return 0;
889}
890
891#ifdef CONFIG_MEMORY_HOTPLUG
892void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
893{
0d0a4bc2 894 remove_pagetable(start, start + page_size);
d9225ad9
AK
895}
896#endif
897#endif
bde3eb62
AK
898
899#ifdef CONFIG_TRANSPARENT_HUGEPAGE
900
901unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
902 pmd_t *pmdp, unsigned long clr,
903 unsigned long set)
904{
905 unsigned long old;
906
907#ifdef CONFIG_DEBUG_VM
ebd31197 908 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
bde3eb62
AK
909 assert_spin_locked(&mm->page_table_lock);
910#endif
911
912 old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
913 trace_hugepage_update(addr, old, clr, set);
914
915 return old;
916}
917
918pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
919 pmd_t *pmdp)
920
921{
922 pmd_t pmd;
923
924 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
925 VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
ebd31197 926 VM_BUG_ON(pmd_devmap(*pmdp));
bde3eb62
AK
927 /*
928 * khugepaged calls this for normal pmd
929 */
930 pmd = *pmdp;
931 pmd_clear(pmdp);
424de9c6 932
bde3eb62 933 /*FIXME!! Verify whether we need this kick below */
fa4531f7 934 serialize_against_pte_lookup(vma->vm_mm);
424de9c6
BH
935
936 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
937
bde3eb62
AK
938 return pmd;
939}
940
941/*
942 * For us pgtable_t is pte_t *. Inorder to save the deposisted
943 * page table, we consider the allocated page table as a list
944 * head. On withdraw we need to make sure we zero out the used
945 * list_head memory area.
946 */
947void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
948 pgtable_t pgtable)
949{
950 struct list_head *lh = (struct list_head *) pgtable;
951
952 assert_spin_locked(pmd_lockptr(mm, pmdp));
953
954 /* FIFO */
955 if (!pmd_huge_pte(mm, pmdp))
956 INIT_LIST_HEAD(lh);
957 else
958 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
959 pmd_huge_pte(mm, pmdp) = pgtable;
960}
961
962pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
963{
964 pte_t *ptep;
965 pgtable_t pgtable;
966 struct list_head *lh;
967
968 assert_spin_locked(pmd_lockptr(mm, pmdp));
969
970 /* FIFO */
971 pgtable = pmd_huge_pte(mm, pmdp);
972 lh = (struct list_head *) pgtable;
973 if (list_empty(lh))
974 pmd_huge_pte(mm, pmdp) = NULL;
975 else {
976 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
977 list_del(lh);
978 }
979 ptep = (pte_t *) pgtable;
980 *ptep = __pte(0);
981 ptep++;
982 *ptep = __pte(0);
983 return pgtable;
984}
985
986
987pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
988 unsigned long addr, pmd_t *pmdp)
989{
990 pmd_t old_pmd;
991 unsigned long old;
992
993 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
994 old_pmd = __pmd(old);
995 /*
fa4531f7 996 * Serialize against find_current_mm_pte which does lock-less
bde3eb62
AK
997 * lookup in page tables with local interrupts disabled. For huge pages
998 * it casts pmd_t to pte_t. Since format of pte_t is different from
999 * pmd_t we want to prevent transit from pmd pointing to page table
1000 * to pmd pointing to huge page (and back) while interrupts are disabled.
1001 * We clear pmd to possibly replace it with page table pointer in
1002 * different code paths. So make sure we wait for the parallel
fa4531f7 1003 * find_current_mm_pte to finish.
bde3eb62 1004 */
fa4531f7 1005 serialize_against_pte_lookup(mm);
bde3eb62
AK
1006 return old_pmd;
1007}
1008
1009int radix__has_transparent_hugepage(void)
1010{
1011 /* For radix 2M at PMD level means thp */
1012 if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
1013 return 1;
1014 return 0;
1015}
1016#endif /* CONFIG_TRANSPARENT_HUGEPAGE */