powerpc/smp: Convert NR_CPUS to nr_cpu_ids
[linux-2.6-block.git] / arch / powerpc / mm / pgtable-hash64.c
CommitLineData
eee24b5a
AK
1/*
2 * Copyright 2005, Paul Mackerras, IBM Corporation.
3 * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
4 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/sched.h>
589ee628
IM
13#include <linux/mm_types.h>
14
eee24b5a
AK
15#include <asm/pgalloc.h>
16#include <asm/tlb.h>
17
18#include "mmu_decl.h"
19
6a1ea362
AK
20#define CREATE_TRACE_POINTS
21#include <trace/events/thp.h>
22
eee24b5a
AK
23#ifdef CONFIG_SPARSEMEM_VMEMMAP
24/*
25 * On hash-based CPUs, the vmemmap is bolted in the hash table.
26 *
27 */
31a14fae
AK
28int __meminit hash__vmemmap_create_mapping(unsigned long start,
29 unsigned long page_size,
30 unsigned long phys)
eee24b5a
AK
31{
32 int rc = htab_bolt_mapping(start, start + page_size, phys,
33 pgprot_val(PAGE_KERNEL),
34 mmu_vmemmap_psize, mmu_kernel_ssize);
35 if (rc < 0) {
36 int rc2 = htab_remove_mapping(start, start + page_size,
37 mmu_vmemmap_psize,
38 mmu_kernel_ssize);
39 BUG_ON(rc2 && (rc2 != -ENOENT));
40 }
41 return rc;
42}
43
44#ifdef CONFIG_MEMORY_HOTPLUG
31a14fae
AK
45void hash__vmemmap_remove_mapping(unsigned long start,
46 unsigned long page_size)
eee24b5a
AK
47{
48 int rc = htab_remove_mapping(start, start + page_size,
49 mmu_vmemmap_psize,
50 mmu_kernel_ssize);
51 BUG_ON((rc < 0) && (rc != -ENOENT));
52 WARN_ON(rc == -ENOENT);
53}
54#endif
55#endif /* CONFIG_SPARSEMEM_VMEMMAP */
56
57/*
58 * map_kernel_page currently only called by __ioremap
59 * map_kernel_page adds an entry to the ioremap page table
60 * and adds an entry to the HPT, possibly bolting it
61 */
31a14fae 62int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
eee24b5a
AK
63{
64 pgd_t *pgdp;
65 pud_t *pudp;
66 pmd_t *pmdp;
67 pte_t *ptep;
68
dd1842a2 69 BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
eee24b5a
AK
70 if (slab_is_available()) {
71 pgdp = pgd_offset_k(ea);
72 pudp = pud_alloc(&init_mm, pgdp, ea);
73 if (!pudp)
74 return -ENOMEM;
75 pmdp = pmd_alloc(&init_mm, pudp, ea);
76 if (!pmdp)
77 return -ENOMEM;
78 ptep = pte_alloc_kernel(pmdp, ea);
79 if (!ptep)
80 return -ENOMEM;
81 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
82 __pgprot(flags)));
83 } else {
84 /*
85 * If the mm subsystem is not fully up, we cannot create a
86 * linux page table entry for this mapping. Simply bolt an
87 * entry in the hardware page table.
88 *
89 */
90 if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
91 mmu_io_psize, mmu_kernel_ssize)) {
92 printk(KERN_ERR "Failed to do bolted mapping IO "
93 "memory at %016lx !\n", pa);
94 return -ENOMEM;
95 }
96 }
97
98 smp_wmb();
99 return 0;
100}
6a1ea362
AK
101
102#ifdef CONFIG_TRANSPARENT_HUGEPAGE
103
3df33f12
AK
104unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
105 pmd_t *pmdp, unsigned long clr,
106 unsigned long set)
6a1ea362
AK
107{
108 __be64 old_be, tmp;
109 unsigned long old;
110
111#ifdef CONFIG_DEBUG_VM
112 WARN_ON(!pmd_trans_huge(*pmdp));
113 assert_spin_locked(&mm->page_table_lock);
114#endif
115
116 __asm__ __volatile__(
117 "1: ldarx %0,0,%3\n\
118 and. %1,%0,%6\n\
119 bne- 1b \n\
120 andc %1,%0,%4 \n\
121 or %1,%1,%7\n\
122 stdcx. %1,0,%3 \n\
123 bne- 1b"
124 : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
125 : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
126 "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
127 : "cc" );
128
129 old = be64_to_cpu(old_be);
130
131 trace_hugepage_update(addr, old, clr, set);
132 if (old & H_PAGE_HASHPTE)
133 hpte_do_hugepage_flush(mm, addr, pmdp, old);
134 return old;
135}
136
3df33f12
AK
137pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
138 pmd_t *pmdp)
6a1ea362
AK
139{
140 pmd_t pmd;
141
142 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
143 VM_BUG_ON(pmd_trans_huge(*pmdp));
144
145 pmd = *pmdp;
146 pmd_clear(pmdp);
147 /*
148 * Wait for all pending hash_page to finish. This is needed
149 * in case of subpage collapse. When we collapse normal pages
150 * to hugepage, we first clear the pmd, then invalidate all
151 * the PTE entries. The assumption here is that any low level
152 * page fault will see a none pmd and take the slow path that
153 * will wait on mmap_sem. But we could very well be in a
154 * hash_page with local ptep pointer value. Such a hash page
155 * can result in adding new HPTE entries for normal subpages.
156 * That means we could be modifying the page content as we
157 * copy them to a huge page. So wait for parallel hash_page
158 * to finish before invalidating HPTE entries. We can do this
159 * by sending an IPI to all the cpus and executing a dummy
160 * function there.
161 */
162 kick_all_cpus_sync();
163 /*
164 * Now invalidate the hpte entries in the range
165 * covered by pmd. This make sure we take a
166 * fault and will find the pmd as none, which will
167 * result in a major fault which takes mmap_sem and
168 * hence wait for collapse to complete. Without this
169 * the __collapse_huge_page_copy can result in copying
170 * the old content.
171 */
172 flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
173 return pmd;
174}
175
6a1ea362
AK
176/*
177 * We want to put the pgtable in pmd and use pgtable for tracking
178 * the base page size hptes
179 */
3df33f12
AK
180void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
181 pgtable_t pgtable)
6a1ea362
AK
182{
183 pgtable_t *pgtable_slot;
184 assert_spin_locked(&mm->page_table_lock);
185 /*
186 * we store the pgtable in the second half of PMD
187 */
188 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
189 *pgtable_slot = pgtable;
190 /*
191 * expose the deposited pgtable to other cpus.
192 * before we set the hugepage PTE at pmd level
193 * hash fault code looks at the deposted pgtable
194 * to store hash index values.
195 */
196 smp_wmb();
197}
198
3df33f12 199pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
6a1ea362
AK
200{
201 pgtable_t pgtable;
202 pgtable_t *pgtable_slot;
203
204 assert_spin_locked(&mm->page_table_lock);
205 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
206 pgtable = *pgtable_slot;
207 /*
208 * Once we withdraw, mark the entry NULL.
209 */
210 *pgtable_slot = NULL;
211 /*
212 * We store HPTE information in the deposited PTE fragment.
213 * zero out the content on withdraw.
214 */
215 memset(pgtable, 0, PTE_FRAG_SIZE);
216 return pgtable;
217}
218
3df33f12
AK
219void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
220 unsigned long address, pmd_t *pmdp)
6a1ea362
AK
221{
222 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
223 VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
224
225 /*
226 * We can't mark the pmd none here, because that will cause a race
227 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
228 * we spilt, but at the same time we wan't rest of the ppc64 code
229 * not to insert hash pte on this, because we will be modifying
230 * the deposited pgtable in the caller of this function. Hence
231 * clear the _PAGE_USER so that we move the fault handling to
232 * higher level function and that will serialize against ptl.
233 * We need to flush existing hash pte entries here even though,
234 * the translation is still valid, because we will withdraw
235 * pgtable_t after this.
236 */
237 pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
238}
239
6a1ea362
AK
240/*
241 * A linux hugepage PMD was changed and the corresponding hash table entries
242 * neesd to be flushed.
243 */
244void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
245 pmd_t *pmdp, unsigned long old_pmd)
246{
247 int ssize;
248 unsigned int psize;
249 unsigned long vsid;
250 unsigned long flags = 0;
251 const struct cpumask *tmp;
252
253 /* get the base page size,vsid and segment size */
254#ifdef CONFIG_DEBUG_VM
255 psize = get_slice_psize(mm, addr);
256 BUG_ON(psize == MMU_PAGE_16M);
257#endif
258 if (old_pmd & H_PAGE_COMBO)
259 psize = MMU_PAGE_4K;
260 else
261 psize = MMU_PAGE_64K;
262
263 if (!is_kernel_addr(addr)) {
264 ssize = user_segment_size(addr);
265 vsid = get_vsid(mm->context.id, addr, ssize);
266 WARN_ON(vsid == 0);
267 } else {
268 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
269 ssize = mmu_kernel_ssize;
270 }
271
272 tmp = cpumask_of(smp_processor_id());
273 if (cpumask_equal(mm_cpumask(mm), tmp))
274 flags |= HPTE_LOCAL_UPDATE;
275
276 return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
277}
278
3df33f12
AK
279pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
280 unsigned long addr, pmd_t *pmdp)
6a1ea362
AK
281{
282 pmd_t old_pmd;
283 pgtable_t pgtable;
284 unsigned long old;
285 pgtable_t *pgtable_slot;
286
287 old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
288 old_pmd = __pmd(old);
289 /*
290 * We have pmd == none and we are holding page_table_lock.
291 * So we can safely go and clear the pgtable hash
292 * index info.
293 */
294 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
295 pgtable = *pgtable_slot;
296 /*
297 * Let's zero out old valid and hash index details
298 * hash fault look at them.
299 */
300 memset(pgtable, 0, PTE_FRAG_SIZE);
301 /*
302 * Serialize against find_linux_pte_or_hugepte which does lock-less
303 * lookup in page tables with local interrupts disabled. For huge pages
304 * it casts pmd_t to pte_t. Since format of pte_t is different from
305 * pmd_t we want to prevent transit from pmd pointing to page table
306 * to pmd pointing to huge page (and back) while interrupts are disabled.
307 * We clear pmd to possibly replace it with page table pointer in
308 * different code paths. So make sure we wait for the parallel
309 * find_linux_pte_or_hugepage to finish.
310 */
311 kick_all_cpus_sync();
312 return old_pmd;
313}
314
3df33f12 315int hash__has_transparent_hugepage(void)
6a1ea362
AK
316{
317
318 if (!mmu_has_feature(MMU_FTR_16M_PAGE))
319 return 0;
320 /*
321 * We support THP only if PMD_SIZE is 16MB.
322 */
323 if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
324 return 0;
325 /*
326 * We need to make sure that we support 16MB hugepage in a segement
327 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
328 * of 64K.
329 */
330 /*
331 * If we have 64K HPTE, we will be using that by default
332 */
333 if (mmu_psize_defs[MMU_PAGE_64K].shift &&
334 (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
335 return 0;
336 /*
337 * Ok we only have 4K HPTE
338 */
339 if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
340 return 0;
341
342 return 1;
343}
344#endif /* CONFIG_TRANSPARENT_HUGEPAGE */