2 * TLB flush routines for radix kernels.
4 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/hugetlb.h>
14 #include <linux/memblock.h>
16 #include <asm/ppc-opcode.h>
18 #include <asm/tlbflush.h>
19 #include <asm/trace.h>
20 #include <asm/cputhreads.h>
22 #define RIC_FLUSH_TLB 0
23 #define RIC_FLUSH_PWC 1
24 #define RIC_FLUSH_ALL 2
27 * tlbiel instruction for radix, set invalidation
28 * i.e., r=1 and is=01 or is=10 or is=11
30 static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
32 unsigned int ric, unsigned int prs)
36 unsigned int r = 1; /* radix format */
38 rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
39 rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
41 asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
42 : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
46 static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
50 asm volatile("ptesync": : :"memory");
53 * Flush the first set of the TLB, and the entire Page Walk Cache
54 * and partition table entries. Then flush the remaining sets of the
57 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
58 for (set = 1; set < num_sets; set++)
59 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
61 /* Do the same for process scoped entries. */
62 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
63 for (set = 1; set < num_sets; set++)
64 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
66 asm volatile("ptesync": : :"memory");
69 void radix__tlbiel_all(unsigned int action)
74 case TLB_INVAL_SCOPE_GLOBAL:
77 case TLB_INVAL_SCOPE_LPID:
84 if (early_cpu_has_feature(CPU_FTR_ARCH_300))
85 tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
87 WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
89 asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
92 static inline void __tlbiel_pid(unsigned long pid, int set,
95 unsigned long rb,rs,prs,r;
97 rb = PPC_BIT(53); /* IS = 1 */
98 rb |= set << PPC_BITLSHIFT(51);
99 rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
100 prs = 1; /* process scoped */
101 r = 1; /* raidx format */
103 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
104 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
105 trace_tlbie(0, 1, rb, rs, ric, prs, r);
108 static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
110 unsigned long rb,rs,prs,r;
112 rb = PPC_BIT(53); /* IS = 1 */
113 rs = pid << PPC_BITLSHIFT(31);
114 prs = 1; /* process scoped */
115 r = 1; /* raidx format */
117 asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
118 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
119 trace_tlbie(0, 0, rb, rs, ric, prs, r);
123 * We use 128 set in radix mode and 256 set in hpt mode.
125 static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
129 asm volatile("ptesync": : :"memory");
132 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
133 * also flush the entire Page Walk Cache.
135 __tlbiel_pid(pid, 0, ric);
137 /* For PWC, only one flush is needed */
138 if (ric == RIC_FLUSH_PWC) {
139 asm volatile("ptesync": : :"memory");
143 /* For the remaining sets, just flush the TLB */
144 for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
145 __tlbiel_pid(pid, set, RIC_FLUSH_TLB);
147 asm volatile("ptesync": : :"memory");
148 asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
151 static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
153 asm volatile("ptesync": : :"memory");
156 * Workaround the fact that the "ric" argument to __tlbie_pid
157 * must be a compile-time contraint to match the "i" constraint
158 * in the asm statement.
162 __tlbie_pid(pid, RIC_FLUSH_TLB);
165 __tlbie_pid(pid, RIC_FLUSH_PWC);
169 __tlbie_pid(pid, RIC_FLUSH_ALL);
171 asm volatile("eieio; tlbsync; ptesync": : :"memory");
174 static inline void __tlbiel_va(unsigned long va, unsigned long pid,
175 unsigned long ap, unsigned long ric)
177 unsigned long rb,rs,prs,r;
179 rb = va & ~(PPC_BITMASK(52, 63));
180 rb |= ap << PPC_BITLSHIFT(58);
181 rs = pid << PPC_BITLSHIFT(31);
182 prs = 1; /* process scoped */
183 r = 1; /* raidx format */
185 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
186 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
187 trace_tlbie(0, 1, rb, rs, ric, prs, r);
190 static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
191 unsigned long pid, unsigned long page_size,
195 unsigned long ap = mmu_get_ap(psize);
197 for (addr = start; addr < end; addr += page_size)
198 __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
201 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
202 unsigned long psize, unsigned long ric)
204 unsigned long ap = mmu_get_ap(psize);
206 asm volatile("ptesync": : :"memory");
207 __tlbiel_va(va, pid, ap, ric);
208 asm volatile("ptesync": : :"memory");
211 static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
212 unsigned long pid, unsigned long page_size,
213 unsigned long psize, bool also_pwc)
215 asm volatile("ptesync": : :"memory");
217 __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
218 __tlbiel_va_range(start, end, pid, page_size, psize);
219 asm volatile("ptesync": : :"memory");
222 static inline void __tlbie_va(unsigned long va, unsigned long pid,
223 unsigned long ap, unsigned long ric)
225 unsigned long rb,rs,prs,r;
227 rb = va & ~(PPC_BITMASK(52, 63));
228 rb |= ap << PPC_BITLSHIFT(58);
229 rs = pid << PPC_BITLSHIFT(31);
230 prs = 1; /* process scoped */
231 r = 1; /* raidx format */
233 asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
234 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
235 trace_tlbie(0, 0, rb, rs, ric, prs, r);
238 static inline void __tlbie_va_range(unsigned long start, unsigned long end,
239 unsigned long pid, unsigned long page_size,
243 unsigned long ap = mmu_get_ap(psize);
245 for (addr = start; addr < end; addr += page_size)
246 __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
249 static inline void _tlbie_va(unsigned long va, unsigned long pid,
250 unsigned long psize, unsigned long ric)
252 unsigned long ap = mmu_get_ap(psize);
254 asm volatile("ptesync": : :"memory");
255 __tlbie_va(va, pid, ap, ric);
256 asm volatile("eieio; tlbsync; ptesync": : :"memory");
259 static inline void _tlbie_va_range(unsigned long start, unsigned long end,
260 unsigned long pid, unsigned long page_size,
261 unsigned long psize, bool also_pwc)
263 asm volatile("ptesync": : :"memory");
265 __tlbie_pid(pid, RIC_FLUSH_PWC);
266 __tlbie_va_range(start, end, pid, page_size, psize);
267 asm volatile("eieio; tlbsync; ptesync": : :"memory");
271 * Base TLB flushing operations:
273 * - flush_tlb_mm(mm) flushes the specified mm context TLB's
274 * - flush_tlb_page(vma, vmaddr) flushes one page
275 * - flush_tlb_range(vma, start, end) flushes a range of pages
276 * - flush_tlb_kernel_range(start, end) flushes kernel pages
278 * - local_* variants of page and mm only apply to the current
281 void radix__local_flush_tlb_mm(struct mm_struct *mm)
286 pid = mm->context.id;
287 if (pid != MMU_NO_CONTEXT)
288 _tlbiel_pid(pid, RIC_FLUSH_TLB);
291 EXPORT_SYMBOL(radix__local_flush_tlb_mm);
294 void radix__local_flush_all_mm(struct mm_struct *mm)
299 pid = mm->context.id;
300 if (pid != MMU_NO_CONTEXT)
301 _tlbiel_pid(pid, RIC_FLUSH_ALL);
304 EXPORT_SYMBOL(radix__local_flush_all_mm);
305 #endif /* CONFIG_SMP */
307 void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
313 pid = mm->context.id;
314 if (pid != MMU_NO_CONTEXT)
315 _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
319 void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
321 #ifdef CONFIG_HUGETLB_PAGE
322 /* need the return fix for nohash.c */
323 if (is_vm_hugetlb_page(vma))
324 return radix__local_flush_hugetlb_page(vma, vmaddr);
326 radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
328 EXPORT_SYMBOL(radix__local_flush_tlb_page);
330 static bool mm_needs_flush_escalation(struct mm_struct *mm)
333 * P9 nest MMU has issues with the page walk cache
334 * caching PTEs and not flushing them properly when
335 * RIC = 0 for a PID/LPID invalidate
337 return atomic_read(&mm->context.copros) != 0;
341 void radix__flush_tlb_mm(struct mm_struct *mm)
345 pid = mm->context.id;
346 if (unlikely(pid == MMU_NO_CONTEXT))
350 if (!mm_is_thread_local(mm)) {
351 if (mm_needs_flush_escalation(mm))
352 _tlbie_pid(pid, RIC_FLUSH_ALL);
354 _tlbie_pid(pid, RIC_FLUSH_TLB);
356 _tlbiel_pid(pid, RIC_FLUSH_TLB);
359 EXPORT_SYMBOL(radix__flush_tlb_mm);
361 void radix__flush_all_mm(struct mm_struct *mm)
365 pid = mm->context.id;
366 if (unlikely(pid == MMU_NO_CONTEXT))
370 if (!mm_is_thread_local(mm))
371 _tlbie_pid(pid, RIC_FLUSH_ALL);
373 _tlbiel_pid(pid, RIC_FLUSH_ALL);
376 EXPORT_SYMBOL(radix__flush_all_mm);
378 void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
380 tlb->need_flush_all = 1;
382 EXPORT_SYMBOL(radix__flush_tlb_pwc);
384 void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
389 pid = mm->context.id;
390 if (unlikely(pid == MMU_NO_CONTEXT))
394 if (!mm_is_thread_local(mm))
395 _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
397 _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
401 void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
403 #ifdef CONFIG_HUGETLB_PAGE
404 if (is_vm_hugetlb_page(vma))
405 return radix__flush_hugetlb_page(vma, vmaddr);
407 radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
409 EXPORT_SYMBOL(radix__flush_tlb_page);
411 #else /* CONFIG_SMP */
412 #define radix__flush_all_mm radix__local_flush_all_mm
413 #endif /* CONFIG_SMP */
415 void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
417 _tlbie_pid(0, RIC_FLUSH_ALL);
419 EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
421 #define TLB_FLUSH_ALL -1UL
424 * Number of pages above which we invalidate the entire PID rather than
425 * flush individual pages, for local and global flushes respectively.
427 * tlbie goes out to the interconnect and individual ops are more costly.
428 * It also does not iterate over sets like the local tlbiel variant when
429 * invalidating a full PID, so it has a far lower threshold to change from
430 * individual page flushes to full-pid flushes.
432 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
433 static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
435 void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
439 struct mm_struct *mm = vma->vm_mm;
441 unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
442 unsigned long page_size = 1UL << page_shift;
443 unsigned long nr_pages = (end - start) >> page_shift;
446 #ifdef CONFIG_HUGETLB_PAGE
447 if (is_vm_hugetlb_page(vma))
448 return radix__flush_hugetlb_tlb_range(vma, start, end);
451 pid = mm->context.id;
452 if (unlikely(pid == MMU_NO_CONTEXT))
456 if (mm_is_thread_local(mm)) {
458 full = (end == TLB_FLUSH_ALL ||
459 nr_pages > tlb_local_single_page_flush_ceiling);
462 full = (end == TLB_FLUSH_ALL ||
463 nr_pages > tlb_single_page_flush_ceiling);
468 _tlbiel_pid(pid, RIC_FLUSH_TLB);
470 if (mm_needs_flush_escalation(mm))
471 _tlbie_pid(pid, RIC_FLUSH_ALL);
473 _tlbie_pid(pid, RIC_FLUSH_TLB);
477 unsigned long hstart, hend;
479 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
480 hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
481 hend = end >> HPAGE_PMD_SHIFT;
483 hstart <<= HPAGE_PMD_SHIFT;
484 hend <<= HPAGE_PMD_SHIFT;
489 asm volatile("ptesync": : :"memory");
491 __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
493 __tlbiel_va_range(hstart, hend, pid,
494 HPAGE_PMD_SIZE, MMU_PAGE_2M);
495 asm volatile("ptesync": : :"memory");
497 __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
499 __tlbie_va_range(hstart, hend, pid,
500 HPAGE_PMD_SIZE, MMU_PAGE_2M);
501 asm volatile("eieio; tlbsync; ptesync": : :"memory");
506 EXPORT_SYMBOL(radix__flush_tlb_range);
508 static int radix_get_mmu_psize(int page_size)
512 if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift))
513 psize = mmu_virtual_psize;
514 else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift))
516 else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift))
523 static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
524 unsigned long end, int psize);
526 void radix__tlb_flush(struct mmu_gather *tlb)
529 struct mm_struct *mm = tlb->mm;
530 int page_size = tlb->page_size;
533 * if page size is not something we understand, do a full mm flush
535 * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
536 * that flushes the process table entry cache upon process teardown.
537 * See the comment for radix in arch_exit_mmap().
540 radix__flush_all_mm(mm);
541 } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
542 if (!tlb->need_flush_all)
543 radix__flush_tlb_mm(mm);
545 radix__flush_all_mm(mm);
547 unsigned long start = tlb->start;
548 unsigned long end = tlb->end;
550 if (!tlb->need_flush_all)
551 radix__flush_tlb_range_psize(mm, start, end, psize);
553 radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
555 tlb->need_flush_all = 0;
558 static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
559 unsigned long start, unsigned long end,
560 int psize, bool also_pwc)
563 unsigned int page_shift = mmu_psize_defs[psize].shift;
564 unsigned long page_size = 1UL << page_shift;
565 unsigned long nr_pages = (end - start) >> page_shift;
568 pid = mm->context.id;
569 if (unlikely(pid == MMU_NO_CONTEXT))
573 if (mm_is_thread_local(mm)) {
575 full = (end == TLB_FLUSH_ALL ||
576 nr_pages > tlb_local_single_page_flush_ceiling);
579 full = (end == TLB_FLUSH_ALL ||
580 nr_pages > tlb_single_page_flush_ceiling);
584 if (!local && mm_needs_flush_escalation(mm))
588 _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
590 _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
593 _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
595 _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
600 void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
601 unsigned long end, int psize)
603 return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
606 static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
607 unsigned long end, int psize)
609 __radix__flush_tlb_range_psize(mm, start, end, psize, true);
612 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
613 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
615 unsigned long pid, end;
617 pid = mm->context.id;
618 if (unlikely(pid == MMU_NO_CONTEXT))
621 /* 4k page size, just blow the world */
622 if (PAGE_SIZE == 0x1000) {
623 radix__flush_all_mm(mm);
627 end = addr + HPAGE_PMD_SIZE;
629 /* Otherwise first do the PWC, then iterate the pages. */
632 if (mm_is_thread_local(mm)) {
633 _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
635 _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
640 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
642 void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
643 unsigned long start, unsigned long end)
645 radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
647 EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
649 void radix__flush_tlb_all(void)
651 unsigned long rb,prs,r,rs;
652 unsigned long ric = RIC_FLUSH_ALL;
654 rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
655 prs = 0; /* partition scoped */
656 r = 1; /* raidx format */
657 rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
659 asm volatile("ptesync": : :"memory");
661 * now flush guest entries by passing PRS = 1 and LPID != 0
663 asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
664 : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
666 * now flush host entires by passing PRS = 0 and LPID == 0
668 asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
669 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
670 asm volatile("eieio; tlbsync; ptesync": : :"memory");
673 void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
674 unsigned long address)
677 * We track page size in pte only for DD1, So we can
678 * call this only on DD1.
680 if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) {
685 if (old_pte & R_PAGE_LARGE)
686 radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M);
688 radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
691 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
692 extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
694 unsigned int pid = mm->context.id;
696 if (unlikely(pid == MMU_NO_CONTEXT))
700 * If this context hasn't run on that CPU before and KVM is
701 * around, there's a slim chance that the guest on another
702 * CPU just brought in obsolete translation into the TLB of
703 * this CPU due to a bad prefetch using the guest PID on
704 * the way into the hypervisor.
706 * We work around this here. If KVM is possible, we check if
707 * any sibling thread is in KVM. If it is, the window may exist
708 * and thus we flush that PID from the core.
710 * A potential future improvement would be to mark which PIDs
711 * have never been used on the system and avoid it if the PID
712 * is new and the process has no other cpumask bit set.
714 if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
715 int cpu = smp_processor_id();
716 int sib = cpu_first_thread_sibling(cpu);
719 for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
722 if (paca[sib].kvm_hstate.kvm_vcpu)
726 _tlbiel_pid(pid, RIC_FLUSH_ALL);
729 EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
730 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */