Commit | Line | Data |
---|---|---|
588cb88c | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
07037db5 PD |
2 | /* |
3 | * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. | |
4 | * Lennox Wu <lennox.wu@sunplusct.com> | |
5 | * Chen Liqin <liqin.chen@sunplusct.com> | |
6 | * Copyright (C) 2012 Regents of the University of California | |
07037db5 PD |
7 | */ |
8 | ||
9 | ||
10 | #include <linux/mm.h> | |
11 | #include <linux/kernel.h> | |
12 | #include <linux/interrupt.h> | |
13 | #include <linux/perf_event.h> | |
14 | #include <linux/signal.h> | |
15 | #include <linux/uaccess.h> | |
c22b0bcb | 16 | #include <linux/kprobes.h> |
47513f24 | 17 | #include <linux/kfence.h> |
f0bddf50 | 18 | #include <linux/entry-common.h> |
07037db5 | 19 | |
07037db5 | 20 | #include <asm/ptrace.h> |
bf587caa | 21 | #include <asm/tlbflush.h> |
07037db5 | 22 | |
ffaee272 PW |
23 | #include "../kernel/head.h" |
24 | ||
21733cb5 EL |
25 | static void die_kernel_fault(const char *msg, unsigned long addr, |
26 | struct pt_regs *regs) | |
27 | { | |
28 | bust_spinlocks(1); | |
29 | ||
30 | pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", msg, | |
31 | addr); | |
32 | ||
33 | bust_spinlocks(0); | |
34 | die(regs, "Oops"); | |
0e25498f | 35 | make_task_dead(SIGKILL); |
21733cb5 EL |
36 | } |
37 | ||
cac4d1dc PE |
38 | static inline void no_context(struct pt_regs *regs, unsigned long addr) |
39 | { | |
21733cb5 EL |
40 | const char *msg; |
41 | ||
cac4d1dc PE |
42 | /* Are we prepared to handle this kernel fault? */ |
43 | if (fixup_exception(regs)) | |
44 | return; | |
45 | ||
46 | /* | |
47 | * Oops. The kernel tried to access some bad page. We'll have to | |
48 | * terminate things with extreme prejudice. | |
49 | */ | |
47513f24 LS |
50 | if (addr < PAGE_SIZE) |
51 | msg = "NULL pointer dereference"; | |
52 | else { | |
53 | if (kfence_handle_page_fault(addr, regs->cause == EXC_STORE_PAGE_FAULT, regs)) | |
54 | return; | |
55 | ||
56 | msg = "paging request"; | |
57 | } | |
58 | ||
21733cb5 | 59 | die_kernel_fault(msg, addr, regs); |
cac4d1dc PE |
60 | } |
61 | ||
6c11ffbf PE |
62 | static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) |
63 | { | |
7a75f3d4 PE |
64 | if (fault & VM_FAULT_OOM) { |
65 | /* | |
66 | * We ran out of memory, call the OOM killer, and return the userspace | |
67 | * (which will retry the fault, or kill us if we got oom-killed). | |
68 | */ | |
69 | if (!user_mode(regs)) { | |
70 | no_context(regs, addr); | |
71 | return; | |
72 | } | |
73 | pagefault_out_of_memory(); | |
6c11ffbf | 74 | return; |
117b1bb0 | 75 | } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { |
7a75f3d4 PE |
76 | /* Kernel mode? Handle exceptions or die */ |
77 | if (!user_mode(regs)) { | |
78 | no_context(regs, addr); | |
79 | return; | |
80 | } | |
81 | do_trap(regs, SIGBUS, BUS_ADRERR, addr); | |
6c11ffbf PE |
82 | return; |
83 | } | |
7a75f3d4 | 84 | BUG(); |
6c11ffbf PE |
85 | } |
86 | ||
7267ef7b BH |
87 | static inline void |
88 | bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr) | |
a51271d9 PE |
89 | { |
90 | /* | |
91 | * Something tried to access memory that isn't in our memory map. | |
92 | * Fix it, but check if it's kernel or user first. | |
93 | */ | |
a51271d9 PE |
94 | /* User mode accesses just cause a SIGSEGV */ |
95 | if (user_mode(regs)) { | |
96 | do_trap(regs, SIGSEGV, code, addr); | |
97 | return; | |
98 | } | |
99 | ||
100 | no_context(regs, addr); | |
101 | } | |
102 | ||
7267ef7b BH |
103 | static inline void |
104 | bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, | |
105 | unsigned long addr) | |
106 | { | |
107 | mmap_read_unlock(mm); | |
108 | ||
109 | bad_area_nosemaphore(regs, code, addr); | |
110 | } | |
111 | ||
2baa6d95 | 112 | static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) |
ac416a72 PE |
113 | { |
114 | pgd_t *pgd, *pgd_k; | |
7da9ca3f CH |
115 | pud_t *pud_k; |
116 | p4d_t *p4d_k; | |
117 | pmd_t *pmd_k; | |
ac416a72 PE |
118 | pte_t *pte_k; |
119 | int index; | |
bcacf5f6 | 120 | unsigned long pfn; |
ac416a72 PE |
121 | |
122 | /* User mode accesses just cause a SIGSEGV */ | |
123 | if (user_mode(regs)) | |
124 | return do_trap(regs, SIGSEGV, code, addr); | |
125 | ||
126 | /* | |
127 | * Synchronize this task's top level page-table | |
128 | * with the 'reference' page table. | |
129 | * | |
130 | * Do _not_ use "tsk->active_mm->pgd" here. | |
131 | * We might be inside an interrupt in the middle | |
132 | * of a task switch. | |
133 | */ | |
134 | index = pgd_index(addr); | |
bcacf5f6 LS |
135 | pfn = csr_read(CSR_SATP) & SATP_PPN; |
136 | pgd = (pgd_t *)pfn_to_virt(pfn) + index; | |
ac416a72 PE |
137 | pgd_k = init_mm.pgd + index; |
138 | ||
139 | if (!pgd_present(*pgd_k)) { | |
140 | no_context(regs, addr); | |
141 | return; | |
142 | } | |
143 | set_pgd(pgd, *pgd_k); | |
144 | ||
ac416a72 PE |
145 | p4d_k = p4d_offset(pgd_k, addr); |
146 | if (!p4d_present(*p4d_k)) { | |
147 | no_context(regs, addr); | |
148 | return; | |
149 | } | |
150 | ||
ac416a72 PE |
151 | pud_k = pud_offset(p4d_k, addr); |
152 | if (!pud_present(*pud_k)) { | |
153 | no_context(regs, addr); | |
154 | return; | |
155 | } | |
47dd902a DJ |
156 | if (pud_leaf(*pud_k)) |
157 | goto flush_tlb; | |
ac416a72 PE |
158 | |
159 | /* | |
160 | * Since the vmalloc area is global, it is unnecessary | |
161 | * to copy individual PTEs | |
162 | */ | |
ac416a72 PE |
163 | pmd_k = pmd_offset(pud_k, addr); |
164 | if (!pmd_present(*pmd_k)) { | |
165 | no_context(regs, addr); | |
166 | return; | |
167 | } | |
47dd902a DJ |
168 | if (pmd_leaf(*pmd_k)) |
169 | goto flush_tlb; | |
ac416a72 PE |
170 | |
171 | /* | |
172 | * Make sure the actual PTE exists as well to | |
173 | * catch kernel vmalloc-area accesses to non-mapped | |
174 | * addresses. If we don't do this, this will just | |
175 | * silently loop forever. | |
176 | */ | |
177 | pte_k = pte_offset_kernel(pmd_k, addr); | |
178 | if (!pte_present(*pte_k)) { | |
179 | no_context(regs, addr); | |
180 | return; | |
181 | } | |
182 | ||
183 | /* | |
184 | * The kernel assumes that TLBs don't cache invalid | |
185 | * entries, but in RISC-V, SFENCE.VMA specifies an | |
186 | * ordering constraint, not a cache flush; it is | |
187 | * necessary even after writing invalid entries. | |
188 | */ | |
47dd902a | 189 | flush_tlb: |
ac416a72 PE |
190 | local_flush_tlb_page(addr); |
191 | } | |
192 | ||
afb8c6fe PE |
193 | static inline bool access_error(unsigned long cause, struct vm_area_struct *vma) |
194 | { | |
195 | switch (cause) { | |
196 | case EXC_INST_PAGE_FAULT: | |
197 | if (!(vma->vm_flags & VM_EXEC)) { | |
198 | return true; | |
199 | } | |
200 | break; | |
201 | case EXC_LOAD_PAGE_FAULT: | |
7ab72c59 AB |
202 | /* Write implies read */ |
203 | if (!(vma->vm_flags & (VM_READ | VM_WRITE))) { | |
afb8c6fe PE |
204 | return true; |
205 | } | |
206 | break; | |
207 | case EXC_STORE_PAGE_FAULT: | |
208 | if (!(vma->vm_flags & VM_WRITE)) { | |
209 | return true; | |
210 | } | |
211 | break; | |
212 | default: | |
213 | panic("%s: unhandled cause %lu", __func__, cause); | |
214 | } | |
215 | return false; | |
216 | } | |
217 | ||
07037db5 PD |
218 | /* |
219 | * This routine handles page faults. It determines the address and the | |
220 | * problem, and then passes it off to one of the appropriate routines. | |
221 | */ | |
f0bddf50 | 222 | void handle_page_fault(struct pt_regs *regs) |
07037db5 PD |
223 | { |
224 | struct task_struct *tsk; | |
225 | struct vm_area_struct *vma; | |
226 | struct mm_struct *mm; | |
227 | unsigned long addr, cause; | |
dde16072 | 228 | unsigned int flags = FAULT_FLAG_DEFAULT; |
50a7ca3c SJ |
229 | int code = SEGV_MAPERR; |
230 | vm_fault_t fault; | |
07037db5 | 231 | |
a4c3733d CH |
232 | cause = regs->cause; |
233 | addr = regs->badaddr; | |
07037db5 PD |
234 | |
235 | tsk = current; | |
236 | mm = tsk->mm; | |
237 | ||
c22b0bcb GR |
238 | if (kprobe_page_fault(regs, cause)) |
239 | return; | |
240 | ||
07037db5 PD |
241 | /* |
242 | * Fault-in kernel-space virtual memory on-demand. | |
243 | * The 'reference' page table is init_mm.pgd. | |
244 | * | |
245 | * NOTE! We MUST NOT take any locks for this case. We may | |
246 | * be in an interrupt or a critical region, and should | |
247 | * only copy the information from the master page table, | |
248 | * nothing more. | |
249 | */ | |
7d3332be BT |
250 | if ((!IS_ENABLED(CONFIG_MMU) || !IS_ENABLED(CONFIG_64BIT)) && |
251 | unlikely(addr >= VMALLOC_START && addr < VMALLOC_END)) { | |
ac416a72 PE |
252 | vmalloc_fault(regs, code, addr); |
253 | return; | |
254 | } | |
07037db5 PD |
255 | |
256 | /* Enable interrupts if they were enabled in the parent context. */ | |
f0bddf50 | 257 | if (!regs_irqs_disabled(regs)) |
07037db5 PD |
258 | local_irq_enable(); |
259 | ||
260 | /* | |
261 | * If we're in an interrupt, have no user context, or are running | |
262 | * in an atomic region, then we must not take the fault. | |
263 | */ | |
cac4d1dc | 264 | if (unlikely(faulthandler_disabled() || !mm)) { |
74784081 | 265 | tsk->thread.bad_cause = cause; |
cac4d1dc PE |
266 | no_context(regs, addr); |
267 | return; | |
268 | } | |
07037db5 PD |
269 | |
270 | if (user_mode(regs)) | |
271 | flags |= FAULT_FLAG_USER; | |
272 | ||
416721ff BT |
273 | if (!user_mode(regs) && addr < TASK_SIZE && unlikely(!(regs->status & SR_SUM))) { |
274 | if (fixup_exception(regs)) | |
275 | return; | |
276 | ||
277 | die_kernel_fault("access to user memory without uaccess routines", addr, regs); | |
278 | } | |
21855cac | 279 | |
07037db5 PD |
280 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); |
281 | ||
67474301 PE |
282 | if (cause == EXC_STORE_PAGE_FAULT) |
283 | flags |= FAULT_FLAG_WRITE; | |
a960c132 PE |
284 | else if (cause == EXC_INST_PAGE_FAULT) |
285 | flags |= FAULT_FLAG_INSTRUCTION; | |
648321fa JZ |
286 | if (!(flags & FAULT_FLAG_USER)) |
287 | goto lock_mmap; | |
288 | ||
289 | vma = lock_vma_under_rcu(mm, addr); | |
290 | if (!vma) | |
291 | goto lock_mmap; | |
292 | ||
293 | if (unlikely(access_error(cause, vma))) { | |
294 | vma_end_read(vma); | |
295 | goto lock_mmap; | |
296 | } | |
297 | ||
298 | fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); | |
4089eef0 SB |
299 | if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) |
300 | vma_end_read(vma); | |
648321fa JZ |
301 | |
302 | if (!(fault & VM_FAULT_RETRY)) { | |
303 | count_vm_vma_lock_event(VMA_LOCK_SUCCESS); | |
304 | goto done; | |
305 | } | |
306 | count_vm_vma_lock_event(VMA_LOCK_RETRY); | |
307 | ||
308 | if (fault_signal_pending(fault, regs)) { | |
309 | if (!user_mode(regs)) | |
310 | no_context(regs, addr); | |
311 | return; | |
312 | } | |
313 | lock_mmap: | |
648321fa | 314 | |
07037db5 | 315 | retry: |
7267ef7b | 316 | vma = lock_mm_and_find_vma(mm, addr, regs); |
a51271d9 | 317 | if (unlikely(!vma)) { |
74784081 | 318 | tsk->thread.bad_cause = cause; |
7267ef7b | 319 | bad_area_nosemaphore(regs, code, addr); |
a51271d9 PE |
320 | return; |
321 | } | |
07037db5 PD |
322 | |
323 | /* | |
324 | * Ok, we have a good vm_area for this memory access, so | |
325 | * we can handle it. | |
326 | */ | |
07037db5 PD |
327 | code = SEGV_ACCERR; |
328 | ||
afb8c6fe | 329 | if (unlikely(access_error(cause, vma))) { |
74784081 | 330 | tsk->thread.bad_cause = cause; |
afb8c6fe PE |
331 | bad_area(regs, mm, code, addr); |
332 | return; | |
07037db5 PD |
333 | } |
334 | ||
335 | /* | |
336 | * If for any reason at all we could not handle the fault, | |
337 | * make sure we exit gracefully rather than endlessly redo | |
338 | * the fault. | |
339 | */ | |
5ac365a4 | 340 | fault = handle_mm_fault(vma, addr, flags, regs); |
07037db5 PD |
341 | |
342 | /* | |
343 | * If we need to retry but a fatal signal is pending, handle the | |
c1e8d7c6 | 344 | * signal first. We do not need to release the mmap_lock because it |
07037db5 PD |
345 | * would already be released in __lock_page_or_retry in mm/filemap.c. |
346 | */ | |
d835eb3a AV |
347 | if (fault_signal_pending(fault, regs)) { |
348 | if (!user_mode(regs)) | |
349 | no_context(regs, addr); | |
07037db5 | 350 | return; |
d835eb3a | 351 | } |
07037db5 | 352 | |
d9272525 PX |
353 | /* The fault is fully completed (including releasing mmap lock) */ |
354 | if (fault & VM_FAULT_COMPLETED) | |
355 | return; | |
356 | ||
36ef159f | 357 | if (unlikely(fault & VM_FAULT_RETRY)) { |
43632871 PE |
358 | flags |= FAULT_FLAG_TRIED; |
359 | ||
360 | /* | |
361 | * No need to mmap_read_unlock(mm) as we would | |
362 | * have already released it in __lock_page_or_retry | |
363 | * in mm/filemap.c. | |
364 | */ | |
365 | goto retry; | |
07037db5 PD |
366 | } |
367 | ||
d8ed45c5 | 368 | mmap_read_unlock(mm); |
bda281d5 | 369 | |
648321fa | 370 | done: |
bda281d5 | 371 | if (unlikely(fault & VM_FAULT_ERROR)) { |
74784081 | 372 | tsk->thread.bad_cause = cause; |
6c11ffbf | 373 | mm_fault_error(regs, addr, fault); |
cac4d1dc PE |
374 | return; |
375 | } | |
07037db5 | 376 | return; |
07037db5 | 377 | } |