Commit | Line | Data |
---|---|---|
0d17de03 HM |
1 | // SPDX-License-Identifier: GPL-2.0+ |
2 | /* | |
3 | * VAS Fault handling. | |
4 | * Copyright 2019, IBM Corporation | |
5 | */ | |
6 | ||
7 | #define pr_fmt(fmt) "vas: " fmt | |
8 | ||
9 | #include <linux/kernel.h> | |
10 | #include <linux/types.h> | |
11 | #include <linux/slab.h> | |
12 | #include <linux/uaccess.h> | |
13 | #include <linux/kthread.h> | |
c96c4436 | 14 | #include <linux/sched/signal.h> |
9774628a | 15 | #include <linux/mmu_context.h> |
0d17de03 HM |
16 | #include <asm/icswx.h> |
17 | ||
18 | #include "vas.h" | |
19 | ||
20 | /* | |
21 | * The maximum FIFO size for fault window can be 8MB | |
22 | * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS | |
23 | * instance will be having fault window. | |
24 | * 8MB FIFO can be used if expects more faults for each VAS | |
25 | * instance. | |
26 | */ | |
27 | #define VAS_FAULT_WIN_FIFO_SIZE (4 << 20) | |
28 | ||
cf33e1e9 HM |
29 | static void dump_crb(struct coprocessor_request_block *crb) |
30 | { | |
31 | struct data_descriptor_entry *dde; | |
32 | struct nx_fault_stamp *nx; | |
33 | ||
34 | dde = &crb->source; | |
35 | pr_devel("SrcDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n", | |
36 | be64_to_cpu(dde->address), be32_to_cpu(dde->length), | |
37 | dde->count, dde->index, dde->flags); | |
38 | ||
39 | dde = &crb->target; | |
40 | pr_devel("TgtDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n", | |
41 | be64_to_cpu(dde->address), be32_to_cpu(dde->length), | |
42 | dde->count, dde->index, dde->flags); | |
43 | ||
44 | nx = &crb->stamp.nx; | |
45 | pr_devel("NX Stamp: PSWID 0x%x, FSA 0x%llx, flags 0x%x, FS 0x%x\n", | |
46 | be32_to_cpu(nx->pswid), | |
47 | be64_to_cpu(crb->stamp.nx.fault_storage_addr), | |
48 | nx->flags, nx->fault_status); | |
49 | } | |
50 | ||
c96c4436 HM |
51 | /* |
52 | * Update the CSB to indicate a translation error. | |
53 | * | |
54 | * User space will be polling on CSB after the request is issued. | |
55 | * If NX can handle the request without any issues, it updates CSB. | |
56 | * Whereas if NX encounters page fault, the kernel will handle the | |
57 | * fault and update CSB with translation error. | |
58 | * | |
59 | * If we are unable to update the CSB means copy_to_user failed due to | |
60 | * invalid csb_addr, send a signal to the process. | |
61 | */ | |
62 | static void update_csb(struct vas_window *window, | |
63 | struct coprocessor_request_block *crb) | |
64 | { | |
65 | struct coprocessor_status_block csb; | |
66 | struct kernel_siginfo info; | |
67 | struct task_struct *tsk; | |
68 | void __user *csb_addr; | |
69 | struct pid *pid; | |
70 | int rc; | |
71 | ||
72 | /* | |
73 | * NX user space windows can not be opened for task->mm=NULL | |
74 | * and faults will not be generated for kernel requests. | |
75 | */ | |
76 | if (WARN_ON_ONCE(!window->mm || !window->user_win)) | |
77 | return; | |
78 | ||
79 | csb_addr = (void __user *)be64_to_cpu(crb->csb_addr); | |
80 | ||
81 | memset(&csb, 0, sizeof(csb)); | |
6068e1a4 | 82 | csb.cc = CSB_CC_FAULT_ADDRESS; |
c96c4436 HM |
83 | csb.ce = CSB_CE_TERMINATION; |
84 | csb.cs = 0; | |
85 | csb.count = 0; | |
86 | ||
87 | /* | |
88 | * NX operates and returns in BE format as defined CRB struct. | |
89 | * So saves fault_storage_addr in BE as NX pastes in FIFO and | |
90 | * expects user space to convert to CPU format. | |
91 | */ | |
92 | csb.address = crb->stamp.nx.fault_storage_addr; | |
93 | csb.flags = 0; | |
94 | ||
95 | pid = window->pid; | |
96 | tsk = get_pid_task(pid, PIDTYPE_PID); | |
97 | /* | |
98 | * Process closes send window after all pending NX requests are | |
99 | * completed. In multi-thread applications, a child thread can | |
100 | * open a window and can exit without closing it. May be some | |
101 | * requests are pending or this window can be used by other | |
102 | * threads later. We should handle faults if NX encounters | |
103 | * pages faults on these requests. Update CSB with translation | |
104 | * error and fault address. If csb_addr passed by user space is | |
105 | * invalid, send SEGV signal to pid saved in window. If the | |
106 | * child thread is not running, send the signal to tgid. | |
107 | * Parent thread (tgid) will close this window upon its exit. | |
108 | * | |
109 | * pid and mm references are taken when window is opened by | |
110 | * process (pid). So tgid is used only when child thread opens | |
111 | * a window and exits without closing it. | |
112 | */ | |
113 | if (!tsk) { | |
114 | pid = window->tgid; | |
115 | tsk = get_pid_task(pid, PIDTYPE_PID); | |
116 | /* | |
117 | * Parent thread (tgid) will be closing window when it | |
118 | * exits. So should not get here. | |
119 | */ | |
120 | if (WARN_ON_ONCE(!tsk)) | |
121 | return; | |
122 | } | |
123 | ||
124 | /* Return if the task is exiting. */ | |
125 | if (tsk->flags & PF_EXITING) { | |
126 | put_task_struct(tsk); | |
127 | return; | |
128 | } | |
129 | ||
f5678e7f | 130 | kthread_use_mm(window->mm); |
c96c4436 HM |
131 | rc = copy_to_user(csb_addr, &csb, sizeof(csb)); |
132 | /* | |
133 | * User space polls on csb.flags (first byte). So add barrier | |
134 | * then copy first byte with csb flags update. | |
135 | */ | |
136 | if (!rc) { | |
137 | csb.flags = CSB_V; | |
138 | /* Make sure update to csb.flags is visible now */ | |
139 | smp_mb(); | |
140 | rc = copy_to_user(csb_addr, &csb, sizeof(u8)); | |
141 | } | |
f5678e7f | 142 | kthread_unuse_mm(window->mm); |
c96c4436 HM |
143 | put_task_struct(tsk); |
144 | ||
145 | /* Success */ | |
146 | if (!rc) | |
147 | return; | |
148 | ||
149 | pr_debug("Invalid CSB address 0x%p signalling pid(%d)\n", | |
150 | csb_addr, pid_vnr(pid)); | |
151 | ||
152 | clear_siginfo(&info); | |
153 | info.si_signo = SIGSEGV; | |
154 | info.si_errno = EFAULT; | |
155 | info.si_code = SEGV_MAPERR; | |
156 | info.si_addr = csb_addr; | |
157 | ||
158 | /* | |
159 | * process will be polling on csb.flags after request is sent to | |
160 | * NX. So generally CSB update should not fail except when an | |
161 | * application passes invalid csb_addr. So an error message will | |
162 | * be displayed and leave it to user space whether to ignore or | |
163 | * handle this signal. | |
164 | */ | |
165 | rcu_read_lock(); | |
166 | rc = kill_pid_info(SIGSEGV, &info, pid); | |
167 | rcu_read_unlock(); | |
168 | ||
169 | pr_devel("%s(): pid %d kill_proc_info() rc %d\n", __func__, | |
170 | pid_vnr(pid), rc); | |
171 | } | |
172 | ||
cf33e1e9 HM |
173 | static void dump_fifo(struct vas_instance *vinst, void *entry) |
174 | { | |
175 | unsigned long *end = vinst->fault_fifo + vinst->fault_fifo_size; | |
176 | unsigned long *fifo = entry; | |
177 | int i; | |
178 | ||
179 | pr_err("Fault fifo size %d, Max crbs %d\n", vinst->fault_fifo_size, | |
180 | vinst->fault_fifo_size / CRB_SIZE); | |
181 | ||
182 | /* Dump 10 CRB entries or until end of FIFO */ | |
183 | pr_err("Fault FIFO Dump:\n"); | |
184 | for (i = 0; i < 10*(CRB_SIZE/8) && fifo < end; i += 4, fifo += 4) { | |
185 | pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n", | |
186 | i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3)); | |
187 | } | |
188 | } | |
189 | ||
9774628a HM |
190 | /* |
191 | * Process valid CRBs in fault FIFO. | |
192 | * NX process user space requests, return credit and update the status | |
193 | * in CRB. If it encounters transalation error when accessing CRB or | |
194 | * request buffers, raises interrupt on the CPU to handle the fault. | |
195 | * It takes credit on fault window, updates nx_fault_stamp in CRB with | |
196 | * the following information and pastes CRB in fault FIFO. | |
197 | * | |
198 | * pswid - window ID of the window on which the request is sent. | |
199 | * fault_storage_addr - fault address | |
200 | * | |
201 | * It can raise a single interrupt for multiple faults. Expects OS to | |
202 | * process all valid faults and return credit for each fault on user | |
203 | * space and fault windows. This fault FIFO control will be done with | |
204 | * credit mechanism. NX can continuously paste CRBs until credits are not | |
205 | * available on fault window. Otherwise, returns with RMA_reject. | |
206 | * | |
207 | * Total credits available on fault window: FIFO_SIZE(4MB)/CRBS_SIZE(128) | |
208 | * | |
209 | */ | |
210 | irqreturn_t vas_fault_thread_fn(int irq, void *data) | |
211 | { | |
212 | struct vas_instance *vinst = data; | |
213 | struct coprocessor_request_block *crb, *entry; | |
214 | struct coprocessor_request_block buf; | |
215 | struct vas_window *window; | |
216 | unsigned long flags; | |
217 | void *fifo; | |
218 | ||
219 | crb = &buf; | |
220 | ||
221 | /* | |
222 | * VAS can interrupt with multiple page faults. So process all | |
223 | * valid CRBs within fault FIFO until reaches invalid CRB. | |
224 | * We use CCW[0] and pswid to validate validate CRBs: | |
225 | * | |
226 | * CCW[0] Reserved bit. When NX pastes CRB, CCW[0]=0 | |
227 | * OS sets this bit to 1 after reading CRB. | |
228 | * pswid NX assigns window ID. Set pswid to -1 after | |
229 | * reading CRB from fault FIFO. | |
230 | * | |
231 | * We exit this function if no valid CRBs are available to process. | |
232 | * So acquire fault_lock and reset fifo_in_progress to 0 before | |
233 | * exit. | |
234 | * In case kernel receives another interrupt with different page | |
235 | * fault, interrupt handler returns with IRQ_HANDLED if | |
236 | * fifo_in_progress is set. Means these new faults will be | |
237 | * handled by the current thread. Otherwise set fifo_in_progress | |
238 | * and return IRQ_WAKE_THREAD to wake up thread. | |
239 | */ | |
240 | while (true) { | |
241 | spin_lock_irqsave(&vinst->fault_lock, flags); | |
242 | /* | |
243 | * Advance the fault fifo pointer to next CRB. | |
244 | * Use CRB_SIZE rather than sizeof(*crb) since the latter is | |
245 | * aligned to CRB_ALIGN (256) but the CRB written to by VAS is | |
246 | * only CRB_SIZE in len. | |
247 | */ | |
248 | fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE); | |
249 | entry = fifo; | |
250 | ||
251 | if ((entry->stamp.nx.pswid == cpu_to_be32(FIFO_INVALID_ENTRY)) | |
252 | || (entry->ccw & cpu_to_be32(CCW0_INVALID))) { | |
253 | vinst->fifo_in_progress = 0; | |
254 | spin_unlock_irqrestore(&vinst->fault_lock, flags); | |
255 | return IRQ_HANDLED; | |
256 | } | |
257 | ||
258 | spin_unlock_irqrestore(&vinst->fault_lock, flags); | |
259 | vinst->fault_crbs++; | |
260 | if (vinst->fault_crbs == (vinst->fault_fifo_size / CRB_SIZE)) | |
261 | vinst->fault_crbs = 0; | |
262 | ||
263 | memcpy(crb, fifo, CRB_SIZE); | |
264 | entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY); | |
265 | entry->ccw |= cpu_to_be32(CCW0_INVALID); | |
461862ef HM |
266 | /* |
267 | * Return credit for the fault window. | |
268 | */ | |
269 | vas_return_credit(vinst->fault_win, false); | |
9774628a HM |
270 | |
271 | pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d\n", | |
272 | vinst->vas_id, vinst->fault_fifo, fifo, | |
273 | vinst->fault_crbs); | |
274 | ||
cf33e1e9 | 275 | dump_crb(crb); |
9774628a HM |
276 | window = vas_pswid_to_window(vinst, |
277 | be32_to_cpu(crb->stamp.nx.pswid)); | |
278 | ||
279 | if (IS_ERR(window)) { | |
280 | /* | |
281 | * We got an interrupt about a specific send | |
282 | * window but we can't find that window and we can't | |
283 | * even clean it up (return credit on user space | |
284 | * window). | |
285 | * But we should not get here. | |
286 | * TODO: Disable IRQ. | |
287 | */ | |
cf33e1e9 | 288 | dump_fifo(vinst, (void *)entry); |
9774628a HM |
289 | pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d bad CRB?\n", |
290 | vinst->vas_id, vinst->fault_fifo, fifo, | |
291 | be32_to_cpu(crb->stamp.nx.pswid), | |
292 | vinst->fault_crbs); | |
293 | ||
294 | WARN_ON_ONCE(1); | |
c96c4436 HM |
295 | } else { |
296 | update_csb(window, crb); | |
461862ef HM |
297 | /* |
298 | * Return credit for send window after processing | |
299 | * fault CRB. | |
300 | */ | |
301 | vas_return_credit(window, true); | |
9774628a | 302 | } |
9774628a HM |
303 | } |
304 | } | |
305 | ||
306 | irqreturn_t vas_fault_handler(int irq, void *dev_id) | |
307 | { | |
308 | struct vas_instance *vinst = dev_id; | |
309 | irqreturn_t ret = IRQ_WAKE_THREAD; | |
310 | unsigned long flags; | |
311 | ||
312 | /* | |
313 | * NX can generate an interrupt for multiple faults. So the | |
314 | * fault handler thread process all CRBs until finds invalid | |
315 | * entry. In case if NX sees continuous faults, it is possible | |
316 | * that the thread function entered with the first interrupt | |
317 | * can execute and process all valid CRBs. | |
318 | * So wake up thread only if the fault thread is not in progress. | |
319 | */ | |
320 | spin_lock_irqsave(&vinst->fault_lock, flags); | |
321 | ||
322 | if (vinst->fifo_in_progress) | |
323 | ret = IRQ_HANDLED; | |
324 | else | |
325 | vinst->fifo_in_progress = 1; | |
326 | ||
327 | spin_unlock_irqrestore(&vinst->fault_lock, flags); | |
328 | ||
329 | return ret; | |
330 | } | |
331 | ||
0d17de03 HM |
332 | /* |
333 | * Fault window is opened per VAS instance. NX pastes fault CRB in fault | |
334 | * FIFO upon page faults. | |
335 | */ | |
336 | int vas_setup_fault_window(struct vas_instance *vinst) | |
337 | { | |
338 | struct vas_rx_win_attr attr; | |
339 | ||
340 | vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE; | |
341 | vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL); | |
342 | if (!vinst->fault_fifo) { | |
343 | pr_err("Unable to alloc %d bytes for fault_fifo\n", | |
344 | vinst->fault_fifo_size); | |
345 | return -ENOMEM; | |
346 | } | |
347 | ||
348 | /* | |
349 | * Invalidate all CRB entries. NX pastes valid entry for each fault. | |
350 | */ | |
351 | memset(vinst->fault_fifo, FIFO_INVALID_ENTRY, vinst->fault_fifo_size); | |
352 | vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT); | |
353 | ||
354 | attr.rx_fifo_size = vinst->fault_fifo_size; | |
355 | attr.rx_fifo = vinst->fault_fifo; | |
356 | ||
357 | /* | |
358 | * Max creds is based on number of CRBs can fit in the FIFO. | |
359 | * (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds | |
360 | * will be 0xffff since the receive creds field is 16bits wide. | |
361 | */ | |
362 | attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE; | |
363 | attr.lnotify_lpid = 0; | |
364 | attr.lnotify_pid = mfspr(SPRN_PID); | |
365 | attr.lnotify_tid = mfspr(SPRN_PID); | |
366 | ||
367 | vinst->fault_win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT, | |
368 | &attr); | |
369 | ||
370 | if (IS_ERR(vinst->fault_win)) { | |
371 | pr_err("VAS: Error %ld opening FaultWin\n", | |
372 | PTR_ERR(vinst->fault_win)); | |
373 | kfree(vinst->fault_fifo); | |
374 | return PTR_ERR(vinst->fault_win); | |
375 | } | |
376 | ||
377 | pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n", | |
378 | vinst->fault_win->winid, attr.lnotify_lpid, | |
379 | attr.lnotify_pid, attr.lnotify_tid); | |
380 | ||
381 | return 0; | |
382 | } |