x86, mpx: On-demand kernel allocation of bounds tables
[linux-2.6-block.git] / arch / x86 / mm / mpx.c
CommitLineData
57319d80
QR
1/*
2 * mpx.c - Memory Protection eXtensions
3 *
4 * Copyright (c) 2014, Intel Corporation.
5 * Qiaowei Ren <qiaowei.ren@intel.com>
6 * Dave Hansen <dave.hansen@intel.com>
7 */
8#include <linux/kernel.h>
fcc7ffd6 9#include <linux/slab.h>
57319d80
QR
10#include <linux/syscalls.h>
11#include <linux/sched/sysctl.h>
12
fe3d197f
DH
13#include <asm/i387.h>
14#include <asm/insn.h>
57319d80
QR
15#include <asm/mman.h>
16#include <asm/mpx.h>
fe3d197f
DH
17#include <asm/processor.h>
18#include <asm/fpu-internal.h>
57319d80
QR
19
20static const char *mpx_mapping_name(struct vm_area_struct *vma)
21{
22 return "[mpx]";
23}
24
25static struct vm_operations_struct mpx_vma_ops = {
26 .name = mpx_mapping_name,
27};
28
29/*
30 * This is really a simplified "vm_mmap". it only handles MPX
31 * bounds tables (the bounds directory is user-allocated).
32 *
33 * Later on, we use the vma->vm_ops to uniquely identify these
34 * VMAs.
35 */
36static unsigned long mpx_mmap(unsigned long len)
37{
38 unsigned long ret;
39 unsigned long addr, pgoff;
40 struct mm_struct *mm = current->mm;
41 vm_flags_t vm_flags;
42 struct vm_area_struct *vma;
43
44 /* Only bounds table and bounds directory can be allocated here */
45 if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES)
46 return -EINVAL;
47
48 down_write(&mm->mmap_sem);
49
50 /* Too many mappings? */
51 if (mm->map_count > sysctl_max_map_count) {
52 ret = -ENOMEM;
53 goto out;
54 }
55
56 /* Obtain the address to map to. we verify (or select) it and ensure
57 * that it represents a valid section of the address space.
58 */
59 addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
60 if (addr & ~PAGE_MASK) {
61 ret = addr;
62 goto out;
63 }
64
65 vm_flags = VM_READ | VM_WRITE | VM_MPX |
66 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
67
68 /* Set pgoff according to addr for anon_vma */
69 pgoff = addr >> PAGE_SHIFT;
70
71 ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
72 if (IS_ERR_VALUE(ret))
73 goto out;
74
75 vma = find_vma(mm, ret);
76 if (!vma) {
77 ret = -ENOMEM;
78 goto out;
79 }
80 vma->vm_ops = &mpx_vma_ops;
81
82 if (vm_flags & VM_LOCKED) {
83 up_write(&mm->mmap_sem);
84 mm_populate(ret, len);
85 return ret;
86 }
87
88out:
89 up_write(&mm->mmap_sem);
90 return ret;
91}
fcc7ffd6
DH
92
93enum reg_type {
94 REG_TYPE_RM = 0,
95 REG_TYPE_INDEX,
96 REG_TYPE_BASE,
97};
98
99static unsigned long get_reg_offset(struct insn *insn, struct pt_regs *regs,
100 enum reg_type type)
101{
102 int regno = 0;
103
104 static const int regoff[] = {
105 offsetof(struct pt_regs, ax),
106 offsetof(struct pt_regs, cx),
107 offsetof(struct pt_regs, dx),
108 offsetof(struct pt_regs, bx),
109 offsetof(struct pt_regs, sp),
110 offsetof(struct pt_regs, bp),
111 offsetof(struct pt_regs, si),
112 offsetof(struct pt_regs, di),
113#ifdef CONFIG_X86_64
114 offsetof(struct pt_regs, r8),
115 offsetof(struct pt_regs, r9),
116 offsetof(struct pt_regs, r10),
117 offsetof(struct pt_regs, r11),
118 offsetof(struct pt_regs, r12),
119 offsetof(struct pt_regs, r13),
120 offsetof(struct pt_regs, r14),
121 offsetof(struct pt_regs, r15),
122#endif
123 };
124 int nr_registers = ARRAY_SIZE(regoff);
125 /*
126 * Don't possibly decode a 32-bit instructions as
127 * reading a 64-bit-only register.
128 */
129 if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
130 nr_registers -= 8;
131
132 switch (type) {
133 case REG_TYPE_RM:
134 regno = X86_MODRM_RM(insn->modrm.value);
135 if (X86_REX_B(insn->rex_prefix.value) == 1)
136 regno += 8;
137 break;
138
139 case REG_TYPE_INDEX:
140 regno = X86_SIB_INDEX(insn->sib.value);
141 if (X86_REX_X(insn->rex_prefix.value) == 1)
142 regno += 8;
143 break;
144
145 case REG_TYPE_BASE:
146 regno = X86_SIB_BASE(insn->sib.value);
147 if (X86_REX_B(insn->rex_prefix.value) == 1)
148 regno += 8;
149 break;
150
151 default:
152 pr_err("invalid register type");
153 BUG();
154 break;
155 }
156
157 if (regno > nr_registers) {
158 WARN_ONCE(1, "decoded an instruction with an invalid register");
159 return -EINVAL;
160 }
161 return regoff[regno];
162}
163
164/*
165 * return the address being referenced be instruction
166 * for rm=3 returning the content of the rm reg
167 * for rm!=3 calculates the address using SIB and Disp
168 */
169static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs)
170{
171 unsigned long addr, addr_offset;
172 unsigned long base, base_offset;
173 unsigned long indx, indx_offset;
174 insn_byte_t sib;
175
176 insn_get_modrm(insn);
177 insn_get_sib(insn);
178 sib = insn->sib.value;
179
180 if (X86_MODRM_MOD(insn->modrm.value) == 3) {
181 addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
182 if (addr_offset < 0)
183 goto out_err;
184 addr = regs_get_register(regs, addr_offset);
185 } else {
186 if (insn->sib.nbytes) {
187 base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
188 if (base_offset < 0)
189 goto out_err;
190
191 indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
192 if (indx_offset < 0)
193 goto out_err;
194
195 base = regs_get_register(regs, base_offset);
196 indx = regs_get_register(regs, indx_offset);
197 addr = base + indx * (1 << X86_SIB_SCALE(sib));
198 } else {
199 addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
200 if (addr_offset < 0)
201 goto out_err;
202 addr = regs_get_register(regs, addr_offset);
203 }
204 addr += insn->displacement.value;
205 }
206 return (void __user *)addr;
207out_err:
208 return (void __user *)-1;
209}
210
211static int mpx_insn_decode(struct insn *insn,
212 struct pt_regs *regs)
213{
214 unsigned char buf[MAX_INSN_SIZE];
215 int x86_64 = !test_thread_flag(TIF_IA32);
216 int not_copied;
217 int nr_copied;
218
219 not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf));
220 nr_copied = sizeof(buf) - not_copied;
221 /*
222 * The decoder _should_ fail nicely if we pass it a short buffer.
223 * But, let's not depend on that implementation detail. If we
224 * did not get anything, just error out now.
225 */
226 if (!nr_copied)
227 return -EFAULT;
228 insn_init(insn, buf, nr_copied, x86_64);
229 insn_get_length(insn);
230 /*
231 * copy_from_user() tries to get as many bytes as we could see in
232 * the largest possible instruction. If the instruction we are
233 * after is shorter than that _and_ we attempt to copy from
234 * something unreadable, we might get a short read. This is OK
235 * as long as the read did not stop in the middle of the
236 * instruction. Check to see if we got a partial instruction.
237 */
238 if (nr_copied < insn->length)
239 return -EFAULT;
240
241 insn_get_opcode(insn);
242 /*
243 * We only _really_ need to decode bndcl/bndcn/bndcu
244 * Error out on anything else.
245 */
246 if (insn->opcode.bytes[0] != 0x0f)
247 goto bad_opcode;
248 if ((insn->opcode.bytes[1] != 0x1a) &&
249 (insn->opcode.bytes[1] != 0x1b))
250 goto bad_opcode;
251
252 return 0;
253bad_opcode:
254 return -EINVAL;
255}
256
257/*
258 * If a bounds overflow occurs then a #BR is generated. This
259 * function decodes MPX instructions to get violation address
260 * and set this address into extended struct siginfo.
261 *
262 * Note that this is not a super precise way of doing this.
263 * Userspace could have, by the time we get here, written
264 * anything it wants in to the instructions. We can not
265 * trust anything about it. They might not be valid
266 * instructions or might encode invalid registers, etc...
267 *
268 * The caller is expected to kfree() the returned siginfo_t.
269 */
270siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
271 struct xsave_struct *xsave_buf)
272{
fe3d197f
DH
273 struct bndreg *bndregs, *bndreg;
274 siginfo_t *info = NULL;
fcc7ffd6
DH
275 struct insn insn;
276 uint8_t bndregno;
277 int err;
fcc7ffd6
DH
278
279 err = mpx_insn_decode(&insn, regs);
280 if (err)
281 goto err_out;
282
283 /*
284 * We know at this point that we are only dealing with
285 * MPX instructions.
286 */
287 insn_get_modrm(&insn);
288 bndregno = X86_MODRM_REG(insn.modrm.value);
289 if (bndregno > 3) {
290 err = -EINVAL;
291 goto err_out;
292 }
fe3d197f
DH
293 /* get the bndregs _area_ of the xsave structure */
294 bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS);
295 if (!bndregs) {
296 err = -EINVAL;
297 goto err_out;
298 }
299 /* now go select the individual register in the set of 4 */
300 bndreg = &bndregs[bndregno];
301
fcc7ffd6
DH
302 info = kzalloc(sizeof(*info), GFP_KERNEL);
303 if (!info) {
304 err = -ENOMEM;
305 goto err_out;
306 }
307 /*
308 * The registers are always 64-bit, but the upper 32
309 * bits are ignored in 32-bit mode. Also, note that the
310 * upper bounds are architecturally represented in 1's
311 * complement form.
312 *
313 * The 'unsigned long' cast is because the compiler
314 * complains when casting from integers to different-size
315 * pointers.
316 */
fe3d197f
DH
317 info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound;
318 info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound;
fcc7ffd6
DH
319 info->si_addr_lsb = 0;
320 info->si_signo = SIGSEGV;
321 info->si_errno = 0;
322 info->si_code = SEGV_BNDERR;
323 info->si_addr = mpx_get_addr_ref(&insn, regs);
324 /*
325 * We were not able to extract an address from the instruction,
326 * probably because there was something invalid in it.
327 */
328 if (info->si_addr == (void *)-1) {
329 err = -EINVAL;
330 goto err_out;
331 }
332 return info;
333err_out:
fe3d197f
DH
334 /* info might be NULL, but kfree() handles that */
335 kfree(info);
fcc7ffd6
DH
336 return ERR_PTR(err);
337}
fe3d197f
DH
338
339static __user void *task_get_bounds_dir(struct task_struct *tsk)
340{
341 struct bndcsr *bndcsr;
342
343 if (!cpu_feature_enabled(X86_FEATURE_MPX))
344 return MPX_INVALID_BOUNDS_DIR;
345
346 /*
347 * The bounds directory pointer is stored in a register
348 * only accessible if we first do an xsave.
349 */
350 fpu_save_init(&tsk->thread.fpu);
351 bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR);
352 if (!bndcsr)
353 return MPX_INVALID_BOUNDS_DIR;
354
355 /*
356 * Make sure the register looks valid by checking the
357 * enable bit.
358 */
359 if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG))
360 return MPX_INVALID_BOUNDS_DIR;
361
362 /*
363 * Lastly, mask off the low bits used for configuration
364 * flags, and return the address of the bounds table.
365 */
366 return (void __user *)(unsigned long)
367 (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
368}
369
370int mpx_enable_management(struct task_struct *tsk)
371{
372 void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
373 struct mm_struct *mm = tsk->mm;
374 int ret = 0;
375
376 /*
377 * runtime in the userspace will be responsible for allocation of
378 * the bounds directory. Then, it will save the base of the bounds
379 * directory into XSAVE/XRSTOR Save Area and enable MPX through
380 * XRSTOR instruction.
381 *
382 * fpu_xsave() is expected to be very expensive. Storing the bounds
383 * directory here means that we do not have to do xsave in the unmap
384 * path; we can just use mm->bd_addr instead.
385 */
386 bd_base = task_get_bounds_dir(tsk);
387 down_write(&mm->mmap_sem);
388 mm->bd_addr = bd_base;
389 if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR)
390 ret = -ENXIO;
391
392 up_write(&mm->mmap_sem);
393 return ret;
394}
395
396int mpx_disable_management(struct task_struct *tsk)
397{
398 struct mm_struct *mm = current->mm;
399
400 if (!cpu_feature_enabled(X86_FEATURE_MPX))
401 return -ENXIO;
402
403 down_write(&mm->mmap_sem);
404 mm->bd_addr = MPX_INVALID_BOUNDS_DIR;
405 up_write(&mm->mmap_sem);
406 return 0;
407}
408
409/*
410 * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each
411 * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB,
412 * and the size of each bounds table is 4MB.
413 */
414static int allocate_bt(long __user *bd_entry)
415{
416 unsigned long expected_old_val = 0;
417 unsigned long actual_old_val = 0;
418 unsigned long bt_addr;
419 int ret = 0;
420
421 /*
422 * Carve the virtual space out of userspace for the new
423 * bounds table:
424 */
425 bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES);
426 if (IS_ERR((void *)bt_addr))
427 return PTR_ERR((void *)bt_addr);
428 /*
429 * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
430 */
431 bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
432
433 /*
434 * Go poke the address of the new bounds table in to the
435 * bounds directory entry out in userspace memory. Note:
436 * we may race with another CPU instantiating the same table.
437 * In that case the cmpxchg will see an unexpected
438 * 'actual_old_val'.
439 *
440 * This can fault, but that's OK because we do not hold
441 * mmap_sem at this point, unlike some of the other part
442 * of the MPX code that have to pagefault_disable().
443 */
444 ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
445 expected_old_val, bt_addr);
446 if (ret)
447 goto out_unmap;
448
449 /*
450 * The user_atomic_cmpxchg_inatomic() will only return nonzero
451 * for faults, *not* if the cmpxchg itself fails. Now we must
452 * verify that the cmpxchg itself completed successfully.
453 */
454 /*
455 * We expected an empty 'expected_old_val', but instead found
456 * an apparently valid entry. Assume we raced with another
457 * thread to instantiate this table and desclare succecss.
458 */
459 if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) {
460 ret = 0;
461 goto out_unmap;
462 }
463 /*
464 * We found a non-empty bd_entry but it did not have the
465 * VALID_FLAG set. Return an error which will result in
466 * a SEGV since this probably means that somebody scribbled
467 * some invalid data in to a bounds table.
468 */
469 if (expected_old_val != actual_old_val) {
470 ret = -EINVAL;
471 goto out_unmap;
472 }
473 return 0;
474out_unmap:
475 vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES);
476 return ret;
477}
478
479/*
480 * When a BNDSTX instruction attempts to save bounds to a bounds
481 * table, it will first attempt to look up the table in the
482 * first-level bounds directory. If it does not find a table in
483 * the directory, a #BR is generated and we get here in order to
484 * allocate a new table.
485 *
486 * With 32-bit mode, the size of BD is 4MB, and the size of each
487 * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
488 * and the size of each bound table is 4MB.
489 */
490static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
491{
492 unsigned long bd_entry, bd_base;
493 struct bndcsr *bndcsr;
494
495 bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
496 if (!bndcsr)
497 return -EINVAL;
498 /*
499 * Mask off the preserve and enable bits
500 */
501 bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK;
502 /*
503 * The hardware provides the address of the missing or invalid
504 * entry via BNDSTATUS, so we don't have to go look it up.
505 */
506 bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK;
507 /*
508 * Make sure the directory entry is within where we think
509 * the directory is.
510 */
511 if ((bd_entry < bd_base) ||
512 (bd_entry >= bd_base + MPX_BD_SIZE_BYTES))
513 return -EINVAL;
514
515 return allocate_bt((long __user *)bd_entry);
516}
517
518int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
519{
520 /*
521 * Userspace never asked us to manage the bounds tables,
522 * so refuse to help.
523 */
524 if (!kernel_managing_mpx_tables(current->mm))
525 return -EINVAL;
526
527 if (do_mpx_bt_fault(xsave_buf)) {
528 force_sig(SIGSEGV, current);
529 /*
530 * The force_sig() is essentially "handling" this
531 * exception, so we do not pass up the error
532 * from do_mpx_bt_fault().
533 */
534 }
535 return 0;
536}