Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * linux/kernel/seccomp.c | |
3 | * | |
4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> | |
5 | * | |
e2cfabdf WD |
6 | * Copyright (C) 2012 Google, Inc. |
7 | * Will Drewry <wad@chromium.org> | |
8 | * | |
9 | * This defines a simple but solid secure-computing facility. | |
10 | * | |
11 | * Mode 1 uses a fixed list of allowed system calls. | |
12 | * Mode 2 allows user-defined system call filters in the form | |
13 | * of Berkeley Packet Filters/Linux Socket Filters. | |
1da177e4 LT |
14 | */ |
15 | ||
e2cfabdf | 16 | #include <linux/atomic.h> |
85e7bac3 | 17 | #include <linux/audit.h> |
5b101740 | 18 | #include <linux/compat.h> |
e2cfabdf WD |
19 | #include <linux/sched.h> |
20 | #include <linux/seccomp.h> | |
1da177e4 LT |
21 | |
22 | /* #define SECCOMP_DEBUG 1 */ | |
e2cfabdf WD |
23 | |
24 | #ifdef CONFIG_SECCOMP_FILTER | |
25 | #include <asm/syscall.h> | |
26 | #include <linux/filter.h> | |
27 | #include <linux/security.h> | |
28 | #include <linux/slab.h> | |
29 | #include <linux/tracehook.h> | |
30 | #include <linux/uaccess.h> | |
31 | ||
32 | /** | |
33 | * struct seccomp_filter - container for seccomp BPF programs | |
34 | * | |
35 | * @usage: reference count to manage the object lifetime. | |
36 | * get/put helpers should be used when accessing an instance | |
37 | * outside of a lifetime-guarded section. In general, this | |
38 | * is only needed for handling filters shared across tasks. | |
39 | * @prev: points to a previously installed, or inherited, filter | |
40 | * @len: the number of instructions in the program | |
41 | * @insns: the BPF program instructions to evaluate | |
42 | * | |
43 | * seccomp_filter objects are organized in a tree linked via the @prev | |
44 | * pointer. For any task, it appears to be a singly-linked list starting | |
45 | * with current->seccomp.filter, the most recently attached or inherited filter. | |
46 | * However, multiple filters may share a @prev node, by way of fork(), which | |
47 | * results in a unidirectional tree existing in memory. This is similar to | |
48 | * how namespaces work. | |
49 | * | |
50 | * seccomp_filter objects should never be modified after being attached | |
51 | * to a task_struct (other than @usage). | |
52 | */ | |
53 | struct seccomp_filter { | |
54 | atomic_t usage; | |
55 | struct seccomp_filter *prev; | |
56 | unsigned short len; /* Instruction count */ | |
57 | struct sock_filter insns[]; | |
58 | }; | |
59 | ||
60 | /* Limit any path through the tree to 256KB worth of instructions. */ | |
61 | #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) | |
62 | ||
e2cfabdf WD |
63 | /** |
64 | * get_u32 - returns a u32 offset into data | |
65 | * @data: a unsigned 64 bit value | |
66 | * @index: 0 or 1 to return the first or second 32-bits | |
67 | * | |
68 | * This inline exists to hide the length of unsigned long. If a 32-bit | |
69 | * unsigned long is passed in, it will be extended and the top 32-bits will be | |
70 | * 0. If it is a 64-bit unsigned long, then whatever data is resident will be | |
71 | * properly returned. | |
72 | * | |
73 | * Endianness is explicitly ignored and left for BPF program authors to manage | |
74 | * as per the specific architecture. | |
75 | */ | |
76 | static inline u32 get_u32(u64 data, int index) | |
77 | { | |
78 | return ((u32 *)&data)[index]; | |
79 | } | |
80 | ||
81 | /* Helper for bpf_load below. */ | |
82 | #define BPF_DATA(_name) offsetof(struct seccomp_data, _name) | |
83 | /** | |
84 | * bpf_load: checks and returns a pointer to the requested offset | |
85 | * @off: offset into struct seccomp_data to load from | |
86 | * | |
87 | * Returns the requested 32-bits of data. | |
88 | * seccomp_check_filter() should assure that @off is 32-bit aligned | |
89 | * and not out of bounds. Failure to do so is a BUG. | |
90 | */ | |
91 | u32 seccomp_bpf_load(int off) | |
92 | { | |
93 | struct pt_regs *regs = task_pt_regs(current); | |
94 | if (off == BPF_DATA(nr)) | |
95 | return syscall_get_nr(current, regs); | |
96 | if (off == BPF_DATA(arch)) | |
97 | return syscall_get_arch(current, regs); | |
98 | if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { | |
99 | unsigned long value; | |
100 | int arg = (off - BPF_DATA(args[0])) / sizeof(u64); | |
101 | int index = !!(off % sizeof(u64)); | |
102 | syscall_get_arguments(current, regs, arg, 1, &value); | |
103 | return get_u32(value, index); | |
104 | } | |
105 | if (off == BPF_DATA(instruction_pointer)) | |
106 | return get_u32(KSTK_EIP(current), 0); | |
107 | if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) | |
108 | return get_u32(KSTK_EIP(current), 1); | |
109 | /* seccomp_check_filter should make this impossible. */ | |
110 | BUG(); | |
111 | } | |
112 | ||
113 | /** | |
114 | * seccomp_check_filter - verify seccomp filter code | |
115 | * @filter: filter to verify | |
116 | * @flen: length of filter | |
117 | * | |
118 | * Takes a previously checked filter (by sk_chk_filter) and | |
119 | * redirects all filter code that loads struct sk_buff data | |
120 | * and related data through seccomp_bpf_load. It also | |
121 | * enforces length and alignment checking of those loads. | |
122 | * | |
123 | * Returns 0 if the rule set is legal or -EINVAL if not. | |
124 | */ | |
125 | static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |
126 | { | |
127 | int pc; | |
128 | for (pc = 0; pc < flen; pc++) { | |
129 | struct sock_filter *ftest = &filter[pc]; | |
130 | u16 code = ftest->code; | |
131 | u32 k = ftest->k; | |
132 | ||
133 | switch (code) { | |
134 | case BPF_S_LD_W_ABS: | |
135 | ftest->code = BPF_S_ANC_SECCOMP_LD_W; | |
136 | /* 32-bit aligned and not out of bounds. */ | |
137 | if (k >= sizeof(struct seccomp_data) || k & 3) | |
138 | return -EINVAL; | |
139 | continue; | |
140 | case BPF_S_LD_W_LEN: | |
141 | ftest->code = BPF_S_LD_IMM; | |
142 | ftest->k = sizeof(struct seccomp_data); | |
143 | continue; | |
144 | case BPF_S_LDX_W_LEN: | |
145 | ftest->code = BPF_S_LDX_IMM; | |
146 | ftest->k = sizeof(struct seccomp_data); | |
147 | continue; | |
148 | /* Explicitly include allowed calls. */ | |
149 | case BPF_S_RET_K: | |
150 | case BPF_S_RET_A: | |
151 | case BPF_S_ALU_ADD_K: | |
152 | case BPF_S_ALU_ADD_X: | |
153 | case BPF_S_ALU_SUB_K: | |
154 | case BPF_S_ALU_SUB_X: | |
155 | case BPF_S_ALU_MUL_K: | |
156 | case BPF_S_ALU_MUL_X: | |
157 | case BPF_S_ALU_DIV_X: | |
158 | case BPF_S_ALU_AND_K: | |
159 | case BPF_S_ALU_AND_X: | |
160 | case BPF_S_ALU_OR_K: | |
161 | case BPF_S_ALU_OR_X: | |
162 | case BPF_S_ALU_LSH_K: | |
163 | case BPF_S_ALU_LSH_X: | |
164 | case BPF_S_ALU_RSH_K: | |
165 | case BPF_S_ALU_RSH_X: | |
166 | case BPF_S_ALU_NEG: | |
167 | case BPF_S_LD_IMM: | |
168 | case BPF_S_LDX_IMM: | |
169 | case BPF_S_MISC_TAX: | |
170 | case BPF_S_MISC_TXA: | |
171 | case BPF_S_ALU_DIV_K: | |
172 | case BPF_S_LD_MEM: | |
173 | case BPF_S_LDX_MEM: | |
174 | case BPF_S_ST: | |
175 | case BPF_S_STX: | |
176 | case BPF_S_JMP_JA: | |
177 | case BPF_S_JMP_JEQ_K: | |
178 | case BPF_S_JMP_JEQ_X: | |
179 | case BPF_S_JMP_JGE_K: | |
180 | case BPF_S_JMP_JGE_X: | |
181 | case BPF_S_JMP_JGT_K: | |
182 | case BPF_S_JMP_JGT_X: | |
183 | case BPF_S_JMP_JSET_K: | |
184 | case BPF_S_JMP_JSET_X: | |
185 | continue; | |
186 | default: | |
187 | return -EINVAL; | |
188 | } | |
189 | } | |
190 | return 0; | |
191 | } | |
192 | ||
193 | /** | |
194 | * seccomp_run_filters - evaluates all seccomp filters against @syscall | |
195 | * @syscall: number of the current system call | |
196 | * | |
197 | * Returns valid seccomp BPF response codes. | |
198 | */ | |
199 | static u32 seccomp_run_filters(int syscall) | |
200 | { | |
201 | struct seccomp_filter *f; | |
202 | u32 ret = SECCOMP_RET_KILL; | |
203 | /* | |
204 | * All filters in the list are evaluated and the lowest BPF return | |
205 | * value always takes priority. | |
206 | */ | |
207 | for (f = current->seccomp.filter; f; f = f->prev) { | |
208 | ret = sk_run_filter(NULL, f->insns); | |
209 | if (ret != SECCOMP_RET_ALLOW) | |
210 | break; | |
211 | } | |
212 | return ret; | |
213 | } | |
214 | ||
215 | /** | |
216 | * seccomp_attach_filter: Attaches a seccomp filter to current. | |
217 | * @fprog: BPF program to install | |
218 | * | |
219 | * Returns 0 on success or an errno on failure. | |
220 | */ | |
221 | static long seccomp_attach_filter(struct sock_fprog *fprog) | |
222 | { | |
223 | struct seccomp_filter *filter; | |
224 | unsigned long fp_size = fprog->len * sizeof(struct sock_filter); | |
225 | unsigned long total_insns = fprog->len; | |
226 | long ret; | |
227 | ||
228 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) | |
229 | return -EINVAL; | |
230 | ||
231 | for (filter = current->seccomp.filter; filter; filter = filter->prev) | |
232 | total_insns += filter->len + 4; /* include a 4 instr penalty */ | |
233 | if (total_insns > MAX_INSNS_PER_PATH) | |
234 | return -ENOMEM; | |
235 | ||
236 | /* | |
237 | * Installing a seccomp filter requires that the task have | |
238 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. | |
239 | * This avoids scenarios where unprivileged tasks can affect the | |
240 | * behavior of privileged children. | |
241 | */ | |
242 | if (!current->no_new_privs && | |
243 | security_capable_noaudit(current_cred(), current_user_ns(), | |
244 | CAP_SYS_ADMIN) != 0) | |
245 | return -EACCES; | |
246 | ||
247 | /* Allocate a new seccomp_filter */ | |
248 | filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, | |
249 | GFP_KERNEL|__GFP_NOWARN); | |
250 | if (!filter) | |
251 | return -ENOMEM; | |
252 | atomic_set(&filter->usage, 1); | |
253 | filter->len = fprog->len; | |
254 | ||
255 | /* Copy the instructions from fprog. */ | |
256 | ret = -EFAULT; | |
257 | if (copy_from_user(filter->insns, fprog->filter, fp_size)) | |
258 | goto fail; | |
259 | ||
260 | /* Check and rewrite the fprog via the skb checker */ | |
261 | ret = sk_chk_filter(filter->insns, filter->len); | |
262 | if (ret) | |
263 | goto fail; | |
264 | ||
265 | /* Check and rewrite the fprog for seccomp use */ | |
266 | ret = seccomp_check_filter(filter->insns, filter->len); | |
267 | if (ret) | |
268 | goto fail; | |
269 | ||
270 | /* | |
271 | * If there is an existing filter, make it the prev and don't drop its | |
272 | * task reference. | |
273 | */ | |
274 | filter->prev = current->seccomp.filter; | |
275 | current->seccomp.filter = filter; | |
276 | return 0; | |
277 | fail: | |
278 | kfree(filter); | |
279 | return ret; | |
280 | } | |
281 | ||
282 | /** | |
283 | * seccomp_attach_user_filter - attaches a user-supplied sock_fprog | |
284 | * @user_filter: pointer to the user data containing a sock_fprog. | |
285 | * | |
286 | * Returns 0 on success and non-zero otherwise. | |
287 | */ | |
288 | long seccomp_attach_user_filter(char __user *user_filter) | |
289 | { | |
290 | struct sock_fprog fprog; | |
291 | long ret = -EFAULT; | |
292 | ||
293 | #ifdef CONFIG_COMPAT | |
294 | if (is_compat_task()) { | |
295 | struct compat_sock_fprog fprog32; | |
296 | if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) | |
297 | goto out; | |
298 | fprog.len = fprog32.len; | |
299 | fprog.filter = compat_ptr(fprog32.filter); | |
300 | } else /* falls through to the if below. */ | |
301 | #endif | |
302 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) | |
303 | goto out; | |
304 | ret = seccomp_attach_filter(&fprog); | |
305 | out: | |
306 | return ret; | |
307 | } | |
308 | ||
309 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ | |
310 | void get_seccomp_filter(struct task_struct *tsk) | |
311 | { | |
312 | struct seccomp_filter *orig = tsk->seccomp.filter; | |
313 | if (!orig) | |
314 | return; | |
315 | /* Reference count is bounded by the number of total processes. */ | |
316 | atomic_inc(&orig->usage); | |
317 | } | |
318 | ||
319 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ | |
320 | void put_seccomp_filter(struct task_struct *tsk) | |
321 | { | |
322 | struct seccomp_filter *orig = tsk->seccomp.filter; | |
323 | /* Clean up single-reference branches iteratively. */ | |
324 | while (orig && atomic_dec_and_test(&orig->usage)) { | |
325 | struct seccomp_filter *freeme = orig; | |
326 | orig = orig->prev; | |
327 | kfree(freeme); | |
328 | } | |
329 | } | |
330 | #endif /* CONFIG_SECCOMP_FILTER */ | |
1da177e4 LT |
331 | |
332 | /* | |
333 | * Secure computing mode 1 allows only read/write/exit/sigreturn. | |
334 | * To be fully secure this must be combined with rlimit | |
335 | * to limit the stack allocations too. | |
336 | */ | |
337 | static int mode1_syscalls[] = { | |
338 | __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, | |
339 | 0, /* null terminated */ | |
340 | }; | |
341 | ||
5b101740 | 342 | #ifdef CONFIG_COMPAT |
1da177e4 LT |
343 | static int mode1_syscalls_32[] = { |
344 | __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, | |
345 | 0, /* null terminated */ | |
346 | }; | |
347 | #endif | |
348 | ||
349 | void __secure_computing(int this_syscall) | |
350 | { | |
351 | int mode = current->seccomp.mode; | |
e2cfabdf WD |
352 | int exit_sig = 0; |
353 | int *syscall; | |
1da177e4 LT |
354 | |
355 | switch (mode) { | |
e2cfabdf | 356 | case SECCOMP_MODE_STRICT: |
1da177e4 | 357 | syscall = mode1_syscalls; |
5b101740 RM |
358 | #ifdef CONFIG_COMPAT |
359 | if (is_compat_task()) | |
1da177e4 LT |
360 | syscall = mode1_syscalls_32; |
361 | #endif | |
362 | do { | |
363 | if (*syscall == this_syscall) | |
364 | return; | |
365 | } while (*++syscall); | |
e2cfabdf | 366 | exit_sig = SIGKILL; |
1da177e4 | 367 | break; |
e2cfabdf WD |
368 | #ifdef CONFIG_SECCOMP_FILTER |
369 | case SECCOMP_MODE_FILTER: | |
370 | if (seccomp_run_filters(this_syscall) == SECCOMP_RET_ALLOW) | |
371 | return; | |
e2cfabdf WD |
372 | exit_sig = SIGSYS; |
373 | break; | |
374 | #endif | |
1da177e4 LT |
375 | default: |
376 | BUG(); | |
377 | } | |
378 | ||
379 | #ifdef SECCOMP_DEBUG | |
380 | dump_stack(); | |
381 | #endif | |
3dc1c1b2 | 382 | audit_seccomp(this_syscall, exit_code, SECCOMP_RET_KILL); |
e2cfabdf | 383 | do_exit(exit_sig); |
1da177e4 | 384 | } |
1d9d02fe AA |
385 | |
386 | long prctl_get_seccomp(void) | |
387 | { | |
388 | return current->seccomp.mode; | |
389 | } | |
390 | ||
e2cfabdf WD |
391 | /** |
392 | * prctl_set_seccomp: configures current->seccomp.mode | |
393 | * @seccomp_mode: requested mode to use | |
394 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER | |
395 | * | |
396 | * This function may be called repeatedly with a @seccomp_mode of | |
397 | * SECCOMP_MODE_FILTER to install additional filters. Every filter | |
398 | * successfully installed will be evaluated (in reverse order) for each system | |
399 | * call the task makes. | |
400 | * | |
401 | * Once current->seccomp.mode is non-zero, it may not be changed. | |
402 | * | |
403 | * Returns 0 on success or -EINVAL on failure. | |
404 | */ | |
405 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | |
1d9d02fe | 406 | { |
e2cfabdf | 407 | long ret = -EINVAL; |
1d9d02fe | 408 | |
e2cfabdf WD |
409 | if (current->seccomp.mode && |
410 | current->seccomp.mode != seccomp_mode) | |
1d9d02fe AA |
411 | goto out; |
412 | ||
e2cfabdf WD |
413 | switch (seccomp_mode) { |
414 | case SECCOMP_MODE_STRICT: | |
415 | ret = 0; | |
cf99abac AA |
416 | #ifdef TIF_NOTSC |
417 | disable_TSC(); | |
418 | #endif | |
e2cfabdf WD |
419 | break; |
420 | #ifdef CONFIG_SECCOMP_FILTER | |
421 | case SECCOMP_MODE_FILTER: | |
422 | ret = seccomp_attach_user_filter(filter); | |
423 | if (ret) | |
424 | goto out; | |
425 | break; | |
426 | #endif | |
427 | default: | |
428 | goto out; | |
1d9d02fe AA |
429 | } |
430 | ||
e2cfabdf WD |
431 | current->seccomp.mode = seccomp_mode; |
432 | set_thread_flag(TIF_SECCOMP); | |
433 | out: | |
1d9d02fe AA |
434 | return ret; |
435 | } |