seccomp/cache: Report cache data through /proc/pid/seccomp_cache
authorYiFei Zhu <yifeifz2@illinois.edu>
Wed, 11 Nov 2020 13:33:54 +0000 (07:33 -0600)
committerKees Cook <keescook@chromium.org>
Fri, 20 Nov 2020 19:16:35 +0000 (11:16 -0800)
Currently the kernel does not provide an infrastructure to translate
architecture numbers to a human-readable name. Translating syscall
numbers to syscall names is possible through FTRACE_SYSCALL
infrastructure but it does not provide support for compat syscalls.

This will create a file for each PID as /proc/pid/seccomp_cache.
The file will be empty when no seccomp filters are loaded, or be
in the format of:
<arch name> <decimal syscall number> <ALLOW | FILTER>
where ALLOW means the cache is guaranteed to allow the syscall,
and filter means the cache will pass the syscall to the BPF filter.

For the docker default profile on x86_64 it looks like:
x86_64 0 ALLOW
x86_64 1 ALLOW
x86_64 2 ALLOW
x86_64 3 ALLOW
[...]
x86_64 132 ALLOW
x86_64 133 ALLOW
x86_64 134 FILTER
x86_64 135 FILTER
x86_64 136 FILTER
x86_64 137 ALLOW
x86_64 138 ALLOW
x86_64 139 FILTER
x86_64 140 ALLOW
x86_64 141 ALLOW
[...]

This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default
of N because I think certain users of seccomp might not want the
application to know which syscalls are definitely usable. For
the same reason, it is also guarded by CAP_SYS_ADMIN.

Suggested-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/
Signed-off-by: YiFei Zhu <yifeifz2@illinois.edu>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/94e663fa53136f5a11f432c661794d1ee7060779.1605101222.git.yifeifz2@illinois.edu
arch/Kconfig
fs/proc/base.c
include/linux/seccomp.h
kernel/seccomp.c

index 56b6ccc0e32d396af496541a8fc31809020d6691..35c9463b7d10e2dc343ab4672910b963ebcada45 100644 (file)
@@ -486,6 +486,9 @@ config HAVE_ARCH_SECCOMP_FILTER
          - secure_computing return value is checked and a return value of -1
            results in the system call being skipped immediately.
          - seccomp syscall wired up
+         - if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE,
+           SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If
+           COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too.
 
 config SECCOMP
        prompt "Enable seccomp to safely execute untrusted bytecode"
@@ -514,6 +517,20 @@ config SECCOMP_FILTER
 
          See Documentation/userspace-api/seccomp_filter.rst for details.
 
+config SECCOMP_CACHE_DEBUG
+       bool "Show seccomp filter cache status in /proc/pid/seccomp_cache"
+       depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR
+       depends on PROC_FS
+       help
+         This enables the /proc/pid/seccomp_cache interface to monitor
+         seccomp cache data. The file format is subject to change. Reading
+         the file requires CAP_SYS_ADMIN.
+
+         This option is for debugging only. Enabling presents the risk that
+         an adversary may be able to infer the seccomp filter logic.
+
+         If unsure, say N.
+
 config HAVE_ARCH_STACKLEAK
        bool
        help
index b362523a9829ace0f2df2a93a2bd2e8e47a7bdd5..8a7d682ba881a15958af712ff59dd49fe9ce0429 100644 (file)
@@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+       ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
 #endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+       ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
index 02aef2844c38ebb736b47ca689c5637a265e8e78..76963ec4641a76182c72db5ff59afc31498a8d56 100644 (file)
@@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task,
        return -EINVAL;
 }
 #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+struct seq_file;
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+                          struct pid *pid, struct task_struct *task);
+#endif
 #endif /* _LINUX_SECCOMP_H */
index d8cf468dbe1e5bf605d86ef6adcc3724fb5676fa..76f524e320b1849f9730ed0e32f30ebc9c771dc9 100644 (file)
@@ -553,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
 {
        struct seccomp_filter *orig = tsk->seccomp.filter;
 
+       /* We are effectively holding the siglock by not having any sighand. */
+       WARN_ON(tsk->sighand != NULL);
+
        /* Detach task from its filter tree. */
        tsk->seccomp.filter = NULL;
        __seccomp_filter_release(orig);
@@ -2335,3 +2338,59 @@ static int __init seccomp_sysctl_init(void)
 device_initcall(seccomp_sysctl_init)
 
 #endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
+static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
+                                       const void *bitmap, size_t bitmap_size)
+{
+       int nr;
+
+       for (nr = 0; nr < bitmap_size; nr++) {
+               bool cached = test_bit(nr, bitmap);
+               char *status = cached ? "ALLOW" : "FILTER";
+
+               seq_printf(m, "%s %d %s\n", name, nr, status);
+       }
+}
+
+int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
+                          struct pid *pid, struct task_struct *task)
+{
+       struct seccomp_filter *f;
+       unsigned long flags;
+
+       /*
+        * We don't want some sandboxed process to know what their seccomp
+        * filters consist of.
+        */
+       if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+               return -EACCES;
+
+       if (!lock_task_sighand(task, &flags))
+               return -ESRCH;
+
+       f = READ_ONCE(task->seccomp.filter);
+       if (!f) {
+               unlock_task_sighand(task, &flags);
+               return 0;
+       }
+
+       /* prevent filter from being freed while we are printing it */
+       __get_seccomp_filter(f);
+       unlock_task_sighand(task, &flags);
+
+       proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
+                                   f->cache.allow_native,
+                                   SECCOMP_ARCH_NATIVE_NR);
+
+#ifdef SECCOMP_ARCH_COMPAT
+       proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
+                                   f->cache.allow_compat,
+                                   SECCOMP_ARCH_COMPAT_NR);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+       __put_seccomp_filter(f);
+       return 0;
+}
+#endif /* CONFIG_SECCOMP_CACHE_DEBUG */