From: Ingo Molnar Date: Wed, 3 Jun 2015 16:41:06 +0000 (+0200) Subject: x86/asm/entry: Move the vsyscall code to arch/x86/entry/vsyscall/ X-Git-Tag: v4.2-rc1~166^2^2~8^2~14 X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=00398a0018d1334fedabfeaabd0fa563121de612;p=linux-2.6-block.git x86/asm/entry: Move the vsyscall code to arch/x86/entry/vsyscall/ The vsyscall code is entry code too, so move it to arch/x86/entry/vsyscall/. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9df72c8a0ac2..b93cce1c6bb2 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,8 +1,10 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o thunk_$(BITS).o +obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o + obj-y += vdso/ +obj-y += vsyscall/ -obj-$(CONFIG_IA32_EMULATION) += ia32entry.o +obj-$(CONFIG_IA32_EMULATION) += ia32entry.o syscall_32.o diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c new file mode 100644 index 000000000000..3777189c4a19 --- /dev/null +++ b/arch/x86/entry/syscall_32.c @@ -0,0 +1,33 @@ +/* System call table for i386. */ + +#include +#include +#include +#include + +#ifdef CONFIG_IA32_EMULATION +#define SYM(sym, compat) compat +#else +#define SYM(sym, compat) sym +#define ia32_sys_call_table sys_call_table +#define __NR_ia32_syscall_max __NR_syscall_max +#endif + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c new file mode 100644 index 000000000000..4ac730b37f0b --- /dev/null +++ b/arch/x86/entry/syscall_64.c @@ -0,0 +1,32 @@ +/* System call table for x86-64. */ + +#include +#include +#include +#include +#include + +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) + +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif + +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_64 + +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, + +extern void sys_ni_syscall(void); + +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile new file mode 100644 index 000000000000..a9f4856f622a --- /dev/null +++ b/arch/x86/entry/vsyscall/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the x86 low level vsyscall code +# +obj-y := vsyscall_gtod.o + +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o + diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c new file mode 100644 index 000000000000..2dcc6ff6fdcc --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2012-2014 Andy Lutomirski + * + * Based on the original implementation which is: + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Parts of the original code have been moved to arch/x86/vdso/vma.c + * + * This file implements vsyscall emulation. vsyscalls are a legacy ABI: + * Userspace can request certain kernel services by calling fixed + * addresses. This concept is problematic: + * + * - It interferes with ASLR. + * - It's awkward to write code that lives in kernel addresses but is + * callable by userspace at fixed addresses. + * - The whole concept is impossible for 32-bit compat userspace. + * - UML cannot easily virtualize a vsyscall. + * + * As of mid-2014, I believe that there is no new userspace code that + * will use a vsyscall if the vDSO is present. I hope that there will + * soon be no new userspace code that will ever use a vsyscall. + * + * The code in this file emulates vsyscalls when notified of a page + * fault to a vsyscall address. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) +{ + if (!show_unhandled_signals) + return; + + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); +} + +static int addr_to_vsyscall_nr(unsigned long addr) +{ + int nr; + + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) + return -EINVAL; + + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; + + return nr; +} + +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_error, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_nr = X86_TRAP_PF; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +{ + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr, syscall_nr, tmp; + int prev_sig_on_uaccess_error; + long ret; + + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ + + WARN_ON_ONCE(address != regs->ip); + + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; + } + + vsyscall_nr = addr_to_vsyscall_nr(address); + + trace_emulate_vsyscall(vsyscall_nr); + + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); + goto sigsegv; + } + + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); + goto sigsegv; + } + + tsk = current; + + /* + * Check for access_ok violations and find the syscall nr. + * + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, NULL means "don't write anything" not "write it at + * address 0". + */ + switch (vsyscall_nr) { + case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + regs->orig_ax = -1; + if (tmp) + goto do_ret; /* skip requested */ + + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: + ret = sys_gettimeofday( + (struct timeval __user *)regs->di, + (struct timezone __user *)regs->si); + break; + + case 1: + ret = sys_time((time_t __user *)regs->di); + break; + + case 2: + ret = sys_getcpu((unsigned __user *)regs->di, + (unsigned __user *)regs->si, + NULL); + break; + } + + current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + +check_fault: + if (ret == -EFAULT) { + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ + } + + regs->ax = ret; + +do_ret: + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; + return true; + +sigsegv: + force_sig(SIGSEGV, current); + return true; +} + +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ +#ifdef CONFIG_IA32_EMULATION + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + if (vsyscall_mode == NONE) + return NULL; + return &gate_vma; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +void __init map_vsyscall(void) +{ + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + + if (vsyscall_mode != NONE) + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S new file mode 100644 index 000000000000..c9596a9af159 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S @@ -0,0 +1,37 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include + +#include +#include +#include + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret + + .balign 4096, 0xcc + + .size __vsyscall_page, 4096 diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c new file mode 100644 index 000000000000..51e330416995 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include +#include +#include + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + gtod_write_begin(vdata); + + /* copy vsyscall data */ + vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->cycle_last = tk->tkr_mono.cycle_last; + vdata->mask = tk->tkr_mono.mask; + vdata->mult = tk->tkr_mono.mult; + vdata->shift = tk->tkr_mono.shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec + << tk->tkr_mono.shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h new file mode 100644 index 000000000000..9dd7359a38a8 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ +#define TRACE_INCLUDE_FILE vsyscall_trace +#include diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9d3ee054453d..01663ee5f1b7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,9 +31,6 @@ obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o -obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_IA32_EMULATION) += syscall_32.o -obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c deleted file mode 100644 index 3777189c4a19..000000000000 --- a/arch/x86/kernel/syscall_32.c +++ /dev/null @@ -1,33 +0,0 @@ -/* System call table for i386. */ - -#include -#include -#include -#include - -#ifdef CONFIG_IA32_EMULATION -#define SYM(sym, compat) compat -#else -#define SYM(sym, compat) sym -#define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max -#endif - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; -#include -#undef __SYSCALL_I386 - -#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), - -typedef asmlinkage void (*sys_call_ptr_t)(void); - -extern asmlinkage void sys_ni_syscall(void); - -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c deleted file mode 100644 index 4ac730b37f0b..000000000000 --- a/arch/x86/kernel/syscall_64.c +++ /dev/null @@ -1,32 +0,0 @@ -/* System call table for x86-64. */ - -#include -#include -#include -#include -#include - -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) - -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif - -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; -#include -#undef __SYSCALL_64 - -#define __SYSCALL_64(nr, sym, compat) [nr] = sym, - -extern void sys_ni_syscall(void); - -asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c deleted file mode 100644 index 2dcc6ff6fdcc..000000000000 --- a/arch/x86/kernel/vsyscall_64.c +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright (c) 2012-2014 Andy Lutomirski - * - * Based on the original implementation which is: - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Parts of the original code have been moved to arch/x86/vdso/vma.c - * - * This file implements vsyscall emulation. vsyscalls are a legacy ABI: - * Userspace can request certain kernel services by calling fixed - * addresses. This concept is problematic: - * - * - It interferes with ASLR. - * - It's awkward to write code that lives in kernel addresses but is - * callable by userspace at fixed addresses. - * - The whole concept is impossible for 32-bit compat userspace. - * - UML cannot easily virtualize a vsyscall. - * - * As of mid-2014, I believe that there is no new userspace code that - * will use a vsyscall if the vDSO is present. I hope that there will - * soon be no new userspace code that will ever use a vsyscall. - * - * The code in this file emulates vsyscalls when notified of a page - * fault to a vsyscall address. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include "vsyscall_trace.h" - -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; - -static int __init vsyscall_setup(char *str) -{ - if (str) { - if (!strcmp("emulate", str)) - vsyscall_mode = EMULATE; - else if (!strcmp("native", str)) - vsyscall_mode = NATIVE; - else if (!strcmp("none", str)) - vsyscall_mode = NONE; - else - return -EINVAL; - - return 0; - } - - return -EINVAL; -} -early_param("vsyscall", vsyscall_setup); - -static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, - const char *message) -{ - if (!show_unhandled_signals) - return; - - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, current->comm, task_pid_nr(current), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); -} - -static int addr_to_vsyscall_nr(unsigned long addr) -{ - int nr; - - if ((addr & ~0xC00UL) != VSYSCALL_ADDR) - return -EINVAL; - - nr = (addr & 0xC00UL) >> 10; - if (nr >= 3) - return -EINVAL; - - return nr; -} - -static bool write_ok_or_segv(unsigned long ptr, size_t size) -{ - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_error, this could go away. - */ - - if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { - siginfo_t info; - struct thread_struct *thread = ¤t->thread; - - thread->error_code = 6; /* user fault, no page, write */ - thread->cr2 = ptr; - thread->trap_nr = X86_TRAP_PF; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGSEGV; - info.si_errno = 0; - info.si_code = SEGV_MAPERR; - info.si_addr = (void __user *)ptr; - - force_sig_info(SIGSEGV, &info, current); - return false; - } else { - return true; - } -} - -bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) -{ - struct task_struct *tsk; - unsigned long caller; - int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_error; - long ret; - - /* - * No point in checking CS -- the only way to get here is a user mode - * trap to a high address, which means that we're in 64-bit user code. - */ - - WARN_ON_ONCE(address != regs->ip); - - if (vsyscall_mode == NONE) { - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall attempted with vsyscall=none"); - return false; - } - - vsyscall_nr = addr_to_vsyscall_nr(address); - - trace_emulate_vsyscall(vsyscall_nr); - - if (vsyscall_nr < 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); - goto sigsegv; - } - - if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "vsyscall with bad stack (exploit attempt?)"); - goto sigsegv; - } - - tsk = current; - - /* - * Check for access_ok violations and find the syscall nr. - * - * NULL is a valid user pointer (in the access_ok sense) on 32-bit and - * 64-bit, so we don't need to special-case it here. For all the - * vsyscalls, NULL means "don't write anything" not "write it at - * address 0". - */ - switch (vsyscall_nr) { - case 0: - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_gettimeofday; - break; - - case 1: - if (!write_ok_or_segv(regs->di, sizeof(time_t))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_time; - break; - - case 2: - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_getcpu; - break; - } - - /* - * Handle seccomp. regs->ip must be the original value. - * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. - * - * We could optimize the seccomp disabled case, but performance - * here doesn't matter. - */ - regs->orig_ax = syscall_nr; - regs->ax = -ENOSYS; - tmp = secure_computing(); - if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { - warn_bad_vsyscall(KERN_DEBUG, regs, - "seccomp tried to change syscall nr or ip"); - do_exit(SIGSYS); - } - regs->orig_ax = -1; - if (tmp) - goto do_ret; /* skip requested */ - - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; - - ret = -EFAULT; - switch (vsyscall_nr) { - case 0: - ret = sys_gettimeofday( - (struct timeval __user *)regs->di, - (struct timezone __user *)regs->si); - break; - - case 1: - ret = sys_time((time_t __user *)regs->di); - break; - - case 2: - ret = sys_getcpu((unsigned __user *)regs->di, - (unsigned __user *)regs->si, - NULL); - break; - } - - current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - -check_fault: - if (ret == -EFAULT) { - /* Bad news -- userspace fed a bad pointer to a vsyscall. */ - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ - } - - regs->ax = ret; - -do_ret: - /* Emulate a ret instruction. */ - regs->ip = caller; - regs->sp += 8; - return true; - -sigsegv: - force_sig(SIGSEGV, current); - return true; -} - -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static const char *gate_vma_name(struct vm_area_struct *vma) -{ - return "[vsyscall]"; -} -static struct vm_operations_struct gate_vma_ops = { - .name = gate_vma_name, -}; -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_ADDR, - .vm_end = VSYSCALL_ADDR + PAGE_SIZE, - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC, - .vm_ops = &gate_vma_ops, -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - if (vsyscall_mode == NONE) - return NULL; - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(mm); - - if (!vma) - return 0; - - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; -} - -void __init map_vsyscall(void) -{ - extern char __vsyscall_page; - unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - - if (vsyscall_mode != NONE) - __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); - - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != - (unsigned long)VSYSCALL_ADDR); -} diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S deleted file mode 100644 index c9596a9af159..000000000000 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ /dev/null @@ -1,37 +0,0 @@ -/* - * vsyscall_emu_64.S: Vsyscall emulation page - * - * Copyright (c) 2011 Andy Lutomirski - * - * Subject to the GNU General Public License, version 2 - */ - -#include - -#include -#include -#include - -__PAGE_ALIGNED_DATA - .globl __vsyscall_page - .balign PAGE_SIZE, 0xcc - .type __vsyscall_page, @object -__vsyscall_page: - - mov $__NR_gettimeofday, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_time, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_getcpu, %rax - syscall - ret - - .balign 4096, 0xcc - - .size __vsyscall_page, 4096 diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c deleted file mode 100644 index 51e330416995..000000000000 --- a/arch/x86/kernel/vsyscall_gtod.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Modified for x86 32 bit architecture by - * Stefani Seibold - * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - */ - -#include -#include -#include - -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); - -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; - vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; -} - -void update_vsyscall(struct timekeeper *tk) -{ - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - - gtod_write_begin(vdata); - - /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - vdata->cycle_last = tk->tkr_mono.cycle_last; - vdata->mask = tk->tkr_mono.mask; - vdata->mult = tk->tkr_mono.mult; - vdata->shift = tk->tkr_mono.shift; - - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec - + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr_mono.shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - vdata->monotonic_time_sec++; - } - - vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift); - - vdata->monotonic_time_coarse_sec = - vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_coarse_nsec = - vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; - - while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { - vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; - vdata->monotonic_time_coarse_sec++; - } - - gtod_write_end(vdata); -} diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h deleted file mode 100644 index a8b2edec54fe..000000000000 --- a/arch/x86/kernel/vsyscall_trace.h +++ /dev/null @@ -1,29 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM vsyscall - -#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) -#define __VSYSCALL_TRACE_H - -#include - -TRACE_EVENT(emulate_vsyscall, - - TP_PROTO(int nr), - - TP_ARGS(nr), - - TP_STRUCT__entry(__field(int, nr)), - - TP_fast_assign( - __entry->nr = nr; - ), - - TP_printk("nr = %d", __entry->nr) -); - -#endif - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../arch/x86/kernel -#define TRACE_INCLUDE_FILE vsyscall_trace -#include