Merge branch 'tracing/hw-breakpoints' into perf/core

author Ingo Molnar <mingo@elte.hu>

Sat, 21 Nov 2009 13:07:23 +0000 (14:07 +0100)

committer Ingo Molnar <mingo@elte.hu>

Sat, 21 Nov 2009 13:07:23 +0000 (14:07 +0100)
author Ingo Molnar <mingo@elte.hu>
Sat, 21 Nov 2009 13:07:23 +0000 (14:07 +0100)
committer Ingo Molnar <mingo@elte.hu>
Sat, 21 Nov 2009 13:07:23 +0000 (14:07 +0100)
diff --combined arch/x86/Kconfig

index 72ace9515a07a44525778899e1ea04b32b3accbc,1b7c74350a04d02a65443909b4e6dd9679a0d827..178084b4377ccbebb2fb089ea1c17742f9addbfe
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -49,6 -49,7 +49,7 @@@ config X8
         select HAVE_KERNEL_GZIP
         select HAVE_KERNEL_BZIP2
         select HAVE_KERNEL_LZMA
+       select HAVE_HW_BREAKPOINT
         select HAVE_ARCH_KMEMCHECK
   
   config OUTPUT_FORMAT
@@@ -1443,8 -1444,12 +1444,8 @@@ config SECCOM
   
           If unsure, say Y. Only embedded should say N here.
   
- -config CC_STACKPROTECTOR_ALL
- -      bool
- -
   config CC_STACKPROTECTOR
         bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
- -      select CC_STACKPROTECTOR_ALL
         ---help---
           This option turns on the -fstack-protector GCC feature. This
           feature puts, at the beginning of functions, a canary value on
diff --combined arch/x86/include/asm/processor.h

index c9786480f0fe4d074e9575557316d7f4b358be4b,820f3000f7367687f3119dd1f5c5cc0f2a2f0377..6f8ec1c37e0a8c999f9344082465fd2c9b4570f1
--- 1/arch/x86/include/asm/processor.h
--- 2/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@@ -30,6 -30,7 +30,7 @@@ struct mm_struct
   #include <linux/math64.h>
   #include <linux/init.h>
   
+ #define HBP_NUM 4
   /*
    * Default implementation of macro that returns current
    * instruction pointer ("program counter").
@@@ -422,6 -423,8 +423,8 @@@ extern unsigned int xstate_size
   extern void free_thread_xstate(struct task_struct *);
   extern struct kmem_cache *task_xstate_cachep;
   
+ struct perf_event;
+ 
   struct thread_struct {
         /* Cached TLS descriptors: */
         struct desc_struct      tls_array[GDT_ENTRY_TLS_ENTRIES];
@@@ -443,13 -446,10 +446,10 @@@
         unsigned long           fs;
   #endif
         unsigned long           gs;
-       /* Hardware debugging registers: */
-       unsigned long           debugreg0;
-       unsigned long           debugreg1;
-       unsigned long           debugreg2;
-       unsigned long           debugreg3;
-       unsigned long           debugreg6;
-       unsigned long           debugreg7;
+       /* Save middle states of ptrace breakpoints */
+       struct perf_event       *ptrace_bps[HBP_NUM];
+       /* Debug status used for traps, single steps, etc... */
+       unsigned long           debugreg6;
         /* Fault info: */
         unsigned long           cr2;
         unsigned long           trap_no;
@@@ -1000,7 -1000,7 +1000,7 @@@ extern unsigned long thread_saved_pc(st
   #define thread_saved_pc(t)    (*(unsigned long *)((t)->thread.sp - 8))
   
   #define task_pt_regs(tsk)     ((struct pt_regs *)(tsk)->thread.sp0 - 1)
- -#define KSTK_ESP(tsk)         -1 /* sorry. doesn't work for syscall. */
+ +extern unsigned long KSTK_ESP(struct task_struct *task);
   #endif /* CONFIG_X86_64 */
   
   extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --combined arch/x86/kernel/kprobes.c

index c5f1f117e0c0577a527e788d4c80220bd49dd84d,b5b1848c5336e505a5a147f0497865056f34b307..3fe86d706a1493ad59cb655478bd9b41cfd11f4d
--- 1/arch/x86/kernel/kprobes.c
--- 2/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@@ -48,14 -48,13 +48,15 @@@
   #include <linux/preempt.h>
   #include <linux/module.h>
   #include <linux/kdebug.h>
+ +#include <linux/kallsyms.h>
   
   #include <asm/cacheflush.h>
   #include <asm/desc.h>
   #include <asm/pgtable.h>
   #include <asm/uaccess.h>
   #include <asm/alternative.h>
+ +#include <asm/insn.h>
+ #include <asm/debugreg.h>
   
   void jprobe_return_end(void);
   
@@@ -108,6 -107,50 +109,6 @@@ static const u32 twobyte_is_boostable[2
         /*      -----------------------------------------------         */
         /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
   };
- -static const u32 onebyte_has_modrm[256 / 32] = {
- -      /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
- -      /*      -----------------------------------------------         */
- -      W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
- -      W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
- -      W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
- -      W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
- -      W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
- -      W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
- -      W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
- -      W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
- -      W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
- -      W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
- -      W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
- -      W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
- -      W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
- -      W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
- -      W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
- -      W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
- -      /*      -----------------------------------------------         */
- -      /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
- -};
- -static const u32 twobyte_has_modrm[256 / 32] = {
- -      /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
- -      /*      -----------------------------------------------         */
- -      W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
- -      W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
- -      W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
- -      W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
- -      W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
- -      W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
- -      W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
- -      W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
- -      W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
- -      W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
- -      W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
- -      W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
- -      W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
- -      W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
- -      W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
- -      W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
- -      /*      -----------------------------------------------         */
- -      /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
- -};
   #undef W
   
   struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@@ -202,75 -245,6 +203,75 @@@ retry
         }
   }
   
+ +/* Recover the probed instruction at addr for further analysis. */
+ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+ +{
+ +      struct kprobe *kp;
+ +      kp = get_kprobe((void *)addr);
+ +      if (!kp)
+ +              return -EINVAL;
+ +
+ +      /*
+ +       *  Basically, kp->ainsn.insn has an original instruction.
+ +       *  However, RIP-relative instruction can not do single-stepping
+ +       *  at different place, fix_riprel() tweaks the displacement of
+ +       *  that instruction. In that case, we can't recover the instruction
+ +       *  from the kp->ainsn.insn.
+ +       *
+ +       *  On the other hand, kp->opcode has a copy of the first byte of
+ +       *  the probed instruction, which is overwritten by int3. And
+ +       *  the instruction at kp->addr is not modified by kprobes except
+ +       *  for the first byte, we can recover the original instruction
+ +       *  from it and kp->opcode.
+ +       */
+ +      memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+ +      buf[0] = kp->opcode;
+ +      return 0;
+ +}
+ +
+ +/* Dummy buffers for kallsyms_lookup */
+ +static char __dummy_buf[KSYM_NAME_LEN];
+ +
+ +/* Check if paddr is at an instruction boundary */
+ +static int __kprobes can_probe(unsigned long paddr)
+ +{
+ +      int ret;
+ +      unsigned long addr, offset = 0;
+ +      struct insn insn;
+ +      kprobe_opcode_t buf[MAX_INSN_SIZE];
+ +
+ +      if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+ +              return 0;
+ +
+ +      /* Decode instructions */
+ +      addr = paddr - offset;
+ +      while (addr < paddr) {
+ +              kernel_insn_init(&insn, (void *)addr);
+ +              insn_get_opcode(&insn);
+ +
+ +              /*
+ +               * Check if the instruction has been modified by another
+ +               * kprobe, in which case we replace the breakpoint by the
+ +               * original instruction in our buffer.
+ +               */
+ +              if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+ +                      ret = recover_probed_instruction(buf, addr);
+ +                      if (ret)
+ +                              /*
+ +                               * Another debugging subsystem might insert
+ +                               * this breakpoint. In that case, we can't
+ +                               * recover it.
+ +                               */
+ +                              return 0;
+ +                      kernel_insn_init(&insn, buf);
+ +              }
+ +              insn_get_length(&insn);
+ +              addr += insn.length;
+ +      }
+ +
+ +      return (addr == paddr);
+ +}
+ +
   /*
    * Returns non-zero if opcode modifies the interrupt flag.
    */
@@@ -304,30 -278,68 +305,30 @@@ static int __kprobes is_IF_modifier(kpr
   static void __kprobes fix_riprel(struct kprobe *p)
   {
   #ifdef CONFIG_X86_64
- -      u8 *insn = p->ainsn.insn;
- -      s64 disp;
- -      int need_modrm;
- -
- -      /* Skip legacy instruction prefixes.  */
- -      while (1) {
- -              switch (*insn) {
- -              case 0x66:
- -              case 0x67:
- -              case 0x2e:
- -              case 0x3e:
- -              case 0x26:
- -              case 0x64:
- -              case 0x65:
- -              case 0x36:
- -              case 0xf0:
- -              case 0xf3:
- -              case 0xf2:
- -                      ++insn;
- -                      continue;
- -              }
- -              break;
- -      }
+ +      struct insn insn;
+ +      kernel_insn_init(&insn, p->ainsn.insn);
   
- -      /* Skip REX instruction prefix.  */
- -      if (is_REX_prefix(insn))
- -              ++insn;
- -
- -      if (*insn == 0x0f) {
- -              /* Two-byte opcode.  */
- -              ++insn;
- -              need_modrm = test_bit(*insn,
- -                                    (unsigned long *)twobyte_has_modrm);
- -      } else
- -              /* One-byte opcode.  */
- -              need_modrm = test_bit(*insn,
- -                                    (unsigned long *)onebyte_has_modrm);
- -
- -      if (need_modrm) {
- -              u8 modrm = *++insn;
- -              if ((modrm & 0xc7) == 0x05) {
- -                      /* %rip+disp32 addressing mode */
- -                      /* Displacement follows ModRM byte.  */
- -                      ++insn;
- -                      /*
- -                       * The copied instruction uses the %rip-relative
- -                       * addressing mode.  Adjust the displacement for the
- -                       * difference between the original location of this
- -                       * instruction and the location of the copy that will
- -                       * actually be run.  The tricky bit here is making sure
- -                       * that the sign extension happens correctly in this
- -                       * calculation, since we need a signed 32-bit result to
- -                       * be sign-extended to 64 bits when it's added to the
- -                       * %rip value and yield the same 64-bit result that the
- -                       * sign-extension of the original signed 32-bit
- -                       * displacement would have given.
- -                       */
- -                      disp = (u8 *) p->addr + *((s32 *) insn) -
- -                             (u8 *) p->ainsn.insn;
- -                      BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
- -                      *(s32 *)insn = (s32) disp;
- -              }
+ +      if (insn_rip_relative(&insn)) {
+ +              s64 newdisp;
+ +              u8 *disp;
+ +              insn_get_displacement(&insn);
+ +              /*
+ +               * The copied instruction uses the %rip-relative addressing
+ +               * mode.  Adjust the displacement for the difference between
+ +               * the original location of this instruction and the location
+ +               * of the copy that will actually be run.  The tricky bit here
+ +               * is making sure that the sign extension happens correctly in
+ +               * this calculation, since we need a signed 32-bit result to
+ +               * be sign-extended to 64 bits when it's added to the %rip
+ +               * value and yield the same 64-bit result that the sign-
+ +               * extension of the original signed 32-bit displacement would
+ +               * have given.
+ +               */
+ +              newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
+ +                        (u8 *) p->ainsn.insn;
+ +              BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */
+ +              disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
+ +              *(s32 *) disp = (s32) newdisp;
         }
   #endif
   }
@@@ -348,8 -360,6 +349,8 @@@ static void __kprobes arch_copy_kprobe(
   
   int __kprobes arch_prepare_kprobe(struct kprobe *p)
   {
+ +      if (!can_probe((unsigned long)p->addr))
+ +              return -EILSEQ;
         /* insn: must be on special executable page on x86. */
         p->ainsn.insn = get_insn_slot();
         if (!p->ainsn.insn)
@@@ -463,6 -473,17 +464,6 @@@ static int __kprobes reenter_kprobe(str
   {
         switch (kcb->kprobe_status) {
         case KPROBE_HIT_SSDONE:
- -#ifdef CONFIG_X86_64
- -              /* TODO: Provide re-entrancy from post_kprobes_handler() and
- -               * avoid exception stack corruption while single-stepping on
- -               * the instruction of the new probe.
- -               */
- -              arch_disarm_kprobe(p);
- -              regs->ip = (unsigned long)p->addr;
- -              reset_current_kprobe();
- -              preempt_enable_no_resched();
- -              break;
- -#endif
         case KPROBE_HIT_ACTIVE:
                 save_previous_kprobe(kcb);
                 set_current_kprobe(p, regs, kcb);
@@@ -471,16 -492,18 +472,16 @@@
                 kcb->kprobe_status = KPROBE_REENTER;
                 break;
         case KPROBE_HIT_SS:
- -              if (p == kprobe_running()) {
- -                      regs->flags &= ~X86_EFLAGS_TF;
- -                      regs->flags |= kcb->kprobe_saved_flags;
- -                      return 0;
- -              } else {
- -                      /* A probe has been hit in the codepath leading up
- -                       * to, or just after, single-stepping of a probed
- -                       * instruction. This entire codepath should strictly
- -                       * reside in .kprobes.text section. Raise a warning
- -                       * to highlight this peculiar case.
- -                       */
- -              }
+ +              /* A probe has been hit in the codepath leading up to, or just
+ +               * after, single-stepping of a probed instruction. This entire
+ +               * codepath should strictly reside in .kprobes.text section.
+ +               * Raise a BUG or we'll continue in an endless reentering loop
+ +               * and eventually a stack overflow.
+ +               */
+ +              printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
+ +                     p->addr);
+ +              dump_kprobe(p);
+ +              BUG();
         default:
                 /* impossible cases */
                 WARN_ON(1);
@@@ -945,8 -968,14 +946,14 @@@ int __kprobes kprobe_exceptions_notify(
                         ret = NOTIFY_STOP;
                 break;
         case DIE_DEBUG:
-               if (post_kprobe_handler(args->regs))
+               if (post_kprobe_handler(args->regs)) {
+                       /*
+                        * Reset the BS bit in dr6 (pointed by args->err) to
+                        * denote completion of processing
+                        */
+                       (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
                         ret = NOTIFY_STOP;
+               }
                 break;
         case DIE_GPF:
                 /*
diff --combined arch/x86/kernel/process_64.c

index eb62cbcaa490ad553ef5d70b6751a2288d25089e,5bafdec344415387f94b7a344bbba75b3b457950..70cf15873f3d65da38e42fbbb670555d2fc22722
--- 1/arch/x86/kernel/process_64.c
--- 2/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -52,6 -52,7 +52,7 @@@
   #include <asm/idle.h>
   #include <asm/syscalls.h>
   #include <asm/ds.h>
+ #include <asm/debugreg.h>
   
   asmlinkage extern void ret_from_fork(void);
   
@@@ -297,12 -298,16 +298,16 @@@ int copy_thread(unsigned long clone_fla
   
         p->thread.fs = me->thread.fs;
         p->thread.gs = me->thread.gs;
+       p->thread.io_bitmap_ptr = NULL;
   
         savesegment(gs, p->thread.gsindex);
         savesegment(fs, p->thread.fsindex);
         savesegment(es, p->thread.es);
         savesegment(ds, p->thread.ds);
   
+       err = -ENOMEM;
+       memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+ 
         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
                 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
                 if (!p->thread.io_bitmap_ptr) {
@@@ -341,6 -346,7 +346,7 @@@ out
                 kfree(p->thread.io_bitmap_ptr);
                 p->thread.io_bitmap_max = 0;
         }
+ 
         return err;
   }
   
@@@ -495,6 -501,7 +501,7 @@@ __switch_to(struct task_struct *prev_p
          */
         if (preload_fpu)
                 __math_state_restore();
+ 
         return prev_p;
   }
   
@@@ -664,8 -671,3 +671,8 @@@ long sys_arch_prctl(int code, unsigned 
         return do_arch_prctl(current, code, addr);
   }
   
+ +unsigned long KSTK_ESP(struct task_struct *task)
+ +{
+ +      return (test_tsk_thread_flag(task, TIF_IA32)) ?
+ +                      (task_pt_regs(task)->sp) : ((task)->thread.usersp);
+ +}
diff --combined arch/x86/kernel/ptrace.c

index c4f76d275ee4cda38bed14268a4ac3b6f04d7436,e79610d95971fe333305f469aae83492bf65a6e1..b25f8947ed7aefd4e885d0820daaf764bb19545b
--- 1/arch/x86/kernel/ptrace.c
--- 2/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@@ -22,6 -22,8 +22,8 @@@
   #include <linux/seccomp.h>
   #include <linux/signal.h>
   #include <linux/workqueue.h>
+ #include <linux/perf_event.h>
+ #include <linux/hw_breakpoint.h>
   
   #include <asm/uaccess.h>
   #include <asm/pgtable.h>
@@@ -34,6 -36,7 +36,7 @@@
   #include <asm/prctl.h>
   #include <asm/proto.h>
   #include <asm/ds.h>
+ #include <asm/hw_breakpoint.h>
   
   #include "tls.h"
   
@@@ -49,118 -52,6 +52,118 @@@ enum x86_regset 
         REGSET_IOPERM32,
   };
   
+ +struct pt_regs_offset {
+ +      const char *name;
+ +      int offset;
+ +};
+ +
+ +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+ +#define REG_OFFSET_END {.name = NULL, .offset = 0}
+ +
+ +static const struct pt_regs_offset regoffset_table[] = {
+ +#ifdef CONFIG_X86_64
+ +      REG_OFFSET_NAME(r15),
+ +      REG_OFFSET_NAME(r14),
+ +      REG_OFFSET_NAME(r13),
+ +      REG_OFFSET_NAME(r12),
+ +      REG_OFFSET_NAME(r11),
+ +      REG_OFFSET_NAME(r10),
+ +      REG_OFFSET_NAME(r9),
+ +      REG_OFFSET_NAME(r8),
+ +#endif
+ +      REG_OFFSET_NAME(bx),
+ +      REG_OFFSET_NAME(cx),
+ +      REG_OFFSET_NAME(dx),
+ +      REG_OFFSET_NAME(si),
+ +      REG_OFFSET_NAME(di),
+ +      REG_OFFSET_NAME(bp),
+ +      REG_OFFSET_NAME(ax),
+ +#ifdef CONFIG_X86_32
+ +      REG_OFFSET_NAME(ds),
+ +      REG_OFFSET_NAME(es),
+ +      REG_OFFSET_NAME(fs),
+ +      REG_OFFSET_NAME(gs),
+ +#endif
+ +      REG_OFFSET_NAME(orig_ax),
+ +      REG_OFFSET_NAME(ip),
+ +      REG_OFFSET_NAME(cs),
+ +      REG_OFFSET_NAME(flags),
+ +      REG_OFFSET_NAME(sp),
+ +      REG_OFFSET_NAME(ss),
+ +      REG_OFFSET_END,
+ +};
+ +
+ +/**
+ + * regs_query_register_offset() - query register offset from its name
+ + * @name:     the name of a register
+ + *
+ + * regs_query_register_offset() returns the offset of a register in struct
+ + * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ + */
+ +int regs_query_register_offset(const char *name)
+ +{
+ +      const struct pt_regs_offset *roff;
+ +      for (roff = regoffset_table; roff->name != NULL; roff++)
+ +              if (!strcmp(roff->name, name))
+ +                      return roff->offset;
+ +      return -EINVAL;
+ +}
+ +
+ +/**
+ + * regs_query_register_name() - query register name from its offset
+ + * @offset:   the offset of a register in struct pt_regs.
+ + *
+ + * regs_query_register_name() returns the name of a register from its
+ + * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ + */
+ +const char *regs_query_register_name(unsigned int offset)
+ +{
+ +      const struct pt_regs_offset *roff;
+ +      for (roff = regoffset_table; roff->name != NULL; roff++)
+ +              if (roff->offset == offset)
+ +                      return roff->name;
+ +      return NULL;
+ +}
+ +
+ +static const int arg_offs_table[] = {
+ +#ifdef CONFIG_X86_32
+ +      [0] = offsetof(struct pt_regs, ax),
+ +      [1] = offsetof(struct pt_regs, dx),
+ +      [2] = offsetof(struct pt_regs, cx)
+ +#else /* CONFIG_X86_64 */
+ +      [0] = offsetof(struct pt_regs, di),
+ +      [1] = offsetof(struct pt_regs, si),
+ +      [2] = offsetof(struct pt_regs, dx),
+ +      [3] = offsetof(struct pt_regs, cx),
+ +      [4] = offsetof(struct pt_regs, r8),
+ +      [5] = offsetof(struct pt_regs, r9)
+ +#endif
+ +};
+ +
+ +/**
+ + * regs_get_argument_nth() - get Nth argument at function call
+ + * @regs:     pt_regs which contains registers at function entry.
+ + * @n:                argument number.
+ + *
+ + * regs_get_argument_nth() returns @n th argument of a function call.
+ + * Since usually the kernel stack will be changed right after function entry,
+ + * you must use this at function entry. If the @n th entry is NOT in the
+ + * kernel stack or pt_regs, this returns 0.
+ + */
+ +unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
+ +{
+ +      if (n < ARRAY_SIZE(arg_offs_table))
+ +              return *(unsigned long *)((char *)regs + arg_offs_table[n]);
+ +      else {
+ +              /*
+ +               * The typical case: arg n is on the stack.
+ +               * (Note: stack[0] = return address, so skip it)
+ +               */
+ +              n -= ARRAY_SIZE(arg_offs_table);
+ +              return regs_get_kernel_stack_nth(regs, 1 + n);
+ +      }
+ +}
+ +
   /*
    * does not yet catch signals sent when the child dies.
    * in exit.c or in signal.c.
@@@ -249,11 -140,6 +252,6 @@@ static int set_segment_reg(struct task_
         return 0;
   }
   
- static unsigned long debugreg_addr_limit(struct task_struct *task)
- {
-       return TASK_SIZE - 3;
- }
- 
   #else  /* CONFIG_X86_64 */
   
   #define FLAG_MASK             (FLAG_MASK_32 | X86_EFLAGS_NT)
@@@ -378,15 -264,6 +376,6 @@@ static int set_segment_reg(struct task_
         return 0;
   }
   
- static unsigned long debugreg_addr_limit(struct task_struct *task)
- {
- #ifdef CONFIG_IA32_EMULATION
-       if (test_tsk_thread_flag(task, TIF_IA32))
-               return IA32_PAGE_OFFSET - 3;
- #endif
-       return TASK_SIZE_MAX - 7;
- }
- 
   #endif        /* CONFIG_X86_32 */
   
   static unsigned long get_flags(struct task_struct *task)
@@@ -566,98 -443,228 +555,228 @@@ static int genregs_set(struct task_stru
         return ret;
   }
   
+ static void ptrace_triggered(struct perf_event *bp, void *data)
+ {
+       int i;
+       struct thread_struct *thread = &(current->thread);
+ 
+       /*
+        * Store in the virtual DR6 register the fact that the breakpoint
+        * was hit so the thread's debugger will see it.
+        */
+       for (i = 0; i < HBP_NUM; i++) {
+               if (thread->ptrace_bps[i] == bp)
+                       break;
+       }
+ 
+       thread->debugreg6 |= (DR_TRAP0 << i);
+ }
+ 
   /*
-  * This function is trivial and will be inlined by the compiler.
-  * Having it separates the implementation details of debug
-  * registers from the interface details of ptrace.
+  * Walk through every ptrace breakpoints for this thread and
+  * build the dr7 value on top of their attributes.
+  *
    */
- static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+ static unsigned long ptrace_get_dr7(struct perf_event *bp[])
   {
-       switch (n) {
-       case 0:         return child->thread.debugreg0;
-       case 1:         return child->thread.debugreg1;
-       case 2:         return child->thread.debugreg2;
-       case 3:         return child->thread.debugreg3;
-       case 6:         return child->thread.debugreg6;
-       case 7:         return child->thread.debugreg7;
+       int i;
+       int dr7 = 0;
+       struct arch_hw_breakpoint *info;
+ 
+       for (i = 0; i < HBP_NUM; i++) {
+               if (bp[i] && !bp[i]->attr.disabled) {
+                       info = counter_arch_bp(bp[i]);
+                       dr7 |= encode_dr7(i, info->len, info->type);
+               }
         }
-       return 0;
+ 
+       return dr7;
   }
   
- static int ptrace_set_debugreg(struct task_struct *child,
-                              int n, unsigned long data)
+ /*
+  * Handle ptrace writes to debug register 7.
+  */
+ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
   {
-       int i;
+       struct thread_struct *thread = &(tsk->thread);
+       unsigned long old_dr7;
+       int i, orig_ret = 0, rc = 0;
+       int enabled, second_pass = 0;
+       unsigned len, type;
+       int gen_len, gen_type;
+       struct perf_event *bp;
+ 
+       data &= ~DR_CONTROL_RESERVED;
+       old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+ restore:
+       /*
+        * Loop through all the hardware breakpoints, making the
+        * appropriate changes to each.
+        */
+       for (i = 0; i < HBP_NUM; i++) {
+               enabled = decode_dr7(data, i, &len, &type);
+               bp = thread->ptrace_bps[i];
+ 
+               if (!enabled) {
+                       if (bp) {
+                               /*
+                                * Don't unregister the breakpoints right-away,
+                                * unless all register_user_hw_breakpoint()
+                                * requests have succeeded. This prevents
+                                * any window of opportunity for debug
+                                * register grabbing by other users.
+                                */
+                               if (!second_pass)
+                                       continue;
+                               thread->ptrace_bps[i] = NULL;
+                               unregister_hw_breakpoint(bp);
+                       }
+                       continue;
+               }
   
-       if (unlikely(n == 4 || n == 5))
-               return -EIO;
+               /*
+                * We shoud have at least an inactive breakpoint at this
+                * slot. It means the user is writing dr7 without having
+                * written the address register first
+                */
+               if (!bp) {
+                       rc = -EINVAL;
+                       break;
+               }
   
-       if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
-               return -EIO;
+               rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+               if (rc)
+                       break;
   
-       switch (n) {
-       case 0:         child->thread.debugreg0 = data; break;
-       case 1:         child->thread.debugreg1 = data; break;
-       case 2:         child->thread.debugreg2 = data; break;
-       case 3:         child->thread.debugreg3 = data; break;
+               /*
+                * This is a temporary thing as bp is unregistered/registered
+                * to simulate modification
+                */
+               bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len,
+                                              gen_type, bp->callback,
+                                              tsk, true);
+               thread->ptrace_bps[i] = NULL;
   
-       case 6:
-               if ((data & ~0xffffffffUL) != 0)
-                       return -EIO;
-               child->thread.debugreg6 = data;
-               break;
+               if (!bp) { /* incorrect bp, or we have a bug in bp API */
+                       rc = -EINVAL;
+                       break;
+               }
+               if (IS_ERR(bp)) {
+                       rc = PTR_ERR(bp);
+                       bp = NULL;
+                       break;
+               }
+               thread->ptrace_bps[i] = bp;
+       }
+       /*
+        * Make a second pass to free the remaining unused breakpoints
+        * or to restore the original breakpoints if an error occurred.
+        */
+       if (!second_pass) {
+               second_pass = 1;
+               if (rc < 0) {
+                       orig_ret = rc;
+                       data = old_dr7;
+               }
+               goto restore;
+       }
+       return ((orig_ret < 0) ? orig_ret : rc);
+ }
   
-       case 7:
+ /*
+  * Handle PTRACE_PEEKUSR calls for the debug register area.
+  */
+ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+ {
+       struct thread_struct *thread = &(tsk->thread);
+       unsigned long val = 0;
+ 
+       if (n < HBP_NUM) {
+               struct perf_event *bp;
+               bp = thread->ptrace_bps[n];
+               if (!bp)
+                       return 0;
+               val = bp->hw.info.address;
+       } else if (n == 6) {
+               val = thread->debugreg6;
+        } else if (n == 7) {
+               val = ptrace_get_dr7(thread->ptrace_bps);
+       }
+       return val;
+ }
+ 
+ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+                                     unsigned long addr)
+ {
+       struct perf_event *bp;
+       struct thread_struct *t = &tsk->thread;
+ 
+       if (!t->ptrace_bps[nr]) {
                 /*
-                * Sanity-check data. Take one half-byte at once with
-                * check = (val >> (16 + 4*i)) & 0xf. It contains the
-                * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-                * 2 and 3 are LENi. Given a list of invalid values,
-                * we do mask |= 1 << invalid_value, so that
-                * (mask >> check) & 1 is a correct test for invalid
-                * values.
-                *
-                * R/Wi contains the type of the breakpoint /
-                * watchpoint, LENi contains the length of the watched
-                * data in the watchpoint case.
-                *
-                * The invalid values are:
-                * - LENi == 0x10 (undefined), so mask |= 0x0f00.       [32-bit]
-                * - R/Wi == 0x10 (break on I/O reads or writes), so
-                *   mask |= 0x4444.
-                * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-                *   0x1110.
-                *
-                * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-                *
-                * See the Intel Manual "System Programming Guide",
-                * 15.2.4
-                *
-                * Note that LENi == 0x10 is defined on x86_64 in long
-                * mode (i.e. even for 32-bit userspace software, but
-                * 64-bit kernel), so the x86_64 mask value is 0x5454.
-                * See the AMD manual no. 24593 (AMD64 System Programming)
+                * Put stub len and type to register (reserve) an inactive but
+                * correct bp
                  */
- #ifdef CONFIG_X86_32
- #define       DR7_MASK        0x5f54
- #else
- #define       DR7_MASK        0x5554
- #endif
-               data &= ~DR_CONTROL_RESERVED;
-               for (i = 0; i < 4; i++)
-                       if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-                               return -EIO;
-               child->thread.debugreg7 = data;
-               if (data)
-                       set_tsk_thread_flag(child, TIF_DEBUG);
-               else
-                       clear_tsk_thread_flag(child, TIF_DEBUG);
-               break;
+               bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1,
+                                                HW_BREAKPOINT_W,
+                                                ptrace_triggered, tsk,
+                                                false);
+       } else {
+               bp = t->ptrace_bps[nr];
+               t->ptrace_bps[nr] = NULL;
+               bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len,
+                                              bp->attr.bp_type,
+                                              bp->callback,
+                                              tsk,
+                                              bp->attr.disabled);
         }
   
+       if (!bp)
+               return -EIO;
+       /*
+        * CHECKME: the previous code returned -EIO if the addr wasn't a
+        * valid task virtual addr. The new one will return -EINVAL in this
+        * case.
+        * -EINVAL may be what we want for in-kernel breakpoints users, but
+        * -EIO looks better for ptrace, since we refuse a register writing
+        * for the user. And anyway this is the previous behaviour.
+        */
+       if (IS_ERR(bp))
+               return PTR_ERR(bp);
+ 
+       t->ptrace_bps[nr] = bp;
+ 
         return 0;
   }
   
+ /*
+  * Handle PTRACE_POKEUSR calls for the debug register area.
+  */
+ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+ {
+       struct thread_struct *thread = &(tsk->thread);
+       int rc = 0;
+ 
+       /* There are no DR4 or DR5 registers */
+       if (n == 4 || n == 5)
+               return -EIO;
+ 
+       if (n == 6) {
+               thread->debugreg6 = val;
+               goto ret_path;
+       }
+       if (n < HBP_NUM) {
+               rc = ptrace_set_breakpoint_addr(tsk, n, val);
+               if (rc)
+                       return rc;
+       }
+       /* All that's left is DR7 */
+       if (n == 7)
+               rc = ptrace_write_dr7(tsk, val);
+ 
+ ret_path:
+       return rc;
+ }
+ 
   /*
    * These access the current or another (stopped) task's io permission
    * bitmap for debugging or core dump.
diff --combined arch/x86/kvm/x86.c

index ae07d261527cba458ed1682118b19295bc997847,3817220cc86b505b10ce28b7787d07a908df3285..4fc80174191ce4b17549b643fe11dee645286c3f
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -42,6 -42,7 +42,7 @@@
   #define CREATE_TRACE_POINTS
   #include "trace.h"
   
+ #include <asm/debugreg.h>
   #include <asm/uaccess.h>
   #include <asm/msr.h>
   #include <asm/desc.h>
@@@ -1692,7 -1693,7 +1693,7 @@@ static int kvm_vcpu_ioctl_x86_setup_mce
         unsigned bank_num = mcg_cap & 0xff, bank;
   
         r = -EINVAL;
- -      if (!bank_num)
+ +      if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
                 goto out;
         if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
                 goto out;
@@@ -3643,14 -3644,15 +3644,15 @@@ static int vcpu_enter_guest(struct kvm_
         trace_kvm_entry(vcpu->vcpu_id);
         kvm_x86_ops->run(vcpu, kvm_run);
   
-       if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
-               set_debugreg(current->thread.debugreg0, 0);
-               set_debugreg(current->thread.debugreg1, 1);
-               set_debugreg(current->thread.debugreg2, 2);
-               set_debugreg(current->thread.debugreg3, 3);
-               set_debugreg(current->thread.debugreg6, 6);
-               set_debugreg(current->thread.debugreg7, 7);
-       }
+       /*
+        * If the guest has used debug registers, at least dr7
+        * will be disabled while returning to the host.
+        * If we don't have active breakpoints in the host, we don't
+        * care about the messed up debug address registers. But if
+        * we have some of them active, restore the old state.
+        */
+       if (hw_breakpoint_active())
+               hw_breakpoint_restore();
   
         set_bit(KVM_REQ_KICK, &vcpu->requests);
         local_irq_enable();
@@@ -4051,7 -4053,7 +4053,7 @@@ static int save_guest_segment_descripto
         return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
   }
   
- -static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
+ +static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
                              struct desc_struct *seg_desc)
   {
         u32 base_addr = get_desc_base(seg_desc);
diff --combined include/linux/perf_event.h

index 7f87563c8485a62da6aafe2e646bd5eb6bdb73ab,cead64ea6c1569c5cc70204a774c30625ad6293a..b5cdac0de370ce538241eba6aa50e1547ac43f7a
--- 1/include/linux/perf_event.h
--- 2/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@@ -18,6 -18,10 +18,10 @@@
   #include <linux/ioctl.h>
   #include <asm/byteorder.h>
   
+ #ifdef CONFIG_HAVE_HW_BREAKPOINT
+ #include <asm/hw_breakpoint.h>
+ #endif
+ 
   /*
    * User-space ABI bits:
    */
@@@ -31,6 -35,7 +35,7 @@@ enum perf_type_id 
         PERF_TYPE_TRACEPOINT                    = 2,
         PERF_TYPE_HW_CACHE                      = 3,
         PERF_TYPE_RAW                           = 4,
+       PERF_TYPE_BREAKPOINT                    = 5,
   
         PERF_TYPE_MAX,                          /* non-ABI */
   };
@@@ -102,8 -107,6 +107,8 @@@ enum perf_sw_ids 
         PERF_COUNT_SW_CPU_MIGRATIONS            = 4,
         PERF_COUNT_SW_PAGE_FAULTS_MIN           = 5,
         PERF_COUNT_SW_PAGE_FAULTS_MAJ           = 6,
+ +      PERF_COUNT_SW_ALIGNMENT_FAULTS          = 7,
+ +      PERF_COUNT_SW_EMULATION_FAULTS          = 8,
   
         PERF_COUNT_SW_MAX,                      /* non-ABI */
   };
@@@ -209,6 -212,15 +214,15 @@@ struct perf_event_attr 
                 __u32           wakeup_events;    /* wakeup every n events */
                 __u32           wakeup_watermark; /* bytes before wakeup   */
         };
+ 
+       union {
+               struct { /* Hardware breakpoint info */
+                       __u64           bp_addr;
+                       __u32           bp_type;
+                       __u32           bp_len;
+               };
+       };
+ 
         __u32                   __reserved_2;
   
         __u64                   __reserved_3;
@@@ -221,7 -233,7 +235,7 @@@
   #define PERF_EVENT_IOC_DISABLE                _IO ('$', 1)
   #define PERF_EVENT_IOC_REFRESH                _IO ('$', 2)
   #define PERF_EVENT_IOC_RESET          _IO ('$', 3)
- -#define PERF_EVENT_IOC_PERIOD         _IOW('$', 4, u64)
+ +#define PERF_EVENT_IOC_PERIOD         _IOW('$', 4, __u64)
   #define PERF_EVENT_IOC_SET_OUTPUT     _IO ('$', 5)
   #define PERF_EVENT_IOC_SET_FILTER     _IOW('$', 6, char *)
   
@@@ -474,10 -486,15 +488,15 @@@ struct hw_perf_event 
                         unsigned long   event_base;
                         int             idx;
                 };
- -              union { /* software */
- -                      atomic64_t      count;
+ +              struct { /* software */
+ +                      s64             remaining;
                         struct hrtimer  hrtimer;
                 };
+ #ifdef CONFIG_HAVE_HW_BREAKPOINT
+               union { /* breakpoint */
+                       struct arch_hw_breakpoint       info;
+               };
+ #endif
         };
         atomic64_t                      prev_count;
         u64                             sample_period;
@@@ -546,6 -563,8 +565,8 @@@ struct perf_pending_entry 
         void (*func)(struct perf_pending_entry *);
   };
   
+ typedef void (*perf_callback_t)(struct perf_event *, void *);
+ 
   /**
    * struct perf_event - performance event kernel representation:
    */
@@@ -588,7 -607,7 +609,7 @@@ struct perf_event 
         u64                             tstamp_running;
         u64                             tstamp_stopped;
   
-       struct perf_event_attr  attr;
+       struct perf_event_attr          attr;
         struct hw_perf_event            hw;
   
         struct perf_event_context       *ctx;
@@@ -641,6 -660,10 +662,10 @@@
         struct event_filter             *filter;
   #endif
   
+       perf_callback_t                 callback;
+ 
+       perf_callback_t                 event_callback;
+ 
   #endif /* CONFIG_PERF_EVENTS */
   };
   
@@@ -714,6 -737,7 +739,6 @@@ struct perf_output_handle 
         int                             nmi;
         int                             sample;
         int                             locked;
- -      unsigned long                   flags;
   };
   
   #ifdef CONFIG_PERF_EVENTS
@@@ -745,6 -769,13 +770,13 @@@ extern int hw_perf_group_sched_in(struc
                struct perf_cpu_context *cpuctx,
                struct perf_event_context *ctx, int cpu);
   extern void perf_event_update_userpage(struct perf_event *event);
+ extern int perf_event_release_kernel(struct perf_event *event);
+ extern struct perf_event *
+ perf_event_create_kernel_counter(struct perf_event_attr *attr,
+                               int cpu,
+                               pid_t pid,
+                               perf_callback_t callback);
+ extern u64 perf_event_read_value(struct perf_event *event);
   
   struct perf_sample_data {
         u64                             type;
@@@ -821,6 -852,7 +853,7 @@@ extern int sysctl_perf_event_sample_rat
   extern void perf_event_init(void);
   extern void perf_tp_event(int event_id, u64 addr, u64 count,
                                  void *record, int entry_size);
+ extern void perf_bp_event(struct perf_event *event, void *data);
   
   #ifndef perf_misc_flags
   #define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \
@@@ -855,6 -887,8 +888,8 @@@ static inline int perf_event_task_enabl
   static inline void
   perf_sw_event(u32 event_id, u64 nr, int nmi,
                      struct pt_regs *regs, u64 addr)                    { }
+ static inline void
+ perf_bp_event(struct perf_event *event, void *data)           { }
   
   static inline void perf_event_mmap(struct vm_area_struct *vma)                { }
   static inline void perf_event_comm(struct task_struct *tsk)           { }
diff --combined kernel/exit.c

index f7864ac2ecc1ad54c0af6b06b6f9d2da4a93f1ac,266f8920628a966ac8abb7944124f08b8fae3921..3f45e3cf931d917fc1dca145be32618afda895cd
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -49,6 -49,7 +49,7 @@@
   #include <linux/init_task.h>
   #include <linux/perf_event.h>
   #include <trace/events/sched.h>
+ #include <linux/hw_breakpoint.h>
   
   #include <asm/uaccess.h>
   #include <asm/unistd.h>
@@@ -359,8 -360,10 +360,8 @@@ void __set_special_pids(struct pid *pid
   {
         struct task_struct *curr = current->group_leader;
   
- -      if (task_session(curr) != pid) {
+ +      if (task_session(curr) != pid)
                 change_pid(curr, PIDTYPE_SID, pid);
- -              proc_sid_connector(curr);
- -      }
   
         if (task_pgrp(curr) != pid)
                 change_pid(curr, PIDTYPE_PGID, pid);
@@@ -977,6 -980,10 +978,10 @@@ NORET_TYPE void do_exit(long code
   
         proc_exit_connector(tsk);
   
+       /*
+        * FIXME: do that only when needed, using sched_exit tracepoint
+        */
+       flush_ptrace_hw_breakpoint(tsk);
         /*
          * Flush inherited counters to the parent - before the parent
          * gets woken up by child-exit notifications.
diff --combined kernel/perf_event.c

index 3256e36ad251f1dc745469d8c572fc6e497c1772,98dc56b2ebe4806d0d8492b87c91e9908f1645ba..3852e2656bb0d36db15c879bac5a8af8822e0592
--- 1/kernel/perf_event.c
--- 2/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@@ -29,6 -29,7 +29,7 @@@
   #include <linux/kernel_stat.h>
   #include <linux/perf_event.h>
   #include <linux/ftrace_event.h>
+ #include <linux/hw_breakpoint.h>
   
   #include <asm/irq_regs.h>
   
@@@ -1356,7 -1357,7 +1357,7 @@@ static void perf_ctx_adjust_freq(struc
         u64 interrupts, freq;
   
         spin_lock(&ctx->lock);
- -      list_for_each_entry(event, &ctx->group_list, group_entry) {
+ +      list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                 if (event->state != PERF_EVENT_STATE_ACTIVE)
                         continue;
   
@@@ -1725,6 -1726,26 +1726,26 @@@ static int perf_release(struct inode *i
         return 0;
   }
   
+ int perf_event_release_kernel(struct perf_event *event)
+ {
+       struct perf_event_context *ctx = event->ctx;
+ 
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       perf_event_remove_from_context(event);
+       mutex_unlock(&ctx->mutex);
+ 
+       mutex_lock(&event->owner->perf_event_mutex);
+       list_del_init(&event->owner_entry);
+       mutex_unlock(&event->owner->perf_event_mutex);
+       put_task_struct(event->owner);
+ 
+       free_event(event);
+ 
+       return 0;
+ }
+ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+ 
   static int perf_event_read_size(struct perf_event *event)
   {
         int entry = sizeof(u64); /* value */
@@@ -1750,7 -1771,7 +1771,7 @@@
         return size;
   }
   
- static u64 perf_event_read_value(struct perf_event *event)
+ u64 perf_event_read_value(struct perf_event *event)
   {
         struct perf_event *child;
         u64 total = 0;
@@@ -1761,6 -1782,7 +1782,7 @@@
   
         return total;
   }
+ EXPORT_SYMBOL_GPL(perf_event_read_value);
   
   static int perf_event_read_entry(struct perf_event *event,
                                    u64 read_format, char __user *buf)
@@@ -2674,21 -2696,20 +2696,21 @@@ static void perf_output_wakeup(struct p
   static void perf_output_lock(struct perf_output_handle *handle)
   {
         struct perf_mmap_data *data = handle->data;
- -      int cpu;
+ +      int cur, cpu = get_cpu();
   
         handle->locked = 0;
   
- -      local_irq_save(handle->flags);
- -      cpu = smp_processor_id();
- -
- -      if (in_nmi() && atomic_read(&data->lock) == cpu)
- -              return;
+ +      for (;;) {
+ +              cur = atomic_cmpxchg(&data->lock, -1, cpu);
+ +              if (cur == -1) {
+ +                      handle->locked = 1;
+ +                      break;
+ +              }
+ +              if (cur == cpu)
+ +                      break;
   
- -      while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
                 cpu_relax();
- -
- -      handle->locked = 1;
+ +      }
   }
   
   static void perf_output_unlock(struct perf_output_handle *handle)
@@@ -2734,7 -2755,7 +2756,7 @@@ again
         if (atomic_xchg(&data->wakeup, 0))
                 perf_output_wakeup(handle);
   out:
- -      local_irq_restore(handle->flags);
+ +      put_cpu();
   }
   
   void perf_output_copy(struct perf_output_handle *handle,
@@@ -3977,9 -3998,8 +3999,9 @@@ static enum hrtimer_restart perf_sweven
                 regs = task_pt_regs(current);
   
         if (regs) {
- -              if (perf_event_overflow(event, 0, &data, regs))
- -                      ret = HRTIMER_NORESTART;
+ +              if (!(event->attr.exclude_idle && current->pid == 0))
+ +                      if (perf_event_overflow(event, 0, &data, regs))
+ +                              ret = HRTIMER_NORESTART;
         }
   
         period = max_t(u64, 10000, event->hw.sample_period);
@@@ -3988,42 -4008,6 +4010,42 @@@
         return ret;
   }
   
+ +static void perf_swevent_start_hrtimer(struct perf_event *event)
+ +{
+ +      struct hw_perf_event *hwc = &event->hw;
+ +
+ +      hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ +      hwc->hrtimer.function = perf_swevent_hrtimer;
+ +      if (hwc->sample_period) {
+ +              u64 period;
+ +
+ +              if (hwc->remaining) {
+ +                      if (hwc->remaining < 0)
+ +                              period = 10000;
+ +                      else
+ +                              period = hwc->remaining;
+ +                      hwc->remaining = 0;
+ +              } else {
+ +                      period = max_t(u64, 10000, hwc->sample_period);
+ +              }
+ +              __hrtimer_start_range_ns(&hwc->hrtimer,
+ +                              ns_to_ktime(period), 0,
+ +                              HRTIMER_MODE_REL, 0);
+ +      }
+ +}
+ +
+ +static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+ +{
+ +      struct hw_perf_event *hwc = &event->hw;
+ +
+ +      if (hwc->sample_period) {
+ +              ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+ +              hwc->remaining = ktime_to_ns(remaining);
+ +
+ +              hrtimer_cancel(&hwc->hrtimer);
+ +      }
+ +}
+ +
   /*
    * Software event: cpu wall time clock
    */
@@@ -4046,14 -4030,22 +4068,14 @@@ static int cpu_clock_perf_event_enable(
         int cpu = raw_smp_processor_id();
   
         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
- -      hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- -      hwc->hrtimer.function = perf_swevent_hrtimer;
- -      if (hwc->sample_period) {
- -              u64 period = max_t(u64, 10000, hwc->sample_period);
- -              __hrtimer_start_range_ns(&hwc->hrtimer,
- -                              ns_to_ktime(period), 0,
- -                              HRTIMER_MODE_REL, 0);
- -      }
+ +      perf_swevent_start_hrtimer(event);
   
         return 0;
   }
   
   static void cpu_clock_perf_event_disable(struct perf_event *event)
   {
- -      if (event->hw.sample_period)
- -              hrtimer_cancel(&event->hw.hrtimer);
+ +      perf_swevent_cancel_hrtimer(event);
         cpu_clock_perf_event_update(event);
   }
   
@@@ -4090,15 -4082,22 +4112,15 @@@ static int task_clock_perf_event_enable
         now = event->ctx->time;
   
         atomic64_set(&hwc->prev_count, now);
- -      hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- -      hwc->hrtimer.function = perf_swevent_hrtimer;
- -      if (hwc->sample_period) {
- -              u64 period = max_t(u64, 10000, hwc->sample_period);
- -              __hrtimer_start_range_ns(&hwc->hrtimer,
- -                              ns_to_ktime(period), 0,
- -                              HRTIMER_MODE_REL, 0);
- -      }
+ +
+ +      perf_swevent_start_hrtimer(event);
   
         return 0;
   }
   
   static void task_clock_perf_event_disable(struct perf_event *event)
   {
- -      if (event->hw.sample_period)
- -              hrtimer_cancel(&event->hw.hrtimer);
+ +      perf_swevent_cancel_hrtimer(event);
         task_clock_perf_event_update(event, event->ctx->time);
   
   }
@@@ -4231,6 -4230,51 +4253,51 @@@ static void perf_event_free_filter(stru
   
   #endif /* CONFIG_EVENT_PROFILE */
   
+ #ifdef CONFIG_HAVE_HW_BREAKPOINT
+ static void bp_perf_event_destroy(struct perf_event *event)
+ {
+       release_bp_slot(event);
+ }
+ 
+ static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+ {
+       int err;
+       /*
+        * The breakpoint is already filled if we haven't created the counter
+        * through perf syscall
+        * FIXME: manage to get trigerred to NULL if it comes from syscalls
+        */
+       if (!bp->callback)
+               err = register_perf_hw_breakpoint(bp);
+       else
+               err = __register_perf_hw_breakpoint(bp);
+       if (err)
+               return ERR_PTR(err);
+ 
+       bp->destroy = bp_perf_event_destroy;
+ 
+       return &perf_ops_bp;
+ }
+ 
+ void perf_bp_event(struct perf_event *bp, void *regs)
+ {
+       /* TODO */
+ }
+ #else
+ static void bp_perf_event_destroy(struct perf_event *event)
+ {
+ }
+ 
+ static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+ {
+       return NULL;
+ }
+ 
+ void perf_bp_event(struct perf_event *bp, void *regs)
+ {
+ }
+ #endif
+ 
   atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
   
   static void sw_perf_event_destroy(struct perf_event *event)
@@@ -4275,8 -4319,6 +4342,8 @@@ static const struct pmu *sw_perf_event_
         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
         case PERF_COUNT_SW_CONTEXT_SWITCHES:
         case PERF_COUNT_SW_CPU_MIGRATIONS:
+ +      case PERF_COUNT_SW_ALIGNMENT_FAULTS:
+ +      case PERF_COUNT_SW_EMULATION_FAULTS:
                 if (!event->parent) {
                         atomic_inc(&perf_swevent_enabled[event_id]);
                         event->destroy = sw_perf_event_destroy;
@@@ -4297,6 -4339,7 +4364,7 @@@ perf_event_alloc(struct perf_event_att
                    struct perf_event_context *ctx,
                    struct perf_event *group_leader,
                    struct perf_event *parent_event,
+                  perf_callback_t callback,
                    gfp_t gfpflags)
   {
         const struct pmu *pmu;
@@@ -4339,6 -4382,11 +4407,11 @@@
   
         event->state            = PERF_EVENT_STATE_INACTIVE;
   
+       if (!callback && parent_event)
+               callback = parent_event->callback;
+       
+       event->callback = callback;
+ 
         if (attr->disabled)
                 event->state = PERF_EVENT_STATE_OFF;
   
@@@ -4373,6 -4421,11 +4446,11 @@@
                 pmu = tp_perf_event_init(event);
                 break;
   
+       case PERF_TYPE_BREAKPOINT:
+               pmu = bp_perf_event_init(event);
+               break;
+ 
+ 
         default:
                 break;
         }
@@@ -4615,7 -4668,7 +4693,7 @@@ SYSCALL_DEFINE5(perf_event_open
         }
   
         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-                                    NULL, GFP_KERNEL);
+                                    NULL, NULL, GFP_KERNEL);
         err = PTR_ERR(event);
         if (IS_ERR(event))
                 goto err_put_context;
@@@ -4663,6 -4716,58 +4741,58 @@@ err_put_context
         return err;
   }
   
+ /**
+  * perf_event_create_kernel_counter
+  *
+  * @attr: attributes of the counter to create
+  * @cpu: cpu in which the counter is bound
+  * @pid: task to profile
+  */
+ struct perf_event *
+ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+                                pid_t pid, perf_callback_t callback)
+ {
+       struct perf_event *event;
+       struct perf_event_context *ctx;
+       int err;
+ 
+       /*
+        * Get the target context (task or percpu):
+        */
+ 
+       ctx = find_get_context(pid, cpu);
+       if (IS_ERR(ctx))
+               return NULL;
+ 
+       event = perf_event_alloc(attr, cpu, ctx, NULL,
+                                    NULL, callback, GFP_KERNEL);
+       err = PTR_ERR(event);
+       if (IS_ERR(event))
+               goto err_put_context;
+ 
+       event->filp = NULL;
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       perf_install_in_context(ctx, event, cpu);
+       ++ctx->generation;
+       mutex_unlock(&ctx->mutex);
+ 
+       event->owner = current;
+       get_task_struct(current);
+       mutex_lock(&current->perf_event_mutex);
+       list_add_tail(&event->owner_entry, &current->perf_event_list);
+       mutex_unlock(&current->perf_event_mutex);
+ 
+       return event;
+ 
+ err_put_context:
+       if (err < 0)
+               put_ctx(ctx);
+ 
+       return NULL;
+ }
+ EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+ 
   /*
    * inherit a event from parent task to child task:
    */
@@@ -4688,7 -4793,7 +4818,7 @@@ inherit_event(struct perf_event *parent
         child_event = perf_event_alloc(&parent_event->attr,
                                            parent_event->cpu, child_ctx,
                                            group_leader, parent_event,
-                                          GFP_KERNEL);
+                                          NULL, GFP_KERNEL);
         if (IS_ERR(child_event))
                 return child_event;
         get_ctx(child_ctx);
diff --combined kernel/trace/Kconfig

index f05671609a897dba01bd32641e301d418e490849,06c3d5be6759521fae84c32f4f8ec5da57dd38cb..d006554888dc68752a8813e5f6063379c2d98f47
--- 1/kernel/trace/Kconfig
--- 2/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@@ -339,6 -339,27 +339,27 @@@ config POWER_TRACE
           power management decisions, specifically the C-state and P-state
           behavior.
   
+ config KSYM_TRACER
+       bool "Trace read and write access on kernel memory locations"
+       depends on HAVE_HW_BREAKPOINT
+       select TRACING
+       help
+         This tracer helps find read and write operations on any given kernel
+         symbol i.e. /proc/kallsyms.
+ 
+ config PROFILE_KSYM_TRACER
+       bool "Profile all kernel memory accesses on 'watched' variables"
+       depends on KSYM_TRACER
+       help
+         This tracer profiles kernel accesses on variables watched through the
+         ksym tracer ftrace plugin. Depending upon the hardware, all read
+         and write operations on kernel variables can be monitored for
+         accesses.
+ 
+         The results will be displayed in:
+         /debugfs/tracing/profile_ksym
+ 
+         Say N if unsure.
   
   config STACK_TRACER
         bool "Trace max stack"
@@@ -428,23 -449,6 +449,23 @@@ config BLK_DEV_IO_TRAC
   
           If unsure, say N.
   
+ +config KPROBE_EVENT
+ +      depends on KPROBES
+ +      depends on X86
+ +      bool "Enable kprobes-based dynamic events"
+ +      select TRACING
+ +      default y
+ +      help
+ +        This allows the user to add tracing events (similar to tracepoints) on the fly
+ +        via the ftrace interface. See Documentation/trace/kprobetrace.txt
+ +        for more details.
+ +
+ +        Those events can be inserted wherever kprobes can probe, and record
+ +        various register and memory values.
+ +
+ +        This option is also required by perf-probe subcommand of perf tools. If
+ +        you want to use perf tools, this option is strongly recommended.
+ +
   config DYNAMIC_FTRACE
         bool "enable/disable ftrace tracepoints dynamically"
         depends on FUNCTION_TRACER
diff --combined kernel/trace/Makefile

index edc3a3cca1a16cbd4199a0b8035550f60f7ba593,0f84c52e58fe641db0e5b4afe5beb1eed49368ba..cd9ecd89ec7714d34f16fd541beabd9ce0e504d2
--- 1/kernel/trace/Makefile
--- 2/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@@ -53,7 -53,7 +53,8 @@@ obj-$(CONFIG_EVENT_TRACING) += trace_ex
   obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
   obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
   obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+ +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
+ obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
   obj-$(CONFIG_EVENT_TRACING) += power-traces.o
   
   libftrace-y := ftrace.o
diff --combined kernel/trace/trace.h

index b4e4212e66d7d6905d835c425387aad291d6eb30,ee00475742ebbefd4421c9cf2e7b2c185906be99..4da6ede74401e8ea5c528f91ba895f1a075af59a
--- 1/kernel/trace/trace.h
--- 2/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@@ -11,6 -11,7 +11,7 @@@
   #include <linux/ftrace.h>
   #include <trace/boot.h>
   #include <linux/kmemtrace.h>
+ #include <linux/hw_breakpoint.h>
   
   #include <linux/trace_seq.h>
   #include <linux/ftrace_event.h>
@@@ -37,6 -38,7 +38,7 @@@ enum trace_type 
         TRACE_KMEM_ALLOC,
         TRACE_KMEM_FREE,
         TRACE_BLK,
+       TRACE_KSYM,
   
         __TRACE_LAST_TYPE,
   };
@@@ -101,29 -103,6 +103,29 @@@ struct syscall_trace_exit 
         unsigned long           ret;
   };
   
+ +struct kprobe_trace_entry {
+ +      struct trace_entry      ent;
+ +      unsigned long           ip;
+ +      int                     nargs;
+ +      unsigned long           args[];
+ +};
+ +
+ +#define SIZEOF_KPROBE_TRACE_ENTRY(n)                  \
+ +      (offsetof(struct kprobe_trace_entry, args) +    \
+ +      (sizeof(unsigned long) * (n)))
+ +
+ +struct kretprobe_trace_entry {
+ +      struct trace_entry      ent;
+ +      unsigned long           func;
+ +      unsigned long           ret_ip;
+ +      int                     nargs;
+ +      unsigned long           args[];
+ +};
+ +
+ +#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)                       \
+ +      (offsetof(struct kretprobe_trace_entry, args) + \
+ +      (sizeof(unsigned long) * (n)))
+ +
   /*
    * trace_flag_type is an enumeration that holds different
    * states when a trace occurs. These are:
@@@ -232,6 -211,7 +234,7 @@@ extern void __ftrace_bad_type(void)
                           TRACE_KMEM_ALLOC);    \
                 IF_ASSIGN(var, ent, struct kmemtrace_free_entry,        \
                           TRACE_KMEM_FREE);     \
+               IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
                 __ftrace_bad_type();                                    \
         } while (0)
   
@@@ -387,6 -367,8 +390,8 @@@ int register_tracer(struct tracer *type
   void unregister_tracer(struct tracer *type);
   int is_tracing_stopped(void);
   
+ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
+ 
   extern unsigned long nsecs_to_usecs(unsigned long nsecs);
   
   #ifdef CONFIG_TRACER_MAX_TRACE
@@@ -461,6 -443,8 +466,8 @@@ extern int trace_selftest_startup_branc
                                          struct trace_array *tr);
   extern int trace_selftest_startup_hw_branches(struct tracer *trace,
                                               struct trace_array *tr);
+ extern int trace_selftest_startup_ksym(struct tracer *trace,
+                                        struct trace_array *tr);
   #endif /* CONFIG_FTRACE_STARTUP_TEST */
   
   extern void *head_page(struct trace_array_cpu *data);
author	Ingo Molnar <mingo@elte.hu>
	Sat, 21 Nov 2009 13:07:23 +0000 (14:07 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Sat, 21 Nov 2009 13:07:23 +0000 (14:07 +0100)
		1	2
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/kprobes.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/ptrace.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/perf_event.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/perf_event.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.h	patch \|	diff1 \|	diff2 \|	blob \| history