trivial: Correct print_tainted routine name in comment
[linux-block.git] / kernel / perf_counter.c
index d7cbc579fc8016603f0d0644c339a470224ad0b9..cc768ab81ac84d3fe81f9742e252f5ffffd348ae 100644 (file)
@@ -46,12 +46,18 @@ static atomic_t nr_task_counters __read_mostly;
 
 /*
  * perf counter paranoia level:
- *  0 - not paranoid
- *  1 - disallow cpu counters to unpriv
- *  2 - disallow kernel profiling to unpriv
+ *  -1 - not paranoid at all
+ *   0 - disallow raw tracepoint access for unpriv
+ *   1 - disallow cpu counters for unpriv
+ *   2 - disallow kernel profiling for unpriv
  */
 int sysctl_perf_counter_paranoid __read_mostly = 1;
 
+static inline bool perf_paranoid_tracepoint_raw(void)
+{
+       return sysctl_perf_counter_paranoid > -1;
+}
+
 static inline bool perf_paranoid_cpu(void)
 {
        return sysctl_perf_counter_paranoid > 0;
@@ -100,16 +106,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,
 
 void __weak perf_counter_print_debug(void)     { }
 
-static DEFINE_PER_CPU(int, disable_count);
+static DEFINE_PER_CPU(int, perf_disable_count);
 
 void __perf_disable(void)
 {
-       __get_cpu_var(disable_count)++;
+       __get_cpu_var(perf_disable_count)++;
 }
 
 bool __perf_enable(void)
 {
-       return !--__get_cpu_var(disable_count);
+       return !--__get_cpu_var(perf_disable_count);
 }
 
 void perf_disable(void)
@@ -469,7 +475,8 @@ static void update_counter_times(struct perf_counter *counter)
        struct perf_counter_context *ctx = counter->ctx;
        u64 run_end;
 
-       if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+       if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+           counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
                return;
 
        counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -518,7 +525,7 @@ static void __perf_counter_disable(void *info)
         */
        if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
                update_context_time(ctx);
-               update_counter_times(counter);
+               update_group_times(counter);
                if (counter == counter->group_leader)
                        group_sched_out(counter, cpuctx, ctx);
                else
@@ -573,7 +580,7 @@ static void perf_counter_disable(struct perf_counter *counter)
         * in, so we can change the state safely.
         */
        if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-               update_counter_times(counter);
+               update_group_times(counter);
                counter->state = PERF_COUNTER_STATE_OFF;
        }
 
@@ -850,6 +857,27 @@ retry:
        spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Put a counter into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_counter_mark_enabled(struct perf_counter *counter,
+                                       struct perf_counter_context *ctx)
+{
+       struct perf_counter *sub;
+
+       counter->state = PERF_COUNTER_STATE_INACTIVE;
+       counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+       list_for_each_entry(sub, &counter->sibling_list, list_entry)
+               if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
+                       sub->tstamp_enabled =
+                               ctx->time - sub->total_time_enabled;
+}
+
 /*
  * Cross CPU call to enable a performance counter
  */
@@ -877,8 +905,7 @@ static void __perf_counter_enable(void *info)
 
        if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
                goto unlock;
-       counter->state = PERF_COUNTER_STATE_INACTIVE;
-       counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+       __perf_counter_mark_enabled(counter, ctx);
 
        /*
         * If the counter is in a group and isn't the group leader,
@@ -971,11 +998,9 @@ static void perf_counter_enable(struct perf_counter *counter)
         * Since we have the lock this context can't be scheduled
         * in, so we can change the state safely.
         */
-       if (counter->state == PERF_COUNTER_STATE_OFF) {
-               counter->state = PERF_COUNTER_STATE_INACTIVE;
-               counter->tstamp_enabled =
-                       ctx->time - counter->total_time_enabled;
-       }
+       if (counter->state == PERF_COUNTER_STATE_OFF)
+               __perf_counter_mark_enabled(counter, ctx);
+
  out:
        spin_unlock_irq(&ctx->lock);
 }
@@ -1479,9 +1504,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
                counter->attr.enable_on_exec = 0;
                if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
                        continue;
-               counter->state = PERF_COUNTER_STATE_INACTIVE;
-               counter->tstamp_enabled =
-                       ctx->time - counter->total_time_enabled;
+               __perf_counter_mark_enabled(counter, ctx);
                enabled = 1;
        }
 
@@ -1675,6 +1698,11 @@ static void free_counter(struct perf_counter *counter)
                        atomic_dec(&nr_task_counters);
        }
 
+       if (counter->output) {
+               fput(counter->output->filp);
+               counter->output = NULL;
+       }
+
        if (counter->destroy)
                counter->destroy(counter);
 
@@ -1960,6 +1988,8 @@ unlock:
        return ret;
 }
 
+int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct perf_counter *counter = file->private_data;
@@ -1983,6 +2013,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_COUNTER_IOC_PERIOD:
                return perf_counter_period(counter, (u64 __user *)arg);
 
+       case PERF_COUNTER_IOC_SET_OUTPUT:
+               return perf_counter_set_output(counter, arg);
+
        default:
                return -ENOTTY;
        }
@@ -2143,6 +2176,13 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
        data->nr_pages = nr_pages;
        atomic_set(&data->lock, -1);
 
+       if (counter->attr.watermark) {
+               data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+                                     counter->attr.wakeup_watermark);
+       }
+       if (!data->watermark)
+               data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+
        rcu_assign_pointer(counter->data, data);
 
        return 0;
@@ -2253,6 +2293,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
        WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->mmap_mutex);
+       if (counter->output) {
+               ret = -EINVAL;
+               goto unlock;
+       }
+
        if (atomic_inc_not_zero(&counter->mmap_count)) {
                if (nr_pages != counter->data->nr_pages)
                        ret = -EINVAL;
@@ -2277,7 +2322,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        lock_limit >>= PAGE_SHIFT;
        locked = vma->vm_mm->locked_vm + extra;
 
-       if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+       if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+               !capable(CAP_IPC_LOCK)) {
                ret = -EPERM;
                goto unlock;
        }
@@ -2466,35 +2512,15 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 /*
  * Output
  */
-
-struct perf_output_handle {
-       struct perf_counter     *counter;
-       struct perf_mmap_data   *data;
-       unsigned long           head;
-       unsigned long           offset;
-       int                     nmi;
-       int                     sample;
-       int                     locked;
-       unsigned long           flags;
-};
-
-static bool perf_output_space(struct perf_mmap_data *data,
-                             unsigned int offset, unsigned int head)
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+                             unsigned long offset, unsigned long head)
 {
-       unsigned long tail;
        unsigned long mask;
 
        if (!data->writable)
                return true;
 
        mask = (data->nr_pages << PAGE_SHIFT) - 1;
-       /*
-        * Userspace could choose to issue a mb() before updating the tail
-        * pointer. So that all reads will be completed before the write is
-        * issued.
-        */
-       tail = ACCESS_ONCE(data->user_page->data_tail);
-       smp_rmb();
 
        offset = (offset - tail) & mask;
        head   = (head   - tail) & mask;
@@ -2595,8 +2621,8 @@ out:
        local_irq_restore(handle->flags);
 }
 
-static void perf_output_copy(struct perf_output_handle *handle,
-                            const void *buf, unsigned int len)
+void perf_output_copy(struct perf_output_handle *handle,
+                     const void *buf, unsigned int len)
 {
        unsigned int pages_mask;
        unsigned int offset;
@@ -2631,15 +2657,13 @@ static void perf_output_copy(struct perf_output_handle *handle,
        WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
-#define perf_output_put(handle, x) \
-       perf_output_copy((handle), &(x), sizeof(x))
-
-static int perf_output_begin(struct perf_output_handle *handle,
-                            struct perf_counter *counter, unsigned int size,
-                            int nmi, int sample)
+int perf_output_begin(struct perf_output_handle *handle,
+                     struct perf_counter *counter, unsigned int size,
+                     int nmi, int sample)
 {
+       struct perf_counter *output_counter;
        struct perf_mmap_data *data;
-       unsigned int offset, head;
+       unsigned long tail, offset, head;
        int have_lost;
        struct {
                struct perf_event_header header;
@@ -2647,13 +2671,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
                u64                      lost;
        } lost_event;
 
+       rcu_read_lock();
        /*
         * For inherited counters we send all the output towards the parent.
         */
        if (counter->parent)
                counter = counter->parent;
 
-       rcu_read_lock();
+       output_counter = rcu_dereference(counter->output);
+       if (output_counter)
+               counter = output_counter;
+
        data = rcu_dereference(counter->data);
        if (!data)
                goto out;
@@ -2673,16 +2701,23 @@ static int perf_output_begin(struct perf_output_handle *handle,
        perf_output_lock(handle);
 
        do {
+               /*
+                * Userspace could choose to issue a mb() before updating the
+                * tail pointer. So that all reads will be completed before the
+                * write is issued.
+                */
+               tail = ACCESS_ONCE(data->user_page->data_tail);
+               smp_rmb();
                offset = head = atomic_long_read(&data->head);
                head += size;
-               if (unlikely(!perf_output_space(data, offset, head)))
+               if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
        } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
        handle->offset  = offset;
        handle->head    = head;
 
-       if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+       if (head - tail > data->watermark)
                atomic_set(&data->wakeup, 1);
 
        if (have_lost) {
@@ -2706,7 +2741,7 @@ out:
        return -ENOSPC;
 }
 
-static void perf_output_end(struct perf_output_handle *handle)
+void perf_output_end(struct perf_output_handle *handle)
 {
        struct perf_counter *counter = handle->counter;
        struct perf_mmap_data *data = handle->data;
@@ -2820,156 +2855,176 @@ static void perf_output_read(struct perf_output_handle *handle,
                perf_output_read_one(handle, counter);
 }
 
-void perf_counter_output(struct perf_counter *counter, int nmi,
-                               struct perf_sample_data *data)
+void perf_output_sample(struct perf_output_handle *handle,
+                       struct perf_event_header *header,
+                       struct perf_sample_data *data,
+                       struct perf_counter *counter)
 {
-       int ret;
-       u64 sample_type = counter->attr.sample_type;
-       struct perf_output_handle handle;
-       struct perf_event_header header;
-       u64 ip;
-       struct {
-               u32 pid, tid;
-       } tid_entry;
-       struct perf_callchain_entry *callchain = NULL;
-       int callchain_size = 0;
-       u64 time;
-       struct {
-               u32 cpu, reserved;
-       } cpu_entry;
-
-       header.type = PERF_EVENT_SAMPLE;
-       header.size = sizeof(header);
-
-       header.misc = 0;
-       header.misc |= perf_misc_flags(data->regs);
-
-       if (sample_type & PERF_SAMPLE_IP) {
-               ip = perf_instruction_pointer(data->regs);
-               header.size += sizeof(ip);
-       }
+       u64 sample_type = data->type;
 
-       if (sample_type & PERF_SAMPLE_TID) {
-               /* namespace issues */
-               tid_entry.pid = perf_counter_pid(counter, current);
-               tid_entry.tid = perf_counter_tid(counter, current);
+       perf_output_put(handle, *header);
 
-               header.size += sizeof(tid_entry);
-       }
+       if (sample_type & PERF_SAMPLE_IP)
+               perf_output_put(handle, data->ip);
 
-       if (sample_type & PERF_SAMPLE_TIME) {
-               /*
-                * Maybe do better on x86 and provide cpu_clock_nmi()
-                */
-               time = sched_clock();
+       if (sample_type & PERF_SAMPLE_TID)
+               perf_output_put(handle, data->tid_entry);
 
-               header.size += sizeof(u64);
-       }
+       if (sample_type & PERF_SAMPLE_TIME)
+               perf_output_put(handle, data->time);
 
        if (sample_type & PERF_SAMPLE_ADDR)
-               header.size += sizeof(u64);
+               perf_output_put(handle, data->addr);
 
        if (sample_type & PERF_SAMPLE_ID)
-               header.size += sizeof(u64);
+               perf_output_put(handle, data->id);
 
        if (sample_type & PERF_SAMPLE_STREAM_ID)
-               header.size += sizeof(u64);
+               perf_output_put(handle, data->stream_id);
 
-       if (sample_type & PERF_SAMPLE_CPU) {
-               header.size += sizeof(cpu_entry);
-
-               cpu_entry.cpu = raw_smp_processor_id();
-               cpu_entry.reserved = 0;
-       }
+       if (sample_type & PERF_SAMPLE_CPU)
+               perf_output_put(handle, data->cpu_entry);
 
        if (sample_type & PERF_SAMPLE_PERIOD)
-               header.size += sizeof(u64);
+               perf_output_put(handle, data->period);
 
        if (sample_type & PERF_SAMPLE_READ)
-               header.size += perf_counter_read_size(counter);
+               perf_output_read(handle, counter);
 
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-               callchain = perf_callchain(data->regs);
+               if (data->callchain) {
+                       int size = 1;
 
-               if (callchain) {
-                       callchain_size = (1 + callchain->nr) * sizeof(u64);
-                       header.size += callchain_size;
-               } else
-                       header.size += sizeof(u64);
+                       if (data->callchain)
+                               size += data->callchain->nr;
+
+                       size *= sizeof(u64);
+
+                       perf_output_copy(handle, data->callchain, size);
+               } else {
+                       u64 nr = 0;
+                       perf_output_put(handle, nr);
+               }
        }
 
        if (sample_type & PERF_SAMPLE_RAW) {
-               int size = sizeof(u32);
+               if (data->raw) {
+                       perf_output_put(handle, data->raw->size);
+                       perf_output_copy(handle, data->raw->data,
+                                        data->raw->size);
+               } else {
+                       struct {
+                               u32     size;
+                               u32     data;
+                       } raw = {
+                               .size = sizeof(u32),
+                               .data = 0,
+                       };
+                       perf_output_put(handle, raw);
+               }
+       }
+}
 
-               if (data->raw)
-                       size += data->raw->size;
-               else
-                       size += sizeof(u32);
+void perf_prepare_sample(struct perf_event_header *header,
+                        struct perf_sample_data *data,
+                        struct perf_counter *counter,
+                        struct pt_regs *regs)
+{
+       u64 sample_type = counter->attr.sample_type;
 
-               WARN_ON_ONCE(size & (sizeof(u64)-1));
-               header.size += size;
-       }
+       data->type = sample_type;
 
-       ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
-       if (ret)
-               return;
+       header->type = PERF_EVENT_SAMPLE;
+       header->size = sizeof(*header);
 
-       perf_output_put(&handle, header);
+       header->misc = 0;
+       header->misc |= perf_misc_flags(regs);
 
-       if (sample_type & PERF_SAMPLE_IP)
-               perf_output_put(&handle, ip);
+       if (sample_type & PERF_SAMPLE_IP) {
+               data->ip = perf_instruction_pointer(regs);
 
-       if (sample_type & PERF_SAMPLE_TID)
-               perf_output_put(&handle, tid_entry);
+               header->size += sizeof(data->ip);
+       }
 
-       if (sample_type & PERF_SAMPLE_TIME)
-               perf_output_put(&handle, time);
+       if (sample_type & PERF_SAMPLE_TID) {
+               /* namespace issues */
+               data->tid_entry.pid = perf_counter_pid(counter, current);
+               data->tid_entry.tid = perf_counter_tid(counter, current);
+
+               header->size += sizeof(data->tid_entry);
+       }
+
+       if (sample_type & PERF_SAMPLE_TIME) {
+               data->time = perf_clock();
+
+               header->size += sizeof(data->time);
+       }
 
        if (sample_type & PERF_SAMPLE_ADDR)
-               perf_output_put(&handle, data->addr);
+               header->size += sizeof(data->addr);
 
        if (sample_type & PERF_SAMPLE_ID) {
-               u64 id = primary_counter_id(counter);
+               data->id = primary_counter_id(counter);
 
-               perf_output_put(&handle, id);
+               header->size += sizeof(data->id);
        }
 
-       if (sample_type & PERF_SAMPLE_STREAM_ID)
-               perf_output_put(&handle, counter->id);
+       if (sample_type & PERF_SAMPLE_STREAM_ID) {
+               data->stream_id = counter->id;
 
-       if (sample_type & PERF_SAMPLE_CPU)
-               perf_output_put(&handle, cpu_entry);
+               header->size += sizeof(data->stream_id);
+       }
+
+       if (sample_type & PERF_SAMPLE_CPU) {
+               data->cpu_entry.cpu             = raw_smp_processor_id();
+               data->cpu_entry.reserved        = 0;
+
+               header->size += sizeof(data->cpu_entry);
+       }
 
        if (sample_type & PERF_SAMPLE_PERIOD)
-               perf_output_put(&handle, data->period);
+               header->size += sizeof(data->period);
 
        if (sample_type & PERF_SAMPLE_READ)
-               perf_output_read(&handle, counter);
+               header->size += perf_counter_read_size(counter);
 
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-               if (callchain)
-                       perf_output_copy(&handle, callchain, callchain_size);
-               else {
-                       u64 nr = 0;
-                       perf_output_put(&handle, nr);
-               }
+               int size = 1;
+
+               data->callchain = perf_callchain(regs);
+
+               if (data->callchain)
+                       size += data->callchain->nr;
+
+               header->size += size * sizeof(u64);
        }
 
        if (sample_type & PERF_SAMPLE_RAW) {
-               if (data->raw) {
-                       perf_output_put(&handle, data->raw->size);
-                       perf_output_copy(&handle, data->raw->data, data->raw->size);
-               } else {
-                       struct {
-                               u32     size;
-                               u32     data;
-                       } raw = {
-                               .size = sizeof(u32),
-                               .data = 0,
-                       };
-                       perf_output_put(&handle, raw);
-               }
+               int size = sizeof(u32);
+
+               if (data->raw)
+                       size += data->raw->size;
+               else
+                       size += sizeof(u32);
+
+               WARN_ON_ONCE(size & (sizeof(u64)-1));
+               header->size += size;
        }
+}
+
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs)
+{
+       struct perf_output_handle handle;
+       struct perf_event_header header;
+
+       perf_prepare_sample(&header, data, counter, regs);
+
+       if (perf_output_begin(&handle, counter, header.size, nmi, 1))
+               return;
+
+       perf_output_sample(&handle, &header, data, counter);
 
        perf_output_end(&handle);
 }
@@ -3028,6 +3083,7 @@ struct perf_task_event {
                u32                             ppid;
                u32                             tid;
                u32                             ptid;
+               u64                             time;
        } event;
 };
 
@@ -3035,9 +3091,12 @@ static void perf_counter_task_output(struct perf_counter *counter,
                                     struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
-       int size = task_event->event.header.size;
+       int size;
        struct task_struct *task = task_event->task;
-       int ret = perf_output_begin(&handle, counter, size, 0, 0);
+       int ret;
+
+       size  = task_event->event.header.size;
+       ret = perf_output_begin(&handle, counter, size, 0, 0);
 
        if (ret)
                return;
@@ -3048,7 +3107,10 @@ static void perf_counter_task_output(struct perf_counter *counter,
        task_event->event.tid = perf_counter_tid(counter, task);
        task_event->event.ptid = perf_counter_tid(counter, current);
 
+       task_event->event.time = perf_clock();
+
        perf_output_put(&handle, task_event->event);
+
        perf_output_end(&handle);
 }
 
@@ -3430,7 +3492,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
-               .time           = sched_clock(),
+               .time           = perf_clock(),
                .id             = primary_counter_id(counter),
                .stream_id      = counter->id,
        };
@@ -3450,14 +3512,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  * Generic counter overflow handling, sampling.
  */
 
-int perf_counter_overflow(struct perf_counter *counter, int nmi,
-                         struct perf_sample_data *data)
+static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
+                                  int throttle, struct perf_sample_data *data,
+                                  struct pt_regs *regs)
 {
        int events = atomic_read(&counter->event_limit);
-       int throttle = counter->pmu->unthrottle != NULL;
        struct hw_perf_counter *hwc = &counter->hw;
        int ret = 0;
 
+       throttle = (throttle && counter->pmu->unthrottle != NULL);
+
        if (!throttle) {
                hwc->interrupts++;
        } else {
@@ -3480,7 +3544,7 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
        }
 
        if (counter->attr.freq) {
-               u64 now = sched_clock();
+               u64 now = perf_clock();
                s64 delta = now - hwc->freq_stamp;
 
                hwc->freq_stamp = now;
@@ -3506,10 +3570,17 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
                        perf_counter_disable(counter);
        }
 
-       perf_counter_output(counter, nmi, data);
+       perf_counter_output(counter, nmi, data, regs);
        return ret;
 }
 
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+                         struct perf_sample_data *data,
+                         struct pt_regs *regs)
+{
+       return __perf_counter_overflow(counter, nmi, 1, data, regs);
+}
+
 /*
  * Generic software counter infrastructure
  */
@@ -3545,9 +3616,11 @@ again:
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-                                   int nmi, struct perf_sample_data *data)
+                                   int nmi, struct perf_sample_data *data,
+                                   struct pt_regs *regs)
 {
        struct hw_perf_counter *hwc = &counter->hw;
+       int throttle = 0;
        u64 overflow;
 
        data->period = counter->hw.last_period;
@@ -3557,13 +3630,15 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
                return;
 
        for (; overflow; overflow--) {
-               if (perf_counter_overflow(counter, nmi, data)) {
+               if (__perf_counter_overflow(counter, nmi, throttle,
+                                           data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
                         * hwc->interrupts == MAX_INTERRUPTS.
                         */
                        break;
                }
+               throttle = 1;
        }
 }
 
@@ -3575,7 +3650,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter)
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-                              int nmi, struct perf_sample_data *data)
+                              int nmi, struct perf_sample_data *data,
+                              struct pt_regs *regs)
 {
        struct hw_perf_counter *hwc = &counter->hw;
 
@@ -3584,11 +3660,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
        if (!hwc->sample_period)
                return;
 
-       if (!data->regs)
+       if (!regs)
                return;
 
        if (!atomic64_add_negative(nr, &hwc->period_left))
-               perf_swcounter_overflow(counter, nmi, data);
+               perf_swcounter_overflow(counter, nmi, data, regs);
 }
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3647,7 +3723,8 @@ static int perf_swcounter_match(struct perf_counter *counter,
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
                                     enum perf_type_id type,
                                     u32 event, u64 nr, int nmi,
-                                    struct perf_sample_data *data)
+                                    struct perf_sample_data *data,
+                                    struct pt_regs *regs)
 {
        struct perf_counter *counter;
 
@@ -3656,8 +3733,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
        rcu_read_lock();
        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-               if (perf_swcounter_match(counter, type, event, data->regs))
-                       perf_swcounter_add(counter, nr, nmi, data);
+               if (perf_swcounter_match(counter, type, event, regs))
+                       perf_swcounter_add(counter, nr, nmi, data, regs);
        }
        rcu_read_unlock();
 }
@@ -3678,7 +3755,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 
 static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
                                    u64 nr, int nmi,
-                                   struct perf_sample_data *data)
+                                   struct perf_sample_data *data,
+                                   struct pt_regs *regs)
 {
        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
        int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3691,7 +3769,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
        barrier();
 
        perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-                                nr, nmi, data);
+                                nr, nmi, data, regs);
        rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
@@ -3699,7 +3777,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
         */
        ctx = rcu_dereference(current->perf_counter_ctxp);
        if (ctx)
-               perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+               perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
        rcu_read_unlock();
 
        barrier();
@@ -3713,11 +3791,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
 {
        struct perf_sample_data data = {
-               .regs = regs,
                .addr = addr,
        };
 
-       do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+       do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
+                               &data, regs);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -3754,6 +3832,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_sample_data data;
+       struct pt_regs *regs;
        struct perf_counter *counter;
        u64 period;
 
@@ -3761,17 +3840,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
        counter->pmu->read(counter);
 
        data.addr = 0;
-       data.regs = get_irq_regs();
+       regs = get_irq_regs();
        /*
         * In case we exclude kernel IPs or are somehow not in interrupt
         * context, provide the next best thing, the user IP.
         */
-       if ((counter->attr.exclude_kernel || !data.regs) &&
+       if ((counter->attr.exclude_kernel || !regs) &&
                        !counter->attr.exclude_user)
-               data.regs = task_pt_regs(current);
+               regs = task_pt_regs(current);
 
-       if (data.regs) {
-               if (perf_counter_overflow(counter, 0, &data))
+       if (regs) {
+               if (perf_counter_overflow(counter, 0, &data, regs))
                        ret = HRTIMER_NORESTART;
        }
 
@@ -3907,15 +3986,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
        };
 
        struct perf_sample_data data = {
-               .regs = get_irq_regs(),
                .addr = addr,
                .raw = &raw,
        };
 
-       if (!data.regs)
-               data.regs = task_pt_regs(current);
+       struct pt_regs *regs = get_irq_regs();
+
+       if (!regs)
+               regs = task_pt_regs(current);
 
-       do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+       do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+                               &data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
@@ -3934,6 +4015,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
         * have these.
         */
        if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+                       perf_paranoid_tracepoint_raw() &&
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
 
@@ -4126,8 +4208,8 @@ done:
 static int perf_copy_attr(struct perf_counter_attr __user *uattr,
                          struct perf_counter_attr *attr)
 {
-       int ret;
        u32 size;
+       int ret;
 
        if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
                return -EFAULT;
@@ -4152,25 +4234,26 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
 
        /*
         * If we're handed a bigger struct than we know of,
-        * ensure all the unknown bits are 0.
+        * ensure all the unknown bits are 0 - i.e. new
+        * user-space does not rely on any kernel feature
+        * extensions we dont know about yet.
         */
        if (size > sizeof(*attr)) {
-               unsigned long val;
-               unsigned long __user *addr;
-               unsigned long __user *end;
+               unsigned char __user *addr;
+               unsigned char __user *end;
+               unsigned char val;
 
-               addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
-                               sizeof(unsigned long));
-               end  = PTR_ALIGN((void __user *)uattr + size,
-                               sizeof(unsigned long));
+               addr = (void __user *)uattr + sizeof(*attr);
+               end  = (void __user *)uattr + size;
 
-               for (; addr < end; addr += sizeof(unsigned long)) {
+               for (; addr < end; addr++) {
                        ret = get_user(val, addr);
                        if (ret)
                                return ret;
                        if (val)
                                goto err_size;
                }
+               size = sizeof(*attr);
        }
 
        ret = copy_from_user(attr, uattr, size);
@@ -4202,6 +4285,57 @@ err_size:
        goto out;
 }
 
+int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+{
+       struct perf_counter *output_counter = NULL;
+       struct file *output_file = NULL;
+       struct perf_counter *old_output;
+       int fput_needed = 0;
+       int ret = -EINVAL;
+
+       if (!output_fd)
+               goto set;
+
+       output_file = fget_light(output_fd, &fput_needed);
+       if (!output_file)
+               return -EBADF;
+
+       if (output_file->f_op != &perf_fops)
+               goto out;
+
+       output_counter = output_file->private_data;
+
+       /* Don't chain output fds */
+       if (output_counter->output)
+               goto out;
+
+       /* Don't set an output fd when we already have an output channel */
+       if (counter->data)
+               goto out;
+
+       atomic_long_inc(&output_file->f_count);
+
+set:
+       mutex_lock(&counter->mmap_mutex);
+       old_output = counter->output;
+       rcu_assign_pointer(counter->output, output_counter);
+       mutex_unlock(&counter->mmap_mutex);
+
+       if (old_output) {
+               /*
+                * we need to make sure no existing perf_output_*()
+                * is still referencing this counter.
+                */
+               synchronize_rcu();
+               fput(old_output->filp);
+       }
+
+       ret = 0;
+out:
+       fput_light(output_file, fput_needed);
+       return ret;
+}
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -4221,15 +4355,15 @@ SYSCALL_DEFINE5(perf_counter_open,
        struct file *group_file = NULL;
        int fput_needed = 0;
        int fput_needed2 = 0;
-       int ret;
+       int err;
 
        /* for future expandability... */
-       if (flags)
+       if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
                return -EINVAL;
 
-       ret = perf_copy_attr(attr_uptr, &attr);
-       if (ret)
-               return ret;
+       err = perf_copy_attr(attr_uptr, &attr);
+       if (err)
+               return err;
 
        if (!attr.exclude_kernel) {
                if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@ -4252,8 +4386,8 @@ SYSCALL_DEFINE5(perf_counter_open,
         * Look up the group leader (we will attach this counter to it):
         */
        group_leader = NULL;
-       if (group_fd != -1) {
-               ret = -EINVAL;
+       if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+               err = -EINVAL;
                group_file = fget_light(group_fd, &fput_needed);
                if (!group_file)
                        goto err_put_context;
@@ -4282,18 +4416,24 @@ SYSCALL_DEFINE5(perf_counter_open,
 
        counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
                                     NULL, GFP_KERNEL);
-       ret = PTR_ERR(counter);
+       err = PTR_ERR(counter);
        if (IS_ERR(counter))
                goto err_put_context;
 
-       ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
-       if (ret < 0)
+       err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+       if (err < 0)
                goto err_free_put_context;
 
-       counter_file = fget_light(ret, &fput_needed2);
+       counter_file = fget_light(err, &fput_needed2);
        if (!counter_file)
                goto err_free_put_context;
 
+       if (flags & PERF_FLAG_FD_OUTPUT) {
+               err = perf_counter_set_output(counter, group_fd);
+               if (err)
+                       goto err_fput_free_put_context;
+       }
+
        counter->filp = counter_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
@@ -4307,20 +4447,20 @@ SYSCALL_DEFINE5(perf_counter_open,
        list_add_tail(&counter->owner_entry, &current->perf_counter_list);
        mutex_unlock(&current->perf_counter_mutex);
 
+err_fput_free_put_context:
        fput_light(counter_file, fput_needed2);
 
-out_fput:
-       fput_light(group_file, fput_needed);
-
-       return ret;
-
 err_free_put_context:
-       kfree(counter);
+       if (err < 0)
+               kfree(counter);
 
 err_put_context:
-       put_ctx(ctx);
+       if (err < 0)
+               put_ctx(ctx);
+
+       fput_light(group_file, fput_needed);
 
-       goto out_fput;
+       return err;
 }
 
 /*