Merge branch 'work.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux-2.6-block.git] / kernel / events / core.c
index f9ff04c8d084955420ed25ea2be616e33bd91606..026a14541a38223669a021585f0539e9d45f7628 100644 (file)
@@ -2553,6 +2553,9 @@ unlock:
        return ret;
 }
 
+static bool exclusive_event_installable(struct perf_event *event,
+                                       struct perf_event_context *ctx);
+
 /*
  * Attach a performance event to a context.
  *
@@ -2567,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 
        lockdep_assert_held(&ctx->mutex);
 
+       WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
        if (event->cpu != -1)
                event->cpu = cpu;
 
@@ -2952,6 +2957,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
        if (!ctx->nr_active || !(is_active & EVENT_ALL))
                return;
 
+       /*
+        * If we had been multiplexing, no rotations are necessary, now no events
+        * are active.
+        */
+       ctx->rotate_necessary = 0;
+
        perf_pmu_disable(ctx->pmu);
        if (is_active & EVENT_PINNED) {
                list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
@@ -3319,10 +3330,13 @@ static int flexible_sched_in(struct perf_event *event, void *data)
                return 0;
 
        if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
-               if (!group_sched_in(event, sid->cpuctx, sid->ctx))
-                       list_add_tail(&event->active_list, &sid->ctx->flexible_active);
-               else
+               int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
+               if (ret) {
                        sid->can_add_hw = 0;
+                       sid->ctx->rotate_necessary = 1;
+                       return 0;
+               }
+               list_add_tail(&event->active_list, &sid->ctx->flexible_active);
        }
 
        return 0;
@@ -3690,24 +3704,17 @@ ctx_first_active(struct perf_event_context *ctx)
 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event *cpu_event = NULL, *task_event = NULL;
-       bool cpu_rotate = false, task_rotate = false;
-       struct perf_event_context *ctx = NULL;
+       struct perf_event_context *task_ctx = NULL;
+       int cpu_rotate, task_rotate;
 
        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */
 
-       if (cpuctx->ctx.nr_events) {
-               if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-                       cpu_rotate = true;
-       }
-
-       ctx = cpuctx->task_ctx;
-       if (ctx && ctx->nr_events) {
-               if (ctx->nr_events != ctx->nr_active)
-                       task_rotate = true;
-       }
+       cpu_rotate = cpuctx->ctx.rotate_necessary;
+       task_ctx = cpuctx->task_ctx;
+       task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
 
        if (!(cpu_rotate || task_rotate))
                return false;
@@ -3716,7 +3723,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
        perf_pmu_disable(cpuctx->ctx.pmu);
 
        if (task_rotate)
-               task_event = ctx_first_active(ctx);
+               task_event = ctx_first_active(task_ctx);
        if (cpu_rotate)
                cpu_event = ctx_first_active(&cpuctx->ctx);
 
@@ -3724,17 +3731,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
-       if (task_event || (ctx && cpu_event))
-               ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+       if (task_event || (task_ctx && cpu_event))
+               ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
        if (cpu_event)
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
        if (task_event)
-               rotate_ctx(ctx, task_event);
+               rotate_ctx(task_ctx, task_event);
        if (cpu_event)
                rotate_ctx(&cpuctx->ctx, cpu_event);
 
-       perf_event_sched_in(cpuctx, ctx, current);
+       perf_event_sched_in(cpuctx, task_ctx, current);
 
        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4358,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event)
 {
        struct pmu *pmu = event->pmu;
 
-       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+       if (!is_exclusive_pmu(pmu))
                return 0;
 
        /*
@@ -4389,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event)
 {
        struct pmu *pmu = event->pmu;
 
-       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+       if (!is_exclusive_pmu(pmu))
                return;
 
        /* see comment in exclusive_event_init() */
@@ -4409,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
        return false;
 }
 
-/* Called under the same ctx::mutex as perf_install_in_context() */
 static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx)
 {
        struct perf_event *iter_event;
        struct pmu *pmu = event->pmu;
 
-       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+       lockdep_assert_held(&ctx->mutex);
+
+       if (!is_exclusive_pmu(pmu))
                return true;
 
        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
@@ -4463,12 +4471,20 @@ static void _free_event(struct perf_event *event)
        if (event->destroy)
                event->destroy(event);
 
-       if (event->ctx)
-               put_ctx(event->ctx);
-
+       /*
+        * Must be after ->destroy(), due to uprobe_perf_close() using
+        * hw.target.
+        */
        if (event->hw.target)
                put_task_struct(event->hw.target);
 
+       /*
+        * perf_event_free_task() relies on put_ctx() being 'last', in particular
+        * all task references must be cleaned up.
+        */
+       if (event->ctx)
+               put_ctx(event->ctx);
+
        exclusive_event_destroy(event);
        module_put(event->pmu->module);
 
@@ -4648,8 +4664,17 @@ again:
        mutex_unlock(&event->child_mutex);
 
        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+               void *var = &child->ctx->refcount;
+
                list_del(&child->child_list);
                free_event(child);
+
+               /*
+                * Wake any perf_event_free_task() waiting for this event to be
+                * freed.
+                */
+               smp_mb(); /* pairs with wait_var_event() */
+               wake_up_var(var);
        }
 
 no_ctx:
@@ -5005,6 +5030,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        if (perf_event_check_period(event, value))
                return -EINVAL;
 
+       if (!event->attr.freq && (value & (1ULL << 63)))
+               return -EINVAL;
+
        event_function_call(event, __perf_event_period, &value);
 
        return 0;
@@ -5923,7 +5951,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
        if (user_mode(regs)) {
                regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
-       } else if (current->mm) {
+       } else if (!(current->flags & PF_KTHREAD)) {
                perf_get_regs_user(regs_user, regs, regs_user_copy);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -8532,9 +8560,9 @@ static int perf_tp_event_match(struct perf_event *event,
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;
        /*
-        * All tracepoints are from kernel-space.
+        * If exclude_kernel, only trace user-space tracepoints (uprobes)
         */
-       if (event->attr.exclude_kernel)
+       if (event->attr.exclude_kernel && !user_mode(regs))
                return 0;
 
        if (!perf_tp_filter_match(event, data))
@@ -9874,6 +9902,12 @@ static int pmu_dev_alloc(struct pmu *pmu)
        if (ret)
                goto del_dev;
 
+       if (pmu->attr_update)
+               ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
+
+       if (ret)
+               goto del_dev;
+
 out:
        return ret;
 
@@ -10033,6 +10067,12 @@ void perf_pmu_unregister(struct pmu *pmu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
+static inline bool has_extended_regs(struct perf_event *event)
+{
+       return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
+              (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
+}
+
 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 {
        struct perf_event_context *ctx = NULL;
@@ -10064,12 +10104,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
                perf_event_ctx_unlock(event->group_leader, ctx);
 
        if (!ret) {
+               if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+                   has_extended_regs(event))
+                       ret = -EOPNOTSUPP;
+
                if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
-                               event_has_any_exclude_flag(event)) {
-                       if (event->destroy)
-                               event->destroy(event);
+                   event_has_any_exclude_flag(event))
                        ret = -EINVAL;
-               }
+
+               if (ret && event->destroy)
+                       event->destroy(event);
        }
 
        if (ret)
@@ -10680,11 +10724,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
                break;
 
        case CLOCK_BOOTTIME:
-               event->clock = &ktime_get_boot_ns;
+               event->clock = &ktime_get_boottime_ns;
                break;
 
        case CLOCK_TAI:
-               event->clock = &ktime_get_tai_ns;
+               event->clock = &ktime_get_clocktai_ns;
                break;
 
        default:
@@ -10909,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open,
                goto err_alloc;
        }
 
-       if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
-               err = -EBUSY;
-               goto err_context;
-       }
-
        /*
         * Look up the group leader (we will attach this event to it):
         */
@@ -11001,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open,
                                move_group = 0;
                        }
                }
+
+               /*
+                * Failure to create exclusive events returns -EBUSY.
+                */
+               err = -EBUSY;
+               if (!exclusive_event_installable(group_leader, ctx))
+                       goto err_locked;
+
+               for_each_sibling_event(sibling, group_leader) {
+                       if (!exclusive_event_installable(sibling, ctx))
+                               goto err_locked;
+               }
        } else {
                mutex_lock(&ctx->mutex);
        }
@@ -11037,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open,
         * because we need to serialize with concurrent event creation.
         */
        if (!exclusive_event_installable(event, ctx)) {
-               /* exclusive and group stuff are assumed mutually exclusive */
-               WARN_ON_ONCE(move_group);
-
                err = -EBUSY;
                goto err_locked;
        }
@@ -11506,11 +11554,11 @@ static void perf_free_event(struct perf_event *event,
 }
 
 /*
- * Free an unexposed, unused context as created by inheritance by
- * perf_event_init_task below, used by fork() in case of fail.
+ * Free a context as created by inheritance by perf_event_init_task() below,
+ * used by fork() in case of fail.
  *
- * Not all locks are strictly required, but take them anyway to be nice and
- * help out with the lockdep assertions.
+ * Even though the task has never lived, the context and events have been
+ * exposed through the child_list, so we must take care tearing it all down.
  */
 void perf_event_free_task(struct task_struct *task)
 {
@@ -11540,7 +11588,23 @@ void perf_event_free_task(struct task_struct *task)
                        perf_free_event(event, ctx);
 
                mutex_unlock(&ctx->mutex);
-               put_ctx(ctx);
+
+               /*
+                * perf_event_release_kernel() could've stolen some of our
+                * child events and still have them on its free_list. In that
+                * case we must wait for these events to have been freed (in
+                * particular all their references to this task must've been
+                * dropped).
+                *
+                * Without this copy_process() will unconditionally free this
+                * task (irrespective of its reference count) and
+                * _free_event()'s put_task_struct(event->hw.target) will be a
+                * use-after-free.
+                *
+                * Wait for all events to drop their context reference.
+                */
+               wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+               put_ctx(ctx); /* must be last */
        }
 }