kernel/events/core.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Performance events core code:
   4  *
   5  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   6  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   7  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
   8  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   9  */
  10
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/cpu.h>
  14 #include <linux/smp.h>
  15 #include <linux/idr.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/slab.h>
  19 #include <linux/hash.h>
  20 #include <linux/tick.h>
  21 #include <linux/sysfs.h>
  22 #include <linux/dcache.h>
  23 #include <linux/percpu.h>
  24 #include <linux/ptrace.h>
  25 #include <linux/reboot.h>
  26 #include <linux/vmstat.h>
  27 #include <linux/device.h>
  28 #include <linux/export.h>
  29 #include <linux/vmalloc.h>
  30 #include <linux/hardirq.h>
  31 #include <linux/hugetlb.h>
  32 #include <linux/rculist.h>
  33 #include <linux/uaccess.h>
  34 #include <linux/syscalls.h>
  35 #include <linux/anon_inodes.h>
  36 #include <linux/kernel_stat.h>
  37 #include <linux/cgroup.h>
  38 #include <linux/perf_event.h>
  39 #include <linux/trace_events.h>
  40 #include <linux/hw_breakpoint.h>
  41 #include <linux/mm_types.h>
  42 #include <linux/module.h>
  43 #include <linux/mman.h>
  44 #include <linux/compat.h>
  45 #include <linux/bpf.h>
  46 #include <linux/filter.h>
  47 #include <linux/namei.h>
  48 #include <linux/parser.h>
  49 #include <linux/sched/clock.h>
  50 #include <linux/sched/mm.h>
  51 #include <linux/proc_ns.h>
  52 #include <linux/mount.h>
  53 #include <linux/min_heap.h>
  54 #include <linux/highmem.h>
  55 #include <linux/pgtable.h>
  56 #include <linux/buildid.h>
  57
  58 #include "internal.h"
  59
  60 #include <asm/irq_regs.h>
  61
  62 typedef int (*remote_function_f)(void *);
  63
  64 struct remote_function_call {
  65         struct task_struct      *p;
  66         remote_function_f       func;
  67         void                    *info;
  68         int                     ret;
  69 };
  70
  71 static void remote_function(void *data)
  72 {
  73         struct remote_function_call *tfc = data;
  74         struct task_struct *p = tfc->p;
  75
  76         if (p) {
  77                 /* -EAGAIN */
  78                 if (task_cpu(p) != smp_processor_id())
  79                         return;
  80
  81                 /*
  82                  * Now that we're on right CPU with IRQs disabled, we can test
  83                  * if we hit the right task without races.
  84                  */
  85
  86                 tfc->ret = -ESRCH; /* No such (running) process */
  87                 if (p != current)
  88                         return;
  89         }
  90
  91         tfc->ret = tfc->func(tfc->info);
  92 }
  93
  94 /**
  95  * task_function_call - call a function on the cpu on which a task runs
  96  * @p:          the task to evaluate
  97  * @func:       the function to be called
  98  * @info:       the function call argument
  99  *
 100  * Calls the function @func when the task is currently running. This might
 101  * be on the current CPU, which just calls the function directly.  This will
 102  * retry due to any failures in smp_call_function_single(), such as if the
 103  * task_cpu() goes offline concurrently.
 104  *
 105  * returns @func return value or -ESRCH or -ENXIO when the process isn't running
 106  */
 107 static int
 108 task_function_call(struct task_struct *p, remote_function_f func, void *info)
 109 {
 110         struct remote_function_call data = {
 111                 .p      = p,
 112                 .func   = func,
 113                 .info   = info,
 114                 .ret    = -EAGAIN,
 115         };
 116         int ret;
 117
 118         for (;;) {
 119                 ret = smp_call_function_single(task_cpu(p), remote_function,
 120                                                &data, 1);
 121                 if (!ret)
 122                         ret = data.ret;
 123
 124                 if (ret != -EAGAIN)
 125                         break;
 126
 127                 cond_resched();
 128         }
 129
 130         return ret;
 131 }
 132
 133 /**
 134  * cpu_function_call - call a function on the cpu
 135  * @cpu:        target cpu to queue this function
 136  * @func:       the function to be called
 137  * @info:       the function call argument
 138  *
 139  * Calls the function @func on the remote cpu.
 140  *
 141  * returns: @func return value or -ENXIO when the cpu is offline
 142  */
 143 static int cpu_function_call(int cpu, remote_function_f func, void *info)
 144 {
 145         struct remote_function_call data = {
 146                 .p      = NULL,
 147                 .func   = func,
 148                 .info   = info,
 149                 .ret    = -ENXIO, /* No such CPU */
 150         };
 151
 152         smp_call_function_single(cpu, remote_function, &data, 1);
 153
 154         return data.ret;
 155 }
 156
 157 static inline struct perf_cpu_context *
 158 __get_cpu_context(struct perf_event_context *ctx)
 159 {
 160         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 161 }
 162
 163 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 164                           struct perf_event_context *ctx)
 165 {
 166         raw_spin_lock(&cpuctx->ctx.lock);
 167         if (ctx)
 168                 raw_spin_lock(&ctx->lock);
 169 }
 170
 171 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 172                             struct perf_event_context *ctx)
 173 {
 174         if (ctx)
 175                 raw_spin_unlock(&ctx->lock);
 176         raw_spin_unlock(&cpuctx->ctx.lock);
 177 }
 178
 179 #define TASK_TOMBSTONE ((void *)-1L)
 180
 181 static bool is_kernel_event(struct perf_event *event)
 182 {
 183         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
 184 }
 185
 186 /*
 187  * On task ctx scheduling...
 188  *
 189  * When !ctx->nr_events a task context will not be scheduled. This means
 190  * we can disable the scheduler hooks (for performance) without leaving
 191  * pending task ctx state.
 192  *
 193  * This however results in two special cases:
 194  *
 195  *  - removing the last event from a task ctx; this is relatively straight
 196  *    forward and is done in __perf_remove_from_context.
 197  *
 198  *  - adding the first event to a task ctx; this is tricky because we cannot
 199  *    rely on ctx->is_active and therefore cannot use event_function_call().
 200  *    See perf_install_in_context().
 201  *
 202  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 203  */
 204
 205 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
 206                         struct perf_event_context *, void *);
 207
 208 struct event_function_struct {
 209         struct perf_event *event;
 210         event_f func;
 211         void *data;
 212 };
 213
 214 static int event_function(void *info)
 215 {
 216         struct event_function_struct *efs = info;
 217         struct perf_event *event = efs->event;
 218         struct perf_event_context *ctx = event->ctx;
 219         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 220         struct perf_event_context *task_ctx = cpuctx->task_ctx;
 221         int ret = 0;
 222
 223         lockdep_assert_irqs_disabled();
 224
 225         perf_ctx_lock(cpuctx, task_ctx);
 226         /*
 227          * Since we do the IPI call without holding ctx->lock things can have
 228          * changed, double check we hit the task we set out to hit.
 229          */
 230         if (ctx->task) {
 231                 if (ctx->task != current) {
 232                         ret = -ESRCH;
 233                         goto unlock;
 234                 }
 235
 236                 /*
 237                  * We only use event_function_call() on established contexts,
 238                  * and event_function() is only ever called when active (or
 239                  * rather, we'll have bailed in task_function_call() or the
 240                  * above ctx->task != current test), therefore we must have
 241                  * ctx->is_active here.
 242                  */
 243                 WARN_ON_ONCE(!ctx->is_active);
 244                 /*
 245                  * And since we have ctx->is_active, cpuctx->task_ctx must
 246                  * match.
 247                  */
 248                 WARN_ON_ONCE(task_ctx != ctx);
 249         } else {
 250                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
 251         }
 252
 253         efs->func(event, cpuctx, ctx, efs->data);
 254 unlock:
 255         perf_ctx_unlock(cpuctx, task_ctx);
 256
 257         return ret;
 258 }
 259
 260 static void event_function_call(struct perf_event *event, event_f func, void *data)
 261 {
 262         struct perf_event_context *ctx = event->ctx;
 263         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
 264         struct event_function_struct efs = {
 265                 .event = event,
 266                 .func = func,
 267                 .data = data,
 268         };
 269
 270         if (!event->parent) {
 271                 /*
 272                  * If this is a !child event, we must hold ctx::mutex to
 273                  * stabilize the event->ctx relation. See
 274                  * perf_event_ctx_lock().
 275                  */
 276                 lockdep_assert_held(&ctx->mutex);
 277         }
 278
 279         if (!task) {
 280                 cpu_function_call(event->cpu, event_function, &efs);
 281                 return;
 282         }
 283
 284         if (task == TASK_TOMBSTONE)
 285                 return;
 286
 287 again:
 288         if (!task_function_call(task, event_function, &efs))
 289                 return;
 290
 291         raw_spin_lock_irq(&ctx->lock);
 292         /*
 293          * Reload the task pointer, it might have been changed by
 294          * a concurrent perf_event_context_sched_out().
 295          */
 296         task = ctx->task;
 297         if (task == TASK_TOMBSTONE) {
 298                 raw_spin_unlock_irq(&ctx->lock);
 299                 return;
 300         }
 301         if (ctx->is_active) {
 302                 raw_spin_unlock_irq(&ctx->lock);
 303                 goto again;
 304         }
 305         func(event, NULL, ctx, data);
 306         raw_spin_unlock_irq(&ctx->lock);
 307 }
 308
 309 /*
 310  * Similar to event_function_call() + event_function(), but hard assumes IRQs
 311  * are already disabled and we're on the right CPU.
 312  */
 313 static void event_function_local(struct perf_event *event, event_f func, void *data)
 314 {
 315         struct perf_event_context *ctx = event->ctx;
 316         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 317         struct task_struct *task = READ_ONCE(ctx->task);
 318         struct perf_event_context *task_ctx = NULL;
 319
 320         lockdep_assert_irqs_disabled();
 321
 322         if (task) {
 323                 if (task == TASK_TOMBSTONE)
 324                         return;
 325
 326                 task_ctx = ctx;
 327         }
 328
 329         perf_ctx_lock(cpuctx, task_ctx);
 330
 331         task = ctx->task;
 332         if (task == TASK_TOMBSTONE)
 333                 goto unlock;
 334
 335         if (task) {
 336                 /*
 337                  * We must be either inactive or active and the right task,
 338                  * otherwise we're screwed, since we cannot IPI to somewhere
 339                  * else.
 340                  */
 341                 if (ctx->is_active) {
 342                         if (WARN_ON_ONCE(task != current))
 343                                 goto unlock;
 344
 345                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
 346                                 goto unlock;
 347                 }
 348         } else {
 349                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
 350         }
 351
 352         func(event, cpuctx, ctx, data);
 353 unlock:
 354         perf_ctx_unlock(cpuctx, task_ctx);
 355 }
 356
 357 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 358                        PERF_FLAG_FD_OUTPUT  |\
 359                        PERF_FLAG_PID_CGROUP |\
 360                        PERF_FLAG_FD_CLOEXEC)
 361
 362 /*
 363  * branch priv levels that need permission checks
 364  */
 365 #define PERF_SAMPLE_BRANCH_PERM_PLM \
 366         (PERF_SAMPLE_BRANCH_KERNEL |\
 367          PERF_SAMPLE_BRANCH_HV)
 368
 369 enum event_type_t {
 370         EVENT_FLEXIBLE = 0x1,
 371         EVENT_PINNED = 0x2,
 372         EVENT_TIME = 0x4,
 373         /* see ctx_resched() for details */
 374         EVENT_CPU = 0x8,
 375         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 376 };
 377
 378 /*
 379  * perf_sched_events : >0 events exist
 380  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 381  */
 382
 383 static void perf_sched_delayed(struct work_struct *work);
 384 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
 385 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
 386 static DEFINE_MUTEX(perf_sched_mutex);
 387 static atomic_t perf_sched_count;
 388
 389 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 390 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 391 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 392
 393 static atomic_t nr_mmap_events __read_mostly;
 394 static atomic_t nr_comm_events __read_mostly;
 395 static atomic_t nr_namespaces_events __read_mostly;
 396 static atomic_t nr_task_events __read_mostly;
 397 static atomic_t nr_freq_events __read_mostly;
 398 static atomic_t nr_switch_events __read_mostly;
 399 static atomic_t nr_ksymbol_events __read_mostly;
 400 static atomic_t nr_bpf_events __read_mostly;
 401 static atomic_t nr_cgroup_events __read_mostly;
 402 static atomic_t nr_text_poke_events __read_mostly;
 403 static atomic_t nr_build_id_events __read_mostly;
 404
 405 static LIST_HEAD(pmus);
 406 static DEFINE_MUTEX(pmus_lock);
 407 static struct srcu_struct pmus_srcu;
 408 static cpumask_var_t perf_online_mask;
 409 static struct kmem_cache *perf_event_cache;
 410
 411 /*
 412  * perf event paranoia level:
 413  *  -1 - not paranoid at all
 414  *   0 - disallow raw tracepoint access for unpriv
 415  *   1 - disallow cpu events for unpriv
 416  *   2 - disallow kernel profiling for unpriv
 417  */
 418 int sysctl_perf_event_paranoid __read_mostly = 2;
 419
 420 /* Minimum for 512 kiB + 1 user control page */
 421 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 422
 423 /*
 424  * max perf event sample rate
 425  */
 426 #define DEFAULT_MAX_SAMPLE_RATE         100000
 427 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
 428 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
 429
 430 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 431
 432 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 433 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
 434
 435 static int perf_sample_allowed_ns __read_mostly =
 436         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 437
 438 static void update_perf_cpu_limits(void)
 439 {
 440         u64 tmp = perf_sample_period_ns;
 441
 442         tmp *= sysctl_perf_cpu_time_max_percent;
 443         tmp = div_u64(tmp, 100);
 444         if (!tmp)
 445                 tmp = 1;
 446
 447         WRITE_ONCE(perf_sample_allowed_ns, tmp);
 448 }
 449
 450 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
 451
 452 int perf_proc_update_handler(struct ctl_table *table, int write,
 453                 void *buffer, size_t *lenp, loff_t *ppos)
 454 {
 455         int ret;
 456         int perf_cpu = sysctl_perf_cpu_time_max_percent;
 457         /*
 458          * If throttling is disabled don't allow the write:
 459          */
 460         if (write && (perf_cpu == 100 || perf_cpu == 0))
 461                 return -EINVAL;
 462
 463         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 464         if (ret || !write)
 465                 return ret;
 466
 467         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 468         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 469         update_perf_cpu_limits();
 470
 471         return 0;
 472 }
 473
 474 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 475
 476 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 477                 void *buffer, size_t *lenp, loff_t *ppos)
 478 {
 479         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 480
 481         if (ret || !write)
 482                 return ret;
 483
 484         if (sysctl_perf_cpu_time_max_percent == 100 ||
 485             sysctl_perf_cpu_time_max_percent == 0) {
 486                 printk(KERN_WARNING
 487                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
 488                 WRITE_ONCE(perf_sample_allowed_ns, 0);
 489         } else {
 490                 update_perf_cpu_limits();
 491         }
 492
 493         return 0;
 494 }
 495
 496 /*
 497  * perf samples are done in some very critical code paths (NMIs).
 498  * If they take too much CPU time, the system can lock up and not
 499  * get any real work done.  This will drop the sample rate when
 500  * we detect that events are taking too long.
 501  */
 502 #define NR_ACCUMULATED_SAMPLES 128
 503 static DEFINE_PER_CPU(u64, running_sample_length);
 504
 505 static u64 __report_avg;
 506 static u64 __report_allowed;
 507
 508 static void perf_duration_warn(struct irq_work *w)
 509 {
 510         printk_ratelimited(KERN_INFO
 511                 "perf: interrupt took too long (%lld > %lld), lowering "
 512                 "kernel.perf_event_max_sample_rate to %d\n",
 513                 __report_avg, __report_allowed,
 514                 sysctl_perf_event_sample_rate);
 515 }
 516
 517 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 518
 519 void perf_sample_event_took(u64 sample_len_ns)
 520 {
 521         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
 522         u64 running_len;
 523         u64 avg_len;
 524         u32 max;
 525
 526         if (max_len == 0)
 527                 return;
 528
 529         /* Decay the counter by 1 average sample. */
 530         running_len = __this_cpu_read(running_sample_length);
 531         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
 532         running_len += sample_len_ns;
 533         __this_cpu_write(running_sample_length, running_len);
 534
 535         /*
 536          * Note: this will be biased artifically low until we have
 537          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
 538          * from having to maintain a count.
 539          */
 540         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
 541         if (avg_len <= max_len)
 542                 return;
 543
 544         __report_avg = avg_len;
 545         __report_allowed = max_len;
 546
 547         /*
 548          * Compute a throttle threshold 25% below the current duration.
 549          */
 550         avg_len += avg_len / 4;
 551         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
 552         if (avg_len < max)
 553                 max /= (u32)avg_len;
 554         else
 555                 max = 1;
 556
 557         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
 558         WRITE_ONCE(max_samples_per_tick, max);
 559
 560         sysctl_perf_event_sample_rate = max * HZ;
 561         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 562
 563         if (!irq_work_queue(&perf_duration_work)) {
 564                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
 565                              "kernel.perf_event_max_sample_rate to %d\n",
 566                              __report_avg, __report_allowed,
 567                              sysctl_perf_event_sample_rate);
 568         }
 569 }
 570
 571 static atomic64_t perf_event_id;
 572
 573 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 574                               enum event_type_t event_type);
 575
 576 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 577                              enum event_type_t event_type);
 578
 579 static void update_context_time(struct perf_event_context *ctx);
 580 static u64 perf_event_time(struct perf_event *event);
 581
 582 void __weak perf_event_print_debug(void)        { }
 583
 584 static inline u64 perf_clock(void)
 585 {
 586         return local_clock();
 587 }
 588
 589 static inline u64 perf_event_clock(struct perf_event *event)
 590 {
 591         return event->clock();
 592 }
 593
 594 /*
 595  * State based event timekeeping...
 596  *
 597  * The basic idea is to use event->state to determine which (if any) time
 598  * fields to increment with the current delta. This means we only need to
 599  * update timestamps when we change state or when they are explicitly requested
 600  * (read).
 601  *
 602  * Event groups make things a little more complicated, but not terribly so. The
 603  * rules for a group are that if the group leader is OFF the entire group is
 604  * OFF, irrespecive of what the group member states are. This results in
 605  * __perf_effective_state().
 606  *
 607  * A futher ramification is that when a group leader flips between OFF and
 608  * !OFF, we need to update all group member times.
 609  *
 610  *
 611  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 612  * need to make sure the relevant context time is updated before we try and
 613  * update our timestamps.
 614  */
 615
 616 static __always_inline enum perf_event_state
 617 __perf_effective_state(struct perf_event *event)
 618 {
 619         struct perf_event *leader = event->group_leader;
 620
 621         if (leader->state <= PERF_EVENT_STATE_OFF)
 622                 return leader->state;
 623
 624         return event->state;
 625 }
 626
 627 static __always_inline void
 628 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
 629 {
 630         enum perf_event_state state = __perf_effective_state(event);
 631         u64 delta = now - event->tstamp;
 632
 633         *enabled = event->total_time_enabled;
 634         if (state >= PERF_EVENT_STATE_INACTIVE)
 635                 *enabled += delta;
 636
 637         *running = event->total_time_running;
 638         if (state >= PERF_EVENT_STATE_ACTIVE)
 639                 *running += delta;
 640 }
 641
 642 static void perf_event_update_time(struct perf_event *event)
 643 {
 644         u64 now = perf_event_time(event);
 645
 646         __perf_update_times(event, now, &event->total_time_enabled,
 647                                         &event->total_time_running);
 648         event->tstamp = now;
 649 }
 650
 651 static void perf_event_update_sibling_time(struct perf_event *leader)
 652 {
 653         struct perf_event *sibling;
 654
 655         for_each_sibling_event(sibling, leader)
 656                 perf_event_update_time(sibling);
 657 }
 658
 659 static void
 660 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
 661 {
 662         if (event->state == state)
 663                 return;
 664
 665         perf_event_update_time(event);
 666         /*
 667          * If a group leader gets enabled/disabled all its siblings
 668          * are affected too.
 669          */
 670         if ((event->state < 0) ^ (state < 0))
 671                 perf_event_update_sibling_time(event);
 672
 673         WRITE_ONCE(event->state, state);
 674 }
 675
 676 /*
 677  * UP store-release, load-acquire
 678  */
 679
 680 #define __store_release(ptr, val)                                       \
 681 do {                                                                    \
 682         barrier();                                                      \
 683         WRITE_ONCE(*(ptr), (val));                                      \
 684 } while (0)
 685
 686 #define __load_acquire(ptr)                                             \
 687 ({                                                                      \
 688         __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));        \
 689         barrier();                                                      \
 690         ___p;                                                           \
 691 })
 692
 693 #ifdef CONFIG_CGROUP_PERF
 694
 695 static inline bool
 696 perf_cgroup_match(struct perf_event *event)
 697 {
 698         struct perf_event_context *ctx = event->ctx;
 699         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 700
 701         /* @event doesn't care about cgroup */
 702         if (!event->cgrp)
 703                 return true;
 704
 705         /* wants specific cgroup scope but @cpuctx isn't associated with any */
 706         if (!cpuctx->cgrp)
 707                 return false;
 708
 709         /*
 710          * Cgroup scoping is recursive.  An event enabled for a cgroup is
 711          * also enabled for all its descendant cgroups.  If @cpuctx's
 712          * cgroup is a descendant of @event's (the test covers identity
 713          * case), it's a match.
 714          */
 715         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
 716                                     event->cgrp->css.cgroup);
 717 }
 718
 719 static inline void perf_detach_cgroup(struct perf_event *event)
 720 {
 721         css_put(&event->cgrp->css);
 722         event->cgrp = NULL;
 723 }
 724
 725 static inline int is_cgroup_event(struct perf_event *event)
 726 {
 727         return event->cgrp != NULL;
 728 }
 729
 730 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 731 {
 732         struct perf_cgroup_info *t;
 733
 734         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 735         return t->time;
 736 }
 737
 738 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
 739 {
 740         struct perf_cgroup_info *t;
 741
 742         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 743         if (!__load_acquire(&t->active))
 744                 return t->time;
 745         now += READ_ONCE(t->timeoffset);
 746         return now;
 747 }
 748
 749 static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
 750 {
 751         if (adv)
 752                 info->time += now - info->timestamp;
 753         info->timestamp = now;
 754         /*
 755          * see update_context_time()
 756          */
 757         WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
 758 }
 759
 760 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
 761 {
 762         struct perf_cgroup *cgrp = cpuctx->cgrp;
 763         struct cgroup_subsys_state *css;
 764         struct perf_cgroup_info *info;
 765
 766         if (cgrp) {
 767                 u64 now = perf_clock();
 768
 769                 for (css = &cgrp->css; css; css = css->parent) {
 770                         cgrp = container_of(css, struct perf_cgroup, css);
 771                         info = this_cpu_ptr(cgrp->info);
 772
 773                         __update_cgrp_time(info, now, true);
 774                         if (final)
 775                                 __store_release(&info->active, 0);
 776                 }
 777         }
 778 }
 779
 780 static inline void update_cgrp_time_from_event(struct perf_event *event)
 781 {
 782         struct perf_cgroup_info *info;
 783         struct perf_cgroup *cgrp;
 784
 785         /*
 786          * ensure we access cgroup data only when needed and
 787          * when we know the cgroup is pinned (css_get)
 788          */
 789         if (!is_cgroup_event(event))
 790                 return;
 791
 792         cgrp = perf_cgroup_from_task(current, event->ctx);
 793         /*
 794          * Do not update time when cgroup is not active
 795          */
 796         if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
 797                 info = this_cpu_ptr(event->cgrp->info);
 798                 __update_cgrp_time(info, perf_clock(), true);
 799         }
 800 }
 801
 802 static inline void
 803 perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
 804 {
 805         struct perf_event_context *ctx = &cpuctx->ctx;
 806         struct perf_cgroup *cgrp = cpuctx->cgrp;
 807         struct perf_cgroup_info *info;
 808         struct cgroup_subsys_state *css;
 809
 810         /*
 811          * ctx->lock held by caller
 812          * ensure we do not access cgroup data
 813          * unless we have the cgroup pinned (css_get)
 814          */
 815         if (!cgrp)
 816                 return;
 817
 818         WARN_ON_ONCE(!ctx->nr_cgroups);
 819
 820         for (css = &cgrp->css; css; css = css->parent) {
 821                 cgrp = container_of(css, struct perf_cgroup, css);
 822                 info = this_cpu_ptr(cgrp->info);
 823                 __update_cgrp_time(info, ctx->timestamp, false);
 824                 __store_release(&info->active, 1);
 825         }
 826 }
 827
 828 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
 829
 830 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
 831 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 832
 833 /*
 834  * reschedule events based on the cgroup constraint of task.
 835  *
 836  * mode SWOUT : schedule out everything
 837  * mode SWIN : schedule in based on cgroup for next
 838  */
 839 static void perf_cgroup_switch(struct task_struct *task, int mode)
 840 {
 841         struct perf_cpu_context *cpuctx, *tmp;
 842         struct list_head *list;
 843         unsigned long flags;
 844
 845         /*
 846          * Disable interrupts and preemption to avoid this CPU's
 847          * cgrp_cpuctx_entry to change under us.
 848          */
 849         local_irq_save(flags);
 850
 851         list = this_cpu_ptr(&cgrp_cpuctx_list);
 852         list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
 853                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 854
 855                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 856                 perf_pmu_disable(cpuctx->ctx.pmu);
 857
 858                 if (mode & PERF_CGROUP_SWOUT) {
 859                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 860                         /*
 861                          * must not be done before ctxswout due
 862                          * to event_filter_match() in event_sched_out()
 863                          */
 864                         cpuctx->cgrp = NULL;
 865                 }
 866
 867                 if (mode & PERF_CGROUP_SWIN) {
 868                         WARN_ON_ONCE(cpuctx->cgrp);
 869                         /*
 870                          * set cgrp before ctxsw in to allow
 871                          * perf_cgroup_set_timestamp() in ctx_sched_in()
 872                          * to not have to pass task around
 873                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
 874                          * because cgorup events are only per-cpu
 875                          */
 876                         cpuctx->cgrp = perf_cgroup_from_task(task,
 877                                                              &cpuctx->ctx);
 878                         cpu_ctx_sched_in(cpuctx, EVENT_ALL);
 879                 }
 880                 perf_pmu_enable(cpuctx->ctx.pmu);
 881                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 882         }
 883
 884         local_irq_restore(flags);
 885 }
 886
 887 static inline void perf_cgroup_sched_out(struct task_struct *task,
 888                                          struct task_struct *next)
 889 {
 890         struct perf_cgroup *cgrp1;
 891         struct perf_cgroup *cgrp2 = NULL;
 892
 893         rcu_read_lock();
 894         /*
 895          * we come here when we know perf_cgroup_events > 0
 896          * we do not need to pass the ctx here because we know
 897          * we are holding the rcu lock
 898          */
 899         cgrp1 = perf_cgroup_from_task(task, NULL);
 900         cgrp2 = perf_cgroup_from_task(next, NULL);
 901
 902         /*
 903          * only schedule out current cgroup events if we know
 904          * that we are switching to a different cgroup. Otherwise,
 905          * do no touch the cgroup events.
 906          */
 907         if (cgrp1 != cgrp2)
 908                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 909
 910         rcu_read_unlock();
 911 }
 912
 913 static inline void perf_cgroup_sched_in(struct task_struct *prev,
 914                                         struct task_struct *task)
 915 {
 916         struct perf_cgroup *cgrp1;
 917         struct perf_cgroup *cgrp2 = NULL;
 918
 919         rcu_read_lock();
 920         /*
 921          * we come here when we know perf_cgroup_events > 0
 922          * we do not need to pass the ctx here because we know
 923          * we are holding the rcu lock
 924          */
 925         cgrp1 = perf_cgroup_from_task(task, NULL);
 926         cgrp2 = perf_cgroup_from_task(prev, NULL);
 927
 928         /*
 929          * only need to schedule in cgroup events if we are changing
 930          * cgroup during ctxsw. Cgroup events were not scheduled
 931          * out of ctxsw out if that was not the case.
 932          */
 933         if (cgrp1 != cgrp2)
 934                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 935
 936         rcu_read_unlock();
 937 }
 938
 939 static int perf_cgroup_ensure_storage(struct perf_event *event,
 940                                 struct cgroup_subsys_state *css)
 941 {
 942         struct perf_cpu_context *cpuctx;
 943         struct perf_event **storage;
 944         int cpu, heap_size, ret = 0;
 945
 946         /*
 947          * Allow storage to have sufficent space for an iterator for each
 948          * possibly nested cgroup plus an iterator for events with no cgroup.
 949          */
 950         for (heap_size = 1; css; css = css->parent)
 951                 heap_size++;
 952
 953         for_each_possible_cpu(cpu) {
 954                 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
 955                 if (heap_size <= cpuctx->heap_size)
 956                         continue;
 957
 958                 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
 959                                        GFP_KERNEL, cpu_to_node(cpu));
 960                 if (!storage) {
 961                         ret = -ENOMEM;
 962                         break;
 963                 }
 964
 965                 raw_spin_lock_irq(&cpuctx->ctx.lock);
 966                 if (cpuctx->heap_size < heap_size) {
 967                         swap(cpuctx->heap, storage);
 968                         if (storage == cpuctx->heap_default)
 969                                 storage = NULL;
 970                         cpuctx->heap_size = heap_size;
 971                 }
 972                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
 973
 974                 kfree(storage);
 975         }
 976
 977         return ret;
 978 }
 979
 980 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 981                                       struct perf_event_attr *attr,
 982                                       struct perf_event *group_leader)
 983 {
 984         struct perf_cgroup *cgrp;
 985         struct cgroup_subsys_state *css;
 986         struct fd f = fdget(fd);
 987         int ret = 0;
 988
 989         if (!f.file)
 990                 return -EBADF;
 991
 992         css = css_tryget_online_from_dir(f.file->f_path.dentry,
 993                                          &perf_event_cgrp_subsys);
 994         if (IS_ERR(css)) {
 995                 ret = PTR_ERR(css);
 996                 goto out;
 997         }
 998
 999         ret = perf_cgroup_ensure_storage(event, css);
1000         if (ret)
1001                 goto out;
1002
1003         cgrp = container_of(css, struct perf_cgroup, css);
1004         event->cgrp = cgrp;
1005
1006         /*
1007          * all events in a group must monitor
1008          * the same cgroup because a task belongs
1009          * to only one perf cgroup at a time
1010          */
1011         if (group_leader && group_leader->cgrp != cgrp) {
1012                 perf_detach_cgroup(event);
1013                 ret = -EINVAL;
1014         }
1015 out:
1016         fdput(f);
1017         return ret;
1018 }
1019
1020 static inline void
1021 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1022 {
1023         struct perf_cpu_context *cpuctx;
1024
1025         if (!is_cgroup_event(event))
1026                 return;
1027
1028         /*
1029          * Because cgroup events are always per-cpu events,
1030          * @ctx == &cpuctx->ctx.
1031          */
1032         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1033
1034         /*
1035          * Since setting cpuctx->cgrp is conditional on the current @cgrp
1036          * matching the event's cgroup, we must do this for every new event,
1037          * because if the first would mismatch, the second would not try again
1038          * and we would leave cpuctx->cgrp unset.
1039          */
1040         if (ctx->is_active && !cpuctx->cgrp) {
1041                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
1042
1043                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
1044                         cpuctx->cgrp = cgrp;
1045         }
1046
1047         if (ctx->nr_cgroups++)
1048                 return;
1049
1050         list_add(&cpuctx->cgrp_cpuctx_entry,
1051                         per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1052 }
1053
1054 static inline void
1055 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1056 {
1057         struct perf_cpu_context *cpuctx;
1058
1059         if (!is_cgroup_event(event))
1060                 return;
1061
1062         /*
1063          * Because cgroup events are always per-cpu events,
1064          * @ctx == &cpuctx->ctx.
1065          */
1066         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1067
1068         if (--ctx->nr_cgroups)
1069                 return;
1070
1071         if (ctx->is_active && cpuctx->cgrp)
1072                 cpuctx->cgrp = NULL;
1073
1074         list_del(&cpuctx->cgrp_cpuctx_entry);
1075 }
1076
1077 #else /* !CONFIG_CGROUP_PERF */
1078
1079 static inline bool
1080 perf_cgroup_match(struct perf_event *event)
1081 {
1082         return true;
1083 }
1084
1085 static inline void perf_detach_cgroup(struct perf_event *event)
1086 {}
1087
1088 static inline int is_cgroup_event(struct perf_event *event)
1089 {
1090         return 0;
1091 }
1092
1093 static inline void update_cgrp_time_from_event(struct perf_event *event)
1094 {
1095 }
1096
1097 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
1098                                                 bool final)
1099 {
1100 }
1101
1102 static inline void perf_cgroup_sched_out(struct task_struct *task,
1103                                          struct task_struct *next)
1104 {
1105 }
1106
1107 static inline void perf_cgroup_sched_in(struct task_struct *prev,
1108                                         struct task_struct *task)
1109 {
1110 }
1111
1112 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1113                                       struct perf_event_attr *attr,
1114                                       struct perf_event *group_leader)
1115 {
1116         return -EINVAL;
1117 }
1118
1119 static inline void
1120 perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
1121 {
1122 }
1123
1124 static inline void
1125 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1126 {
1127 }
1128
1129 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1130 {
1131         return 0;
1132 }
1133
1134 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
1135 {
1136         return 0;
1137 }
1138
1139 static inline void
1140 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1141 {
1142 }
1143
1144 static inline void
1145 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1146 {
1147 }
1148 #endif
1149
1150 /*
1151  * set default to be dependent on timer tick just
1152  * like original code
1153  */
1154 #define PERF_CPU_HRTIMER (1000 / HZ)
1155 /*
1156  * function must be called with interrupts disabled
1157  */
1158 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1159 {
1160         struct perf_cpu_context *cpuctx;
1161         bool rotations;
1162
1163         lockdep_assert_irqs_disabled();
1164
1165         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1166         rotations = perf_rotate_context(cpuctx);
1167
1168         raw_spin_lock(&cpuctx->hrtimer_lock);
1169         if (rotations)
1170                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1171         else
1172                 cpuctx->hrtimer_active = 0;
1173         raw_spin_unlock(&cpuctx->hrtimer_lock);
1174
1175         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1176 }
1177
1178 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1179 {
1180         struct hrtimer *timer = &cpuctx->hrtimer;
1181         struct pmu *pmu = cpuctx->ctx.pmu;
1182         u64 interval;
1183
1184         /* no multiplexing needed for SW PMU */
1185         if (pmu->task_ctx_nr == perf_sw_context)
1186                 return;
1187
1188         /*
1189          * check default is sane, if not set then force to
1190          * default interval (1/tick)
1191          */
1192         interval = pmu->hrtimer_interval_ms;
1193         if (interval < 1)
1194                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1195
1196         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1197
1198         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1199         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1200         timer->function = perf_mux_hrtimer_handler;
1201 }
1202
1203 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1204 {
1205         struct hrtimer *timer = &cpuctx->hrtimer;
1206         struct pmu *pmu = cpuctx->ctx.pmu;
1207         unsigned long flags;
1208
1209         /* not for SW PMU */
1210         if (pmu->task_ctx_nr == perf_sw_context)
1211                 return 0;
1212
1213         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1214         if (!cpuctx->hrtimer_active) {
1215                 cpuctx->hrtimer_active = 1;
1216                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1217                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1218         }
1219         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1220
1221         return 0;
1222 }
1223
1224 void perf_pmu_disable(struct pmu *pmu)
1225 {
1226         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1227         if (!(*count)++)
1228                 pmu->pmu_disable(pmu);
1229 }
1230
1231 void perf_pmu_enable(struct pmu *pmu)
1232 {
1233         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1234         if (!--(*count))
1235                 pmu->pmu_enable(pmu);
1236 }
1237
1238 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1239
1240 /*
1241  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1242  * perf_event_task_tick() are fully serialized because they're strictly cpu
1243  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1244  * disabled, while perf_event_task_tick is called from IRQ context.
1245  */
1246 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1247 {
1248         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1249
1250         lockdep_assert_irqs_disabled();
1251
1252         WARN_ON(!list_empty(&ctx->active_ctx_list));
1253
1254         list_add(&ctx->active_ctx_list, head);
1255 }
1256
1257 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1258 {
1259         lockdep_assert_irqs_disabled();
1260
1261         WARN_ON(list_empty(&ctx->active_ctx_list));
1262
1263         list_del_init(&ctx->active_ctx_list);
1264 }
1265
1266 static void get_ctx(struct perf_event_context *ctx)
1267 {
1268         refcount_inc(&ctx->refcount);
1269 }
1270
1271 static void *alloc_task_ctx_data(struct pmu *pmu)
1272 {
1273         if (pmu->task_ctx_cache)
1274                 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1275
1276         return NULL;
1277 }
1278
1279 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1280 {
1281         if (pmu->task_ctx_cache && task_ctx_data)
1282                 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1283 }
1284
1285 static void free_ctx(struct rcu_head *head)
1286 {
1287         struct perf_event_context *ctx;
1288
1289         ctx = container_of(head, struct perf_event_context, rcu_head);
1290         free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1291         kfree(ctx);
1292 }
1293
1294 static void put_ctx(struct perf_event_context *ctx)
1295 {
1296         if (refcount_dec_and_test(&ctx->refcount)) {
1297                 if (ctx->parent_ctx)
1298                         put_ctx(ctx->parent_ctx);
1299                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1300                         put_task_struct(ctx->task);
1301                 call_rcu(&ctx->rcu_head, free_ctx);
1302         }
1303 }
1304
1305 /*
1306  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1307  * perf_pmu_migrate_context() we need some magic.
1308  *
1309  * Those places that change perf_event::ctx will hold both
1310  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1311  *
1312  * Lock ordering is by mutex address. There are two other sites where
1313  * perf_event_context::mutex nests and those are:
1314  *
1315  *  - perf_event_exit_task_context()    [ child , 0 ]
1316  *      perf_event_exit_event()
1317  *        put_event()                   [ parent, 1 ]
1318  *
1319  *  - perf_event_init_context()         [ parent, 0 ]
1320  *      inherit_task_group()
1321  *        inherit_group()
1322  *          inherit_event()
1323  *            perf_event_alloc()
1324  *              perf_init_event()
1325  *                perf_try_init_event() [ child , 1 ]
1326  *
1327  * While it appears there is an obvious deadlock here -- the parent and child
1328  * nesting levels are inverted between the two. This is in fact safe because
1329  * life-time rules separate them. That is an exiting task cannot fork, and a
1330  * spawning task cannot (yet) exit.
1331  *
1332  * But remember that these are parent<->child context relations, and
1333  * migration does not affect children, therefore these two orderings should not
1334  * interact.
1335  *
1336  * The change in perf_event::ctx does not affect children (as claimed above)
1337  * because the sys_perf_event_open() case will install a new event and break
1338  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1339  * concerned with cpuctx and that doesn't have children.
1340  *
1341  * The places that change perf_event::ctx will issue:
1342  *
1343  *   perf_remove_from_context();
1344  *   synchronize_rcu();
1345  *   perf_install_in_context();
1346  *
1347  * to affect the change. The remove_from_context() + synchronize_rcu() should
1348  * quiesce the event, after which we can install it in the new location. This
1349  * means that only external vectors (perf_fops, prctl) can perturb the event
1350  * while in transit. Therefore all such accessors should also acquire
1351  * perf_event_context::mutex to serialize against this.
1352  *
1353  * However; because event->ctx can change while we're waiting to acquire
1354  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1355  * function.
1356  *
1357  * Lock order:
1358  *    exec_update_lock
1359  *      task_struct::perf_event_mutex
1360  *        perf_event_context::mutex
1361  *          perf_event::child_mutex;
1362  *            perf_event_context::lock
1363  *          perf_event::mmap_mutex
1364  *          mmap_lock
1365  *            perf_addr_filters_head::lock
1366  *
1367  *    cpu_hotplug_lock
1368  *      pmus_lock
1369  *        cpuctx->mutex / perf_event_context::mutex
1370  */
1371 static struct perf_event_context *
1372 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1373 {
1374         struct perf_event_context *ctx;
1375
1376 again:
1377         rcu_read_lock();
1378         ctx = READ_ONCE(event->ctx);
1379         if (!refcount_inc_not_zero(&ctx->refcount)) {
1380                 rcu_read_unlock();
1381                 goto again;
1382         }
1383         rcu_read_unlock();
1384
1385         mutex_lock_nested(&ctx->mutex, nesting);
1386         if (event->ctx != ctx) {
1387                 mutex_unlock(&ctx->mutex);
1388                 put_ctx(ctx);
1389                 goto again;
1390         }
1391
1392         return ctx;
1393 }
1394
1395 static inline struct perf_event_context *
1396 perf_event_ctx_lock(struct perf_event *event)
1397 {
1398         return perf_event_ctx_lock_nested(event, 0);
1399 }
1400
1401 static void perf_event_ctx_unlock(struct perf_event *event,
1402                                   struct perf_event_context *ctx)
1403 {
1404         mutex_unlock(&ctx->mutex);
1405         put_ctx(ctx);
1406 }
1407
1408 /*
1409  * This must be done under the ctx->lock, such as to serialize against
1410  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1411  * calling scheduler related locks and ctx->lock nests inside those.
1412  */
1413 static __must_check struct perf_event_context *
1414 unclone_ctx(struct perf_event_context *ctx)
1415 {
1416         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1417
1418         lockdep_assert_held(&ctx->lock);
1419
1420         if (parent_ctx)
1421                 ctx->parent_ctx = NULL;
1422         ctx->generation++;
1423
1424         return parent_ctx;
1425 }
1426
1427 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1428                                 enum pid_type type)
1429 {
1430         u32 nr;
1431         /*
1432          * only top level events have the pid namespace they were created in
1433          */
1434         if (event->parent)
1435                 event = event->parent;
1436
1437         nr = __task_pid_nr_ns(p, type, event->ns);
1438         /* avoid -1 if it is idle thread or runs in another ns */
1439         if (!nr && !pid_alive(p))
1440                 nr = -1;
1441         return nr;
1442 }
1443
1444 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1445 {
1446         return perf_event_pid_type(event, p, PIDTYPE_TGID);
1447 }
1448
1449 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1450 {
1451         return perf_event_pid_type(event, p, PIDTYPE_PID);
1452 }
1453
1454 /*
1455  * If we inherit events we want to return the parent event id
1456  * to userspace.
1457  */
1458 static u64 primary_event_id(struct perf_event *event)
1459 {
1460         u64 id = event->id;
1461
1462         if (event->parent)
1463                 id = event->parent->id;
1464
1465         return id;
1466 }
1467
1468 /*
1469  * Get the perf_event_context for a task and lock it.
1470  *
1471  * This has to cope with the fact that until it is locked,
1472  * the context could get moved to another task.
1473  */
1474 static struct perf_event_context *
1475 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1476 {
1477         struct perf_event_context *ctx;
1478
1479 retry:
1480         /*
1481          * One of the few rules of preemptible RCU is that one cannot do
1482          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1483          * part of the read side critical section was irqs-enabled -- see
1484          * rcu_read_unlock_special().
1485          *
1486          * Since ctx->lock nests under rq->lock we must ensure the entire read
1487          * side critical section has interrupts disabled.
1488          */
1489         local_irq_save(*flags);
1490         rcu_read_lock();
1491         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1492         if (ctx) {
1493                 /*
1494                  * If this context is a clone of another, it might
1495                  * get swapped for another underneath us by
1496                  * perf_event_task_sched_out, though the
1497                  * rcu_read_lock() protects us from any context
1498                  * getting freed.  Lock the context and check if it
1499                  * got swapped before we could get the lock, and retry
1500                  * if so.  If we locked the right context, then it
1501                  * can't get swapped on us any more.
1502                  */
1503                 raw_spin_lock(&ctx->lock);
1504                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1505                         raw_spin_unlock(&ctx->lock);
1506                         rcu_read_unlock();
1507                         local_irq_restore(*flags);
1508                         goto retry;
1509                 }
1510
1511                 if (ctx->task == TASK_TOMBSTONE ||
1512                     !refcount_inc_not_zero(&ctx->refcount)) {
1513                         raw_spin_unlock(&ctx->lock);
1514                         ctx = NULL;
1515                 } else {
1516                         WARN_ON_ONCE(ctx->task != task);
1517                 }
1518         }
1519         rcu_read_unlock();
1520         if (!ctx)
1521                 local_irq_restore(*flags);
1522         return ctx;
1523 }
1524
1525 /*
1526  * Get the context for a task and increment its pin_count so it
1527  * can't get swapped to another task.  This also increments its
1528  * reference count so that the context can't get freed.
1529  */
1530 static struct perf_event_context *
1531 perf_pin_task_context(struct task_struct *task, int ctxn)
1532 {
1533         struct perf_event_context *ctx;
1534         unsigned long flags;
1535
1536         ctx = perf_lock_task_context(task, ctxn, &flags);
1537         if (ctx) {
1538                 ++ctx->pin_count;
1539                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1540         }
1541         return ctx;
1542 }
1543
1544 static void perf_unpin_context(struct perf_event_context *ctx)
1545 {
1546         unsigned long flags;
1547
1548         raw_spin_lock_irqsave(&ctx->lock, flags);
1549         --ctx->pin_count;
1550         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1551 }
1552
1553 /*
1554  * Update the record of the current time in a context.
1555  */
1556 static void __update_context_time(struct perf_event_context *ctx, bool adv)
1557 {
1558         u64 now = perf_clock();
1559
1560         if (adv)
1561                 ctx->time += now - ctx->timestamp;
1562         ctx->timestamp = now;
1563
1564         /*
1565          * The above: time' = time + (now - timestamp), can be re-arranged
1566          * into: time` = now + (time - timestamp), which gives a single value
1567          * offset to compute future time without locks on.
1568          *
1569          * See perf_event_time_now(), which can be used from NMI context where
1570          * it's (obviously) not possible to acquire ctx->lock in order to read
1571          * both the above values in a consistent manner.
1572          */
1573         WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
1574 }
1575
1576 static void update_context_time(struct perf_event_context *ctx)
1577 {
1578         __update_context_time(ctx, true);
1579 }
1580
1581 static u64 perf_event_time(struct perf_event *event)
1582 {
1583         struct perf_event_context *ctx = event->ctx;
1584
1585         if (unlikely(!ctx))
1586                 return 0;
1587
1588         if (is_cgroup_event(event))
1589                 return perf_cgroup_event_time(event);
1590
1591         return ctx->time;
1592 }
1593
1594 static u64 perf_event_time_now(struct perf_event *event, u64 now)
1595 {
1596         struct perf_event_context *ctx = event->ctx;
1597
1598         if (unlikely(!ctx))
1599                 return 0;
1600
1601         if (is_cgroup_event(event))
1602                 return perf_cgroup_event_time_now(event, now);
1603
1604         if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
1605                 return ctx->time;
1606
1607         now += READ_ONCE(ctx->timeoffset);
1608         return now;
1609 }
1610
1611 static enum event_type_t get_event_type(struct perf_event *event)
1612 {
1613         struct perf_event_context *ctx = event->ctx;
1614         enum event_type_t event_type;
1615
1616         lockdep_assert_held(&ctx->lock);
1617
1618         /*
1619          * It's 'group type', really, because if our group leader is
1620          * pinned, so are we.
1621          */
1622         if (event->group_leader != event)
1623                 event = event->group_leader;
1624
1625         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1626         if (!ctx->task)
1627                 event_type |= EVENT_CPU;
1628
1629         return event_type;
1630 }
1631
1632 /*
1633  * Helper function to initialize event group nodes.
1634  */
1635 static void init_event_group(struct perf_event *event)
1636 {
1637         RB_CLEAR_NODE(&event->group_node);
1638         event->group_index = 0;
1639 }
1640
1641 /*
1642  * Extract pinned or flexible groups from the context
1643  * based on event attrs bits.
1644  */
1645 static struct perf_event_groups *
1646 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1647 {
1648         if (event->attr.pinned)
1649                 return &ctx->pinned_groups;
1650         else
1651                 return &ctx->flexible_groups;
1652 }
1653
1654 /*
1655  * Helper function to initializes perf_event_group trees.
1656  */
1657 static void perf_event_groups_init(struct perf_event_groups *groups)
1658 {
1659         groups->tree = RB_ROOT;
1660         groups->index = 0;
1661 }
1662
1663 static inline struct cgroup *event_cgroup(const struct perf_event *event)
1664 {
1665         struct cgroup *cgroup = NULL;
1666
1667 #ifdef CONFIG_CGROUP_PERF
1668         if (event->cgrp)
1669                 cgroup = event->cgrp->css.cgroup;
1670 #endif
1671
1672         return cgroup;
1673 }
1674
1675 /*
1676  * Compare function for event groups;
1677  *
1678  * Implements complex key that first sorts by CPU and then by virtual index
1679  * which provides ordering when rotating groups for the same CPU.
1680  */
1681 static __always_inline int
1682 perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
1683                       const u64 left_group_index, const struct perf_event *right)
1684 {
1685         if (left_cpu < right->cpu)
1686                 return -1;
1687         if (left_cpu > right->cpu)
1688                 return 1;
1689
1690 #ifdef CONFIG_CGROUP_PERF
1691         {
1692                 const struct cgroup *right_cgroup = event_cgroup(right);
1693
1694                 if (left_cgroup != right_cgroup) {
1695                         if (!left_cgroup) {
1696                                 /*
1697                                  * Left has no cgroup but right does, no
1698                                  * cgroups come first.
1699                                  */
1700                                 return -1;
1701                         }
1702                         if (!right_cgroup) {
1703                                 /*
1704                                  * Right has no cgroup but left does, no
1705                                  * cgroups come first.
1706                                  */
1707                                 return 1;
1708                         }
1709                         /* Two dissimilar cgroups, order by id. */
1710                         if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1711                                 return -1;
1712
1713                         return 1;
1714                 }
1715         }
1716 #endif
1717
1718         if (left_group_index < right->group_index)
1719                 return -1;
1720         if (left_group_index > right->group_index)
1721                 return 1;
1722
1723         return 0;
1724 }
1725
1726 #define __node_2_pe(node) \
1727         rb_entry((node), struct perf_event, group_node)
1728
1729 static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1730 {
1731         struct perf_event *e = __node_2_pe(a);
1732         return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
1733                                      __node_2_pe(b)) < 0;
1734 }
1735
1736 struct __group_key {
1737         int cpu;
1738         struct cgroup *cgroup;
1739 };
1740
1741 static inline int __group_cmp(const void *key, const struct rb_node *node)
1742 {
1743         const struct __group_key *a = key;
1744         const struct perf_event *b = __node_2_pe(node);
1745
1746         /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
1747         return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
1748 }
1749
1750 /*
1751  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1752  * key (see perf_event_groups_less). This places it last inside the CPU
1753  * subtree.
1754  */
1755 static void
1756 perf_event_groups_insert(struct perf_event_groups *groups,
1757                          struct perf_event *event)
1758 {
1759         event->group_index = ++groups->index;
1760
1761         rb_add(&event->group_node, &groups->tree, __group_less);
1762 }
1763
1764 /*
1765  * Helper function to insert event into the pinned or flexible groups.
1766  */
1767 static void
1768 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1769 {
1770         struct perf_event_groups *groups;
1771
1772         groups = get_event_groups(event, ctx);
1773         perf_event_groups_insert(groups, event);
1774 }
1775
1776 /*
1777  * Delete a group from a tree.
1778  */
1779 static void
1780 perf_event_groups_delete(struct perf_event_groups *groups,
1781                          struct perf_event *event)
1782 {
1783         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1784                      RB_EMPTY_ROOT(&groups->tree));
1785
1786         rb_erase(&event->group_node, &groups->tree);
1787         init_event_group(event);
1788 }
1789
1790 /*
1791  * Helper function to delete event from its groups.
1792  */
1793 static void
1794 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1795 {
1796         struct perf_event_groups *groups;
1797
1798         groups = get_event_groups(event, ctx);
1799         perf_event_groups_delete(groups, event);
1800 }
1801
1802 /*
1803  * Get the leftmost event in the cpu/cgroup subtree.
1804  */
1805 static struct perf_event *
1806 perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1807                         struct cgroup *cgrp)
1808 {
1809         struct __group_key key = {
1810                 .cpu = cpu,
1811                 .cgroup = cgrp,
1812         };
1813         struct rb_node *node;
1814
1815         node = rb_find_first(&key, &groups->tree, __group_cmp);
1816         if (node)
1817                 return __node_2_pe(node);
1818
1819         return NULL;
1820 }
1821
1822 /*
1823  * Like rb_entry_next_safe() for the @cpu subtree.
1824  */
1825 static struct perf_event *
1826 perf_event_groups_next(struct perf_event *event)
1827 {
1828         struct __group_key key = {
1829                 .cpu = event->cpu,
1830                 .cgroup = event_cgroup(event),
1831         };
1832         struct rb_node *next;
1833
1834         next = rb_next_match(&key, &event->group_node, __group_cmp);
1835         if (next)
1836                 return __node_2_pe(next);
1837
1838         return NULL;
1839 }
1840
1841 /*
1842  * Iterate through the whole groups tree.
1843  */
1844 #define perf_event_groups_for_each(event, groups)                       \
1845         for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1846                                 typeof(*event), group_node); event;     \
1847                 event = rb_entry_safe(rb_next(&event->group_node),      \
1848                                 typeof(*event), group_node))
1849
1850 /*
1851  * Add an event from the lists for its context.
1852  * Must be called with ctx->mutex and ctx->lock held.
1853  */
1854 static void
1855 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1856 {
1857         lockdep_assert_held(&ctx->lock);
1858
1859         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1860         event->attach_state |= PERF_ATTACH_CONTEXT;
1861
1862         event->tstamp = perf_event_time(event);
1863
1864         /*
1865          * If we're a stand alone event or group leader, we go to the context
1866          * list, group events are kept attached to the group so that
1867          * perf_group_detach can, at all times, locate all siblings.
1868          */
1869         if (event->group_leader == event) {
1870                 event->group_caps = event->event_caps;
1871                 add_event_to_groups(event, ctx);
1872         }
1873
1874         list_add_rcu(&event->event_entry, &ctx->event_list);
1875         ctx->nr_events++;
1876         if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
1877                 ctx->nr_user++;
1878         if (event->attr.inherit_stat)
1879                 ctx->nr_stat++;
1880
1881         if (event->state > PERF_EVENT_STATE_OFF)
1882                 perf_cgroup_event_enable(event, ctx);
1883
1884         ctx->generation++;
1885 }
1886
1887 /*
1888  * Initialize event state based on the perf_event_attr::disabled.
1889  */
1890 static inline void perf_event__state_init(struct perf_event *event)
1891 {
1892         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1893                                               PERF_EVENT_STATE_INACTIVE;
1894 }
1895
1896 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1897 {
1898         int entry = sizeof(u64); /* value */
1899         int size = 0;
1900         int nr = 1;
1901
1902         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1903                 size += sizeof(u64);
1904
1905         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1906                 size += sizeof(u64);
1907
1908         if (event->attr.read_format & PERF_FORMAT_ID)
1909                 entry += sizeof(u64);
1910
1911         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1912                 nr += nr_siblings;
1913                 size += sizeof(u64);
1914         }
1915
1916         size += entry * nr;
1917         event->read_size = size;
1918 }
1919
1920 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1921 {
1922         struct perf_sample_data *data;
1923         u16 size = 0;
1924
1925         if (sample_type & PERF_SAMPLE_IP)
1926                 size += sizeof(data->ip);
1927
1928         if (sample_type & PERF_SAMPLE_ADDR)
1929                 size += sizeof(data->addr);
1930
1931         if (sample_type & PERF_SAMPLE_PERIOD)
1932                 size += sizeof(data->period);
1933
1934         if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1935                 size += sizeof(data->weight.full);
1936
1937         if (sample_type & PERF_SAMPLE_READ)
1938                 size += event->read_size;
1939
1940         if (sample_type & PERF_SAMPLE_DATA_SRC)
1941                 size += sizeof(data->data_src.val);
1942
1943         if (sample_type & PERF_SAMPLE_TRANSACTION)
1944                 size += sizeof(data->txn);
1945
1946         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1947                 size += sizeof(data->phys_addr);
1948
1949         if (sample_type & PERF_SAMPLE_CGROUP)
1950                 size += sizeof(data->cgroup);
1951
1952         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1953                 size += sizeof(data->data_page_size);
1954
1955         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1956                 size += sizeof(data->code_page_size);
1957
1958         event->header_size = size;
1959 }
1960
1961 /*
1962  * Called at perf_event creation and when events are attached/detached from a
1963  * group.
1964  */
1965 static void perf_event__header_size(struct perf_event *event)
1966 {
1967         __perf_event_read_size(event,
1968                                event->group_leader->nr_siblings);
1969         __perf_event_header_size(event, event->attr.sample_type);
1970 }
1971
1972 static void perf_event__id_header_size(struct perf_event *event)
1973 {
1974         struct perf_sample_data *data;
1975         u64 sample_type = event->attr.sample_type;
1976         u16 size = 0;
1977
1978         if (sample_type & PERF_SAMPLE_TID)
1979                 size += sizeof(data->tid_entry);
1980
1981         if (sample_type & PERF_SAMPLE_TIME)
1982                 size += sizeof(data->time);
1983
1984         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1985                 size += sizeof(data->id);
1986
1987         if (sample_type & PERF_SAMPLE_ID)
1988                 size += sizeof(data->id);
1989
1990         if (sample_type & PERF_SAMPLE_STREAM_ID)
1991                 size += sizeof(data->stream_id);
1992
1993         if (sample_type & PERF_SAMPLE_CPU)
1994                 size += sizeof(data->cpu_entry);
1995
1996         event->id_header_size = size;
1997 }
1998
1999 static bool perf_event_validate_size(struct perf_event *event)
2000 {
2001         /*
2002          * The values computed here will be over-written when we actually
2003          * attach the event.
2004          */
2005         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
2006         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
2007         perf_event__id_header_size(event);
2008
2009         /*
2010          * Sum the lot; should not exceed the 64k limit we have on records.
2011          * Conservative limit to allow for callchains and other variable fields.
2012          */
2013         if (event->read_size + event->header_size +
2014             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
2015                 return false;
2016
2017         return true;
2018 }
2019
2020 static void perf_group_attach(struct perf_event *event)
2021 {
2022         struct perf_event *group_leader = event->group_leader, *pos;
2023
2024         lockdep_assert_held(&event->ctx->lock);
2025
2026         /*
2027          * We can have double attach due to group movement in perf_event_open.
2028          */
2029         if (event->attach_state & PERF_ATTACH_GROUP)
2030                 return;
2031
2032         event->attach_state |= PERF_ATTACH_GROUP;
2033
2034         if (group_leader == event)
2035                 return;
2036
2037         WARN_ON_ONCE(group_leader->ctx != event->ctx);
2038
2039         group_leader->group_caps &= event->event_caps;
2040
2041         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
2042         group_leader->nr_siblings++;
2043
2044         perf_event__header_size(group_leader);
2045
2046         for_each_sibling_event(pos, group_leader)
2047                 perf_event__header_size(pos);
2048 }
2049
2050 /*
2051  * Remove an event from the lists for its context.
2052  * Must be called with ctx->mutex and ctx->lock held.
2053  */
2054 static void
2055 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
2056 {
2057         WARN_ON_ONCE(event->ctx != ctx);
2058         lockdep_assert_held(&ctx->lock);
2059
2060         /*
2061          * We can have double detach due to exit/hot-unplug + close.
2062          */
2063         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
2064                 return;
2065
2066         event->attach_state &= ~PERF_ATTACH_CONTEXT;
2067
2068         ctx->nr_events--;
2069         if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
2070                 ctx->nr_user--;
2071         if (event->attr.inherit_stat)
2072                 ctx->nr_stat--;
2073
2074         list_del_rcu(&event->event_entry);
2075
2076         if (event->group_leader == event)
2077                 del_event_from_groups(event, ctx);
2078
2079         /*
2080          * If event was in error state, then keep it
2081          * that way, otherwise bogus counts will be
2082          * returned on read(). The only way to get out
2083          * of error state is by explicit re-enabling
2084          * of the event
2085          */
2086         if (event->state > PERF_EVENT_STATE_OFF) {
2087                 perf_cgroup_event_disable(event, ctx);
2088                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2089         }
2090
2091         ctx->generation++;
2092 }
2093
2094 static int
2095 perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2096 {
2097         if (!has_aux(aux_event))
2098                 return 0;
2099
2100         if (!event->pmu->aux_output_match)
2101                 return 0;
2102
2103         return event->pmu->aux_output_match(aux_event);
2104 }
2105
2106 static void put_event(struct perf_event *event);
2107 static void event_sched_out(struct perf_event *event,
2108                             struct perf_cpu_context *cpuctx,
2109                             struct perf_event_context *ctx);
2110
2111 static void perf_put_aux_event(struct perf_event *event)
2112 {
2113         struct perf_event_context *ctx = event->ctx;
2114         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2115         struct perf_event *iter;
2116
2117         /*
2118          * If event uses aux_event tear down the link
2119          */
2120         if (event->aux_event) {
2121                 iter = event->aux_event;
2122                 event->aux_event = NULL;
2123                 put_event(iter);
2124                 return;
2125         }
2126
2127         /*
2128          * If the event is an aux_event, tear down all links to
2129          * it from other events.
2130          */
2131         for_each_sibling_event(iter, event->group_leader) {
2132                 if (iter->aux_event != event)
2133                         continue;
2134
2135                 iter->aux_event = NULL;
2136                 put_event(event);
2137
2138                 /*
2139                  * If it's ACTIVE, schedule it out and put it into ERROR
2140                  * state so that we don't try to schedule it again. Note
2141                  * that perf_event_enable() will clear the ERROR status.
2142                  */
2143                 event_sched_out(iter, cpuctx, ctx);
2144                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2145         }
2146 }
2147
2148 static bool perf_need_aux_event(struct perf_event *event)
2149 {
2150         return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2151 }
2152
2153 static int perf_get_aux_event(struct perf_event *event,
2154                               struct perf_event *group_leader)
2155 {
2156         /*
2157          * Our group leader must be an aux event if we want to be
2158          * an aux_output. This way, the aux event will precede its
2159          * aux_output events in the group, and therefore will always
2160          * schedule first.
2161          */
2162         if (!group_leader)
2163                 return 0;
2164
2165         /*
2166          * aux_output and aux_sample_size are mutually exclusive.
2167          */
2168         if (event->attr.aux_output && event->attr.aux_sample_size)
2169                 return 0;
2170
2171         if (event->attr.aux_output &&
2172             !perf_aux_output_match(event, group_leader))
2173                 return 0;
2174
2175         if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2176                 return 0;
2177
2178         if (!atomic_long_inc_not_zero(&group_leader->refcount))
2179                 return 0;
2180
2181         /*
2182          * Link aux_outputs to their aux event; this is undone in
2183          * perf_group_detach() by perf_put_aux_event(). When the
2184          * group in torn down, the aux_output events loose their
2185          * link to the aux_event and can't schedule any more.
2186          */
2187         event->aux_event = group_leader;
2188
2189         return 1;
2190 }
2191
2192 static inline struct list_head *get_event_list(struct perf_event *event)
2193 {
2194         struct perf_event_context *ctx = event->ctx;
2195         return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2196 }
2197
2198 /*
2199  * Events that have PERF_EV_CAP_SIBLING require being part of a group and
2200  * cannot exist on their own, schedule them out and move them into the ERROR
2201  * state. Also see _perf_event_enable(), it will not be able to recover
2202  * this ERROR state.
2203  */
2204 static inline void perf_remove_sibling_event(struct perf_event *event)
2205 {
2206         struct perf_event_context *ctx = event->ctx;
2207         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2208
2209         event_sched_out(event, cpuctx, ctx);
2210         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2211 }
2212
2213 static void perf_group_detach(struct perf_event *event)
2214 {
2215         struct perf_event *leader = event->group_leader;
2216         struct perf_event *sibling, *tmp;
2217         struct perf_event_context *ctx = event->ctx;
2218
2219         lockdep_assert_held(&ctx->lock);
2220
2221         /*
2222          * We can have double detach due to exit/hot-unplug + close.
2223          */
2224         if (!(event->attach_state & PERF_ATTACH_GROUP))
2225                 return;
2226
2227         event->attach_state &= ~PERF_ATTACH_GROUP;
2228
2229         perf_put_aux_event(event);
2230
2231         /*
2232          * If this is a sibling, remove it from its group.
2233          */
2234         if (leader != event) {
2235                 list_del_init(&event->sibling_list);
2236                 event->group_leader->nr_siblings--;
2237                 goto out;
2238         }
2239
2240         /*
2241          * If this was a group event with sibling events then
2242          * upgrade the siblings to singleton events by adding them
2243          * to whatever list we are on.
2244          */
2245         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2246
2247                 if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2248                         perf_remove_sibling_event(sibling);
2249
2250                 sibling->group_leader = sibling;
2251                 list_del_init(&sibling->sibling_list);
2252
2253                 /* Inherit group flags from the previous leader */
2254                 sibling->group_caps = event->group_caps;
2255
2256                 if (!RB_EMPTY_NODE(&event->group_node)) {
2257                         add_event_to_groups(sibling, event->ctx);
2258
2259                         if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2260                                 list_add_tail(&sibling->active_list, get_event_list(sibling));
2261                 }
2262
2263                 WARN_ON_ONCE(sibling->ctx != event->ctx);
2264         }
2265
2266 out:
2267         for_each_sibling_event(tmp, leader)
2268                 perf_event__header_size(tmp);
2269
2270         perf_event__header_size(leader);
2271 }
2272
2273 static void sync_child_event(struct perf_event *child_event);
2274
2275 static void perf_child_detach(struct perf_event *event)
2276 {
2277         struct perf_event *parent_event = event->parent;
2278
2279         if (!(event->attach_state & PERF_ATTACH_CHILD))
2280                 return;
2281
2282         event->attach_state &= ~PERF_ATTACH_CHILD;
2283
2284         if (WARN_ON_ONCE(!parent_event))
2285                 return;
2286
2287         lockdep_assert_held(&parent_event->child_mutex);
2288
2289         sync_child_event(event);
2290         list_del_init(&event->child_list);
2291 }
2292
2293 static bool is_orphaned_event(struct perf_event *event)
2294 {
2295         return event->state == PERF_EVENT_STATE_DEAD;
2296 }
2297
2298 static inline int __pmu_filter_match(struct perf_event *event)
2299 {
2300         struct pmu *pmu = event->pmu;
2301         return pmu->filter_match ? pmu->filter_match(event) : 1;
2302 }
2303
2304 /*
2305  * Check whether we should attempt to schedule an event group based on
2306  * PMU-specific filtering. An event group can consist of HW and SW events,
2307  * potentially with a SW leader, so we must check all the filters, to
2308  * determine whether a group is schedulable:
2309  */
2310 static inline int pmu_filter_match(struct perf_event *event)
2311 {
2312         struct perf_event *sibling;
2313
2314         if (!__pmu_filter_match(event))
2315                 return 0;
2316
2317         for_each_sibling_event(sibling, event) {
2318                 if (!__pmu_filter_match(sibling))
2319                         return 0;
2320         }
2321
2322         return 1;
2323 }
2324
2325 static inline int
2326 event_filter_match(struct perf_event *event)
2327 {
2328         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2329                perf_cgroup_match(event) && pmu_filter_match(event);
2330 }
2331
2332 static void
2333 event_sched_out(struct perf_event *event,
2334                   struct perf_cpu_context *cpuctx,
2335                   struct perf_event_context *ctx)
2336 {
2337         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2338
2339         WARN_ON_ONCE(event->ctx != ctx);
2340         lockdep_assert_held(&ctx->lock);
2341
2342         if (event->state != PERF_EVENT_STATE_ACTIVE)
2343                 return;
2344
2345         /*
2346          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2347          * we can schedule events _OUT_ individually through things like
2348          * __perf_remove_from_context().
2349          */
2350         list_del_init(&event->active_list);
2351
2352         perf_pmu_disable(event->pmu);
2353
2354         event->pmu->del(event, 0);
2355         event->oncpu = -1;
2356
2357         if (READ_ONCE(event->pending_disable) >= 0) {
2358                 WRITE_ONCE(event->pending_disable, -1);
2359                 perf_cgroup_event_disable(event, ctx);
2360                 state = PERF_EVENT_STATE_OFF;
2361         }
2362         perf_event_set_state(event, state);
2363
2364         if (!is_software_event(event))
2365                 cpuctx->active_oncpu--;
2366         if (!--ctx->nr_active)
2367                 perf_event_ctx_deactivate(ctx);
2368         if (event->attr.freq && event->attr.sample_freq)
2369                 ctx->nr_freq--;
2370         if (event->attr.exclusive || !cpuctx->active_oncpu)
2371                 cpuctx->exclusive = 0;
2372
2373         perf_pmu_enable(event->pmu);
2374 }
2375
2376 static void
2377 group_sched_out(struct perf_event *group_event,
2378                 struct perf_cpu_context *cpuctx,
2379                 struct perf_event_context *ctx)
2380 {
2381         struct perf_event *event;
2382
2383         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2384                 return;
2385
2386         perf_pmu_disable(ctx->pmu);
2387
2388         event_sched_out(group_event, cpuctx, ctx);
2389
2390         /*
2391          * Schedule out siblings (if any):
2392          */
2393         for_each_sibling_event(event, group_event)
2394                 event_sched_out(event, cpuctx, ctx);
2395
2396         perf_pmu_enable(ctx->pmu);
2397 }
2398
2399 #define DETACH_GROUP    0x01UL
2400 #define DETACH_CHILD    0x02UL
2401
2402 /*
2403  * Cross CPU call to remove a performance event
2404  *
2405  * We disable the event on the hardware level first. After that we
2406  * remove it from the context list.
2407  */
2408 static void
2409 __perf_remove_from_context(struct perf_event *event,
2410                            struct perf_cpu_context *cpuctx,
2411                            struct perf_event_context *ctx,
2412                            void *info)
2413 {
2414         unsigned long flags = (unsigned long)info;
2415
2416         if (ctx->is_active & EVENT_TIME) {
2417                 update_context_time(ctx);
2418                 update_cgrp_time_from_cpuctx(cpuctx, false);
2419         }
2420
2421         event_sched_out(event, cpuctx, ctx);
2422         if (flags & DETACH_GROUP)
2423                 perf_group_detach(event);
2424         if (flags & DETACH_CHILD)
2425                 perf_child_detach(event);
2426         list_del_event(event, ctx);
2427
2428         if (!ctx->nr_events && ctx->is_active) {
2429                 if (ctx == &cpuctx->ctx)
2430                         update_cgrp_time_from_cpuctx(cpuctx, true);
2431
2432                 ctx->is_active = 0;
2433                 ctx->rotate_necessary = 0;
2434                 if (ctx->task) {
2435                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2436                         cpuctx->task_ctx = NULL;
2437                 }
2438         }
2439 }
2440
2441 /*
2442  * Remove the event from a task's (or a CPU's) list of events.
2443  *
2444  * If event->ctx is a cloned context, callers must make sure that
2445  * every task struct that event->ctx->task could possibly point to
2446  * remains valid.  This is OK when called from perf_release since
2447  * that only calls us on the top-level context, which can't be a clone.
2448  * When called from perf_event_exit_task, it's OK because the
2449  * context has been detached from its task.
2450  */
2451 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2452 {
2453         struct perf_event_context *ctx = event->ctx;
2454
2455         lockdep_assert_held(&ctx->mutex);
2456
2457         /*
2458          * Because of perf_event_exit_task(), perf_remove_from_context() ought
2459          * to work in the face of TASK_TOMBSTONE, unlike every other
2460          * event_function_call() user.
2461          */
2462         raw_spin_lock_irq(&ctx->lock);
2463         /*
2464          * Cgroup events are per-cpu events, and must IPI because of
2465          * cgrp_cpuctx_list.
2466          */
2467         if (!ctx->is_active && !is_cgroup_event(event)) {
2468                 __perf_remove_from_context(event, __get_cpu_context(ctx),
2469                                            ctx, (void *)flags);
2470                 raw_spin_unlock_irq(&ctx->lock);
2471                 return;
2472         }
2473         raw_spin_unlock_irq(&ctx->lock);
2474
2475         event_function_call(event, __perf_remove_from_context, (void *)flags);
2476 }
2477
2478 /*
2479  * Cross CPU call to disable a performance event
2480  */
2481 static void __perf_event_disable(struct perf_event *event,
2482                                  struct perf_cpu_context *cpuctx,
2483                                  struct perf_event_context *ctx,
2484                                  void *info)
2485 {
2486         if (event->state < PERF_EVENT_STATE_INACTIVE)
2487                 return;
2488
2489         if (ctx->is_active & EVENT_TIME) {
2490                 update_context_time(ctx);
2491                 update_cgrp_time_from_event(event);
2492         }
2493
2494         if (event == event->group_leader)
2495                 group_sched_out(event, cpuctx, ctx);
2496         else
2497                 event_sched_out(event, cpuctx, ctx);
2498
2499         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2500         perf_cgroup_event_disable(event, ctx);
2501 }
2502
2503 /*
2504  * Disable an event.
2505  *
2506  * If event->ctx is a cloned context, callers must make sure that
2507  * every task struct that event->ctx->task could possibly point to
2508  * remains valid.  This condition is satisfied when called through
2509  * perf_event_for_each_child or perf_event_for_each because they
2510  * hold the top-level event's child_mutex, so any descendant that
2511  * goes to exit will block in perf_event_exit_event().
2512  *
2513  * When called from perf_pending_event it's OK because event->ctx
2514  * is the current context on this CPU and preemption is disabled,
2515  * hence we can't get into perf_event_task_sched_out for this context.
2516  */
2517 static void _perf_event_disable(struct perf_event *event)
2518 {
2519         struct perf_event_context *ctx = event->ctx;
2520
2521         raw_spin_lock_irq(&ctx->lock);
2522         if (event->state <= PERF_EVENT_STATE_OFF) {
2523                 raw_spin_unlock_irq(&ctx->lock);
2524                 return;
2525         }
2526         raw_spin_unlock_irq(&ctx->lock);
2527
2528         event_function_call(event, __perf_event_disable, NULL);
2529 }
2530
2531 void perf_event_disable_local(struct perf_event *event)
2532 {
2533         event_function_local(event, __perf_event_disable, NULL);
2534 }
2535
2536 /*
2537  * Strictly speaking kernel users cannot create groups and therefore this
2538  * interface does not need the perf_event_ctx_lock() magic.
2539  */
2540 void perf_event_disable(struct perf_event *event)
2541 {
2542         struct perf_event_context *ctx;
2543
2544         ctx = perf_event_ctx_lock(event);
2545         _perf_event_disable(event);
2546         perf_event_ctx_unlock(event, ctx);
2547 }
2548 EXPORT_SYMBOL_GPL(perf_event_disable);
2549
2550 void perf_event_disable_inatomic(struct perf_event *event)
2551 {
2552         WRITE_ONCE(event->pending_disable, smp_processor_id());
2553         /* can fail, see perf_pending_event_disable() */
2554         irq_work_queue(&event->pending);
2555 }
2556
2557 #define MAX_INTERRUPTS (~0ULL)
2558
2559 static void perf_log_throttle(struct perf_event *event, int enable);
2560 static void perf_log_itrace_start(struct perf_event *event);
2561
2562 static int
2563 event_sched_in(struct perf_event *event,
2564                  struct perf_cpu_context *cpuctx,
2565                  struct perf_event_context *ctx)
2566 {
2567         int ret = 0;
2568
2569         WARN_ON_ONCE(event->ctx != ctx);
2570
2571         lockdep_assert_held(&ctx->lock);
2572
2573         if (event->state <= PERF_EVENT_STATE_OFF)
2574                 return 0;
2575
2576         WRITE_ONCE(event->oncpu, smp_processor_id());
2577         /*
2578          * Order event::oncpu write to happen before the ACTIVE state is
2579          * visible. This allows perf_event_{stop,read}() to observe the correct
2580          * ->oncpu if it sees ACTIVE.
2581          */
2582         smp_wmb();
2583         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2584
2585         /*
2586          * Unthrottle events, since we scheduled we might have missed several
2587          * ticks already, also for a heavily scheduling task there is little
2588          * guarantee it'll get a tick in a timely manner.
2589          */
2590         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2591                 perf_log_throttle(event, 1);
2592                 event->hw.interrupts = 0;
2593         }
2594
2595         perf_pmu_disable(event->pmu);
2596
2597         perf_log_itrace_start(event);
2598
2599         if (event->pmu->add(event, PERF_EF_START)) {
2600                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2601                 event->oncpu = -1;
2602                 ret = -EAGAIN;
2603                 goto out;
2604         }
2605
2606         if (!is_software_event(event))
2607                 cpuctx->active_oncpu++;
2608         if (!ctx->nr_active++)
2609                 perf_event_ctx_activate(ctx);
2610         if (event->attr.freq && event->attr.sample_freq)
2611                 ctx->nr_freq++;
2612
2613         if (event->attr.exclusive)
2614                 cpuctx->exclusive = 1;
2615
2616 out:
2617         perf_pmu_enable(event->pmu);
2618
2619         return ret;
2620 }
2621
2622 static int
2623 group_sched_in(struct perf_event *group_event,
2624                struct perf_cpu_context *cpuctx,
2625                struct perf_event_context *ctx)
2626 {
2627         struct perf_event *event, *partial_group = NULL;
2628         struct pmu *pmu = ctx->pmu;
2629
2630         if (group_event->state == PERF_EVENT_STATE_OFF)
2631                 return 0;
2632
2633         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2634
2635         if (event_sched_in(group_event, cpuctx, ctx))
2636                 goto error;
2637
2638         /*
2639          * Schedule in siblings as one group (if any):
2640          */
2641         for_each_sibling_event(event, group_event) {
2642                 if (event_sched_in(event, cpuctx, ctx)) {
2643                         partial_group = event;
2644                         goto group_error;
2645                 }
2646         }
2647
2648         if (!pmu->commit_txn(pmu))
2649                 return 0;
2650
2651 group_error:
2652         /*
2653          * Groups can be scheduled in as one unit only, so undo any
2654          * partial group before returning:
2655          * The events up to the failed event are scheduled out normally.
2656          */
2657         for_each_sibling_event(event, group_event) {
2658                 if (event == partial_group)
2659                         break;
2660
2661                 event_sched_out(event, cpuctx, ctx);
2662         }
2663         event_sched_out(group_event, cpuctx, ctx);
2664
2665 error:
2666         pmu->cancel_txn(pmu);
2667         return -EAGAIN;
2668 }
2669
2670 /*
2671  * Work out whether we can put this event group on the CPU now.
2672  */
2673 static int group_can_go_on(struct perf_event *event,
2674                            struct perf_cpu_context *cpuctx,
2675                            int can_add_hw)
2676 {
2677         /*
2678          * Groups consisting entirely of software events can always go on.
2679          */
2680         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2681                 return 1;
2682         /*
2683          * If an exclusive group is already on, no other hardware
2684          * events can go on.
2685          */
2686         if (cpuctx->exclusive)
2687                 return 0;
2688         /*
2689          * If this group is exclusive and there are already
2690          * events on the CPU, it can't go on.
2691          */
2692         if (event->attr.exclusive && !list_empty(get_event_list(event)))
2693                 return 0;
2694         /*
2695          * Otherwise, try to add it if all previous groups were able
2696          * to go on.
2697          */
2698         return can_add_hw;
2699 }
2700
2701 static void add_event_to_ctx(struct perf_event *event,
2702                                struct perf_event_context *ctx)
2703 {
2704         list_add_event(event, ctx);
2705         perf_group_attach(event);
2706 }
2707
2708 static void ctx_sched_out(struct perf_event_context *ctx,
2709                           struct perf_cpu_context *cpuctx,
2710                           enum event_type_t event_type);
2711 static void
2712 ctx_sched_in(struct perf_event_context *ctx,
2713              struct perf_cpu_context *cpuctx,
2714              enum event_type_t event_type);
2715
2716 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2717                                struct perf_event_context *ctx,
2718                                enum event_type_t event_type)
2719 {
2720         if (!cpuctx->task_ctx)
2721                 return;
2722
2723         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2724                 return;
2725
2726         ctx_sched_out(ctx, cpuctx, event_type);
2727 }
2728
2729 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2730                                 struct perf_event_context *ctx)
2731 {
2732         cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
2733         if (ctx)
2734                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
2735         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
2736         if (ctx)
2737                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
2738 }
2739
2740 /*
2741  * We want to maintain the following priority of scheduling:
2742  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2743  *  - task pinned (EVENT_PINNED)
2744  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2745  *  - task flexible (EVENT_FLEXIBLE).
2746  *
2747  * In order to avoid unscheduling and scheduling back in everything every
2748  * time an event is added, only do it for the groups of equal priority and
2749  * below.
2750  *
2751  * This can be called after a batch operation on task events, in which case
2752  * event_type is a bit mask of the types of events involved. For CPU events,
2753  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2754  */
2755 static void ctx_resched(struct perf_cpu_context *cpuctx,
2756                         struct perf_event_context *task_ctx,
2757                         enum event_type_t event_type)
2758 {
2759         enum event_type_t ctx_event_type;
2760         bool cpu_event = !!(event_type & EVENT_CPU);
2761
2762         /*
2763          * If pinned groups are involved, flexible groups also need to be
2764          * scheduled out.
2765          */
2766         if (event_type & EVENT_PINNED)
2767                 event_type |= EVENT_FLEXIBLE;
2768
2769         ctx_event_type = event_type & EVENT_ALL;
2770
2771         perf_pmu_disable(cpuctx->ctx.pmu);
2772         if (task_ctx)
2773                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2774
2775         /*
2776          * Decide which cpu ctx groups to schedule out based on the types
2777          * of events that caused rescheduling:
2778          *  - EVENT_CPU: schedule out corresponding groups;
2779          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2780          *  - otherwise, do nothing more.
2781          */
2782         if (cpu_event)
2783                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2784         else if (ctx_event_type & EVENT_PINNED)
2785                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2786
2787         perf_event_sched_in(cpuctx, task_ctx);
2788         perf_pmu_enable(cpuctx->ctx.pmu);
2789 }
2790
2791 void perf_pmu_resched(struct pmu *pmu)
2792 {
2793         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2794         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2795
2796         perf_ctx_lock(cpuctx, task_ctx);
2797         ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2798         perf_ctx_unlock(cpuctx, task_ctx);
2799 }
2800
2801 /*
2802  * Cross CPU call to install and enable a performance event
2803  *
2804  * Very similar to remote_function() + event_function() but cannot assume that
2805  * things like ctx->is_active and cpuctx->task_ctx are set.
2806  */
2807 static int  __perf_install_in_context(void *info)
2808 {
2809         struct perf_event *event = info;
2810         struct perf_event_context *ctx = event->ctx;
2811         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2812         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2813         bool reprogram = true;
2814         int ret = 0;
2815
2816         raw_spin_lock(&cpuctx->ctx.lock);
2817         if (ctx->task) {
2818                 raw_spin_lock(&ctx->lock);
2819                 task_ctx = ctx;
2820
2821                 reprogram = (ctx->task == current);
2822
2823                 /*
2824                  * If the task is running, it must be running on this CPU,
2825                  * otherwise we cannot reprogram things.
2826                  *
2827                  * If its not running, we don't care, ctx->lock will
2828                  * serialize against it becoming runnable.
2829                  */
2830                 if (task_curr(ctx->task) && !reprogram) {
2831                         ret = -ESRCH;
2832                         goto unlock;
2833                 }
2834
2835                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2836         } else if (task_ctx) {
2837                 raw_spin_lock(&task_ctx->lock);
2838         }
2839
2840 #ifdef CONFIG_CGROUP_PERF
2841         if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2842                 /*
2843                  * If the current cgroup doesn't match the event's
2844                  * cgroup, we should not try to schedule it.
2845                  */
2846                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2847                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2848                                         event->cgrp->css.cgroup);
2849         }
2850 #endif
2851
2852         if (reprogram) {
2853                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2854                 add_event_to_ctx(event, ctx);
2855                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2856         } else {
2857                 add_event_to_ctx(event, ctx);
2858         }
2859
2860 unlock:
2861         perf_ctx_unlock(cpuctx, task_ctx);
2862
2863         return ret;
2864 }
2865
2866 static bool exclusive_event_installable(struct perf_event *event,
2867                                         struct perf_event_context *ctx);
2868
2869 /*
2870  * Attach a performance event to a context.
2871  *
2872  * Very similar to event_function_call, see comment there.
2873  */
2874 static void
2875 perf_install_in_context(struct perf_event_context *ctx,
2876                         struct perf_event *event,
2877                         int cpu)
2878 {
2879         struct task_struct *task = READ_ONCE(ctx->task);
2880
2881         lockdep_assert_held(&ctx->mutex);
2882
2883         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2884
2885         if (event->cpu != -1)
2886                 event->cpu = cpu;
2887
2888         /*
2889          * Ensures that if we can observe event->ctx, both the event and ctx
2890          * will be 'complete'. See perf_iterate_sb_cpu().
2891          */
2892         smp_store_release(&event->ctx, ctx);
2893
2894         /*
2895          * perf_event_attr::disabled events will not run and can be initialized
2896          * without IPI. Except when this is the first event for the context, in
2897          * that case we need the magic of the IPI to set ctx->is_active.
2898          * Similarly, cgroup events for the context also needs the IPI to
2899          * manipulate the cgrp_cpuctx_list.
2900          *
2901          * The IOC_ENABLE that is sure to follow the creation of a disabled
2902          * event will issue the IPI and reprogram the hardware.
2903          */
2904         if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
2905             ctx->nr_events && !is_cgroup_event(event)) {
2906                 raw_spin_lock_irq(&ctx->lock);
2907                 if (ctx->task == TASK_TOMBSTONE) {
2908                         raw_spin_unlock_irq(&ctx->lock);
2909                         return;
2910                 }
2911                 add_event_to_ctx(event, ctx);
2912                 raw_spin_unlock_irq(&ctx->lock);
2913                 return;
2914         }
2915
2916         if (!task) {
2917                 cpu_function_call(cpu, __perf_install_in_context, event);
2918                 return;
2919         }
2920
2921         /*
2922          * Should not happen, we validate the ctx is still alive before calling.
2923          */
2924         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2925                 return;
2926
2927         /*
2928          * Installing events is tricky because we cannot rely on ctx->is_active
2929          * to be set in case this is the nr_events 0 -> 1 transition.
2930          *
2931          * Instead we use task_curr(), which tells us if the task is running.
2932          * However, since we use task_curr() outside of rq::lock, we can race
2933          * against the actual state. This means the result can be wrong.
2934          *
2935          * If we get a false positive, we retry, this is harmless.
2936          *
2937          * If we get a false negative, things are complicated. If we are after
2938          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2939          * value must be correct. If we're before, it doesn't matter since
2940          * perf_event_context_sched_in() will program the counter.
2941          *
2942          * However, this hinges on the remote context switch having observed
2943          * our task->perf_event_ctxp[] store, such that it will in fact take
2944          * ctx::lock in perf_event_context_sched_in().
2945          *
2946          * We do this by task_function_call(), if the IPI fails to hit the task
2947          * we know any future context switch of task must see the
2948          * perf_event_ctpx[] store.
2949          */
2950
2951         /*
2952          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2953          * task_cpu() load, such that if the IPI then does not find the task
2954          * running, a future context switch of that task must observe the
2955          * store.
2956          */
2957         smp_mb();
2958 again:
2959         if (!task_function_call(task, __perf_install_in_context, event))
2960                 return;
2961
2962         raw_spin_lock_irq(&ctx->lock);
2963         task = ctx->task;
2964         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2965                 /*
2966                  * Cannot happen because we already checked above (which also
2967                  * cannot happen), and we hold ctx->mutex, which serializes us
2968                  * against perf_event_exit_task_context().
2969                  */
2970                 raw_spin_unlock_irq(&ctx->lock);
2971                 return;
2972         }
2973         /*
2974          * If the task is not running, ctx->lock will avoid it becoming so,
2975          * thus we can safely install the event.
2976          */
2977         if (task_curr(task)) {
2978                 raw_spin_unlock_irq(&ctx->lock);
2979                 goto again;
2980         }
2981         add_event_to_ctx(event, ctx);
2982         raw_spin_unlock_irq(&ctx->lock);
2983 }
2984
2985 /*
2986  * Cross CPU call to enable a performance event
2987  */
2988 static void __perf_event_enable(struct perf_event *event,
2989                                 struct perf_cpu_context *cpuctx,
2990                                 struct perf_event_context *ctx,
2991                                 void *info)
2992 {
2993         struct perf_event *leader = event->group_leader;
2994         struct perf_event_context *task_ctx;
2995
2996         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2997             event->state <= PERF_EVENT_STATE_ERROR)
2998                 return;
2999
3000         if (ctx->is_active)
3001                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3002
3003         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3004         perf_cgroup_event_enable(event, ctx);
3005
3006         if (!ctx->is_active)
3007                 return;
3008
3009         if (!event_filter_match(event)) {
3010                 ctx_sched_in(ctx, cpuctx, EVENT_TIME);
3011                 return;
3012         }
3013
3014         /*
3015          * If the event is in a group and isn't the group leader,
3016          * then don't put it on unless the group is on.
3017          */
3018         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
3019                 ctx_sched_in(ctx, cpuctx, EVENT_TIME);
3020                 return;
3021         }
3022
3023         task_ctx = cpuctx->task_ctx;
3024         if (ctx->task)
3025                 WARN_ON_ONCE(task_ctx != ctx);
3026
3027         ctx_resched(cpuctx, task_ctx, get_event_type(event));
3028 }
3029
3030 /*
3031  * Enable an event.
3032  *
3033  * If event->ctx is a cloned context, callers must make sure that
3034  * every task struct that event->ctx->task could possibly point to
3035  * remains valid.  This condition is satisfied when called through
3036  * perf_event_for_each_child or perf_event_for_each as described
3037  * for perf_event_disable.
3038  */
3039 static void _perf_event_enable(struct perf_event *event)
3040 {
3041         struct perf_event_context *ctx = event->ctx;
3042
3043         raw_spin_lock_irq(&ctx->lock);
3044         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
3045             event->state <  PERF_EVENT_STATE_ERROR) {
3046 out:
3047                 raw_spin_unlock_irq(&ctx->lock);
3048                 return;
3049         }
3050
3051         /*
3052          * If the event is in error state, clear that first.
3053          *
3054          * That way, if we see the event in error state below, we know that it
3055          * has gone back into error state, as distinct from the task having
3056          * been scheduled away before the cross-call arrived.
3057          */
3058         if (event->state == PERF_EVENT_STATE_ERROR) {
3059                 /*
3060                  * Detached SIBLING events cannot leave ERROR state.
3061                  */
3062                 if (event->event_caps & PERF_EV_CAP_SIBLING &&
3063                     event->group_leader == event)
3064                         goto out;
3065
3066                 event->state = PERF_EVENT_STATE_OFF;
3067         }
3068         raw_spin_unlock_irq(&ctx->lock);
3069
3070         event_function_call(event, __perf_event_enable, NULL);
3071 }
3072
3073 /*
3074  * See perf_event_disable();
3075  */
3076 void perf_event_enable(struct perf_event *event)
3077 {
3078         struct perf_event_context *ctx;
3079
3080         ctx = perf_event_ctx_lock(event);
3081         _perf_event_enable(event);
3082         perf_event_ctx_unlock(event, ctx);
3083 }
3084 EXPORT_SYMBOL_GPL(perf_event_enable);
3085
3086 struct stop_event_data {
3087         struct perf_event       *event;
3088         unsigned int            restart;
3089 };
3090
3091 static int __perf_event_stop(void *info)
3092 {
3093         struct stop_event_data *sd = info;
3094         struct perf_event *event = sd->event;
3095
3096         /* if it's already INACTIVE, do nothing */
3097         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3098                 return 0;
3099
3100         /* matches smp_wmb() in event_sched_in() */
3101         smp_rmb();
3102
3103         /*
3104          * There is a window with interrupts enabled before we get here,
3105          * so we need to check again lest we try to stop another CPU's event.
3106          */
3107         if (READ_ONCE(event->oncpu) != smp_processor_id())
3108                 return -EAGAIN;
3109
3110         event->pmu->stop(event, PERF_EF_UPDATE);
3111
3112         /*
3113          * May race with the actual stop (through perf_pmu_output_stop()),
3114          * but it is only used for events with AUX ring buffer, and such
3115          * events will refuse to restart because of rb::aux_mmap_count==0,
3116          * see comments in perf_aux_output_begin().
3117          *
3118          * Since this is happening on an event-local CPU, no trace is lost
3119          * while restarting.
3120          */
3121         if (sd->restart)
3122                 event->pmu->start(event, 0);
3123
3124         return 0;
3125 }
3126
3127 static int perf_event_stop(struct perf_event *event, int restart)
3128 {
3129         struct stop_event_data sd = {
3130                 .event          = event,
3131                 .restart        = restart,
3132         };
3133         int ret = 0;
3134
3135         do {
3136                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3137                         return 0;
3138
3139                 /* matches smp_wmb() in event_sched_in() */
3140                 smp_rmb();
3141
3142                 /*
3143                  * We only want to restart ACTIVE events, so if the event goes
3144                  * inactive here (event->oncpu==-1), there's nothing more to do;
3145                  * fall through with ret==-ENXIO.
3146                  */
3147                 ret = cpu_function_call(READ_ONCE(event->oncpu),
3148                                         __perf_event_stop, &sd);
3149         } while (ret == -EAGAIN);
3150
3151         return ret;
3152 }
3153
3154 /*
3155  * In order to contain the amount of racy and tricky in the address filter
3156  * configuration management, it is a two part process:
3157  *
3158  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3159  *      we update the addresses of corresponding vmas in
3160  *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
3161  * (p2) when an event is scheduled in (pmu::add), it calls
3162  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3163  *      if the generation has changed since the previous call.
3164  *
3165  * If (p1) happens while the event is active, we restart it to force (p2).
3166  *
3167  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3168  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3169  *     ioctl;
3170  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3171  *     registered mapping, called for every new mmap(), with mm::mmap_lock down
3172  *     for reading;
3173  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3174  *     of exec.
3175  */
3176 void perf_event_addr_filters_sync(struct perf_event *event)
3177 {
3178         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3179
3180         if (!has_addr_filter(event))
3181                 return;
3182
3183         raw_spin_lock(&ifh->lock);
3184         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3185                 event->pmu->addr_filters_sync(event);
3186                 event->hw.addr_filters_gen = event->addr_filters_gen;
3187         }
3188         raw_spin_unlock(&ifh->lock);
3189 }
3190 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3191
3192 static int _perf_event_refresh(struct perf_event *event, int refresh)
3193 {
3194         /*
3195          * not supported on inherited events
3196          */
3197         if (event->attr.inherit || !is_sampling_event(event))
3198                 return -EINVAL;
3199
3200         atomic_add(refresh, &event->event_limit);
3201         _perf_event_enable(event);
3202
3203         return 0;
3204 }
3205
3206 /*
3207  * See perf_event_disable()
3208  */
3209 int perf_event_refresh(struct perf_event *event, int refresh)
3210 {
3211         struct perf_event_context *ctx;
3212         int ret;
3213
3214         ctx = perf_event_ctx_lock(event);
3215         ret = _perf_event_refresh(event, refresh);
3216         perf_event_ctx_unlock(event, ctx);
3217
3218         return ret;
3219 }
3220 EXPORT_SYMBOL_GPL(perf_event_refresh);
3221
3222 static int perf_event_modify_breakpoint(struct perf_event *bp,
3223                                          struct perf_event_attr *attr)
3224 {
3225         int err;
3226
3227         _perf_event_disable(bp);
3228
3229         err = modify_user_hw_breakpoint_check(bp, attr, true);
3230
3231         if (!bp->attr.disabled)
3232                 _perf_event_enable(bp);
3233
3234         return err;
3235 }
3236
3237 /*
3238  * Copy event-type-independent attributes that may be modified.
3239  */
3240 static void perf_event_modify_copy_attr(struct perf_event_attr *to,
3241                                         const struct perf_event_attr *from)
3242 {
3243         to->sig_data = from->sig_data;
3244 }
3245
3246 static int perf_event_modify_attr(struct perf_event *event,
3247                                   struct perf_event_attr *attr)
3248 {
3249         int (*func)(struct perf_event *, struct perf_event_attr *);
3250         struct perf_event *child;
3251         int err;
3252
3253         if (event->attr.type != attr->type)
3254                 return -EINVAL;
3255
3256         switch (event->attr.type) {
3257         case PERF_TYPE_BREAKPOINT:
3258                 func = perf_event_modify_breakpoint;
3259                 break;
3260         default:
3261                 /* Place holder for future additions. */
3262                 return -EOPNOTSUPP;
3263         }
3264
3265         WARN_ON_ONCE(event->ctx->parent_ctx);
3266
3267         mutex_lock(&event->child_mutex);
3268         /*
3269          * Event-type-independent attributes must be copied before event-type
3270          * modification, which will validate that final attributes match the
3271          * source attributes after all relevant attributes have been copied.
3272          */
3273         perf_event_modify_copy_attr(&event->attr, attr);
3274         err = func(event, attr);
3275         if (err)
3276                 goto out;
3277         list_for_each_entry(child, &event->child_list, child_list) {
3278                 perf_event_modify_copy_attr(&child->attr, attr);
3279                 err = func(child, attr);
3280                 if (err)
3281                         goto out;
3282         }
3283 out:
3284         mutex_unlock(&event->child_mutex);
3285         return err;
3286 }
3287
3288 static void ctx_sched_out(struct perf_event_context *ctx,
3289                           struct perf_cpu_context *cpuctx,
3290                           enum event_type_t event_type)
3291 {
3292         struct perf_event *event, *tmp;
3293         int is_active = ctx->is_active;
3294
3295         lockdep_assert_held(&ctx->lock);
3296
3297         if (likely(!ctx->nr_events)) {
3298                 /*
3299                  * See __perf_remove_from_context().
3300                  */
3301                 WARN_ON_ONCE(ctx->is_active);
3302                 if (ctx->task)
3303                         WARN_ON_ONCE(cpuctx->task_ctx);
3304                 return;
3305         }
3306
3307         /*
3308          * Always update time if it was set; not only when it changes.
3309          * Otherwise we can 'forget' to update time for any but the last
3310          * context we sched out. For example:
3311          *
3312          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3313          *   ctx_sched_out(.event_type = EVENT_PINNED)
3314          *
3315          * would only update time for the pinned events.
3316          */
3317         if (is_active & EVENT_TIME) {
3318                 /* update (and stop) ctx time */
3319                 update_context_time(ctx);
3320                 update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
3321                 /*
3322                  * CPU-release for the below ->is_active store,
3323                  * see __load_acquire() in perf_event_time_now()
3324                  */
3325                 barrier();
3326         }
3327
3328         ctx->is_active &= ~event_type;
3329         if (!(ctx->is_active & EVENT_ALL))
3330                 ctx->is_active = 0;
3331
3332         if (ctx->task) {
3333                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3334                 if (!ctx->is_active)
3335                         cpuctx->task_ctx = NULL;
3336         }
3337
3338         is_active ^= ctx->is_active; /* changed bits */
3339
3340         if (!ctx->nr_active || !(is_active & EVENT_ALL))
3341                 return;
3342
3343         perf_pmu_disable(ctx->pmu);
3344         if (is_active & EVENT_PINNED) {
3345                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3346                         group_sched_out(event, cpuctx, ctx);
3347         }
3348
3349         if (is_active & EVENT_FLEXIBLE) {
3350                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3351                         group_sched_out(event, cpuctx, ctx);
3352
3353                 /*
3354                  * Since we cleared EVENT_FLEXIBLE, also clear
3355                  * rotate_necessary, is will be reset by
3356                  * ctx_flexible_sched_in() when needed.
3357                  */
3358                 ctx->rotate_necessary = 0;
3359         }
3360         perf_pmu_enable(ctx->pmu);
3361 }
3362
3363 /*
3364  * Test whether two contexts are equivalent, i.e. whether they have both been
3365  * cloned from the same version of the same context.
3366  *
3367  * Equivalence is measured using a generation number in the context that is
3368  * incremented on each modification to it; see unclone_ctx(), list_add_event()
3369  * and list_del_event().
3370  */
3371 static int context_equiv(struct perf_event_context *ctx1,
3372                          struct perf_event_context *ctx2)
3373 {
3374         lockdep_assert_held(&ctx1->lock);
3375         lockdep_assert_held(&ctx2->lock);
3376
3377         /* Pinning disables the swap optimization */
3378         if (ctx1->pin_count || ctx2->pin_count)
3379                 return 0;
3380
3381         /* If ctx1 is the parent of ctx2 */
3382         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3383                 return 1;
3384
3385         /* If ctx2 is the parent of ctx1 */
3386         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3387                 return 1;
3388
3389         /*
3390          * If ctx1 and ctx2 have the same parent; we flatten the parent
3391          * hierarchy, see perf_event_init_context().
3392          */
3393         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3394                         ctx1->parent_gen == ctx2->parent_gen)
3395                 return 1;
3396
3397         /* Unmatched */
3398         return 0;
3399 }
3400
3401 static void __perf_event_sync_stat(struct perf_event *event,
3402                                      struct perf_event *next_event)
3403 {
3404         u64 value;
3405
3406         if (!event->attr.inherit_stat)
3407                 return;
3408
3409         /*
3410          * Update the event value, we cannot use perf_event_read()
3411          * because we're in the middle of a context switch and have IRQs
3412          * disabled, which upsets smp_call_function_single(), however
3413          * we know the event must be on the current CPU, therefore we
3414          * don't need to use it.
3415          */
3416         if (event->state == PERF_EVENT_STATE_ACTIVE)
3417                 event->pmu->read(event);
3418
3419         perf_event_update_time(event);
3420
3421         /*
3422          * In order to keep per-task stats reliable we need to flip the event
3423          * values when we flip the contexts.
3424          */
3425         value = local64_read(&next_event->count);
3426         value = local64_xchg(&event->count, value);
3427         local64_set(&next_event->count, value);
3428
3429         swap(event->total_time_enabled, next_event->total_time_enabled);
3430         swap(event->total_time_running, next_event->total_time_running);
3431
3432         /*
3433          * Since we swizzled the values, update the user visible data too.
3434          */
3435         perf_event_update_userpage(event);
3436         perf_event_update_userpage(next_event);
3437 }
3438
3439 static void perf_event_sync_stat(struct perf_event_context *ctx,
3440                                    struct perf_event_context *next_ctx)
3441 {
3442         struct perf_event *event, *next_event;
3443
3444         if (!ctx->nr_stat)
3445                 return;
3446
3447         update_context_time(ctx);
3448
3449         event = list_first_entry(&ctx->event_list,
3450                                    struct perf_event, event_entry);
3451
3452         next_event = list_first_entry(&next_ctx->event_list,
3453                                         struct perf_event, event_entry);
3454
3455         while (&event->event_entry != &ctx->event_list &&
3456                &next_event->event_entry != &next_ctx->event_list) {
3457
3458                 __perf_event_sync_stat(event, next_event);
3459
3460                 event = list_next_entry(event, event_entry);
3461                 next_event = list_next_entry(next_event, event_entry);
3462         }
3463 }
3464
3465 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3466                                          struct task_struct *next)
3467 {
3468         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3469         struct perf_event_context *next_ctx;
3470         struct perf_event_context *parent, *next_parent;
3471         struct perf_cpu_context *cpuctx;
3472         int do_switch = 1;
3473         struct pmu *pmu;
3474
3475         if (likely(!ctx))
3476                 return;
3477
3478         pmu = ctx->pmu;
3479         cpuctx = __get_cpu_context(ctx);
3480         if (!cpuctx->task_ctx)
3481                 return;
3482
3483         rcu_read_lock();
3484         next_ctx = next->perf_event_ctxp[ctxn];
3485         if (!next_ctx)
3486                 goto unlock;
3487
3488         parent = rcu_dereference(ctx->parent_ctx);
3489         next_parent = rcu_dereference(next_ctx->parent_ctx);
3490
3491         /* If neither context have a parent context; they cannot be clones. */
3492         if (!parent && !next_parent)
3493                 goto unlock;
3494
3495         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3496                 /*
3497                  * Looks like the two contexts are clones, so we might be
3498                  * able to optimize the context switch.  We lock both
3499                  * contexts and check that they are clones under the
3500                  * lock (including re-checking that neither has been
3501                  * uncloned in the meantime).  It doesn't matter which
3502                  * order we take the locks because no other cpu could
3503                  * be trying to lock both of these tasks.
3504                  */
3505                 raw_spin_lock(&ctx->lock);
3506                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3507                 if (context_equiv(ctx, next_ctx)) {
3508
3509                         WRITE_ONCE(ctx->task, next);
3510                         WRITE_ONCE(next_ctx->task, task);
3511
3512                         perf_pmu_disable(pmu);
3513
3514                         if (cpuctx->sched_cb_usage && pmu->sched_task)
3515                                 pmu->sched_task(ctx, false);
3516
3517                         /*
3518                          * PMU specific parts of task perf context can require
3519                          * additional synchronization. As an example of such
3520                          * synchronization see implementation details of Intel
3521                          * LBR call stack data profiling;
3522                          */
3523                         if (pmu->swap_task_ctx)
3524                                 pmu->swap_task_ctx(ctx, next_ctx);
3525                         else
3526                                 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3527
3528                         perf_pmu_enable(pmu);
3529
3530                         /*
3531                          * RCU_INIT_POINTER here is safe because we've not
3532                          * modified the ctx and the above modification of
3533                          * ctx->task and ctx->task_ctx_data are immaterial
3534                          * since those values are always verified under
3535                          * ctx->lock which we're now holding.
3536                          */
3537                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3538                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3539
3540                         do_switch = 0;
3541
3542                         perf_event_sync_stat(ctx, next_ctx);
3543                 }
3544                 raw_spin_unlock(&next_ctx->lock);
3545                 raw_spin_unlock(&ctx->lock);
3546         }
3547 unlock:
3548         rcu_read_unlock();
3549
3550         if (do_switch) {
3551                 raw_spin_lock(&ctx->lock);
3552                 perf_pmu_disable(pmu);
3553
3554                 if (cpuctx->sched_cb_usage && pmu->sched_task)
3555                         pmu->sched_task(ctx, false);
3556                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3557
3558                 perf_pmu_enable(pmu);
3559                 raw_spin_unlock(&ctx->lock);
3560         }
3561 }
3562
3563 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3564
3565 void perf_sched_cb_dec(struct pmu *pmu)
3566 {
3567         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3568
3569         this_cpu_dec(perf_sched_cb_usages);
3570
3571         if (!--cpuctx->sched_cb_usage)
3572                 list_del(&cpuctx->sched_cb_entry);
3573 }
3574
3575
3576 void perf_sched_cb_inc(struct pmu *pmu)
3577 {
3578         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3579
3580         if (!cpuctx->sched_cb_usage++)
3581                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3582
3583         this_cpu_inc(perf_sched_cb_usages);
3584 }
3585
3586 /*
3587  * This function provides the context switch callback to the lower code
3588  * layer. It is invoked ONLY when the context switch callback is enabled.
3589  *
3590  * This callback is relevant even to per-cpu events; for example multi event
3591  * PEBS requires this to provide PID/TID information. This requires we flush
3592  * all queued PEBS records before we context switch to a new task.
3593  */
3594 static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3595 {
3596         struct pmu *pmu;
3597
3598         pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3599
3600         if (WARN_ON_ONCE(!pmu->sched_task))
3601                 return;
3602
3603         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3604         perf_pmu_disable(pmu);
3605
3606         pmu->sched_task(cpuctx->task_ctx, sched_in);
3607
3608         perf_pmu_enable(pmu);
3609         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3610 }
3611
3612 static void perf_pmu_sched_task(struct task_struct *prev,
3613                                 struct task_struct *next,
3614                                 bool sched_in)
3615 {
3616         struct perf_cpu_context *cpuctx;
3617
3618         if (prev == next)
3619                 return;
3620
3621         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3622                 /* will be handled in perf_event_context_sched_in/out */
3623                 if (cpuctx->task_ctx)
3624                         continue;
3625
3626                 __perf_pmu_sched_task(cpuctx, sched_in);
3627         }
3628 }
3629
3630 static void perf_event_switch(struct task_struct *task,
3631                               struct task_struct *next_prev, bool sched_in);
3632
3633 #define for_each_task_context_nr(ctxn)                                  \
3634         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3635
3636 /*
3637  * Called from scheduler to remove the events of the current task,
3638  * with interrupts disabled.
3639  *
3640  * We stop each event and update the event value in event->count.
3641  *
3642  * This does not protect us against NMI, but disable()
3643  * sets the disabled bit in the control field of event _before_
3644  * accessing the event control register. If a NMI hits, then it will
3645  * not restart the event.
3646  */
3647 void __perf_event_task_sched_out(struct task_struct *task,
3648                                  struct task_struct *next)
3649 {
3650         int ctxn;
3651
3652         if (__this_cpu_read(perf_sched_cb_usages))
3653                 perf_pmu_sched_task(task, next, false);
3654
3655         if (atomic_read(&nr_switch_events))
3656                 perf_event_switch(task, next, false);
3657
3658         for_each_task_context_nr(ctxn)
3659                 perf_event_context_sched_out(task, ctxn, next);
3660
3661         /*
3662          * if cgroup events exist on this CPU, then we need
3663          * to check if we have to switch out PMU state.
3664          * cgroup event are system-wide mode only
3665          */
3666         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3667                 perf_cgroup_sched_out(task, next);
3668 }
3669
3670 /*
3671  * Called with IRQs disabled
3672  */
3673 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3674                               enum event_type_t event_type)
3675 {
3676         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3677 }
3678
3679 static bool perf_less_group_idx(const void *l, const void *r)
3680 {
3681         const struct perf_event *le = *(const struct perf_event **)l;
3682         const struct perf_event *re = *(const struct perf_event **)r;
3683
3684         return le->group_index < re->group_index;
3685 }
3686
3687 static void swap_ptr(void *l, void *r)
3688 {
3689         void **lp = l, **rp = r;
3690
3691         swap(*lp, *rp);
3692 }
3693
3694 static const struct min_heap_callbacks perf_min_heap = {
3695         .elem_size = sizeof(struct perf_event *),
3696         .less = perf_less_group_idx,
3697         .swp = swap_ptr,
3698 };
3699
3700 static void __heap_add(struct min_heap *heap, struct perf_event *event)
3701 {
3702         struct perf_event **itrs = heap->data;
3703
3704         if (event) {
3705                 itrs[heap->nr] = event;
3706                 heap->nr++;
3707         }
3708 }
3709
3710 static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3711                                 struct perf_event_groups *groups, int cpu,
3712                                 int (*func)(struct perf_event *, void *),
3713                                 void *data)
3714 {
3715 #ifdef CONFIG_CGROUP_PERF
3716         struct cgroup_subsys_state *css = NULL;
3717 #endif
3718         /* Space for per CPU and/or any CPU event iterators. */
3719         struct perf_event *itrs[2];
3720         struct min_heap event_heap;
3721         struct perf_event **evt;
3722         int ret;
3723
3724         if (cpuctx) {
3725                 event_heap = (struct min_heap){
3726                         .data = cpuctx->heap,
3727                         .nr = 0,
3728                         .size = cpuctx->heap_size,
3729                 };
3730
3731                 lockdep_assert_held(&cpuctx->ctx.lock);
3732
3733 #ifdef CONFIG_CGROUP_PERF
3734                 if (cpuctx->cgrp)
3735                         css = &cpuctx->cgrp->css;
3736 #endif
3737         } else {
3738                 event_heap = (struct min_heap){
3739                         .data = itrs,
3740                         .nr = 0,
3741                         .size = ARRAY_SIZE(itrs),
3742                 };
3743                 /* Events not within a CPU context may be on any CPU. */
3744                 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3745         }
3746         evt = event_heap.data;
3747
3748         __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3749
3750 #ifdef CONFIG_CGROUP_PERF
3751         for (; css; css = css->parent)
3752                 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3753 #endif
3754
3755         min_heapify_all(&event_heap, &perf_min_heap);
3756
3757         while (event_heap.nr) {
3758                 ret = func(*evt, data);
3759                 if (ret)
3760                         return ret;
3761
3762                 *evt = perf_event_groups_next(*evt);
3763                 if (*evt)
3764                         min_heapify(&event_heap, 0, &perf_min_heap);
3765                 else
3766                         min_heap_pop(&event_heap, &perf_min_heap);
3767         }
3768
3769         return 0;
3770 }
3771
3772 /*
3773  * Because the userpage is strictly per-event (there is no concept of context,
3774  * so there cannot be a context indirection), every userpage must be updated
3775  * when context time starts :-(
3776  *
3777  * IOW, we must not miss EVENT_TIME edges.
3778  */
3779 static inline bool event_update_userpage(struct perf_event *event)
3780 {
3781         if (likely(!atomic_read(&event->mmap_count)))
3782                 return false;
3783
3784         perf_event_update_time(event);
3785         perf_event_update_userpage(event);
3786
3787         return true;
3788 }
3789
3790 static inline void group_update_userpage(struct perf_event *group_event)
3791 {
3792         struct perf_event *event;
3793
3794         if (!event_update_userpage(group_event))
3795                 return;
3796
3797         for_each_sibling_event(event, group_event)
3798                 event_update_userpage(event);
3799 }
3800
3801 static int merge_sched_in(struct perf_event *event, void *data)
3802 {
3803         struct perf_event_context *ctx = event->ctx;
3804         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3805         int *can_add_hw = data;
3806
3807         if (event->state <= PERF_EVENT_STATE_OFF)
3808                 return 0;
3809
3810         if (!event_filter_match(event))
3811                 return 0;
3812
3813         if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3814                 if (!group_sched_in(event, cpuctx, ctx))
3815                         list_add_tail(&event->active_list, get_event_list(event));
3816         }
3817
3818         if (event->state == PERF_EVENT_STATE_INACTIVE) {
3819                 *can_add_hw = 0;
3820                 if (event->attr.pinned) {
3821                         perf_cgroup_event_disable(event, ctx);
3822                         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3823                 } else {
3824                         ctx->rotate_necessary = 1;
3825                         perf_mux_hrtimer_restart(cpuctx);
3826                         group_update_userpage(event);
3827                 }
3828         }
3829
3830         return 0;
3831 }
3832
3833 static void
3834 ctx_pinned_sched_in(struct perf_event_context *ctx,
3835                     struct perf_cpu_context *cpuctx)
3836 {
3837         int can_add_hw = 1;
3838
3839         if (ctx != &cpuctx->ctx)
3840                 cpuctx = NULL;
3841
3842         visit_groups_merge(cpuctx, &ctx->pinned_groups,
3843                            smp_processor_id(),
3844                            merge_sched_in, &can_add_hw);
3845 }
3846
3847 static void
3848 ctx_flexible_sched_in(struct perf_event_context *ctx,
3849                       struct perf_cpu_context *cpuctx)
3850 {
3851         int can_add_hw = 1;
3852
3853         if (ctx != &cpuctx->ctx)
3854                 cpuctx = NULL;
3855
3856         visit_groups_merge(cpuctx, &ctx->flexible_groups,
3857                            smp_processor_id(),
3858                            merge_sched_in, &can_add_hw);
3859 }
3860
3861 static void
3862 ctx_sched_in(struct perf_event_context *ctx,
3863              struct perf_cpu_context *cpuctx,
3864              enum event_type_t event_type)
3865 {
3866         int is_active = ctx->is_active;
3867
3868         lockdep_assert_held(&ctx->lock);
3869
3870         if (likely(!ctx->nr_events))
3871                 return;
3872
3873         if (is_active ^ EVENT_TIME) {
3874                 /* start ctx time */
3875                 __update_context_time(ctx, false);
3876                 perf_cgroup_set_timestamp(cpuctx);
3877                 /*
3878                  * CPU-release for the below ->is_active store,
3879                  * see __load_acquire() in perf_event_time_now()
3880                  */
3881                 barrier();
3882         }
3883
3884         ctx->is_active |= (event_type | EVENT_TIME);
3885         if (ctx->task) {
3886                 if (!is_active)
3887                         cpuctx->task_ctx = ctx;
3888                 else
3889                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3890         }
3891
3892         is_active ^= ctx->is_active; /* changed bits */
3893
3894         /*
3895          * First go through the list and put on any pinned groups
3896          * in order to give them the best chance of going on.
3897          */
3898         if (is_active & EVENT_PINNED)
3899                 ctx_pinned_sched_in(ctx, cpuctx);
3900
3901         /* Then walk through the lower prio flexible groups */
3902         if (is_active & EVENT_FLEXIBLE)
3903                 ctx_flexible_sched_in(ctx, cpuctx);
3904 }
3905
3906 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3907                              enum event_type_t event_type)
3908 {
3909         struct perf_event_context *ctx = &cpuctx->ctx;
3910
3911         ctx_sched_in(ctx, cpuctx, event_type);
3912 }
3913
3914 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3915                                         struct task_struct *task)
3916 {
3917         struct perf_cpu_context *cpuctx;
3918         struct pmu *pmu;
3919
3920         cpuctx = __get_cpu_context(ctx);
3921
3922         /*
3923          * HACK: for HETEROGENEOUS the task context might have switched to a
3924          * different PMU, force (re)set the context,
3925          */
3926         pmu = ctx->pmu = cpuctx->ctx.pmu;
3927
3928         if (cpuctx->task_ctx == ctx) {
3929                 if (cpuctx->sched_cb_usage)
3930                         __perf_pmu_sched_task(cpuctx, true);
3931                 return;
3932         }
3933
3934         perf_ctx_lock(cpuctx, ctx);
3935         /*
3936          * We must check ctx->nr_events while holding ctx->lock, such
3937          * that we serialize against perf_install_in_context().
3938          */
3939         if (!ctx->nr_events)
3940                 goto unlock;
3941
3942         perf_pmu_disable(pmu);
3943         /*
3944          * We want to keep the following priority order:
3945          * cpu pinned (that don't need to move), task pinned,
3946          * cpu flexible, task flexible.
3947          *
3948          * However, if task's ctx is not carrying any pinned
3949          * events, no need to flip the cpuctx's events around.
3950          */
3951         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3952                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3953         perf_event_sched_in(cpuctx, ctx);
3954
3955         if (cpuctx->sched_cb_usage && pmu->sched_task)
3956                 pmu->sched_task(cpuctx->task_ctx, true);
3957
3958         perf_pmu_enable(pmu);
3959
3960 unlock:
3961         perf_ctx_unlock(cpuctx, ctx);
3962 }
3963
3964 /*
3965  * Called from scheduler to add the events of the current task
3966  * with interrupts disabled.
3967  *
3968  * We restore the event value and then enable it.
3969  *
3970  * This does not protect us against NMI, but enable()
3971  * sets the enabled bit in the control field of event _before_
3972  * accessing the event control register. If a NMI hits, then it will
3973  * keep the event running.
3974  */
3975 void __perf_event_task_sched_in(struct task_struct *prev,
3976                                 struct task_struct *task)
3977 {
3978         struct perf_event_context *ctx;
3979         int ctxn;
3980
3981         /*
3982          * If cgroup events exist on this CPU, then we need to check if we have
3983          * to switch in PMU state; cgroup event are system-wide mode only.
3984          *
3985          * Since cgroup events are CPU events, we must schedule these in before
3986          * we schedule in the task events.
3987          */
3988         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3989                 perf_cgroup_sched_in(prev, task);
3990
3991         for_each_task_context_nr(ctxn) {
3992                 ctx = task->perf_event_ctxp[ctxn];
3993                 if (likely(!ctx))
3994                         continue;
3995
3996                 perf_event_context_sched_in(ctx, task);
3997         }
3998
3999         if (atomic_read(&nr_switch_events))
4000                 perf_event_switch(task, prev, true);
4001
4002         if (__this_cpu_read(perf_sched_cb_usages))
4003                 perf_pmu_sched_task(prev, task, true);
4004 }
4005
4006 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
4007 {
4008         u64 frequency = event->attr.sample_freq;
4009         u64 sec = NSEC_PER_SEC;
4010         u64 divisor, dividend;
4011
4012         int count_fls, nsec_fls, frequency_fls, sec_fls;
4013
4014         count_fls = fls64(count);
4015         nsec_fls = fls64(nsec);
4016         frequency_fls = fls64(frequency);
4017         sec_fls = 30;
4018
4019         /*
4020          * We got @count in @nsec, with a target of sample_freq HZ
4021          * the target period becomes:
4022          *
4023          *             @count * 10^9
4024          * period = -------------------
4025          *          @nsec * sample_freq
4026          *
4027          */
4028
4029         /*
4030          * Reduce accuracy by one bit such that @a and @b converge
4031          * to a similar magnitude.
4032          */
4033 #define REDUCE_FLS(a, b)                \
4034 do {                                    \
4035         if (a##_fls > b##_fls) {        \
4036                 a >>= 1;                \
4037                 a##_fls--;              \
4038         } else {                        \
4039                 b >>= 1;                \
4040                 b##_fls--;              \
4041         }                               \
4042 } while (0)
4043
4044         /*
4045          * Reduce accuracy until either term fits in a u64, then proceed with
4046          * the other, so that finally we can do a u64/u64 division.
4047          */
4048         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
4049                 REDUCE_FLS(nsec, frequency);
4050                 REDUCE_FLS(sec, count);
4051         }
4052
4053         if (count_fls + sec_fls > 64) {
4054                 divisor = nsec * frequency;
4055
4056                 while (count_fls + sec_fls > 64) {
4057                         REDUCE_FLS(count, sec);
4058                         divisor >>= 1;
4059                 }
4060
4061                 dividend = count * sec;
4062         } else {
4063                 dividend = count * sec;
4064
4065                 while (nsec_fls + frequency_fls > 64) {
4066                         REDUCE_FLS(nsec, frequency);
4067                         dividend >>= 1;
4068                 }
4069
4070                 divisor = nsec * frequency;
4071         }
4072
4073         if (!divisor)
4074                 return dividend;
4075
4076         return div64_u64(dividend, divisor);
4077 }
4078
4079 static DEFINE_PER_CPU(int, perf_throttled_count);
4080 static DEFINE_PER_CPU(u64, perf_throttled_seq);
4081
4082 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
4083 {
4084         struct hw_perf_event *hwc = &event->hw;
4085         s64 period, sample_period;
4086         s64 delta;
4087
4088         period = perf_calculate_period(event, nsec, count);
4089
4090         delta = (s64)(period - hwc->sample_period);
4091         delta = (delta + 7) / 8; /* low pass filter */
4092
4093         sample_period = hwc->sample_period + delta;
4094
4095         if (!sample_period)
4096                 sample_period = 1;
4097
4098         hwc->sample_period = sample_period;
4099
4100         if (local64_read(&hwc->period_left) > 8*sample_period) {
4101                 if (disable)
4102                         event->pmu->stop(event, PERF_EF_UPDATE);
4103
4104                 local64_set(&hwc->period_left, 0);
4105
4106                 if (disable)
4107                         event->pmu->start(event, PERF_EF_RELOAD);
4108         }
4109 }
4110
4111 /*
4112  * combine freq adjustment with unthrottling to avoid two passes over the
4113  * events. At the same time, make sure, having freq events does not change
4114  * the rate of unthrottling as that would introduce bias.
4115  */
4116 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
4117                                            int needs_unthr)
4118 {
4119         struct perf_event *event;
4120         struct hw_perf_event *hwc;
4121         u64 now, period = TICK_NSEC;
4122         s64 delta;
4123
4124         /*
4125          * only need to iterate over all events iff:
4126          * - context have events in frequency mode (needs freq adjust)
4127          * - there are events to unthrottle on this cpu
4128          */
4129         if (!(ctx->nr_freq || needs_unthr))
4130                 return;
4131
4132         raw_spin_lock(&ctx->lock);
4133         perf_pmu_disable(ctx->pmu);
4134
4135         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4136                 if (event->state != PERF_EVENT_STATE_ACTIVE)
4137                         continue;
4138
4139                 if (!event_filter_match(event))
4140                         continue;
4141
4142                 perf_pmu_disable(event->pmu);
4143
4144                 hwc = &event->hw;
4145
4146                 if (hwc->interrupts == MAX_INTERRUPTS) {
4147                         hwc->interrupts = 0;
4148                         perf_log_throttle(event, 1);
4149                         event->pmu->start(event, 0);
4150                 }
4151
4152                 if (!event->attr.freq || !event->attr.sample_freq)
4153                         goto next;
4154
4155                 /*
4156                  * stop the event and update event->count
4157                  */
4158                 event->pmu->stop(event, PERF_EF_UPDATE);
4159
4160                 now = local64_read(&event->count);
4161                 delta = now - hwc->freq_count_stamp;
4162                 hwc->freq_count_stamp = now;
4163
4164                 /*
4165                  * restart the event
4166                  * reload only if value has changed
4167                  * we have stopped the event so tell that
4168                  * to perf_adjust_period() to avoid stopping it
4169                  * twice.
4170                  */
4171                 if (delta > 0)
4172                         perf_adjust_period(event, period, delta, false);
4173
4174                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4175         next:
4176                 perf_pmu_enable(event->pmu);
4177         }
4178
4179         perf_pmu_enable(ctx->pmu);
4180         raw_spin_unlock(&ctx->lock);
4181 }
4182
4183 /*
4184  * Move @event to the tail of the @ctx's elegible events.
4185  */
4186 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4187 {
4188         /*
4189          * Rotate the first entry last of non-pinned groups. Rotation might be
4190          * disabled by the inheritance code.
4191          */
4192         if (ctx->rotate_disable)
4193                 return;
4194
4195         perf_event_groups_delete(&ctx->flexible_groups, event);
4196         perf_event_groups_insert(&ctx->flexible_groups, event);
4197 }
4198
4199 /* pick an event from the flexible_groups to rotate */
4200 static inline struct perf_event *
4201 ctx_event_to_rotate(struct perf_event_context *ctx)
4202 {
4203         struct perf_event *event;
4204
4205         /* pick the first active flexible event */
4206         event = list_first_entry_or_null(&ctx->flexible_active,
4207                                          struct perf_event, active_list);
4208
4209         /* if no active flexible event, pick the first event */
4210         if (!event) {
4211                 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4212                                       typeof(*event), group_node);
4213         }
4214
4215         /*
4216          * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4217          * finds there are unschedulable events, it will set it again.
4218          */
4219         ctx->rotate_necessary = 0;
4220
4221         return event;
4222 }
4223
4224 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4225 {
4226         struct perf_event *cpu_event = NULL, *task_event = NULL;
4227         struct perf_event_context *task_ctx = NULL;
4228         int cpu_rotate, task_rotate;
4229
4230         /*
4231          * Since we run this from IRQ context, nobody can install new
4232          * events, thus the event count values are stable.
4233          */
4234
4235         cpu_rotate = cpuctx->ctx.rotate_necessary;
4236         task_ctx = cpuctx->task_ctx;
4237         task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4238
4239         if (!(cpu_rotate || task_rotate))
4240                 return false;
4241
4242         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4243         perf_pmu_disable(cpuctx->ctx.pmu);
4244
4245         if (task_rotate)
4246                 task_event = ctx_event_to_rotate(task_ctx);
4247         if (cpu_rotate)
4248                 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4249
4250         /*
4251          * As per the order given at ctx_resched() first 'pop' task flexible
4252          * and then, if needed CPU flexible.
4253          */
4254         if (task_event || (task_ctx && cpu_event))
4255                 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4256         if (cpu_event)
4257                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4258
4259         if (task_event)
4260                 rotate_ctx(task_ctx, task_event);
4261         if (cpu_event)
4262                 rotate_ctx(&cpuctx->ctx, cpu_event);
4263
4264         perf_event_sched_in(cpuctx, task_ctx);
4265
4266         perf_pmu_enable(cpuctx->ctx.pmu);
4267         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4268
4269         return true;
4270 }
4271
4272 void perf_event_task_tick(void)
4273 {
4274         struct list_head *head = this_cpu_ptr(&active_ctx_list);
4275         struct perf_event_context *ctx, *tmp;
4276         int throttled;
4277
4278         lockdep_assert_irqs_disabled();
4279
4280         __this_cpu_inc(perf_throttled_seq);
4281         throttled = __this_cpu_xchg(perf_throttled_count, 0);
4282         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4283
4284         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4285                 perf_adjust_freq_unthr_context(ctx, throttled);
4286 }
4287
4288 static int event_enable_on_exec(struct perf_event *event,
4289                                 struct perf_event_context *ctx)
4290 {
4291         if (!event->attr.enable_on_exec)
4292                 return 0;
4293
4294         event->attr.enable_on_exec = 0;
4295         if (event->state >= PERF_EVENT_STATE_INACTIVE)
4296                 return 0;
4297
4298         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4299
4300         return 1;
4301 }
4302
4303 /*
4304  * Enable all of a task's events that have been marked enable-on-exec.
4305  * This expects task == current.
4306  */
4307 static void perf_event_enable_on_exec(int ctxn)
4308 {
4309         struct perf_event_context *ctx, *clone_ctx = NULL;
4310         enum event_type_t event_type = 0;
4311         struct perf_cpu_context *cpuctx;
4312         struct perf_event *event;
4313         unsigned long flags;
4314         int enabled = 0;
4315
4316         local_irq_save(flags);
4317         ctx = current->perf_event_ctxp[ctxn];
4318         if (!ctx || !ctx->nr_events)
4319                 goto out;
4320
4321         cpuctx = __get_cpu_context(ctx);
4322         perf_ctx_lock(cpuctx, ctx);
4323         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4324         list_for_each_entry(event, &ctx->event_list, event_entry) {
4325                 enabled |= event_enable_on_exec(event, ctx);
4326                 event_type |= get_event_type(event);
4327         }
4328
4329         /*
4330          * Unclone and reschedule this context if we enabled any event.
4331          */
4332         if (enabled) {
4333                 clone_ctx = unclone_ctx(ctx);
4334                 ctx_resched(cpuctx, ctx, event_type);
4335         } else {
4336                 ctx_sched_in(ctx, cpuctx, EVENT_TIME);
4337         }
4338         perf_ctx_unlock(cpuctx, ctx);
4339
4340 out:
4341         local_irq_restore(flags);
4342
4343         if (clone_ctx)
4344                 put_ctx(clone_ctx);
4345 }
4346
4347 static void perf_remove_from_owner(struct perf_event *event);
4348 static void perf_event_exit_event(struct perf_event *event,
4349                                   struct perf_event_context *ctx);
4350
4351 /*
4352  * Removes all events from the current task that have been marked
4353  * remove-on-exec, and feeds their values back to parent events.
4354  */
4355 static void perf_event_remove_on_exec(int ctxn)
4356 {
4357         struct perf_event_context *ctx, *clone_ctx = NULL;
4358         struct perf_event *event, *next;
4359         LIST_HEAD(free_list);
4360         unsigned long flags;
4361         bool modified = false;
4362
4363         ctx = perf_pin_task_context(current, ctxn);
4364         if (!ctx)
4365                 return;
4366
4367         mutex_lock(&ctx->mutex);
4368
4369         if (WARN_ON_ONCE(ctx->task != current))
4370                 goto unlock;
4371
4372         list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
4373                 if (!event->attr.remove_on_exec)
4374                         continue;
4375
4376                 if (!is_kernel_event(event))
4377                         perf_remove_from_owner(event);
4378
4379                 modified = true;
4380
4381                 perf_event_exit_event(event, ctx);
4382         }
4383
4384         raw_spin_lock_irqsave(&ctx->lock, flags);
4385         if (modified)
4386                 clone_ctx = unclone_ctx(ctx);
4387         --ctx->pin_count;
4388         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4389
4390 unlock:
4391         mutex_unlock(&ctx->mutex);
4392
4393         put_ctx(ctx);
4394         if (clone_ctx)
4395                 put_ctx(clone_ctx);
4396 }
4397
4398 struct perf_read_data {
4399         struct perf_event *event;
4400         bool group;
4401         int ret;
4402 };
4403
4404 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4405 {
4406         u16 local_pkg, event_pkg;
4407
4408         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4409                 int local_cpu = smp_processor_id();
4410
4411                 event_pkg = topology_physical_package_id(event_cpu);
4412                 local_pkg = topology_physical_package_id(local_cpu);
4413
4414                 if (event_pkg == local_pkg)
4415                         return local_cpu;
4416         }
4417
4418         return event_cpu;
4419 }
4420
4421 /*
4422  * Cross CPU call to read the hardware event
4423  */
4424 static void __perf_event_read(void *info)
4425 {
4426         struct perf_read_data *data = info;
4427         struct perf_event *sub, *event = data->event;
4428         struct perf_event_context *ctx = event->ctx;
4429         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4430         struct pmu *pmu = event->pmu;
4431
4432         /*
4433          * If this is a task context, we need to check whether it is
4434          * the current task context of this cpu.  If not it has been
4435          * scheduled out before the smp call arrived.  In that case
4436          * event->count would have been updated to a recent sample
4437          * when the event was scheduled out.
4438          */
4439         if (ctx->task && cpuctx->task_ctx != ctx)
4440                 return;
4441
4442         raw_spin_lock(&ctx->lock);
4443         if (ctx->is_active & EVENT_TIME) {
4444                 update_context_time(ctx);
4445                 update_cgrp_time_from_event(event);
4446         }
4447
4448         perf_event_update_time(event);
4449         if (data->group)
4450                 perf_event_update_sibling_time(event);
4451
4452         if (event->state != PERF_EVENT_STATE_ACTIVE)
4453                 goto unlock;
4454
4455         if (!data->group) {
4456                 pmu->read(event);
4457                 data->ret = 0;
4458                 goto unlock;
4459         }
4460
4461         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4462
4463         pmu->read(event);
4464
4465         for_each_sibling_event(sub, event) {
4466                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4467                         /*
4468                          * Use sibling's PMU rather than @event's since
4469                          * sibling could be on different (eg: software) PMU.
4470                          */
4471                         sub->pmu->read(sub);
4472                 }
4473         }
4474
4475         data->ret = pmu->commit_txn(pmu);
4476
4477 unlock:
4478         raw_spin_unlock(&ctx->lock);
4479 }
4480
4481 static inline u64 perf_event_count(struct perf_event *event)
4482 {
4483         return local64_read(&event->count) + atomic64_read(&event->child_count);
4484 }
4485
4486 static void calc_timer_values(struct perf_event *event,
4487                                 u64 *now,
4488                                 u64 *enabled,
4489                                 u64 *running)
4490 {
4491         u64 ctx_time;
4492
4493         *now = perf_clock();
4494         ctx_time = perf_event_time_now(event, *now);
4495         __perf_update_times(event, ctx_time, enabled, running);
4496 }
4497
4498 /*
4499  * NMI-safe method to read a local event, that is an event that
4500  * is:
4501  *   - either for the current task, or for this CPU
4502  *   - does not have inherit set, for inherited task events
4503  *     will not be local and we cannot read them atomically
4504  *   - must not have a pmu::count method
4505  */
4506 int perf_event_read_local(struct perf_event *event, u64 *value,
4507                           u64 *enabled, u64 *running)
4508 {
4509         unsigned long flags;
4510         int ret = 0;
4511
4512         /*
4513          * Disabling interrupts avoids all counter scheduling (context
4514          * switches, timer based rotation and IPIs).
4515          */
4516         local_irq_save(flags);
4517
4518         /*
4519          * It must not be an event with inherit set, we cannot read
4520          * all child counters from atomic context.
4521          */
4522         if (event->attr.inherit) {
4523                 ret = -EOPNOTSUPP;
4524                 goto out;
4525         }
4526
4527         /* If this is a per-task event, it must be for current */
4528         if ((event->attach_state & PERF_ATTACH_TASK) &&
4529             event->hw.target != current) {
4530                 ret = -EINVAL;
4531                 goto out;
4532         }
4533
4534         /* If this is a per-CPU event, it must be for this CPU */
4535         if (!(event->attach_state & PERF_ATTACH_TASK) &&
4536             event->cpu != smp_processor_id()) {
4537                 ret = -EINVAL;
4538                 goto out;
4539         }
4540
4541         /* If this is a pinned event it must be running on this CPU */
4542         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4543                 ret = -EBUSY;
4544                 goto out;
4545         }
4546
4547         /*
4548          * If the event is currently on this CPU, its either a per-task event,
4549          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4550          * oncpu == -1).
4551          */
4552         if (event->oncpu == smp_processor_id())
4553                 event->pmu->read(event);
4554
4555         *value = local64_read(&event->count);
4556         if (enabled || running) {
4557                 u64 __enabled, __running, __now;;
4558
4559                 calc_timer_values(event, &__now, &__enabled, &__running);
4560                 if (enabled)
4561                         *enabled = __enabled;
4562                 if (running)
4563                         *running = __running;
4564         }
4565 out:
4566         local_irq_restore(flags);
4567
4568         return ret;
4569 }
4570
4571 static int perf_event_read(struct perf_event *event, bool group)
4572 {
4573         enum perf_event_state state = READ_ONCE(event->state);
4574         int event_cpu, ret = 0;
4575
4576         /*
4577          * If event is enabled and currently active on a CPU, update the
4578          * value in the event structure:
4579          */
4580 again:
4581         if (state == PERF_EVENT_STATE_ACTIVE) {
4582                 struct perf_read_data data;
4583
4584                 /*
4585                  * Orders the ->state and ->oncpu loads such that if we see
4586                  * ACTIVE we must also see the right ->oncpu.
4587                  *
4588                  * Matches the smp_wmb() from event_sched_in().
4589                  */
4590                 smp_rmb();
4591
4592                 event_cpu = READ_ONCE(event->oncpu);
4593                 if ((unsigned)event_cpu >= nr_cpu_ids)
4594                         return 0;
4595
4596                 data = (struct perf_read_data){
4597                         .event = event,
4598                         .group = group,
4599                         .ret = 0,
4600                 };
4601
4602                 preempt_disable();
4603                 event_cpu = __perf_event_read_cpu(event, event_cpu);
4604
4605                 /*
4606                  * Purposely ignore the smp_call_function_single() return
4607                  * value.
4608                  *
4609                  * If event_cpu isn't a valid CPU it means the event got
4610                  * scheduled out and that will have updated the event count.
4611                  *
4612                  * Therefore, either way, we'll have an up-to-date event count
4613                  * after this.
4614                  */
4615                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4616                 preempt_enable();
4617                 ret = data.ret;
4618
4619         } else if (state == PERF_EVENT_STATE_INACTIVE) {
4620                 struct perf_event_context *ctx = event->ctx;
4621                 unsigned long flags;
4622
4623                 raw_spin_lock_irqsave(&ctx->lock, flags);
4624                 state = event->state;
4625                 if (state != PERF_EVENT_STATE_INACTIVE) {
4626                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4627                         goto again;
4628                 }
4629
4630                 /*
4631                  * May read while context is not active (e.g., thread is
4632                  * blocked), in that case we cannot update context time
4633                  */
4634                 if (ctx->is_active & EVENT_TIME) {
4635                         update_context_time(ctx);
4636                         update_cgrp_time_from_event(event);
4637                 }
4638
4639                 perf_event_update_time(event);
4640                 if (group)
4641                         perf_event_update_sibling_time(event);
4642                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4643         }
4644
4645         return ret;
4646 }
4647
4648 /*
4649  * Initialize the perf_event context in a task_struct:
4650  */
4651 static void __perf_event_init_context(struct perf_event_context *ctx)
4652 {
4653         raw_spin_lock_init(&ctx->lock);
4654         mutex_init(&ctx->mutex);
4655         INIT_LIST_HEAD(&ctx->active_ctx_list);
4656         perf_event_groups_init(&ctx->pinned_groups);
4657         perf_event_groups_init(&ctx->flexible_groups);
4658         INIT_LIST_HEAD(&ctx->event_list);
4659         INIT_LIST_HEAD(&ctx->pinned_active);
4660         INIT_LIST_HEAD(&ctx->flexible_active);
4661         refcount_set(&ctx->refcount, 1);
4662 }
4663
4664 static struct perf_event_context *
4665 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4666 {
4667         struct perf_event_context *ctx;
4668
4669         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4670         if (!ctx)
4671                 return NULL;
4672
4673         __perf_event_init_context(ctx);
4674         if (task)
4675                 ctx->task = get_task_struct(task);
4676         ctx->pmu = pmu;
4677
4678         return ctx;
4679 }
4680
4681 static struct task_struct *
4682 find_lively_task_by_vpid(pid_t vpid)
4683 {
4684         struct task_struct *task;
4685
4686         rcu_read_lock();
4687         if (!vpid)
4688                 task = current;
4689         else
4690                 task = find_task_by_vpid(vpid);
4691         if (task)
4692                 get_task_struct(task);
4693         rcu_read_unlock();
4694
4695         if (!task)
4696                 return ERR_PTR(-ESRCH);
4697
4698         return task;
4699 }
4700
4701 /*
4702  * Returns a matching context with refcount and pincount.
4703  */
4704 static struct perf_event_context *
4705 find_get_context(struct pmu *pmu, struct task_struct *task,
4706                 struct perf_event *event)
4707 {
4708         struct perf_event_context *ctx, *clone_ctx = NULL;
4709         struct perf_cpu_context *cpuctx;
4710         void *task_ctx_data = NULL;
4711         unsigned long flags;
4712         int ctxn, err;
4713         int cpu = event->cpu;
4714
4715         if (!task) {
4716                 /* Must be root to operate on a CPU event: */
4717                 err = perf_allow_cpu(&event->attr);
4718                 if (err)
4719                         return ERR_PTR(err);
4720
4721                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4722                 ctx = &cpuctx->ctx;
4723                 get_ctx(ctx);
4724                 raw_spin_lock_irqsave(&ctx->lock, flags);
4725                 ++ctx->pin_count;
4726                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4727
4728                 return ctx;
4729         }
4730
4731         err = -EINVAL;
4732         ctxn = pmu->task_ctx_nr;
4733         if (ctxn < 0)
4734                 goto errout;
4735
4736         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4737                 task_ctx_data = alloc_task_ctx_data(pmu);
4738                 if (!task_ctx_data) {
4739                         err = -ENOMEM;
4740                         goto errout;
4741                 }
4742         }
4743
4744 retry:
4745         ctx = perf_lock_task_context(task, ctxn, &flags);
4746         if (ctx) {
4747                 clone_ctx = unclone_ctx(ctx);
4748                 ++ctx->pin_count;
4749
4750                 if (task_ctx_data && !ctx->task_ctx_data) {
4751                         ctx->task_ctx_data = task_ctx_data;
4752                         task_ctx_data = NULL;
4753                 }
4754                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4755
4756                 if (clone_ctx)
4757                         put_ctx(clone_ctx);
4758         } else {
4759                 ctx = alloc_perf_context(pmu, task);
4760                 err = -ENOMEM;
4761                 if (!ctx)
4762                         goto errout;
4763
4764                 if (task_ctx_data) {
4765                         ctx->task_ctx_data = task_ctx_data;
4766                         task_ctx_data = NULL;
4767                 }
4768
4769                 err = 0;
4770                 mutex_lock(&task->perf_event_mutex);
4771                 /*
4772                  * If it has already passed perf_event_exit_task().
4773                  * we must see PF_EXITING, it takes this mutex too.
4774                  */
4775                 if (task->flags & PF_EXITING)
4776                         err = -ESRCH;
4777                 else if (task->perf_event_ctxp[ctxn])
4778                         err = -EAGAIN;
4779                 else {
4780                         get_ctx(ctx);
4781                         ++ctx->pin_count;
4782                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4783                 }
4784                 mutex_unlock(&task->perf_event_mutex);
4785
4786                 if (unlikely(err)) {
4787                         put_ctx(ctx);
4788
4789                         if (err == -EAGAIN)
4790                                 goto retry;
4791                         goto errout;
4792                 }
4793         }
4794
4795         free_task_ctx_data(pmu, task_ctx_data);
4796         return ctx;
4797
4798 errout:
4799         free_task_ctx_data(pmu, task_ctx_data);
4800         return ERR_PTR(err);
4801 }
4802
4803 static void perf_event_free_filter(struct perf_event *event);
4804
4805 static void free_event_rcu(struct rcu_head *head)
4806 {
4807         struct perf_event *event;
4808
4809         event = container_of(head, struct perf_event, rcu_head);
4810         if (event->ns)
4811                 put_pid_ns(event->ns);
4812         perf_event_free_filter(event);
4813         kmem_cache_free(perf_event_cache, event);
4814 }
4815
4816 static void ring_buffer_attach(struct perf_event *event,
4817                                struct perf_buffer *rb);
4818
4819 static void detach_sb_event(struct perf_event *event)
4820 {
4821         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4822
4823         raw_spin_lock(&pel->lock);
4824         list_del_rcu(&event->sb_list);
4825         raw_spin_unlock(&pel->lock);
4826 }
4827
4828 static bool is_sb_event(struct perf_event *event)
4829 {
4830         struct perf_event_attr *attr = &event->attr;
4831
4832         if (event->parent)
4833                 return false;
4834
4835         if (event->attach_state & PERF_ATTACH_TASK)
4836                 return false;
4837
4838         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4839             attr->comm || attr->comm_exec ||
4840             attr->task || attr->ksymbol ||
4841             attr->context_switch || attr->text_poke ||
4842             attr->bpf_event)
4843                 return true;
4844         return false;
4845 }
4846
4847 static void unaccount_pmu_sb_event(struct perf_event *event)
4848 {
4849         if (is_sb_event(event))
4850                 detach_sb_event(event);
4851 }
4852
4853 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4854 {
4855         if (event->parent)
4856                 return;
4857
4858         if (is_cgroup_event(event))
4859                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4860 }
4861
4862 #ifdef CONFIG_NO_HZ_FULL
4863 static DEFINE_SPINLOCK(nr_freq_lock);
4864 #endif
4865
4866 static void unaccount_freq_event_nohz(void)
4867 {
4868 #ifdef CONFIG_NO_HZ_FULL
4869         spin_lock(&nr_freq_lock);
4870         if (atomic_dec_and_test(&nr_freq_events))
4871                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4872         spin_unlock(&nr_freq_lock);
4873 #endif
4874 }
4875
4876 static void unaccount_freq_event(void)
4877 {
4878         if (tick_nohz_full_enabled())
4879                 unaccount_freq_event_nohz();
4880         else
4881                 atomic_dec(&nr_freq_events);
4882 }
4883
4884 static void unaccount_event(struct perf_event *event)
4885 {
4886         bool dec = false;
4887
4888         if (event->parent)
4889                 return;
4890
4891         if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
4892                 dec = true;
4893         if (event->attr.mmap || event->attr.mmap_data)
4894                 atomic_dec(&nr_mmap_events);
4895         if (event->attr.build_id)
4896                 atomic_dec(&nr_build_id_events);
4897         if (event->attr.comm)
4898                 atomic_dec(&nr_comm_events);
4899         if (event->attr.namespaces)
4900                 atomic_dec(&nr_namespaces_events);
4901         if (event->attr.cgroup)
4902                 atomic_dec(&nr_cgroup_events);
4903         if (event->attr.task)
4904                 atomic_dec(&nr_task_events);
4905         if (event->attr.freq)
4906                 unaccount_freq_event();
4907         if (event->attr.context_switch) {
4908                 dec = true;
4909                 atomic_dec(&nr_switch_events);
4910         }
4911         if (is_cgroup_event(event))
4912                 dec = true;
4913         if (has_branch_stack(event))
4914                 dec = true;
4915         if (event->attr.ksymbol)
4916                 atomic_dec(&nr_ksymbol_events);
4917         if (event->attr.bpf_event)
4918                 atomic_dec(&nr_bpf_events);
4919         if (event->attr.text_poke)
4920                 atomic_dec(&nr_text_poke_events);
4921
4922         if (dec) {
4923                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4924                         schedule_delayed_work(&perf_sched_work, HZ);
4925         }
4926
4927         unaccount_event_cpu(event, event->cpu);
4928
4929         unaccount_pmu_sb_event(event);
4930 }
4931
4932 static void perf_sched_delayed(struct work_struct *work)
4933 {
4934         mutex_lock(&perf_sched_mutex);
4935         if (atomic_dec_and_test(&perf_sched_count))
4936                 static_branch_disable(&perf_sched_events);
4937         mutex_unlock(&perf_sched_mutex);
4938 }
4939
4940 /*
4941  * The following implement mutual exclusion of events on "exclusive" pmus
4942  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4943  * at a time, so we disallow creating events that might conflict, namely:
4944  *
4945  *  1) cpu-wide events in the presence of per-task events,
4946  *  2) per-task events in the presence of cpu-wide events,
4947  *  3) two matching events on the same context.
4948  *
4949  * The former two cases are handled in the allocation path (perf_event_alloc(),
4950  * _free_event()), the latter -- before the first perf_install_in_context().
4951  */
4952 static int exclusive_event_init(struct perf_event *event)
4953 {
4954         struct pmu *pmu = event->pmu;
4955
4956         if (!is_exclusive_pmu(pmu))
4957                 return 0;
4958
4959         /*
4960          * Prevent co-existence of per-task and cpu-wide events on the
4961          * same exclusive pmu.
4962          *
4963          * Negative pmu::exclusive_cnt means there are cpu-wide
4964          * events on this "exclusive" pmu, positive means there are
4965          * per-task events.
4966          *
4967          * Since this is called in perf_event_alloc() path, event::ctx
4968          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4969          * to mean "per-task event", because unlike other attach states it
4970          * never gets cleared.
4971          */
4972         if (event->attach_state & PERF_ATTACH_TASK) {
4973                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4974                         return -EBUSY;
4975         } else {
4976                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4977                         return -EBUSY;
4978         }
4979
4980         return 0;
4981 }
4982
4983 static void exclusive_event_destroy(struct perf_event *event)
4984 {
4985         struct pmu *pmu = event->pmu;
4986
4987         if (!is_exclusive_pmu(pmu))
4988                 return;
4989
4990         /* see comment in exclusive_event_init() */
4991         if (event->attach_state & PERF_ATTACH_TASK)
4992                 atomic_dec(&pmu->exclusive_cnt);
4993         else
4994                 atomic_inc(&pmu->exclusive_cnt);
4995 }
4996
4997 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4998 {
4999         if ((e1->pmu == e2->pmu) &&
5000             (e1->cpu == e2->cpu ||
5001              e1->cpu == -1 ||
5002              e2->cpu == -1))
5003                 return true;
5004         return false;
5005 }
5006
5007 static bool exclusive_event_installable(struct perf_event *event,
5008                                         struct perf_event_context *ctx)
5009 {
5010         struct perf_event *iter_event;
5011         struct pmu *pmu = event->pmu;
5012
5013         lockdep_assert_held(&ctx->mutex);
5014
5015         if (!is_exclusive_pmu(pmu))
5016                 return true;
5017
5018         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
5019                 if (exclusive_event_match(iter_event, event))
5020                         return false;
5021         }
5022
5023         return true;
5024 }
5025
5026 static void perf_addr_filters_splice(struct perf_event *event,
5027                                        struct list_head *head);
5028
5029 static void _free_event(struct perf_event *event)
5030 {
5031         irq_work_sync(&event->pending);
5032
5033         unaccount_event(event);
5034
5035         security_perf_event_free(event);
5036
5037         if (event->rb) {
5038                 /*
5039                  * Can happen when we close an event with re-directed output.
5040                  *
5041                  * Since we have a 0 refcount, perf_mmap_close() will skip
5042                  * over us; possibly making our ring_buffer_put() the last.
5043                  */
5044                 mutex_lock(&event->mmap_mutex);
5045                 ring_buffer_attach(event, NULL);
5046                 mutex_unlock(&event->mmap_mutex);
5047         }
5048
5049         if (is_cgroup_event(event))
5050                 perf_detach_cgroup(event);
5051
5052         if (!event->parent) {
5053                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
5054                         put_callchain_buffers();
5055         }
5056
5057         perf_event_free_bpf_prog(event);
5058         perf_addr_filters_splice(event, NULL);
5059         kfree(event->addr_filter_ranges);
5060
5061         if (event->destroy)
5062                 event->destroy(event);
5063
5064         /*
5065          * Must be after ->destroy(), due to uprobe_perf_close() using
5066          * hw.target.
5067          */
5068         if (event->hw.target)
5069                 put_task_struct(event->hw.target);
5070
5071         /*
5072          * perf_event_free_task() relies on put_ctx() being 'last', in particular
5073          * all task references must be cleaned up.
5074          */
5075         if (event->ctx)
5076                 put_ctx(event->ctx);
5077
5078         exclusive_event_destroy(event);
5079         module_put(event->pmu->module);
5080
5081         call_rcu(&event->rcu_head, free_event_rcu);
5082 }
5083
5084 /*
5085  * Used to free events which have a known refcount of 1, such as in error paths
5086  * where the event isn't exposed yet and inherited events.
5087  */
5088 static void free_event(struct perf_event *event)
5089 {
5090         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
5091                                 "unexpected event refcount: %ld; ptr=%p\n",
5092                                 atomic_long_read(&event->refcount), event)) {
5093                 /* leak to avoid use-after-free */
5094                 return;
5095         }
5096
5097         _free_event(event);
5098 }
5099
5100 /*
5101  * Remove user event from the owner task.
5102  */
5103 static void perf_remove_from_owner(struct perf_event *event)
5104 {
5105         struct task_struct *owner;
5106
5107         rcu_read_lock();
5108         /*
5109          * Matches the smp_store_release() in perf_event_exit_task(). If we
5110          * observe !owner it means the list deletion is complete and we can
5111          * indeed free this event, otherwise we need to serialize on
5112          * owner->perf_event_mutex.
5113          */
5114         owner = READ_ONCE(event->owner);
5115         if (owner) {
5116                 /*
5117                  * Since delayed_put_task_struct() also drops the last
5118                  * task reference we can safely take a new reference
5119                  * while holding the rcu_read_lock().
5120                  */
5121                 get_task_struct(owner);
5122         }
5123         rcu_read_unlock();
5124
5125         if (owner) {
5126                 /*
5127                  * If we're here through perf_event_exit_task() we're already
5128                  * holding ctx->mutex which would be an inversion wrt. the
5129                  * normal lock order.
5130                  *
5131                  * However we can safely take this lock because its the child
5132                  * ctx->mutex.
5133                  */
5134                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
5135
5136                 /*
5137                  * We have to re-check the event->owner field, if it is cleared
5138                  * we raced with perf_event_exit_task(), acquiring the mutex
5139                  * ensured they're done, and we can proceed with freeing the
5140                  * event.
5141                  */
5142                 if (event->owner) {
5143                         list_del_init(&event->owner_entry);
5144                         smp_store_release(&event->owner, NULL);
5145                 }
5146                 mutex_unlock(&owner->perf_event_mutex);
5147                 put_task_struct(owner);
5148         }
5149 }
5150
5151 static void put_event(struct perf_event *event)
5152 {
5153         if (!atomic_long_dec_and_test(&event->refcount))
5154                 return;
5155
5156         _free_event(event);
5157 }
5158
5159 /*
5160  * Kill an event dead; while event:refcount will preserve the event
5161  * object, it will not preserve its functionality. Once the last 'user'
5162  * gives up the object, we'll destroy the thing.
5163  */
5164 int perf_event_release_kernel(struct perf_event *event)
5165 {
5166         struct perf_event_context *ctx = event->ctx;
5167         struct perf_event *child, *tmp;
5168         LIST_HEAD(free_list);
5169
5170         /*
5171          * If we got here through err_file: fput(event_file); we will not have
5172          * attached to a context yet.
5173          */
5174         if (!ctx) {
5175                 WARN_ON_ONCE(event->attach_state &
5176                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
5177                 goto no_ctx;
5178         }
5179
5180         if (!is_kernel_event(event))
5181                 perf_remove_from_owner(event);
5182
5183         ctx = perf_event_ctx_lock(event);
5184         WARN_ON_ONCE(ctx->parent_ctx);
5185         perf_remove_from_context(event, DETACH_GROUP);
5186
5187         raw_spin_lock_irq(&ctx->lock);
5188         /*
5189          * Mark this event as STATE_DEAD, there is no external reference to it
5190          * anymore.
5191          *
5192          * Anybody acquiring event->child_mutex after the below loop _must_
5193          * also see this, most importantly inherit_event() which will avoid
5194          * placing more children on the list.
5195          *
5196          * Thus this guarantees that we will in fact observe and kill _ALL_
5197          * child events.
5198          */
5199         event->state = PERF_EVENT_STATE_DEAD;
5200         raw_spin_unlock_irq(&ctx->lock);
5201
5202         perf_event_ctx_unlock(event, ctx);
5203
5204 again:
5205         mutex_lock(&event->child_mutex);
5206         list_for_each_entry(child, &event->child_list, child_list) {
5207
5208                 /*
5209                  * Cannot change, child events are not migrated, see the
5210                  * comment with perf_event_ctx_lock_nested().
5211                  */
5212                 ctx = READ_ONCE(child->ctx);
5213                 /*
5214                  * Since child_mutex nests inside ctx::mutex, we must jump
5215                  * through hoops. We start by grabbing a reference on the ctx.
5216                  *
5217                  * Since the event cannot get freed while we hold the
5218                  * child_mutex, the context must also exist and have a !0
5219                  * reference count.
5220                  */
5221                 get_ctx(ctx);
5222
5223                 /*
5224                  * Now that we have a ctx ref, we can drop child_mutex, and
5225                  * acquire ctx::mutex without fear of it going away. Then we
5226                  * can re-acquire child_mutex.
5227                  */
5228                 mutex_unlock(&event->child_mutex);
5229                 mutex_lock(&ctx->mutex);
5230                 mutex_lock(&event->child_mutex);
5231
5232                 /*
5233                  * Now that we hold ctx::mutex and child_mutex, revalidate our
5234                  * state, if child is still the first entry, it didn't get freed
5235                  * and we can continue doing so.
5236                  */
5237                 tmp = list_first_entry_or_null(&event->child_list,
5238                                                struct perf_event, child_list);
5239                 if (tmp == child) {
5240                         perf_remove_from_context(child, DETACH_GROUP);
5241                         list_move(&child->child_list, &free_list);
5242                         /*
5243                          * This matches the refcount bump in inherit_event();
5244                          * this can't be the last reference.
5245                          */
5246                         put_event(event);
5247                 }
5248
5249                 mutex_unlock(&event->child_mutex);
5250                 mutex_unlock(&ctx->mutex);
5251                 put_ctx(ctx);
5252                 goto again;
5253         }
5254         mutex_unlock(&event->child_mutex);
5255
5256         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5257                 void *var = &child->ctx->refcount;
5258
5259                 list_del(&child->child_list);
5260                 free_event(child);
5261
5262                 /*
5263                  * Wake any perf_event_free_task() waiting for this event to be
5264                  * freed.
5265                  */
5266                 smp_mb(); /* pairs with wait_var_event() */
5267                 wake_up_var(var);
5268         }
5269
5270 no_ctx:
5271         put_event(event); /* Must be the 'last' reference */
5272         return 0;
5273 }
5274 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5275
5276 /*
5277  * Called when the last reference to the file is gone.
5278  */
5279 static int perf_release(struct inode *inode, struct file *file)
5280 {
5281         perf_event_release_kernel(file->private_data);
5282         return 0;
5283 }
5284
5285 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5286 {
5287         struct perf_event *child;
5288         u64 total = 0;
5289
5290         *enabled = 0;
5291         *running = 0;
5292
5293         mutex_lock(&event->child_mutex);
5294
5295         (void)perf_event_read(event, false);
5296         total += perf_event_count(event);
5297
5298         *enabled += event->total_time_enabled +
5299                         atomic64_read(&event->child_total_time_enabled);
5300         *running += event->total_time_running +
5301                         atomic64_read(&event->child_total_time_running);
5302
5303         list_for_each_entry(child, &event->child_list, child_list) {
5304                 (void)perf_event_read(child, false);
5305                 total += perf_event_count(child);
5306                 *enabled += child->total_time_enabled;
5307                 *running += child->total_time_running;
5308         }
5309         mutex_unlock(&event->child_mutex);
5310
5311         return total;
5312 }
5313
5314 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5315 {
5316         struct perf_event_context *ctx;
5317         u64 count;
5318
5319         ctx = perf_event_ctx_lock(event);
5320         count = __perf_event_read_value(event, enabled, running);
5321         perf_event_ctx_unlock(event, ctx);
5322
5323         return count;
5324 }
5325 EXPORT_SYMBOL_GPL(perf_event_read_value);
5326
5327 static int __perf_read_group_add(struct perf_event *leader,
5328                                         u64 read_format, u64 *values)
5329 {
5330         struct perf_event_context *ctx = leader->ctx;
5331         struct perf_event *sub;
5332         unsigned long flags;
5333         int n = 1; /* skip @nr */
5334         int ret;
5335
5336         ret = perf_event_read(leader, true);
5337         if (ret)
5338                 return ret;
5339
5340         raw_spin_lock_irqsave(&ctx->lock, flags);
5341
5342         /*
5343          * Since we co-schedule groups, {enabled,running} times of siblings
5344          * will be identical to those of the leader, so we only publish one
5345          * set.
5346          */
5347         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5348                 values[n++] += leader->total_time_enabled +
5349                         atomic64_read(&leader->child_total_time_enabled);
5350         }
5351
5352         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5353                 values[n++] += leader->total_time_running +
5354                         atomic64_read(&leader->child_total_time_running);
5355         }
5356
5357         /*
5358          * Write {count,id} tuples for every sibling.
5359          */
5360         values[n++] += perf_event_count(leader);
5361         if (read_format & PERF_FORMAT_ID)
5362                 values[n++] = primary_event_id(leader);
5363
5364         for_each_sibling_event(sub, leader) {
5365                 values[n++] += perf_event_count(sub);
5366                 if (read_format & PERF_FORMAT_ID)
5367                         values[n++] = primary_event_id(sub);
5368         }
5369
5370         raw_spin_unlock_irqrestore(&ctx->lock, flags);
5371         return 0;
5372 }
5373
5374 static int perf_read_group(struct perf_event *event,
5375                                    u64 read_format, char __user *buf)
5376 {
5377         struct perf_event *leader = event->group_leader, *child;
5378         struct perf_event_context *ctx = leader->ctx;
5379         int ret;
5380         u64 *values;
5381
5382         lockdep_assert_held(&ctx->mutex);
5383
5384         values = kzalloc(event->read_size, GFP_KERNEL);
5385         if (!values)
5386                 return -ENOMEM;
5387
5388         values[0] = 1 + leader->nr_siblings;
5389
5390         /*
5391          * By locking the child_mutex of the leader we effectively
5392          * lock the child list of all siblings.. XXX explain how.
5393          */
5394         mutex_lock(&leader->child_mutex);
5395
5396         ret = __perf_read_group_add(leader, read_format, values);
5397         if (ret)
5398                 goto unlock;
5399
5400         list_for_each_entry(child, &leader->child_list, child_list) {
5401                 ret = __perf_read_group_add(child, read_format, values);
5402                 if (ret)
5403                         goto unlock;
5404         }
5405
5406         mutex_unlock(&leader->child_mutex);
5407
5408         ret = event->read_size;
5409         if (copy_to_user(buf, values, event->read_size))
5410                 ret = -EFAULT;
5411         goto out;
5412
5413 unlock:
5414         mutex_unlock(&leader->child_mutex);
5415 out:
5416         kfree(values);
5417         return ret;
5418 }
5419
5420 static int perf_read_one(struct perf_event *event,
5421                                  u64 read_format, char __user *buf)
5422 {
5423         u64 enabled, running;
5424         u64 values[4];
5425         int n = 0;
5426
5427         values[n++] = __perf_event_read_value(event, &enabled, &running);
5428         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5429                 values[n++] = enabled;
5430         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5431                 values[n++] = running;
5432         if (read_format & PERF_FORMAT_ID)
5433                 values[n++] = primary_event_id(event);
5434
5435         if (copy_to_user(buf, values, n * sizeof(u64)))
5436                 return -EFAULT;
5437
5438         return n * sizeof(u64);
5439 }
5440
5441 static bool is_event_hup(struct perf_event *event)
5442 {
5443         bool no_children;
5444
5445         if (event->state > PERF_EVENT_STATE_EXIT)
5446                 return false;
5447
5448         mutex_lock(&event->child_mutex);
5449         no_children = list_empty(&event->child_list);
5450         mutex_unlock(&event->child_mutex);
5451         return no_children;
5452 }
5453
5454 /*
5455  * Read the performance event - simple non blocking version for now
5456  */
5457 static ssize_t
5458 __perf_read(struct perf_event *event, char __user *buf, size_t count)
5459 {
5460         u64 read_format = event->attr.read_format;
5461         int ret;
5462
5463         /*
5464          * Return end-of-file for a read on an event that is in
5465          * error state (i.e. because it was pinned but it couldn't be
5466          * scheduled on to the CPU at some point).
5467          */
5468         if (event->state == PERF_EVENT_STATE_ERROR)
5469                 return 0;
5470
5471         if (count < event->read_size)
5472                 return -ENOSPC;
5473
5474         WARN_ON_ONCE(event->ctx->parent_ctx);
5475         if (read_format & PERF_FORMAT_GROUP)
5476                 ret = perf_read_group(event, read_format, buf);
5477         else
5478                 ret = perf_read_one(event, read_format, buf);
5479
5480         return ret;
5481 }
5482
5483 static ssize_t
5484 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5485 {
5486         struct perf_event *event = file->private_data;
5487         struct perf_event_context *ctx;
5488         int ret;
5489
5490         ret = security_perf_event_read(event);
5491         if (ret)
5492                 return ret;
5493
5494         ctx = perf_event_ctx_lock(event);
5495         ret = __perf_read(event, buf, count);
5496         perf_event_ctx_unlock(event, ctx);
5497
5498         return ret;
5499 }
5500
5501 static __poll_t perf_poll(struct file *file, poll_table *wait)
5502 {
5503         struct perf_event *event = file->private_data;
5504         struct perf_buffer *rb;
5505         __poll_t events = EPOLLHUP;
5506
5507         poll_wait(file, &event->waitq, wait);
5508
5509         if (is_event_hup(event))
5510                 return events;
5511
5512         /*
5513          * Pin the event->rb by taking event->mmap_mutex; otherwise
5514          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
5515          */
5516         mutex_lock(&event->mmap_mutex);
5517         rb = event->rb;
5518         if (rb)
5519                 events = atomic_xchg(&rb->poll, 0);
5520         mutex_unlock(&event->mmap_mutex);
5521         return events;
5522 }
5523
5524 static void _perf_event_reset(struct perf_event *event)
5525 {
5526         (void)perf_event_read(event, false);
5527         local64_set(&event->count, 0);
5528         perf_event_update_userpage(event);
5529 }
5530
5531 /* Assume it's not an event with inherit set. */
5532 u64 perf_event_pause(struct perf_event *event, bool reset)
5533 {
5534         struct perf_event_context *ctx;
5535         u64 count;
5536
5537         ctx = perf_event_ctx_lock(event);
5538         WARN_ON_ONCE(event->attr.inherit);
5539         _perf_event_disable(event);
5540         count = local64_read(&event->count);
5541         if (reset)
5542                 local64_set(&event->count, 0);
5543         perf_event_ctx_unlock(event, ctx);
5544
5545         return count;
5546 }
5547 EXPORT_SYMBOL_GPL(perf_event_pause);
5548
5549 /*
5550  * Holding the top-level event's child_mutex means that any
5551  * descendant process that has inherited this event will block
5552  * in perf_event_exit_event() if it goes to exit, thus satisfying the
5553  * task existence requirements of perf_event_enable/disable.
5554  */
5555 static void perf_event_for_each_child(struct perf_event *event,
5556                                         void (*func)(struct perf_event *))
5557 {
5558         struct perf_event *child;
5559
5560         WARN_ON_ONCE(event->ctx->parent_ctx);
5561
5562         mutex_lock(&event->child_mutex);
5563         func(event);
5564         list_for_each_entry(child, &event->child_list, child_list)
5565                 func(child);
5566         mutex_unlock(&event->child_mutex);
5567 }
5568
5569 static void perf_event_for_each(struct perf_event *event,
5570                                   void (*func)(struct perf_event *))
5571 {
5572         struct perf_event_context *ctx = event->ctx;
5573         struct perf_event *sibling;
5574
5575         lockdep_assert_held(&ctx->mutex);
5576
5577         event = event->group_leader;
5578
5579         perf_event_for_each_child(event, func);
5580         for_each_sibling_event(sibling, event)
5581                 perf_event_for_each_child(sibling, func);
5582 }
5583
5584 static void __perf_event_period(struct perf_event *event,
5585                                 struct perf_cpu_context *cpuctx,
5586                                 struct perf_event_context *ctx,
5587                                 void *info)
5588 {
5589         u64 value = *((u64 *)info);
5590         bool active;
5591
5592         if (event->attr.freq) {
5593                 event->attr.sample_freq = value;
5594         } else {
5595                 event->attr.sample_period = value;
5596                 event->hw.sample_period = value;
5597         }
5598
5599         active = (event->state == PERF_EVENT_STATE_ACTIVE);
5600         if (active) {
5601                 perf_pmu_disable(ctx->pmu);
5602                 /*
5603                  * We could be throttled; unthrottle now to avoid the tick
5604                  * trying to unthrottle while we already re-started the event.
5605                  */
5606                 if (event->hw.interrupts == MAX_INTERRUPTS) {
5607                         event->hw.interrupts = 0;
5608                         perf_log_throttle(event, 1);
5609                 }
5610                 event->pmu->stop(event, PERF_EF_UPDATE);
5611         }
5612
5613         local64_set(&event->hw.period_left, 0);
5614
5615         if (active) {
5616                 event->pmu->start(event, PERF_EF_RELOAD);
5617                 perf_pmu_enable(ctx->pmu);
5618         }
5619 }
5620
5621 static int perf_event_check_period(struct perf_event *event, u64 value)
5622 {
5623         return event->pmu->check_period(event, value);
5624 }
5625
5626 static int _perf_event_period(struct perf_event *event, u64 value)
5627 {
5628         if (!is_sampling_event(event))
5629                 return -EINVAL;
5630
5631         if (!value)
5632                 return -EINVAL;
5633
5634         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5635                 return -EINVAL;
5636
5637         if (perf_event_check_period(event, value))
5638                 return -EINVAL;
5639
5640         if (!event->attr.freq && (value & (1ULL << 63)))
5641                 return -EINVAL;
5642
5643         event_function_call(event, __perf_event_period, &value);
5644
5645         return 0;
5646 }
5647
5648 int perf_event_period(struct perf_event *event, u64 value)
5649 {
5650         struct perf_event_context *ctx;
5651         int ret;
5652
5653         ctx = perf_event_ctx_lock(event);
5654         ret = _perf_event_period(event, value);
5655         perf_event_ctx_unlock(event, ctx);
5656
5657         return ret;
5658 }
5659 EXPORT_SYMBOL_GPL(perf_event_period);
5660
5661 static const struct file_operations perf_fops;
5662
5663 static inline int perf_fget_light(int fd, struct fd *p)
5664 {
5665         struct fd f = fdget(fd);
5666         if (!f.file)
5667                 return -EBADF;
5668
5669         if (f.file->f_op != &perf_fops) {
5670                 fdput(f);
5671                 return -EBADF;
5672         }
5673         *p = f;
5674         return 0;
5675 }
5676
5677 static int perf_event_set_output(struct perf_event *event,
5678                                  struct perf_event *output_event);
5679 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5680 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5681                           struct perf_event_attr *attr);
5682
5683 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5684 {
5685         void (*func)(struct perf_event *);
5686         u32 flags = arg;
5687
5688         switch (cmd) {
5689         case PERF_EVENT_IOC_ENABLE:
5690                 func = _perf_event_enable;
5691                 break;
5692         case PERF_EVENT_IOC_DISABLE:
5693                 func = _perf_event_disable;
5694                 break;
5695         case PERF_EVENT_IOC_RESET:
5696                 func = _perf_event_reset;
5697                 break;
5698
5699         case PERF_EVENT_IOC_REFRESH:
5700                 return _perf_event_refresh(event, arg);
5701
5702         case PERF_EVENT_IOC_PERIOD:
5703         {
5704                 u64 value;
5705
5706                 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5707                         return -EFAULT;
5708
5709                 return _perf_event_period(event, value);
5710         }
5711         case PERF_EVENT_IOC_ID:
5712         {
5713                 u64 id = primary_event_id(event);
5714
5715                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5716                         return -EFAULT;
5717                 return 0;
5718         }
5719
5720         case PERF_EVENT_IOC_SET_OUTPUT:
5721         {
5722                 int ret;
5723                 if (arg != -1) {
5724                         struct perf_event *output_event;
5725                         struct fd output;
5726                         ret = perf_fget_light(arg, &output);
5727                         if (ret)
5728                                 return ret;
5729                         output_event = output.file->private_data;
5730                         ret = perf_event_set_output(event, output_event);
5731                         fdput(output);
5732                 } else {
5733                         ret = perf_event_set_output(event, NULL);
5734                 }
5735                 return ret;
5736         }
5737
5738         case PERF_EVENT_IOC_SET_FILTER:
5739                 return perf_event_set_filter(event, (void __user *)arg);
5740
5741         case PERF_EVENT_IOC_SET_BPF:
5742         {
5743                 struct bpf_prog *prog;
5744                 int err;
5745
5746                 prog = bpf_prog_get(arg);
5747                 if (IS_ERR(prog))
5748                         return PTR_ERR(prog);
5749
5750                 err = perf_event_set_bpf_prog(event, prog, 0);
5751                 if (err) {
5752                         bpf_prog_put(prog);
5753                         return err;
5754                 }
5755
5756                 return 0;
5757         }
5758
5759         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5760                 struct perf_buffer *rb;
5761
5762                 rcu_read_lock();
5763                 rb = rcu_dereference(event->rb);
5764                 if (!rb || !rb->nr_pages) {
5765                         rcu_read_unlock();
5766                         return -EINVAL;
5767                 }
5768                 rb_toggle_paused(rb, !!arg);
5769                 rcu_read_unlock();
5770                 return 0;
5771         }
5772
5773         case PERF_EVENT_IOC_QUERY_BPF:
5774                 return perf_event_query_prog_array(event, (void __user *)arg);
5775
5776         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5777                 struct perf_event_attr new_attr;
5778                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5779                                          &new_attr);
5780
5781                 if (err)
5782                         return err;
5783
5784                 return perf_event_modify_attr(event,  &new_attr);
5785         }
5786         default:
5787                 return -ENOTTY;
5788         }
5789
5790         if (flags & PERF_IOC_FLAG_GROUP)
5791                 perf_event_for_each(event, func);
5792         else
5793                 perf_event_for_each_child(event, func);
5794
5795         return 0;
5796 }
5797
5798 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5799 {
5800         struct perf_event *event = file->private_data;
5801         struct perf_event_context *ctx;
5802         long ret;
5803
5804         /* Treat ioctl like writes as it is likely a mutating operation. */
5805         ret = security_perf_event_write(event);
5806         if (ret)
5807                 return ret;
5808
5809         ctx = perf_event_ctx_lock(event);
5810         ret = _perf_ioctl(event, cmd, arg);
5811         perf_event_ctx_unlock(event, ctx);
5812
5813         return ret;
5814 }
5815
5816 #ifdef CONFIG_COMPAT
5817 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5818                                 unsigned long arg)
5819 {
5820         switch (_IOC_NR(cmd)) {
5821         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5822         case _IOC_NR(PERF_EVENT_IOC_ID):
5823         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5824         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5825                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5826                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5827                         cmd &= ~IOCSIZE_MASK;
5828                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5829                 }
5830                 break;
5831         }
5832         return perf_ioctl(file, cmd, arg);
5833 }
5834 #else
5835 # define perf_compat_ioctl NULL
5836 #endif
5837
5838 int perf_event_task_enable(void)
5839 {
5840         struct perf_event_context *ctx;
5841         struct perf_event *event;
5842
5843         mutex_lock(&current->perf_event_mutex);
5844         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5845                 ctx = perf_event_ctx_lock(event);
5846                 perf_event_for_each_child(event, _perf_event_enable);
5847                 perf_event_ctx_unlock(event, ctx);
5848         }
5849         mutex_unlock(&current->perf_event_mutex);
5850
5851         return 0;
5852 }
5853
5854 int perf_event_task_disable(void)
5855 {
5856         struct perf_event_context *ctx;
5857         struct perf_event *event;
5858
5859         mutex_lock(&current->perf_event_mutex);
5860         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5861                 ctx = perf_event_ctx_lock(event);
5862                 perf_event_for_each_child(event, _perf_event_disable);
5863                 perf_event_ctx_unlock(event, ctx);
5864         }
5865         mutex_unlock(&current->perf_event_mutex);
5866
5867         return 0;
5868 }
5869
5870 static int perf_event_index(struct perf_event *event)
5871 {
5872         if (event->hw.state & PERF_HES_STOPPED)
5873                 return 0;
5874
5875         if (event->state != PERF_EVENT_STATE_ACTIVE)
5876                 return 0;
5877
5878         return event->pmu->event_idx(event);
5879 }
5880
5881 static void perf_event_init_userpage(struct perf_event *event)
5882 {
5883         struct perf_event_mmap_page *userpg;
5884         struct perf_buffer *rb;
5885
5886         rcu_read_lock();
5887         rb = rcu_dereference(event->rb);
5888         if (!rb)
5889                 goto unlock;
5890
5891         userpg = rb->user_page;
5892
5893         /* Allow new userspace to detect that bit 0 is deprecated */
5894         userpg->cap_bit0_is_deprecated = 1;
5895         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5896         userpg->data_offset = PAGE_SIZE;
5897         userpg->data_size = perf_data_size(rb);
5898
5899 unlock:
5900         rcu_read_unlock();
5901 }
5902
5903 void __weak arch_perf_update_userpage(
5904         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5905 {
5906 }
5907
5908 /*
5909  * Callers need to ensure there can be no nesting of this function, otherwise
5910  * the seqlock logic goes bad. We can not serialize this because the arch
5911  * code calls this from NMI context.
5912  */
5913 void perf_event_update_userpage(struct perf_event *event)
5914 {
5915         struct perf_event_mmap_page *userpg;
5916         struct perf_buffer *rb;
5917         u64 enabled, running, now;
5918
5919         rcu_read_lock();
5920         rb = rcu_dereference(event->rb);
5921         if (!rb)
5922                 goto unlock;
5923
5924         /*
5925          * compute total_time_enabled, total_time_running
5926          * based on snapshot values taken when the event
5927          * was last scheduled in.
5928          *
5929          * we cannot simply called update_context_time()
5930          * because of locking issue as we can be called in
5931          * NMI context
5932          */
5933         calc_timer_values(event, &now, &enabled, &running);
5934
5935         userpg = rb->user_page;
5936         /*
5937          * Disable preemption to guarantee consistent time stamps are stored to
5938          * the user page.
5939          */
5940         preempt_disable();
5941         ++userpg->lock;
5942         barrier();
5943         userpg->index = perf_event_index(event);
5944         userpg->offset = perf_event_count(event);
5945         if (userpg->index)
5946                 userpg->offset -= local64_read(&event->hw.prev_count);
5947
5948         userpg->time_enabled = enabled +
5949                         atomic64_read(&event->child_total_time_enabled);
5950
5951         userpg->time_running = running +
5952                         atomic64_read(&event->child_total_time_running);
5953
5954         arch_perf_update_userpage(event, userpg, now);
5955
5956         barrier();
5957         ++userpg->lock;
5958         preempt_enable();
5959 unlock:
5960         rcu_read_unlock();
5961 }
5962 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5963
5964 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5965 {
5966         struct perf_event *event = vmf->vma->vm_file->private_data;
5967         struct perf_buffer *rb;
5968         vm_fault_t ret = VM_FAULT_SIGBUS;
5969
5970         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5971                 if (vmf->pgoff == 0)
5972                         ret = 0;
5973                 return ret;
5974         }
5975
5976         rcu_read_lock();
5977         rb = rcu_dereference(event->rb);
5978         if (!rb)
5979                 goto unlock;
5980
5981         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5982                 goto unlock;
5983
5984         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5985         if (!vmf->page)
5986                 goto unlock;
5987
5988         get_page(vmf->page);
5989         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5990         vmf->page->index   = vmf->pgoff;
5991
5992         ret = 0;
5993 unlock:
5994         rcu_read_unlock();
5995
5996         return ret;
5997 }
5998
5999 static void ring_buffer_attach(struct perf_event *event,
6000                                struct perf_buffer *rb)
6001 {
6002         struct perf_buffer *old_rb = NULL;
6003         unsigned long flags;
6004
6005         WARN_ON_ONCE(event->parent);
6006
6007         if (event->rb) {
6008                 /*
6009                  * Should be impossible, we set this when removing
6010                  * event->rb_entry and wait/clear when adding event->rb_entry.
6011                  */
6012                 WARN_ON_ONCE(event->rcu_pending);
6013
6014                 old_rb = event->rb;
6015                 spin_lock_irqsave(&old_rb->event_lock, flags);
6016                 list_del_rcu(&event->rb_entry);
6017                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
6018
6019                 event->rcu_batches = get_state_synchronize_rcu();
6020                 event->rcu_pending = 1;
6021         }
6022
6023         if (rb) {
6024                 if (event->rcu_pending) {
6025                         cond_synchronize_rcu(event->rcu_batches);
6026                         event->rcu_pending = 0;
6027                 }
6028
6029                 spin_lock_irqsave(&rb->event_lock, flags);
6030                 list_add_rcu(&event->rb_entry, &rb->event_list);
6031                 spin_unlock_irqrestore(&rb->event_lock, flags);
6032         }
6033
6034         /*
6035          * Avoid racing with perf_mmap_close(AUX): stop the event
6036          * before swizzling the event::rb pointer; if it's getting
6037          * unmapped, its aux_mmap_count will be 0 and it won't
6038          * restart. See the comment in __perf_pmu_output_stop().
6039          *
6040          * Data will inevitably be lost when set_output is done in
6041          * mid-air, but then again, whoever does it like this is
6042          * not in for the data anyway.
6043          */
6044         if (has_aux(event))
6045                 perf_event_stop(event, 0);
6046
6047         rcu_assign_pointer(event->rb, rb);
6048
6049         if (old_rb) {
6050                 ring_buffer_put(old_rb);
6051                 /*
6052                  * Since we detached before setting the new rb, so that we
6053                  * could attach the new rb, we could have missed a wakeup.
6054                  * Provide it now.
6055                  */
6056                 wake_up_all(&event->waitq);
6057         }
6058 }
6059
6060 static void ring_buffer_wakeup(struct perf_event *event)
6061 {
6062         struct perf_buffer *rb;
6063
6064         if (event->parent)
6065                 event = event->parent;
6066
6067         rcu_read_lock();
6068         rb = rcu_dereference(event->rb);
6069         if (rb) {
6070                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
6071                         wake_up_all(&event->waitq);
6072         }
6073         rcu_read_unlock();
6074 }
6075
6076 struct perf_buffer *ring_buffer_get(struct perf_event *event)
6077 {
6078         struct perf_buffer *rb;
6079
6080         if (event->parent)
6081                 event = event->parent;
6082
6083         rcu_read_lock();
6084         rb = rcu_dereference(event->rb);
6085         if (rb) {
6086                 if (!refcount_inc_not_zero(&rb->refcount))
6087                         rb = NULL;
6088         }
6089         rcu_read_unlock();
6090
6091         return rb;
6092 }
6093
6094 void ring_buffer_put(struct perf_buffer *rb)
6095 {
6096         if (!refcount_dec_and_test(&rb->refcount))
6097                 return;
6098
6099         WARN_ON_ONCE(!list_empty(&rb->event_list));
6100
6101         call_rcu(&rb->rcu_head, rb_free_rcu);
6102 }
6103
6104 static void perf_mmap_open(struct vm_area_struct *vma)
6105 {
6106         struct perf_event *event = vma->vm_file->private_data;
6107
6108         atomic_inc(&event->mmap_count);
6109         atomic_inc(&event->rb->mmap_count);
6110
6111         if (vma->vm_pgoff)
6112                 atomic_inc(&event->rb->aux_mmap_count);
6113
6114         if (event->pmu->event_mapped)
6115                 event->pmu->event_mapped(event, vma->vm_mm);
6116 }
6117
6118 static void perf_pmu_output_stop(struct perf_event *event);
6119
6120 /*
6121  * A buffer can be mmap()ed multiple times; either directly through the same
6122  * event, or through other events by use of perf_event_set_output().
6123  *
6124  * In order to undo the VM accounting done by perf_mmap() we need to destroy
6125  * the buffer here, where we still have a VM context. This means we need
6126  * to detach all events redirecting to us.
6127  */
6128 static void perf_mmap_close(struct vm_area_struct *vma)
6129 {
6130         struct perf_event *event = vma->vm_file->private_data;
6131         struct perf_buffer *rb = ring_buffer_get(event);
6132         struct user_struct *mmap_user = rb->mmap_user;
6133         int mmap_locked = rb->mmap_locked;
6134         unsigned long size = perf_data_size(rb);
6135         bool detach_rest = false;
6136
6137         if (event->pmu->event_unmapped)
6138                 event->pmu->event_unmapped(event, vma->vm_mm);
6139
6140         /*
6141          * rb->aux_mmap_count will always drop before rb->mmap_count and
6142          * event->mmap_count, so it is ok to use event->mmap_mutex to
6143          * serialize with perf_mmap here.
6144          */
6145         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
6146             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
6147                 /*
6148                  * Stop all AUX events that are writing to this buffer,
6149                  * so that we can free its AUX pages and corresponding PMU
6150                  * data. Note that after rb::aux_mmap_count dropped to zero,
6151                  * they won't start any more (see perf_aux_output_begin()).
6152                  */
6153                 perf_pmu_output_stop(event);
6154
6155                 /* now it's safe to free the pages */
6156                 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
6157                 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
6158
6159                 /* this has to be the last one */
6160                 rb_free_aux(rb);
6161                 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
6162
6163                 mutex_unlock(&event->mmap_mutex);
6164         }
6165
6166         if (atomic_dec_and_test(&rb->mmap_count))
6167                 detach_rest = true;
6168
6169         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
6170                 goto out_put;
6171
6172         ring_buffer_attach(event, NULL);
6173         mutex_unlock(&event->mmap_mutex);
6174
6175         /* If there's still other mmap()s of this buffer, we're done. */
6176         if (!detach_rest)
6177                 goto out_put;
6178
6179         /*
6180          * No other mmap()s, detach from all other events that might redirect
6181          * into the now unreachable buffer. Somewhat complicated by the
6182          * fact that rb::event_lock otherwise nests inside mmap_mutex.
6183          */
6184 again:
6185         rcu_read_lock();
6186         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
6187                 if (!atomic_long_inc_not_zero(&event->refcount)) {
6188                         /*
6189                          * This event is en-route to free_event() which will
6190                          * detach it and remove it from the list.
6191                          */
6192                         continue;
6193                 }
6194                 rcu_read_unlock();
6195
6196                 mutex_lock(&event->mmap_mutex);
6197                 /*
6198                  * Check we didn't race with perf_event_set_output() which can
6199                  * swizzle the rb from under us while we were waiting to
6200                  * acquire mmap_mutex.
6201                  *
6202                  * If we find a different rb; ignore this event, a next
6203                  * iteration will no longer find it on the list. We have to
6204                  * still restart the iteration to make sure we're not now
6205                  * iterating the wrong list.
6206                  */
6207                 if (event->rb == rb)
6208                         ring_buffer_attach(event, NULL);
6209
6210                 mutex_unlock(&event->mmap_mutex);
6211                 put_event(event);
6212
6213                 /*
6214                  * Restart the iteration; either we're on the wrong list or
6215                  * destroyed its integrity by doing a deletion.
6216                  */
6217                 goto again;
6218         }
6219         rcu_read_unlock();
6220
6221         /*
6222          * It could be there's still a few 0-ref events on the list; they'll
6223          * get cleaned up by free_event() -- they'll also still have their
6224          * ref on the rb and will free it whenever they are done with it.
6225          *
6226          * Aside from that, this buffer is 'fully' detached and unmapped,
6227          * undo the VM accounting.
6228          */
6229
6230         atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6231                         &mmap_user->locked_vm);
6232         atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6233         free_uid(mmap_user);
6234
6235 out_put:
6236         ring_buffer_put(rb); /* could be last */
6237 }
6238
6239 static const struct vm_operations_struct perf_mmap_vmops = {
6240         .open           = perf_mmap_open,
6241         .close          = perf_mmap_close, /* non mergeable */
6242         .fault          = perf_mmap_fault,
6243         .page_mkwrite   = perf_mmap_fault,
6244 };
6245
6246 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6247 {
6248         struct perf_event *event = file->private_data;
6249         unsigned long user_locked, user_lock_limit;
6250         struct user_struct *user = current_user();
6251         struct perf_buffer *rb = NULL;
6252         unsigned long locked, lock_limit;
6253         unsigned long vma_size;
6254         unsigned long nr_pages;
6255         long user_extra = 0, extra = 0;
6256         int ret = 0, flags = 0;
6257
6258         /*
6259          * Don't allow mmap() of inherited per-task counters. This would
6260          * create a performance issue due to all children writing to the
6261          * same rb.
6262          */
6263         if (event->cpu == -1 && event->attr.inherit)
6264                 return -EINVAL;
6265
6266         if (!(vma->vm_flags & VM_SHARED))
6267                 return -EINVAL;
6268
6269         ret = security_perf_event_read(event);
6270         if (ret)
6271                 return ret;
6272
6273         vma_size = vma->vm_end - vma->vm_start;
6274
6275         if (vma->vm_pgoff == 0) {
6276                 nr_pages = (vma_size / PAGE_SIZE) - 1;
6277         } else {
6278                 /*
6279                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
6280                  * mapped, all subsequent mappings should have the same size
6281                  * and offset. Must be above the normal perf buffer.
6282                  */
6283                 u64 aux_offset, aux_size;
6284
6285                 if (!event->rb)
6286                         return -EINVAL;
6287
6288                 nr_pages = vma_size / PAGE_SIZE;
6289
6290                 mutex_lock(&event->mmap_mutex);
6291                 ret = -EINVAL;
6292
6293                 rb = event->rb;
6294                 if (!rb)
6295                         goto aux_unlock;
6296
6297                 aux_offset = READ_ONCE(rb->user_page->aux_offset);
6298                 aux_size = READ_ONCE(rb->user_page->aux_size);
6299
6300                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6301                         goto aux_unlock;
6302
6303                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6304                         goto aux_unlock;
6305
6306                 /* already mapped with a different offset */
6307                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6308                         goto aux_unlock;
6309
6310                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6311                         goto aux_unlock;
6312
6313                 /* already mapped with a different size */
6314                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6315                         goto aux_unlock;
6316
6317                 if (!is_power_of_2(nr_pages))
6318                         goto aux_unlock;
6319
6320                 if (!atomic_inc_not_zero(&rb->mmap_count))
6321                         goto aux_unlock;
6322
6323                 if (rb_has_aux(rb)) {
6324                         atomic_inc(&rb->aux_mmap_count);
6325                         ret = 0;
6326                         goto unlock;
6327                 }
6328
6329                 atomic_set(&rb->aux_mmap_count, 1);
6330                 user_extra = nr_pages;
6331
6332                 goto accounting;
6333         }
6334
6335         /*
6336          * If we have rb pages ensure they're a power-of-two number, so we
6337          * can do bitmasks instead of modulo.
6338          */
6339         if (nr_pages != 0 && !is_power_of_2(nr_pages))
6340                 return -EINVAL;
6341
6342         if (vma_size != PAGE_SIZE * (1 + nr_pages))
6343                 return -EINVAL;
6344
6345         WARN_ON_ONCE(event->ctx->parent_ctx);
6346 again:
6347         mutex_lock(&event->mmap_mutex);
6348         if (event->rb) {
6349                 if (event->rb->nr_pages != nr_pages) {
6350                         ret = -EINVAL;
6351                         goto unlock;
6352                 }
6353
6354                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6355                         /*
6356                          * Raced against perf_mmap_close() through
6357                          * perf_event_set_output(). Try again, hope for better
6358                          * luck.
6359                          */
6360                         mutex_unlock(&event->mmap_mutex);
6361                         goto again;
6362                 }
6363
6364                 goto unlock;
6365         }
6366
6367         user_extra = nr_pages + 1;
6368
6369 accounting:
6370         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6371
6372         /*
6373          * Increase the limit linearly with more CPUs:
6374          */
6375         user_lock_limit *= num_online_cpus();
6376
6377         user_locked = atomic_long_read(&user->locked_vm);
6378
6379         /*
6380          * sysctl_perf_event_mlock may have changed, so that
6381          *     user->locked_vm > user_lock_limit
6382          */
6383         if (user_locked > user_lock_limit)
6384                 user_locked = user_lock_limit;
6385         user_locked += user_extra;
6386
6387         if (user_locked > user_lock_limit) {
6388                 /*
6389                  * charge locked_vm until it hits user_lock_limit;
6390                  * charge the rest from pinned_vm
6391                  */
6392                 extra = user_locked - user_lock_limit;
6393                 user_extra -= extra;
6394         }
6395
6396         lock_limit = rlimit(RLIMIT_MEMLOCK);
6397         lock_limit >>= PAGE_SHIFT;
6398         locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6399
6400         if ((locked > lock_limit) && perf_is_paranoid() &&
6401                 !capable(CAP_IPC_LOCK)) {
6402                 ret = -EPERM;
6403                 goto unlock;
6404         }
6405
6406         WARN_ON(!rb && event->rb);
6407
6408         if (vma->vm_flags & VM_WRITE)
6409                 flags |= RING_BUFFER_WRITABLE;
6410
6411         if (!rb) {
6412                 rb = rb_alloc(nr_pages,
6413                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
6414                               event->cpu, flags);
6415
6416                 if (!rb) {
6417                         ret = -ENOMEM;
6418                         goto unlock;
6419                 }
6420
6421                 atomic_set(&rb->mmap_count, 1);
6422                 rb->mmap_user = get_current_user();
6423                 rb->mmap_locked = extra;
6424
6425                 ring_buffer_attach(event, rb);
6426
6427                 perf_event_update_time(event);
6428                 perf_event_init_userpage(event);
6429                 perf_event_update_userpage(event);
6430         } else {
6431                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6432                                    event->attr.aux_watermark, flags);
6433                 if (!ret)
6434                         rb->aux_mmap_locked = extra;
6435         }
6436
6437 unlock:
6438         if (!ret) {
6439                 atomic_long_add(user_extra, &user->locked_vm);
6440                 atomic64_add(extra, &vma->vm_mm->pinned_vm);
6441
6442                 atomic_inc(&event->mmap_count);
6443         } else if (rb) {
6444                 atomic_dec(&rb->mmap_count);
6445         }
6446 aux_unlock:
6447         mutex_unlock(&event->mmap_mutex);
6448
6449         /*
6450          * Since pinned accounting is per vm we cannot allow fork() to copy our
6451          * vma.
6452          */
6453         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
6454         vma->vm_ops = &perf_mmap_vmops;
6455
6456         if (event->pmu->event_mapped)
6457                 event->pmu->event_mapped(event, vma->vm_mm);
6458
6459         return ret;
6460 }
6461
6462 static int perf_fasync(int fd, struct file *filp, int on)
6463 {
6464         struct inode *inode = file_inode(filp);
6465         struct perf_event *event = filp->private_data;
6466         int retval;
6467
6468         inode_lock(inode);
6469         retval = fasync_helper(fd, filp, on, &event->fasync);
6470         inode_unlock(inode);
6471
6472         if (retval < 0)
6473                 return retval;
6474
6475         return 0;
6476 }
6477
6478 static const struct file_operations perf_fops = {
6479         .llseek                 = no_llseek,
6480         .release                = perf_release,
6481         .read                   = perf_read,
6482         .poll                   = perf_poll,
6483         .unlocked_ioctl         = perf_ioctl,
6484         .compat_ioctl           = perf_compat_ioctl,
6485         .mmap                   = perf_mmap,
6486         .fasync                 = perf_fasync,
6487 };
6488
6489 /*
6490  * Perf event wakeup
6491  *
6492  * If there's data, ensure we set the poll() state and publish everything
6493  * to user-space before waking everybody up.
6494  */
6495
6496 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6497 {
6498         /* only the parent has fasync state */
6499         if (event->parent)
6500                 event = event->parent;
6501         return &event->fasync;
6502 }
6503
6504 void perf_event_wakeup(struct perf_event *event)
6505 {
6506         ring_buffer_wakeup(event);
6507
6508         if (event->pending_kill) {
6509                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6510                 event->pending_kill = 0;
6511         }
6512 }
6513
6514 static void perf_sigtrap(struct perf_event *event)
6515 {
6516         /*
6517          * We'd expect this to only occur if the irq_work is delayed and either
6518          * ctx->task or current has changed in the meantime. This can be the
6519          * case on architectures that do not implement arch_irq_work_raise().
6520          */
6521         if (WARN_ON_ONCE(event->ctx->task != current))
6522                 return;
6523
6524         /*
6525          * perf_pending_event() can race with the task exiting.
6526          */
6527         if (current->flags & PF_EXITING)
6528                 return;
6529
6530         force_sig_perf((void __user *)event->pending_addr,
6531                        event->attr.type, event->attr.sig_data);
6532 }
6533
6534 static void perf_pending_event_disable(struct perf_event *event)
6535 {
6536         int cpu = READ_ONCE(event->pending_disable);
6537
6538         if (cpu < 0)
6539                 return;
6540
6541         if (cpu == smp_processor_id()) {
6542                 WRITE_ONCE(event->pending_disable, -1);
6543
6544                 if (event->attr.sigtrap) {
6545                         perf_sigtrap(event);
6546                         atomic_set_release(&event->event_limit, 1); /* rearm event */
6547                         return;
6548                 }
6549
6550                 perf_event_disable_local(event);
6551                 return;
6552         }
6553
6554         /*
6555          *  CPU-A                       CPU-B
6556          *
6557          *  perf_event_disable_inatomic()
6558          *    @pending_disable = CPU-A;
6559          *    irq_work_queue();
6560          *
6561          *  sched-out
6562          *    @pending_disable = -1;
6563          *
6564          *                              sched-in
6565          *                              perf_event_disable_inatomic()
6566          *                                @pending_disable = CPU-B;
6567          *                                irq_work_queue(); // FAILS
6568          *
6569          *  irq_work_run()
6570          *    perf_pending_event()
6571          *
6572          * But the event runs on CPU-B and wants disabling there.
6573          */
6574         irq_work_queue_on(&event->pending, cpu);
6575 }
6576
6577 static void perf_pending_event(struct irq_work *entry)
6578 {
6579         struct perf_event *event = container_of(entry, struct perf_event, pending);
6580         int rctx;
6581
6582         rctx = perf_swevent_get_recursion_context();
6583         /*
6584          * If we 'fail' here, that's OK, it means recursion is already disabled
6585          * and we won't recurse 'further'.
6586          */
6587
6588         perf_pending_event_disable(event);
6589
6590         if (event->pending_wakeup) {
6591                 event->pending_wakeup = 0;
6592                 perf_event_wakeup(event);
6593         }
6594
6595         if (rctx >= 0)
6596                 perf_swevent_put_recursion_context(rctx);
6597 }
6598
6599 #ifdef CONFIG_GUEST_PERF_EVENTS
6600 struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
6601
6602 DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
6603 DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
6604 DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
6605
6606 void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6607 {
6608         if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
6609                 return;
6610
6611         rcu_assign_pointer(perf_guest_cbs, cbs);
6612         static_call_update(__perf_guest_state, cbs->state);
6613         static_call_update(__perf_guest_get_ip, cbs->get_ip);
6614
6615         /* Implementing ->handle_intel_pt_intr is optional. */
6616         if (cbs->handle_intel_pt_intr)
6617                 static_call_update(__perf_guest_handle_intel_pt_intr,
6618                                    cbs->handle_intel_pt_intr);
6619 }
6620 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
6621
6622 void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6623 {
6624         if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
6625                 return;
6626
6627         rcu_assign_pointer(perf_guest_cbs, NULL);
6628         static_call_update(__perf_guest_state, (void *)&__static_call_return0);
6629         static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
6630         static_call_update(__perf_guest_handle_intel_pt_intr,
6631                            (void *)&__static_call_return0);
6632         synchronize_rcu();
6633 }
6634 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
6635 #endif
6636
6637 static void
6638 perf_output_sample_regs(struct perf_output_handle *handle,
6639                         struct pt_regs *regs, u64 mask)
6640 {
6641         int bit;
6642         DECLARE_BITMAP(_mask, 64);
6643
6644         bitmap_from_u64(_mask, mask);
6645         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
6646                 u64 val;
6647
6648                 val = perf_reg_value(regs, bit);
6649                 perf_output_put(handle, val);
6650         }
6651 }
6652
6653 static void perf_sample_regs_user(struct perf_regs *regs_user,
6654                                   struct pt_regs *regs)
6655 {
6656         if (user_mode(regs)) {
6657                 regs_user->abi = perf_reg_abi(current);
6658                 regs_user->regs = regs;
6659         } else if (!(current->flags & PF_KTHREAD)) {
6660                 perf_get_regs_user(regs_user, regs);
6661         } else {
6662                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
6663                 regs_user->regs = NULL;
6664         }
6665 }
6666
6667 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
6668                                   struct pt_regs *regs)
6669 {
6670         regs_intr->regs = regs;
6671         regs_intr->abi  = perf_reg_abi(current);
6672 }
6673
6674
6675 /*
6676  * Get remaining task size from user stack pointer.
6677  *
6678  * It'd be better to take stack vma map and limit this more
6679  * precisely, but there's no way to get it safely under interrupt,
6680  * so using TASK_SIZE as limit.
6681  */
6682 static u64 perf_ustack_task_size(struct pt_regs *regs)
6683 {
6684         unsigned long addr = perf_user_stack_pointer(regs);
6685
6686         if (!addr || addr >= TASK_SIZE)
6687                 return 0;
6688
6689         return TASK_SIZE - addr;
6690 }
6691
6692 static u16
6693 perf_sample_ustack_size(u16 stack_size, u16 header_size,
6694                         struct pt_regs *regs)
6695 {
6696         u64 task_size;
6697
6698         /* No regs, no stack pointer, no dump. */
6699         if (!regs)
6700                 return 0;
6701
6702         /*
6703          * Check if we fit in with the requested stack size into the:
6704          * - TASK_SIZE
6705          *   If we don't, we limit the size to the TASK_SIZE.
6706          *
6707          * - remaining sample size
6708          *   If we don't, we customize the stack size to
6709          *   fit in to the remaining sample size.
6710          */
6711
6712         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6713         stack_size = min(stack_size, (u16) task_size);
6714
6715         /* Current header size plus static size and dynamic size. */
6716         header_size += 2 * sizeof(u64);
6717
6718         /* Do we fit in with the current stack dump size? */
6719         if ((u16) (header_size + stack_size) < header_size) {
6720                 /*
6721                  * If we overflow the maximum size for the sample,
6722                  * we customize the stack dump size to fit in.
6723                  */
6724                 stack_size = USHRT_MAX - header_size - sizeof(u64);
6725                 stack_size = round_up(stack_size, sizeof(u64));
6726         }
6727
6728         return stack_size;
6729 }
6730
6731 static void
6732 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6733                           struct pt_regs *regs)
6734 {
6735         /* Case of a kernel thread, nothing to dump */
6736         if (!regs) {
6737                 u64 size = 0;
6738                 perf_output_put(handle, size);
6739         } else {
6740                 unsigned long sp;
6741                 unsigned int rem;
6742                 u64 dyn_size;
6743
6744                 /*
6745                  * We dump:
6746                  * static size
6747                  *   - the size requested by user or the best one we can fit
6748                  *     in to the sample max size
6749                  * data
6750                  *   - user stack dump data
6751                  * dynamic size
6752                  *   - the actual dumped size
6753                  */
6754
6755                 /* Static size. */
6756                 perf_output_put(handle, dump_size);
6757
6758                 /* Data. */
6759                 sp = perf_user_stack_pointer(regs);
6760                 rem = __output_copy_user(handle, (void *) sp, dump_size);
6761                 dyn_size = dump_size - rem;
6762
6763                 perf_output_skip(handle, rem);
6764
6765                 /* Dynamic size. */
6766                 perf_output_put(handle, dyn_size);
6767         }
6768 }
6769
6770 static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6771                                           struct perf_sample_data *data,
6772                                           size_t size)
6773 {
6774         struct perf_event *sampler = event->aux_event;
6775         struct perf_buffer *rb;
6776
6777         data->aux_size = 0;
6778
6779         if (!sampler)
6780                 goto out;
6781
6782         if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6783                 goto out;
6784
6785         if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6786                 goto out;
6787
6788         rb = ring_buffer_get(sampler);
6789         if (!rb)
6790                 goto out;
6791
6792         /*
6793          * If this is an NMI hit inside sampling code, don't take
6794          * the sample. See also perf_aux_sample_output().
6795          */
6796         if (READ_ONCE(rb->aux_in_sampling)) {
6797                 data->aux_size = 0;
6798         } else {
6799                 size = min_t(size_t, size, perf_aux_size(rb));
6800                 data->aux_size = ALIGN(size, sizeof(u64));
6801         }
6802         ring_buffer_put(rb);
6803
6804 out:
6805         return data->aux_size;
6806 }
6807
6808 static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
6809                                  struct perf_event *event,
6810                                  struct perf_output_handle *handle,
6811                                  unsigned long size)
6812 {
6813         unsigned long flags;
6814         long ret;
6815
6816         /*
6817          * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
6818          * paths. If we start calling them in NMI context, they may race with
6819          * the IRQ ones, that is, for example, re-starting an event that's just
6820          * been stopped, which is why we're using a separate callback that
6821          * doesn't change the event state.
6822          *
6823          * IRQs need to be disabled to prevent IPIs from racing with us.
6824          */
6825         local_irq_save(flags);
6826         /*
6827          * Guard against NMI hits inside the critical section;
6828          * see also perf_prepare_sample_aux().
6829          */
6830         WRITE_ONCE(rb->aux_in_sampling, 1);
6831         barrier();
6832
6833         ret = event->pmu->snapshot_aux(event, handle, size);
6834
6835         barrier();
6836         WRITE_ONCE(rb->aux_in_sampling, 0);
6837         local_irq_restore(flags);
6838
6839         return ret;
6840 }
6841
6842 static void perf_aux_sample_output(struct perf_event *event,
6843                                    struct perf_output_handle *handle,
6844                                    struct perf_sample_data *data)
6845 {
6846         struct perf_event *sampler = event->aux_event;
6847         struct perf_buffer *rb;
6848         unsigned long pad;
6849         long size;
6850
6851         if (WARN_ON_ONCE(!sampler || !data->aux_size))
6852                 return;
6853
6854         rb = ring_buffer_get(sampler);
6855         if (!rb)
6856                 return;
6857
6858         size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6859
6860         /*
6861          * An error here means that perf_output_copy() failed (returned a
6862          * non-zero surplus that it didn't copy), which in its current
6863          * enlightened implementation is not possible. If that changes, we'd
6864          * like to know.
6865          */
6866         if (WARN_ON_ONCE(size < 0))
6867                 goto out_put;
6868
6869         /*
6870          * The pad comes from ALIGN()ing data->aux_size up to u64 in
6871          * perf_prepare_sample_aux(), so should not be more than that.
6872          */
6873         pad = data->aux_size - size;
6874         if (WARN_ON_ONCE(pad >= sizeof(u64)))
6875                 pad = 8;
6876
6877         if (pad) {
6878                 u64 zero = 0;
6879                 perf_output_copy(handle, &zero, pad);
6880         }
6881
6882 out_put:
6883         ring_buffer_put(rb);
6884 }
6885
6886 static void __perf_event_header__init_id(struct perf_event_header *header,
6887                                          struct perf_sample_data *data,
6888                                          struct perf_event *event)
6889 {
6890         u64 sample_type = event->attr.sample_type;
6891
6892         data->type = sample_type;
6893         header->size += event->id_header_size;
6894
6895         if (sample_type & PERF_SAMPLE_TID) {
6896                 /* namespace issues */
6897                 data->tid_entry.pid = perf_event_pid(event, current);
6898                 data->tid_entry.tid = perf_event_tid(event, current);
6899         }
6900
6901         if (sample_type & PERF_SAMPLE_TIME)
6902                 data->time = perf_event_clock(event);
6903
6904         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6905                 data->id = primary_event_id(event);
6906
6907         if (sample_type & PERF_SAMPLE_STREAM_ID)
6908                 data->stream_id = event->id;
6909
6910         if (sample_type & PERF_SAMPLE_CPU) {
6911                 data->cpu_entry.cpu      = raw_smp_processor_id();
6912                 data->cpu_entry.reserved = 0;
6913         }
6914 }
6915
6916 void perf_event_header__init_id(struct perf_event_header *header,
6917                                 struct perf_sample_data *data,
6918                                 struct perf_event *event)
6919 {
6920         if (event->attr.sample_id_all)
6921                 __perf_event_header__init_id(header, data, event);
6922 }
6923
6924 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6925                                            struct perf_sample_data *data)
6926 {
6927         u64 sample_type = data->type;
6928
6929         if (sample_type & PERF_SAMPLE_TID)
6930                 perf_output_put(handle, data->tid_entry);
6931
6932         if (sample_type & PERF_SAMPLE_TIME)
6933                 perf_output_put(handle, data->time);
6934
6935         if (sample_type & PERF_SAMPLE_ID)
6936                 perf_output_put(handle, data->id);
6937
6938         if (sample_type & PERF_SAMPLE_STREAM_ID)
6939                 perf_output_put(handle, data->stream_id);
6940
6941         if (sample_type & PERF_SAMPLE_CPU)
6942                 perf_output_put(handle, data->cpu_entry);
6943
6944         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6945                 perf_output_put(handle, data->id);
6946 }
6947
6948 void perf_event__output_id_sample(struct perf_event *event,
6949                                   struct perf_output_handle *handle,
6950                                   struct perf_sample_data *sample)
6951 {
6952         if (event->attr.sample_id_all)
6953                 __perf_event__output_id_sample(handle, sample);
6954 }
6955
6956 static void perf_output_read_one(struct perf_output_handle *handle,
6957                                  struct perf_event *event,
6958                                  u64 enabled, u64 running)
6959 {
6960         u64 read_format = event->attr.read_format;
6961         u64 values[4];
6962         int n = 0;
6963
6964         values[n++] = perf_event_count(event);
6965         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6966                 values[n++] = enabled +
6967                         atomic64_read(&event->child_total_time_enabled);
6968         }
6969         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6970                 values[n++] = running +
6971                         atomic64_read(&event->child_total_time_running);
6972         }
6973         if (read_format & PERF_FORMAT_ID)
6974                 values[n++] = primary_event_id(event);
6975
6976         __output_copy(handle, values, n * sizeof(u64));
6977 }
6978
6979 static void perf_output_read_group(struct perf_output_handle *handle,
6980                             struct perf_event *event,
6981                             u64 enabled, u64 running)
6982 {
6983         struct perf_event *leader = event->group_leader, *sub;
6984         u64 read_format = event->attr.read_format;
6985         u64 values[5];
6986         int n = 0;
6987
6988         values[n++] = 1 + leader->nr_siblings;
6989
6990         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6991                 values[n++] = enabled;
6992
6993         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6994                 values[n++] = running;
6995
6996         if ((leader != event) &&
6997             (leader->state == PERF_EVENT_STATE_ACTIVE))
6998                 leader->pmu->read(leader);
6999
7000         values[n++] = perf_event_count(leader);
7001         if (read_format & PERF_FORMAT_ID)
7002                 values[n++] = primary_event_id(leader);
7003
7004         __output_copy(handle, values, n * sizeof(u64));
7005
7006         for_each_sibling_event(sub, leader) {
7007                 n = 0;
7008
7009                 if ((sub != event) &&
7010                     (sub->state == PERF_EVENT_STATE_ACTIVE))
7011                         sub->pmu->read(sub);
7012
7013                 values[n++] = perf_event_count(sub);
7014                 if (read_format & PERF_FORMAT_ID)
7015                         values[n++] = primary_event_id(sub);
7016
7017                 __output_copy(handle, values, n * sizeof(u64));
7018         }
7019 }
7020
7021 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
7022                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
7023
7024 /*
7025  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
7026  *
7027  * The problem is that its both hard and excessively expensive to iterate the
7028  * child list, not to mention that its impossible to IPI the children running
7029  * on another CPU, from interrupt/NMI context.
7030  */
7031 static void perf_output_read(struct perf_output_handle *handle,
7032                              struct perf_event *event)
7033 {
7034         u64 enabled = 0, running = 0, now;
7035         u64 read_format = event->attr.read_format;
7036
7037         /*
7038          * compute total_time_enabled, total_time_running
7039          * based on snapshot values taken when the event
7040          * was last scheduled in.
7041          *
7042          * we cannot simply called update_context_time()
7043          * because of locking issue as we are called in
7044          * NMI context
7045          */
7046         if (read_format & PERF_FORMAT_TOTAL_TIMES)
7047                 calc_timer_values(event, &now, &enabled, &running);
7048
7049         if (event->attr.read_format & PERF_FORMAT_GROUP)
7050                 perf_output_read_group(handle, event, enabled, running);
7051         else
7052                 perf_output_read_one(handle, event, enabled, running);
7053 }
7054
7055 static inline bool perf_sample_save_hw_index(struct perf_event *event)
7056 {
7057         return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
7058 }
7059
7060 void perf_output_sample(struct perf_output_handle *handle,
7061                         struct perf_event_header *header,
7062                         struct perf_sample_data *data,
7063                         struct perf_event *event)
7064 {
7065         u64 sample_type = data->type;
7066
7067         perf_output_put(handle, *header);
7068
7069         if (sample_type & PERF_SAMPLE_IDENTIFIER)
7070                 perf_output_put(handle, data->id);
7071
7072         if (sample_type & PERF_SAMPLE_IP)
7073                 perf_output_put(handle, data->ip);
7074
7075         if (sample_type & PERF_SAMPLE_TID)
7076                 perf_output_put(handle, data->tid_entry);
7077
7078         if (sample_type & PERF_SAMPLE_TIME)
7079                 perf_output_put(handle, data->time);
7080
7081         if (sample_type & PERF_SAMPLE_ADDR)
7082                 perf_output_put(handle, data->addr);
7083
7084         if (sample_type & PERF_SAMPLE_ID)
7085                 perf_output_put(handle, data->id);
7086
7087         if (sample_type & PERF_SAMPLE_STREAM_ID)
7088                 perf_output_put(handle, data->stream_id);
7089
7090         if (sample_type & PERF_SAMPLE_CPU)
7091                 perf_output_put(handle, data->cpu_entry);
7092
7093         if (sample_type & PERF_SAMPLE_PERIOD)
7094                 perf_output_put(handle, data->period);
7095
7096         if (sample_type & PERF_SAMPLE_READ)
7097                 perf_output_read(handle, event);
7098
7099         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7100                 int size = 1;
7101
7102                 size += data->callchain->nr;
7103                 size *= sizeof(u64);
7104                 __output_copy(handle, data->callchain, size);
7105         }
7106
7107         if (sample_type & PERF_SAMPLE_RAW) {
7108                 struct perf_raw_record *raw = data->raw;
7109
7110                 if (raw) {
7111                         struct perf_raw_frag *frag = &raw->frag;
7112
7113                         perf_output_put(handle, raw->size);
7114                         do {
7115                                 if (frag->copy) {
7116                                         __output_custom(handle, frag->copy,
7117                                                         frag->data, frag->size);
7118                                 } else {
7119                                         __output_copy(handle, frag->data,
7120                                                       frag->size);
7121                                 }
7122                                 if (perf_raw_frag_last(frag))
7123                                         break;
7124                                 frag = frag->next;
7125                         } while (1);
7126                         if (frag->pad)
7127                                 __output_skip(handle, NULL, frag->pad);
7128                 } else {
7129                         struct {
7130                                 u32     size;
7131                                 u32     data;
7132                         } raw = {
7133                                 .size = sizeof(u32),
7134                                 .data = 0,
7135                         };
7136                         perf_output_put(handle, raw);
7137                 }
7138         }
7139
7140         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7141                 if (data->br_stack) {
7142                         size_t size;
7143
7144                         size = data->br_stack->nr
7145                              * sizeof(struct perf_branch_entry);
7146
7147                         perf_output_put(handle, data->br_stack->nr);
7148                         if (perf_sample_save_hw_index(event))
7149                                 perf_output_put(handle, data->br_stack->hw_idx);
7150                         perf_output_copy(handle, data->br_stack->entries, size);
7151                 } else {
7152                         /*
7153                          * we always store at least the value of nr
7154                          */
7155                         u64 nr = 0;
7156                         perf_output_put(handle, nr);
7157                 }
7158         }
7159
7160         if (sample_type & PERF_SAMPLE_REGS_USER) {
7161                 u64 abi = data->regs_user.abi;
7162
7163                 /*
7164                  * If there are no regs to dump, notice it through
7165                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
7166                  */
7167                 perf_output_put(handle, abi);
7168
7169                 if (abi) {
7170                         u64 mask = event->attr.sample_regs_user;
7171                         perf_output_sample_regs(handle,
7172                                                 data->regs_user.regs,
7173                                                 mask);
7174                 }
7175         }
7176
7177         if (sample_type & PERF_SAMPLE_STACK_USER) {
7178                 perf_output_sample_ustack(handle,
7179                                           data->stack_user_size,
7180                                           data->regs_user.regs);
7181         }
7182
7183         if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
7184                 perf_output_put(handle, data->weight.full);
7185
7186         if (sample_type & PERF_SAMPLE_DATA_SRC)
7187                 perf_output_put(handle, data->data_src.val);
7188
7189         if (sample_type & PERF_SAMPLE_TRANSACTION)
7190                 perf_output_put(handle, data->txn);
7191
7192         if (sample_type & PERF_SAMPLE_REGS_INTR) {
7193                 u64 abi = data->regs_intr.abi;
7194                 /*
7195                  * If there are no regs to dump, notice it through
7196                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
7197                  */
7198                 perf_output_put(handle, abi);
7199
7200                 if (abi) {
7201                         u64 mask = event->attr.sample_regs_intr;
7202
7203                         perf_output_sample_regs(handle,
7204                                                 data->regs_intr.regs,
7205                                                 mask);
7206                 }
7207         }
7208
7209         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7210                 perf_output_put(handle, data->phys_addr);
7211
7212         if (sample_type & PERF_SAMPLE_CGROUP)
7213                 perf_output_put(handle, data->cgroup);
7214
7215         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7216                 perf_output_put(handle, data->data_page_size);
7217
7218         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7219                 perf_output_put(handle, data->code_page_size);
7220
7221         if (sample_type & PERF_SAMPLE_AUX) {
7222                 perf_output_put(handle, data->aux_size);
7223
7224                 if (data->aux_size)
7225                         perf_aux_sample_output(event, handle, data);
7226         }
7227
7228         if (!event->attr.watermark) {
7229                 int wakeup_events = event->attr.wakeup_events;
7230
7231                 if (wakeup_events) {
7232                         struct perf_buffer *rb = handle->rb;
7233                         int events = local_inc_return(&rb->events);
7234
7235                         if (events >= wakeup_events) {
7236                                 local_sub(wakeup_events, &rb->events);
7237                                 local_inc(&rb->wakeup);
7238                         }
7239                 }
7240         }
7241 }
7242
7243 static u64 perf_virt_to_phys(u64 virt)
7244 {
7245         u64 phys_addr = 0;
7246
7247         if (!virt)
7248                 return 0;
7249
7250         if (virt >= TASK_SIZE) {
7251                 /* If it's vmalloc()d memory, leave phys_addr as 0 */
7252                 if (virt_addr_valid((void *)(uintptr_t)virt) &&
7253                     !(virt >= VMALLOC_START && virt < VMALLOC_END))
7254                         phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
7255         } else {
7256                 /*
7257                  * Walking the pages tables for user address.
7258                  * Interrupts are disabled, so it prevents any tear down
7259                  * of the page tables.
7260                  * Try IRQ-safe get_user_page_fast_only first.
7261                  * If failed, leave phys_addr as 0.
7262                  */
7263                 if (current->mm != NULL) {
7264                         struct page *p;
7265
7266                         pagefault_disable();
7267                         if (get_user_page_fast_only(virt, 0, &p)) {
7268                                 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
7269                                 put_page(p);
7270                         }
7271                         pagefault_enable();
7272                 }
7273         }
7274
7275         return phys_addr;
7276 }
7277
7278 /*
7279  * Return the pagetable size of a given virtual address.
7280  */
7281 static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
7282 {
7283         u64 size = 0;
7284
7285 #ifdef CONFIG_HAVE_FAST_GUP
7286         pgd_t *pgdp, pgd;
7287         p4d_t *p4dp, p4d;
7288         pud_t *pudp, pud;
7289         pmd_t *pmdp, pmd;
7290         pte_t *ptep, pte;
7291
7292         pgdp = pgd_offset(mm, addr);
7293         pgd = READ_ONCE(*pgdp);
7294         if (pgd_none(pgd))
7295                 return 0;
7296
7297         if (pgd_leaf(pgd))
7298                 return pgd_leaf_size(pgd);
7299
7300         p4dp = p4d_offset_lockless(pgdp, pgd, addr);
7301         p4d = READ_ONCE(*p4dp);
7302         if (!p4d_present(p4d))
7303                 return 0;
7304
7305         if (p4d_leaf(p4d))
7306                 return p4d_leaf_size(p4d);
7307
7308         pudp = pud_offset_lockless(p4dp, p4d, addr);
7309         pud = READ_ONCE(*pudp);
7310         if (!pud_present(pud))
7311                 return 0;
7312
7313         if (pud_leaf(pud))
7314                 return pud_leaf_size(pud);
7315
7316         pmdp = pmd_offset_lockless(pudp, pud, addr);
7317         pmd = READ_ONCE(*pmdp);
7318         if (!pmd_present(pmd))
7319                 return 0;
7320
7321         if (pmd_leaf(pmd))
7322                 return pmd_leaf_size(pmd);
7323
7324         ptep = pte_offset_map(&pmd, addr);
7325         pte = ptep_get_lockless(ptep);
7326         if (pte_present(pte))
7327                 size = pte_leaf_size(pte);
7328         pte_unmap(ptep);
7329 #endif /* CONFIG_HAVE_FAST_GUP */
7330
7331         return size;
7332 }
7333
7334 static u64 perf_get_page_size(unsigned long addr)
7335 {
7336         struct mm_struct *mm;
7337         unsigned long flags;
7338         u64 size;
7339
7340         if (!addr)
7341                 return 0;
7342
7343         /*
7344          * Software page-table walkers must disable IRQs,
7345          * which prevents any tear down of the page tables.
7346          */
7347         local_irq_save(flags);
7348
7349         mm = current->mm;
7350         if (!mm) {
7351                 /*
7352                  * For kernel threads and the like, use init_mm so that
7353                  * we can find kernel memory.
7354                  */
7355                 mm = &init_mm;
7356         }
7357
7358         size = perf_get_pgtable_size(mm, addr);
7359
7360         local_irq_restore(flags);
7361
7362         return size;
7363 }
7364
7365 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
7366
7367 struct perf_callchain_entry *
7368 perf_callchain(struct perf_event *event, struct pt_regs *regs)
7369 {
7370         bool kernel = !event->attr.exclude_callchain_kernel;
7371         bool user   = !event->attr.exclude_callchain_user;
7372         /* Disallow cross-task user callchains. */
7373         bool crosstask = event->ctx->task && event->ctx->task != current;
7374         const u32 max_stack = event->attr.sample_max_stack;
7375         struct perf_callchain_entry *callchain;
7376
7377         if (!kernel && !user)
7378                 return &__empty_callchain;
7379
7380         callchain = get_perf_callchain(regs, 0, kernel, user,
7381                                        max_stack, crosstask, true);
7382         return callchain ?: &__empty_callchain;
7383 }
7384
7385 void perf_prepare_sample(struct perf_event_header *header,
7386                          struct perf_sample_data *data,
7387                          struct perf_event *event,
7388                          struct pt_regs *regs)
7389 {
7390         u64 sample_type = event->attr.sample_type;
7391
7392         header->type = PERF_RECORD_SAMPLE;
7393         header->size = sizeof(*header) + event->header_size;
7394
7395         header->misc = 0;
7396         header->misc |= perf_misc_flags(regs);
7397
7398         __perf_event_header__init_id(header, data, event);
7399
7400         if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
7401                 data->ip = perf_instruction_pointer(regs);
7402
7403         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7404                 int size = 1;
7405
7406                 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
7407                         data->callchain = perf_callchain(event, regs);
7408
7409                 size += data->callchain->nr;
7410
7411                 header->size += size * sizeof(u64);
7412         }
7413
7414         if (sample_type & PERF_SAMPLE_RAW) {
7415                 struct perf_raw_record *raw = data->raw;
7416                 int size;
7417
7418                 if (raw) {
7419                         struct perf_raw_frag *frag = &raw->frag;
7420                         u32 sum = 0;
7421
7422                         do {
7423                                 sum += frag->size;
7424                                 if (perf_raw_frag_last(frag))
7425                                         break;
7426                                 frag = frag->next;
7427                         } while (1);
7428
7429                         size = round_up(sum + sizeof(u32), sizeof(u64));
7430                         raw->size = size - sizeof(u32);
7431                         frag->pad = raw->size - sum;
7432                 } else {
7433                         size = sizeof(u64);
7434                 }
7435
7436                 header->size += size;
7437         }
7438
7439         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7440                 int size = sizeof(u64); /* nr */
7441                 if (data->br_stack) {
7442                         if (perf_sample_save_hw_index(event))
7443                                 size += sizeof(u64);
7444
7445                         size += data->br_stack->nr
7446                               * sizeof(struct perf_branch_entry);
7447                 }
7448                 header->size += size;
7449         }
7450
7451         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
7452                 perf_sample_regs_user(&data->regs_user, regs);
7453
7454         if (sample_type & PERF_SAMPLE_REGS_USER) {
7455                 /* regs dump ABI info */
7456                 int size = sizeof(u64);
7457
7458                 if (data->regs_user.regs) {
7459                         u64 mask = event->attr.sample_regs_user;
7460                         size += hweight64(mask) * sizeof(u64);
7461                 }
7462
7463                 header->size += size;
7464         }
7465
7466         if (sample_type & PERF_SAMPLE_STACK_USER) {
7467                 /*
7468                  * Either we need PERF_SAMPLE_STACK_USER bit to be always
7469                  * processed as the last one or have additional check added
7470                  * in case new sample type is added, because we could eat
7471                  * up the rest of the sample size.
7472                  */
7473                 u16 stack_size = event->attr.sample_stack_user;
7474                 u16 size = sizeof(u64);
7475
7476                 stack_size = perf_sample_ustack_size(stack_size, header->size,
7477                                                      data->regs_user.regs);
7478
7479                 /*
7480                  * If there is something to dump, add space for the dump
7481                  * itself and for the field that tells the dynamic size,
7482                  * which is how many have been actually dumped.
7483                  */
7484                 if (stack_size)
7485                         size += sizeof(u64) + stack_size;
7486
7487                 data->stack_user_size = stack_size;
7488                 header->size += size;
7489         }
7490
7491         if (sample_type & PERF_SAMPLE_REGS_INTR) {
7492                 /* regs dump ABI info */
7493                 int size = sizeof(u64);
7494
7495                 perf_sample_regs_intr(&data->regs_intr, regs);
7496
7497                 if (data->regs_intr.regs) {
7498                         u64 mask = event->attr.sample_regs_intr;
7499
7500                         size += hweight64(mask) * sizeof(u64);
7501                 }
7502
7503                 header->size += size;
7504         }
7505
7506         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7507                 data->phys_addr = perf_virt_to_phys(data->addr);
7508
7509 #ifdef CONFIG_CGROUP_PERF
7510         if (sample_type & PERF_SAMPLE_CGROUP) {
7511                 struct cgroup *cgrp;
7512
7513                 /* protected by RCU */
7514                 cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7515                 data->cgroup = cgroup_id(cgrp);
7516         }
7517 #endif
7518
7519         /*
7520          * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
7521          * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
7522          * but the value will not dump to the userspace.
7523          */
7524         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7525                 data->data_page_size = perf_get_page_size(data->addr);
7526
7527         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7528                 data->code_page_size = perf_get_page_size(data->ip);
7529
7530         if (sample_type & PERF_SAMPLE_AUX) {
7531                 u64 size;
7532
7533                 header->size += sizeof(u64); /* size */
7534
7535                 /*
7536                  * Given the 16bit nature of header::size, an AUX sample can
7537                  * easily overflow it, what with all the preceding sample bits.
7538                  * Make sure this doesn't happen by using up to U16_MAX bytes
7539                  * per sample in total (rounded down to 8 byte boundary).
7540                  */
7541                 size = min_t(size_t, U16_MAX - header->size,
7542                              event->attr.aux_sample_size);
7543                 size = rounddown(size, 8);
7544                 size = perf_prepare_sample_aux(event, data, size);
7545
7546                 WARN_ON_ONCE(size + header->size > U16_MAX);
7547                 header->size += size;
7548         }
7549         /*
7550          * If you're adding more sample types here, you likely need to do
7551          * something about the overflowing header::size, like repurpose the
7552          * lowest 3 bits of size, which should be always zero at the moment.
7553          * This raises a more important question, do we really need 512k sized
7554          * samples and why, so good argumentation is in order for whatever you
7555          * do here next.
7556          */
7557         WARN_ON_ONCE(header->size & 7);
7558 }
7559
7560 static __always_inline int
7561 __perf_event_output(struct perf_event *event,
7562                     struct perf_sample_data *data,
7563                     struct pt_regs *regs,
7564                     int (*output_begin)(struct perf_output_handle *,
7565                                         struct perf_sample_data *,
7566                                         struct perf_event *,
7567                                         unsigned int))
7568 {
7569         struct perf_output_handle handle;
7570         struct perf_event_header header;
7571         int err;
7572
7573         /* protect the callchain buffers */
7574         rcu_read_lock();
7575
7576         perf_prepare_sample(&header, data, event, regs);
7577
7578         err = output_begin(&handle, data, event, header.size);
7579         if (err)
7580                 goto exit;
7581
7582         perf_output_sample(&handle, &header, data, event);
7583
7584         perf_output_end(&handle);
7585
7586 exit:
7587         rcu_read_unlock();
7588         return err;
7589 }
7590
7591 void
7592 perf_event_output_forward(struct perf_event *event,
7593                          struct perf_sample_data *data,
7594                          struct pt_regs *regs)
7595 {
7596         __perf_event_output(event, data, regs, perf_output_begin_forward);
7597 }
7598
7599 void
7600 perf_event_output_backward(struct perf_event *event,
7601                            struct perf_sample_data *data,
7602                            struct pt_regs *regs)
7603 {
7604         __perf_event_output(event, data, regs, perf_output_begin_backward);
7605 }
7606
7607 int
7608 perf_event_output(struct perf_event *event,
7609                   struct perf_sample_data *data,
7610                   struct pt_regs *regs)
7611 {
7612         return __perf_event_output(event, data, regs, perf_output_begin);
7613 }
7614
7615 /*
7616  * read event_id
7617  */
7618
7619 struct perf_read_event {
7620         struct perf_event_header        header;
7621
7622         u32                             pid;
7623         u32                             tid;
7624 };
7625
7626 static void
7627 perf_event_read_event(struct perf_event *event,
7628                         struct task_struct *task)
7629 {
7630         struct perf_output_handle handle;
7631         struct perf_sample_data sample;
7632         struct perf_read_event read_event = {
7633                 .header = {
7634                         .type = PERF_RECORD_READ,
7635                         .misc = 0,
7636                         .size = sizeof(read_event) + event->read_size,
7637                 },
7638                 .pid = perf_event_pid(event, task),
7639                 .tid = perf_event_tid(event, task),
7640         };
7641         int ret;
7642
7643         perf_event_header__init_id(&read_event.header, &sample, event);
7644         ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
7645         if (ret)
7646                 return;
7647
7648         perf_output_put(&handle, read_event);
7649         perf_output_read(&handle, event);
7650         perf_event__output_id_sample(event, &handle, &sample);
7651
7652         perf_output_end(&handle);
7653 }
7654
7655 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
7656
7657 static void
7658 perf_iterate_ctx(struct perf_event_context *ctx,
7659                    perf_iterate_f output,
7660                    void *data, bool all)
7661 {
7662         struct perf_event *event;
7663
7664         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7665                 if (!all) {
7666                         if (event->state < PERF_EVENT_STATE_INACTIVE)
7667                                 continue;
7668                         if (!event_filter_match(event))
7669                                 continue;
7670                 }
7671
7672                 output(event, data);
7673         }
7674 }
7675
7676 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
7677 {
7678         struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
7679         struct perf_event *event;
7680
7681         list_for_each_entry_rcu(event, &pel->list, sb_list) {
7682                 /*
7683                  * Skip events that are not fully formed yet; ensure that
7684                  * if we observe event->ctx, both event and ctx will be
7685                  * complete enough. See perf_install_in_context().
7686                  */
7687                 if (!smp_load_acquire(&event->ctx))
7688                         continue;
7689
7690                 if (event->state < PERF_EVENT_STATE_INACTIVE)
7691                         continue;
7692                 if (!event_filter_match(event))
7693                         continue;
7694                 output(event, data);
7695         }
7696 }
7697
7698 /*
7699  * Iterate all events that need to receive side-band events.
7700  *
7701  * For new callers; ensure that account_pmu_sb_event() includes
7702  * your event, otherwise it might not get delivered.
7703  */
7704 static void
7705 perf_iterate_sb(perf_iterate_f output, void *data,
7706                struct perf_event_context *task_ctx)
7707 {
7708         struct perf_event_context *ctx;
7709         int ctxn;
7710
7711         rcu_read_lock();
7712         preempt_disable();
7713
7714         /*
7715          * If we have task_ctx != NULL we only notify the task context itself.
7716          * The task_ctx is set only for EXIT events before releasing task
7717          * context.
7718          */
7719         if (task_ctx) {
7720                 perf_iterate_ctx(task_ctx, output, data, false);
7721                 goto done;
7722         }
7723
7724         perf_iterate_sb_cpu(output, data);
7725
7726         for_each_task_context_nr(ctxn) {
7727                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7728                 if (ctx)
7729                         perf_iterate_ctx(ctx, output, data, false);
7730         }
7731 done:
7732         preempt_enable();
7733         rcu_read_unlock();
7734 }
7735
7736 /*
7737  * Clear all file-based filters at exec, they'll have to be
7738  * re-instated when/if these objects are mmapped again.
7739  */
7740 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
7741 {
7742         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7743         struct perf_addr_filter *filter;
7744         unsigned int restart = 0, count = 0;
7745         unsigned long flags;
7746
7747         if (!has_addr_filter(event))
7748                 return;
7749
7750         raw_spin_lock_irqsave(&ifh->lock, flags);
7751         list_for_each_entry(filter, &ifh->list, entry) {
7752                 if (filter->path.dentry) {
7753                         event->addr_filter_ranges[count].start = 0;
7754                         event->addr_filter_ranges[count].size = 0;
7755                         restart++;
7756                 }
7757
7758                 count++;
7759         }
7760
7761         if (restart)
7762                 event->addr_filters_gen++;
7763         raw_spin_unlock_irqrestore(&ifh->lock, flags);
7764
7765         if (restart)
7766                 perf_event_stop(event, 1);
7767 }
7768
7769 void perf_event_exec(void)
7770 {
7771         struct perf_event_context *ctx;
7772         int ctxn;
7773
7774         for_each_task_context_nr(ctxn) {
7775                 perf_event_enable_on_exec(ctxn);
7776                 perf_event_remove_on_exec(ctxn);
7777
7778                 rcu_read_lock();
7779                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7780                 if (ctx) {
7781                         perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
7782                                          NULL, true);
7783                 }
7784                 rcu_read_unlock();
7785         }
7786 }
7787
7788 struct remote_output {
7789         struct perf_buffer      *rb;
7790         int                     err;
7791 };
7792
7793 static void __perf_event_output_stop(struct perf_event *event, void *data)
7794 {
7795         struct perf_event *parent = event->parent;
7796         struct remote_output *ro = data;
7797         struct perf_buffer *rb = ro->rb;
7798         struct stop_event_data sd = {
7799                 .event  = event,
7800         };
7801
7802         if (!has_aux(event))
7803                 return;
7804
7805         if (!parent)
7806                 parent = event;
7807
7808         /*
7809          * In case of inheritance, it will be the parent that links to the
7810          * ring-buffer, but it will be the child that's actually using it.
7811          *
7812          * We are using event::rb to determine if the event should be stopped,
7813          * however this may race with ring_buffer_attach() (through set_output),
7814          * which will make us skip the event that actually needs to be stopped.
7815          * So ring_buffer_attach() has to stop an aux event before re-assigning
7816          * its rb pointer.
7817          */
7818         if (rcu_dereference(parent->rb) == rb)
7819                 ro->err = __perf_event_stop(&sd);
7820 }
7821
7822 static int __perf_pmu_output_stop(void *info)
7823 {
7824         struct perf_event *event = info;
7825         struct pmu *pmu = event->ctx->pmu;
7826         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7827         struct remote_output ro = {
7828                 .rb     = event->rb,
7829         };
7830
7831         rcu_read_lock();
7832         perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
7833         if (cpuctx->task_ctx)
7834                 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
7835                                    &ro, false);
7836         rcu_read_unlock();
7837
7838         return ro.err;
7839 }
7840
7841 static void perf_pmu_output_stop(struct perf_event *event)
7842 {
7843         struct perf_event *iter;
7844         int err, cpu;
7845
7846 restart:
7847         rcu_read_lock();
7848         list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
7849                 /*
7850                  * For per-CPU events, we need to make sure that neither they
7851                  * nor their children are running; for cpu==-1 events it's
7852                  * sufficient to stop the event itself if it's active, since
7853                  * it can't have children.
7854                  */
7855                 cpu = iter->cpu;
7856                 if (cpu == -1)
7857                         cpu = READ_ONCE(iter->oncpu);
7858
7859                 if (cpu == -1)
7860                         continue;
7861
7862                 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
7863                 if (err == -EAGAIN) {
7864                         rcu_read_unlock();
7865                         goto restart;
7866                 }
7867         }
7868         rcu_read_unlock();
7869 }
7870
7871 /*
7872  * task tracking -- fork/exit
7873  *
7874  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
7875  */
7876
7877 struct perf_task_event {
7878         struct task_struct              *task;
7879         struct perf_event_context       *task_ctx;
7880
7881         struct {
7882                 struct perf_event_header        header;
7883
7884                 u32                             pid;
7885                 u32                             ppid;
7886                 u32                             tid;
7887                 u32                             ptid;
7888                 u64                             time;
7889         } event_id;
7890 };
7891
7892 static int perf_event_task_match(struct perf_event *event)
7893 {
7894         return event->attr.comm  || event->attr.mmap ||
7895                event->attr.mmap2 || event->attr.mmap_data ||
7896                event->attr.task;
7897 }
7898
7899 static void perf_event_task_output(struct perf_event *event,
7900                                    void *data)
7901 {
7902         struct perf_task_event *task_event = data;
7903         struct perf_output_handle handle;
7904         struct perf_sample_data sample;
7905         struct task_struct *task = task_event->task;
7906         int ret, size = task_event->event_id.header.size;
7907
7908         if (!perf_event_task_match(event))
7909                 return;
7910
7911         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
7912
7913         ret = perf_output_begin(&handle, &sample, event,
7914                                 task_event->event_id.header.size);
7915         if (ret)
7916                 goto out;
7917
7918         task_event->event_id.pid = perf_event_pid(event, task);
7919         task_event->event_id.tid = perf_event_tid(event, task);
7920
7921         if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
7922                 task_event->event_id.ppid = perf_event_pid(event,
7923                                                         task->real_parent);
7924                 task_event->event_id.ptid = perf_event_pid(event,
7925                                                         task->real_parent);
7926         } else {  /* PERF_RECORD_FORK */
7927                 task_event->event_id.ppid = perf_event_pid(event, current);
7928                 task_event->event_id.ptid = perf_event_tid(event, current);
7929         }
7930
7931         task_event->event_id.time = perf_event_clock(event);
7932
7933         perf_output_put(&handle, task_event->event_id);
7934
7935         perf_event__output_id_sample(event, &handle, &sample);
7936
7937         perf_output_end(&handle);
7938 out:
7939         task_event->event_id.header.size = size;
7940 }
7941
7942 static void perf_event_task(struct task_struct *task,
7943                               struct perf_event_context *task_ctx,
7944                               int new)
7945 {
7946         struct perf_task_event task_event;
7947
7948         if (!atomic_read(&nr_comm_events) &&
7949             !atomic_read(&nr_mmap_events) &&
7950             !atomic_read(&nr_task_events))
7951                 return;
7952
7953         task_event = (struct perf_task_event){
7954                 .task     = task,
7955                 .task_ctx = task_ctx,
7956                 .event_id    = {
7957                         .header = {
7958                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
7959                                 .misc = 0,
7960                                 .size = sizeof(task_event.event_id),
7961                         },
7962                         /* .pid  */
7963                         /* .ppid */
7964                         /* .tid  */
7965                         /* .ptid */
7966                         /* .time */
7967                 },
7968         };
7969
7970         perf_iterate_sb(perf_event_task_output,
7971                        &task_event,
7972                        task_ctx);
7973 }
7974
7975 void perf_event_fork(struct task_struct *task)
7976 {
7977         perf_event_task(task, NULL, 1);
7978         perf_event_namespaces(task);
7979 }
7980
7981 /*
7982  * comm tracking
7983  */
7984
7985 struct perf_comm_event {
7986         struct task_struct      *task;
7987         char                    *comm;
7988         int                     comm_size;
7989
7990         struct {
7991                 struct perf_event_header        header;
7992
7993                 u32                             pid;
7994                 u32                             tid;
7995         } event_id;
7996 };
7997
7998 static int perf_event_comm_match(struct perf_event *event)
7999 {
8000         return event->attr.comm;
8001 }
8002
8003 static void perf_event_comm_output(struct perf_event *event,
8004                                    void *data)
8005 {
8006         struct perf_comm_event *comm_event = data;
8007         struct perf_output_handle handle;
8008         struct perf_sample_data sample;
8009         int size = comm_event->event_id.header.size;
8010         int ret;
8011
8012         if (!perf_event_comm_match(event))
8013                 return;
8014
8015         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
8016         ret = perf_output_begin(&handle, &sample, event,
8017                                 comm_event->event_id.header.size);
8018
8019         if (ret)
8020                 goto out;
8021
8022         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
8023         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
8024
8025         perf_output_put(&handle, comm_event->event_id);
8026         __output_copy(&handle, comm_event->comm,
8027                                    comm_event->comm_size);
8028
8029         perf_event__output_id_sample(event, &handle, &sample);
8030
8031         perf_output_end(&handle);
8032 out:
8033         comm_event->event_id.header.size = size;
8034 }
8035
8036 static void perf_event_comm_event(struct perf_comm_event *comm_event)
8037 {
8038         char comm[TASK_COMM_LEN];
8039         unsigned int size;
8040
8041         memset(comm, 0, sizeof(comm));
8042         strlcpy(comm, comm_event->task->comm, sizeof(comm));
8043         size = ALIGN(strlen(comm)+1, sizeof(u64));
8044
8045         comm_event->comm = comm;
8046         comm_event->comm_size = size;
8047
8048         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
8049
8050         perf_iterate_sb(perf_event_comm_output,
8051                        comm_event,
8052                        NULL);
8053 }
8054
8055 void perf_event_comm(struct task_struct *task, bool exec)
8056 {
8057         struct perf_comm_event comm_event;
8058
8059         if (!atomic_read(&nr_comm_events))
8060                 return;
8061
8062         comm_event = (struct perf_comm_event){
8063                 .task   = task,
8064                 /* .comm      */
8065                 /* .comm_size */
8066                 .event_id  = {
8067                         .header = {
8068                                 .type = PERF_RECORD_COMM,
8069                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
8070                                 /* .size */
8071                         },
8072                         /* .pid */
8073                         /* .tid */
8074                 },
8075         };
8076
8077         perf_event_comm_event(&comm_event);
8078 }
8079
8080 /*
8081  * namespaces tracking
8082  */
8083
8084 struct perf_namespaces_event {
8085         struct task_struct              *task;
8086
8087         struct {
8088                 struct perf_event_header        header;
8089
8090                 u32                             pid;
8091                 u32                             tid;
8092                 u64                             nr_namespaces;
8093                 struct perf_ns_link_info        link_info[NR_NAMESPACES];
8094         } event_id;
8095 };
8096
8097 static int perf_event_namespaces_match(struct perf_event *event)
8098 {
8099         return event->attr.namespaces;
8100 }
8101
8102 static void perf_event_namespaces_output(struct perf_event *event,
8103                                          void *data)
8104 {
8105         struct perf_namespaces_event *namespaces_event = data;
8106         struct perf_output_handle handle;
8107         struct perf_sample_data sample;
8108         u16 header_size = namespaces_event->event_id.header.size;
8109         int ret;
8110
8111         if (!perf_event_namespaces_match(event))
8112                 return;
8113
8114         perf_event_header__init_id(&namespaces_event->event_id.header,
8115                                    &sample, event);
8116         ret = perf_output_begin(&handle, &sample, event,
8117                                 namespaces_event->event_id.header.size);
8118         if (ret)
8119                 goto out;
8120
8121         namespaces_event->event_id.pid = perf_event_pid(event,
8122                                                         namespaces_event->task);
8123         namespaces_event->event_id.tid = perf_event_tid(event,
8124                                                         namespaces_event->task);
8125
8126         perf_output_put(&handle, namespaces_event->event_id);
8127
8128         perf_event__output_id_sample(event, &handle, &sample);
8129
8130         perf_output_end(&handle);
8131 out:
8132         namespaces_event->event_id.header.size = header_size;
8133 }
8134
8135 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
8136                                    struct task_struct *task,
8137                                    const struct proc_ns_operations *ns_ops)
8138 {
8139         struct path ns_path;
8140         struct inode *ns_inode;
8141         int error;
8142
8143         error = ns_get_path(&ns_path, task, ns_ops);
8144         if (!error) {
8145                 ns_inode = ns_path.dentry->d_inode;
8146                 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
8147                 ns_link_info->ino = ns_inode->i_ino;
8148                 path_put(&ns_path);
8149         }
8150 }
8151
8152 void perf_event_namespaces(struct task_struct *task)
8153 {
8154         struct perf_namespaces_event namespaces_event;
8155         struct perf_ns_link_info *ns_link_info;
8156
8157         if (!atomic_read(&nr_namespaces_events))
8158                 return;
8159
8160         namespaces_event = (struct perf_namespaces_event){
8161                 .task   = task,
8162                 .event_id  = {
8163                         .header = {
8164                                 .type = PERF_RECORD_NAMESPACES,
8165                                 .misc = 0,
8166                                 .size = sizeof(namespaces_event.event_id),
8167                         },
8168                         /* .pid */
8169                         /* .tid */
8170                         .nr_namespaces = NR_NAMESPACES,
8171                         /* .link_info[NR_NAMESPACES] */
8172                 },
8173         };
8174
8175         ns_link_info = namespaces_event.event_id.link_info;
8176
8177         perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
8178                                task, &mntns_operations);
8179
8180 #ifdef CONFIG_USER_NS
8181         perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
8182                                task, &userns_operations);
8183 #endif
8184 #ifdef CONFIG_NET_NS
8185         perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
8186                                task, &netns_operations);
8187 #endif
8188 #ifdef CONFIG_UTS_NS
8189         perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
8190                                task, &utsns_operations);
8191 #endif
8192 #ifdef CONFIG_IPC_NS
8193         perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
8194                                task, &ipcns_operations);
8195 #endif
8196 #ifdef CONFIG_PID_NS
8197         perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
8198                                task, &pidns_operations);
8199 #endif
8200 #ifdef CONFIG_CGROUPS
8201         perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
8202                                task, &cgroupns_operations);
8203 #endif
8204
8205         perf_iterate_sb(perf_event_namespaces_output,
8206                         &namespaces_event,
8207                         NULL);
8208 }
8209
8210 /*
8211  * cgroup tracking
8212  */
8213 #ifdef CONFIG_CGROUP_PERF
8214
8215 struct perf_cgroup_event {
8216         char                            *path;
8217         int                             path_size;
8218         struct {
8219                 struct perf_event_header        header;
8220                 u64                             id;
8221                 char                            path[];
8222         } event_id;
8223 };
8224
8225 static int perf_event_cgroup_match(struct perf_event *event)
8226 {
8227         return event->attr.cgroup;
8228 }
8229
8230 static void perf_event_cgroup_output(struct perf_event *event, void *data)
8231 {
8232         struct perf_cgroup_event *cgroup_event = data;
8233         struct perf_output_handle handle;
8234         struct perf_sample_data sample;
8235         u16 header_size = cgroup_event->event_id.header.size;
8236         int ret;
8237
8238         if (!perf_event_cgroup_match(event))
8239                 return;
8240
8241         perf_event_header__init_id(&cgroup_event->event_id.header,
8242                                    &sample, event);
8243         ret = perf_output_begin(&handle, &sample, event,
8244                                 cgroup_event->event_id.header.size);
8245         if (ret)
8246                 goto out;
8247
8248         perf_output_put(&handle, cgroup_event->event_id);
8249         __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
8250
8251         perf_event__output_id_sample(event, &handle, &sample);
8252
8253         perf_output_end(&handle);
8254 out:
8255         cgroup_event->event_id.header.size = header_size;
8256 }
8257
8258 static void perf_event_cgroup(struct cgroup *cgrp)
8259 {
8260         struct perf_cgroup_event cgroup_event;
8261         char path_enomem[16] = "//enomem";
8262         char *pathname;
8263         size_t size;
8264
8265         if (!atomic_read(&nr_cgroup_events))
8266                 return;
8267
8268         cgroup_event = (struct perf_cgroup_event){
8269                 .event_id  = {
8270                         .header = {
8271                                 .type = PERF_RECORD_CGROUP,
8272                                 .misc = 0,
8273                                 .size = sizeof(cgroup_event.event_id),
8274                         },
8275                         .id = cgroup_id(cgrp),
8276                 },
8277         };
8278
8279         pathname = kmalloc(PATH_MAX, GFP_KERNEL);
8280         if (pathname == NULL) {
8281                 cgroup_event.path = path_enomem;
8282         } else {
8283                 /* just to be sure to have enough space for alignment */
8284                 cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
8285                 cgroup_event.path = pathname;
8286         }
8287
8288         /*
8289          * Since our buffer works in 8 byte units we need to align our string
8290          * size to a multiple of 8. However, we must guarantee the tail end is
8291          * zero'd out to avoid leaking random bits to userspace.
8292          */
8293         size = strlen(cgroup_event.path) + 1;
8294         while (!IS_ALIGNED(size, sizeof(u64)))
8295                 cgroup_event.path[size++] = '\0';
8296
8297         cgroup_event.event_id.header.size += size;
8298         cgroup_event.path_size = size;
8299
8300         perf_iterate_sb(perf_event_cgroup_output,
8301                         &cgroup_event,
8302                         NULL);
8303
8304         kfree(pathname);
8305 }
8306
8307 #endif
8308
8309 /*
8310  * mmap tracking
8311  */
8312
8313 struct perf_mmap_event {
8314         struct vm_area_struct   *vma;
8315
8316         const char              *file_name;
8317         int                     file_size;
8318         int                     maj, min;
8319         u64                     ino;
8320         u64                     ino_generation;
8321         u32                     prot, flags;
8322         u8                      build_id[BUILD_ID_SIZE_MAX];
8323         u32                     build_id_size;
8324
8325         struct {
8326                 struct perf_event_header        header;
8327
8328                 u32                             pid;
8329                 u32                             tid;
8330                 u64                             start;
8331                 u64                             len;
8332                 u64                             pgoff;
8333         } event_id;
8334 };
8335
8336 static int perf_event_mmap_match(struct perf_event *event,
8337                                  void *data)
8338 {
8339         struct perf_mmap_event *mmap_event = data;
8340         struct vm_area_struct *vma = mmap_event->vma;
8341         int executable = vma->vm_flags & VM_EXEC;
8342
8343         return (!executable && event->attr.mmap_data) ||
8344                (executable && (event->attr.mmap || event->attr.mmap2));
8345 }
8346
8347 static void perf_event_mmap_output(struct perf_event *event,
8348                                    void *data)
8349 {
8350         struct perf_mmap_event *mmap_event = data;
8351         struct perf_output_handle handle;
8352         struct perf_sample_data sample;
8353         int size = mmap_event->event_id.header.size;
8354         u32 type = mmap_event->event_id.header.type;
8355         bool use_build_id;
8356         int ret;
8357
8358         if (!perf_event_mmap_match(event, data))
8359                 return;
8360
8361         if (event->attr.mmap2) {
8362                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
8363                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
8364                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
8365                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
8366                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
8367                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
8368                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
8369         }
8370
8371         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
8372         ret = perf_output_begin(&handle, &sample, event,
8373                                 mmap_event->event_id.header.size);
8374         if (ret)
8375                 goto out;
8376
8377         mmap_event->event_id.pid = perf_event_pid(event, current);
8378         mmap_event->event_id.tid = perf_event_tid(event, current);
8379
8380         use_build_id = event->attr.build_id && mmap_event->build_id_size;
8381
8382         if (event->attr.mmap2 && use_build_id)
8383                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
8384
8385         perf_output_put(&handle, mmap_event->event_id);
8386
8387         if (event->attr.mmap2) {
8388                 if (use_build_id) {
8389                         u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
8390
8391                         __output_copy(&handle, size, 4);
8392                         __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
8393                 } else {
8394                         perf_output_put(&handle, mmap_event->maj);
8395                         perf_output_put(&handle, mmap_event->min);
8396                         perf_output_put(&handle, mmap_event->ino);
8397                         perf_output_put(&handle, mmap_event->ino_generation);
8398                 }
8399                 perf_output_put(&handle, mmap_event->prot);
8400                 perf_output_put(&handle, mmap_event->flags);
8401         }
8402
8403         __output_copy(&handle, mmap_event->file_name,
8404                                    mmap_event->file_size);
8405
8406         perf_event__output_id_sample(event, &handle, &sample);
8407
8408         perf_output_end(&handle);
8409 out:
8410         mmap_event->event_id.header.size = size;
8411         mmap_event->event_id.header.type = type;
8412 }
8413
8414 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
8415 {
8416         struct vm_area_struct *vma = mmap_event->vma;
8417         struct file *file = vma->vm_file;
8418         int maj = 0, min = 0;
8419         u64 ino = 0, gen = 0;
8420         u32 prot = 0, flags = 0;
8421         unsigned int size;
8422         char tmp[16];
8423         char *buf = NULL;
8424         char *name;
8425
8426         if (vma->vm_flags & VM_READ)
8427                 prot |= PROT_READ;
8428         if (vma->vm_flags & VM_WRITE)
8429                 prot |= PROT_WRITE;
8430         if (vma->vm_flags & VM_EXEC)
8431                 prot |= PROT_EXEC;
8432
8433         if (vma->vm_flags & VM_MAYSHARE)
8434                 flags = MAP_SHARED;
8435         else
8436                 flags = MAP_PRIVATE;
8437
8438         if (vma->vm_flags & VM_LOCKED)
8439                 flags |= MAP_LOCKED;
8440         if (is_vm_hugetlb_page(vma))
8441                 flags |= MAP_HUGETLB;
8442
8443         if (file) {
8444                 struct inode *inode;
8445                 dev_t dev;
8446
8447                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
8448                 if (!buf) {
8449                         name = "//enomem";
8450                         goto cpy_name;
8451                 }
8452                 /*
8453                  * d_path() works from the end of the rb backwards, so we
8454                  * need to add enough zero bytes after the string to handle
8455                  * the 64bit alignment we do later.
8456                  */
8457                 name = file_path(file, buf, PATH_MAX - sizeof(u64));
8458                 if (IS_ERR(name)) {
8459                         name = "//toolong";
8460                         goto cpy_name;
8461                 }
8462                 inode = file_inode(vma->vm_file);
8463                 dev = inode->i_sb->s_dev;
8464                 ino = inode->i_ino;
8465                 gen = inode->i_generation;
8466                 maj = MAJOR(dev);
8467                 min = MINOR(dev);
8468
8469                 goto got_name;
8470         } else {
8471                 if (vma->vm_ops && vma->vm_ops->name) {
8472                         name = (char *) vma->vm_ops->name(vma);
8473                         if (name)
8474                                 goto cpy_name;
8475                 }
8476
8477                 name = (char *)arch_vma_name(vma);
8478                 if (name)
8479                         goto cpy_name;
8480
8481                 if (vma->vm_start <= vma->vm_mm->start_brk &&
8482                                 vma->vm_end >= vma->vm_mm->brk) {
8483                         name = "[heap]";
8484                         goto cpy_name;
8485                 }
8486                 if (vma->vm_start <= vma->vm_mm->start_stack &&
8487                                 vma->vm_end >= vma->vm_mm->start_stack) {
8488                         name = "[stack]";
8489                         goto cpy_name;
8490                 }
8491
8492                 name = "//anon";
8493                 goto cpy_name;
8494         }
8495
8496 cpy_name:
8497         strlcpy(tmp, name, sizeof(tmp));
8498         name = tmp;
8499 got_name:
8500         /*
8501          * Since our buffer works in 8 byte units we need to align our string
8502          * size to a multiple of 8. However, we must guarantee the tail end is
8503          * zero'd out to avoid leaking random bits to userspace.
8504          */
8505         size = strlen(name)+1;
8506         while (!IS_ALIGNED(size, sizeof(u64)))
8507                 name[size++] = '\0';
8508
8509         mmap_event->file_name = name;
8510         mmap_event->file_size = size;
8511         mmap_event->maj = maj;
8512         mmap_event->min = min;
8513         mmap_event->ino = ino;
8514         mmap_event->ino_generation = gen;
8515         mmap_event->prot = prot;
8516         mmap_event->flags = flags;
8517
8518         if (!(vma->vm_flags & VM_EXEC))
8519                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
8520
8521         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
8522
8523         if (atomic_read(&nr_build_id_events))
8524                 build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
8525
8526         perf_iterate_sb(perf_event_mmap_output,
8527                        mmap_event,
8528                        NULL);
8529
8530         kfree(buf);
8531 }
8532
8533 /*
8534  * Check whether inode and address range match filter criteria.
8535  */
8536 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
8537                                      struct file *file, unsigned long offset,
8538                                      unsigned long size)
8539 {
8540         /* d_inode(NULL) won't be equal to any mapped user-space file */
8541         if (!filter->path.dentry)
8542                 return false;
8543
8544         if (d_inode(filter->path.dentry) != file_inode(file))
8545                 return false;
8546
8547         if (filter->offset > offset + size)
8548                 return false;
8549
8550         if (filter->offset + filter->size < offset)
8551                 return false;
8552
8553         return true;
8554 }
8555
8556 static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
8557                                         struct vm_area_struct *vma,
8558                                         struct perf_addr_filter_range *fr)
8559 {
8560         unsigned long vma_size = vma->vm_end - vma->vm_start;
8561         unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8562         struct file *file = vma->vm_file;
8563
8564         if (!perf_addr_filter_match(filter, file, off, vma_size))
8565                 return false;
8566
8567         if (filter->offset < off) {
8568                 fr->start = vma->vm_start;
8569                 fr->size = min(vma_size, filter->size - (off - filter->offset));
8570         } else {
8571                 fr->start = vma->vm_start + filter->offset - off;
8572                 fr->size = min(vma->vm_end - fr->start, filter->size);
8573         }
8574
8575         return true;
8576 }
8577
8578 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
8579 {
8580         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8581         struct vm_area_struct *vma = data;
8582         struct perf_addr_filter *filter;
8583         unsigned int restart = 0, count = 0;
8584         unsigned long flags;
8585
8586         if (!has_addr_filter(event))
8587                 return;
8588
8589         if (!vma->vm_file)
8590                 return;
8591
8592         raw_spin_lock_irqsave(&ifh->lock, flags);
8593         list_for_each_entry(filter, &ifh->list, entry) {
8594                 if (perf_addr_filter_vma_adjust(filter, vma,
8595                                                 &event->addr_filter_ranges[count]))
8596                         restart++;
8597
8598                 count++;
8599         }
8600
8601         if (restart)
8602                 event->addr_filters_gen++;
8603         raw_spin_unlock_irqrestore(&ifh->lock, flags);
8604
8605         if (restart)
8606                 perf_event_stop(event, 1);
8607 }
8608
8609 /*
8610  * Adjust all task's events' filters to the new vma
8611  */
8612 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
8613 {
8614         struct perf_event_context *ctx;
8615         int ctxn;
8616
8617         /*
8618          * Data tracing isn't supported yet and as such there is no need
8619          * to keep track of anything that isn't related to executable code:
8620          */
8621         if (!(vma->vm_flags & VM_EXEC))
8622                 return;
8623
8624         rcu_read_lock();
8625         for_each_task_context_nr(ctxn) {
8626                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
8627                 if (!ctx)
8628                         continue;
8629
8630                 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
8631         }
8632         rcu_read_unlock();
8633 }
8634
8635 void perf_event_mmap(struct vm_area_struct *vma)
8636 {
8637         struct perf_mmap_event mmap_event;
8638
8639         if (!atomic_read(&nr_mmap_events))
8640                 return;
8641
8642         mmap_event = (struct perf_mmap_event){
8643                 .vma    = vma,
8644                 /* .file_name */
8645                 /* .file_size */
8646                 .event_id  = {
8647                         .header = {
8648                                 .type = PERF_RECORD_MMAP,
8649                                 .misc = PERF_RECORD_MISC_USER,
8650                                 /* .size */
8651                         },
8652                         /* .pid */
8653                         /* .tid */
8654                         .start  = vma->vm_start,
8655                         .len    = vma->vm_end - vma->vm_start,
8656                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
8657                 },
8658                 /* .maj (attr_mmap2 only) */
8659                 /* .min (attr_mmap2 only) */
8660                 /* .ino (attr_mmap2 only) */
8661                 /* .ino_generation (attr_mmap2 only) */
8662                 /* .prot (attr_mmap2 only) */
8663                 /* .flags (attr_mmap2 only) */
8664         };
8665
8666         perf_addr_filters_adjust(vma);
8667         perf_event_mmap_event(&mmap_event);
8668 }
8669
8670 void perf_event_aux_event(struct perf_event *event, unsigned long head,
8671                           unsigned long size, u64 flags)
8672 {
8673         struct perf_output_handle handle;
8674         struct perf_sample_data sample;
8675         struct perf_aux_event {
8676                 struct perf_event_header        header;
8677                 u64                             offset;
8678                 u64                             size;
8679                 u64                             flags;
8680         } rec = {
8681                 .header = {
8682                         .type = PERF_RECORD_AUX,
8683                         .misc = 0,
8684                         .size = sizeof(rec),
8685                 },
8686                 .offset         = head,
8687                 .size           = size,
8688                 .flags          = flags,
8689         };
8690         int ret;
8691
8692         perf_event_header__init_id(&rec.header, &sample, event);
8693         ret = perf_output_begin(&handle, &sample, event, rec.header.size);
8694
8695         if (ret)
8696                 return;
8697
8698         perf_output_put(&handle, rec);
8699         perf_event__output_id_sample(event, &handle, &sample);
8700
8701         perf_output_end(&handle);
8702 }
8703
8704 /*
8705  * Lost/dropped samples logging
8706  */
8707 void perf_log_lost_samples(struct perf_event *event, u64 lost)
8708 {
8709         struct perf_output_handle handle;
8710         struct perf_sample_data sample;
8711         int ret;
8712
8713         struct {
8714                 struct perf_event_header        header;
8715                 u64                             lost;
8716         } lost_samples_event = {
8717                 .header = {
8718                         .type = PERF_RECORD_LOST_SAMPLES,
8719                         .misc = 0,
8720                         .size = sizeof(lost_samples_event),
8721                 },
8722                 .lost           = lost,
8723         };
8724
8725         perf_event_header__init_id(&lost_samples_event.header, &sample, event);
8726
8727         ret = perf_output_begin(&handle, &sample, event,
8728                                 lost_samples_event.header.size);
8729         if (ret)
8730                 return;
8731
8732         perf_output_put(&handle, lost_samples_event);
8733         perf_event__output_id_sample(event, &handle, &sample);
8734         perf_output_end(&handle);
8735 }
8736
8737 /*
8738  * context_switch tracking
8739  */
8740
8741 struct perf_switch_event {
8742         struct task_struct      *task;
8743         struct task_struct      *next_prev;
8744
8745         struct {
8746                 struct perf_event_header        header;
8747                 u32                             next_prev_pid;
8748                 u32                             next_prev_tid;
8749         } event_id;
8750 };
8751
8752 static int perf_event_switch_match(struct perf_event *event)
8753 {
8754         return event->attr.context_switch;
8755 }
8756
8757 static void perf_event_switch_output(struct perf_event *event, void *data)
8758 {
8759         struct perf_switch_event *se = data;
8760         struct perf_output_handle handle;
8761         struct perf_sample_data sample;
8762         int ret;
8763
8764         if (!perf_event_switch_match(event))
8765                 return;
8766
8767         /* Only CPU-wide events are allowed to see next/prev pid/tid */
8768         if (event->ctx->task) {
8769                 se->event_id.header.type = PERF_RECORD_SWITCH;
8770                 se->event_id.header.size = sizeof(se->event_id.header);
8771         } else {
8772                 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
8773                 se->event_id.header.size = sizeof(se->event_id);
8774                 se->event_id.next_prev_pid =
8775                                         perf_event_pid(event, se->next_prev);
8776                 se->event_id.next_prev_tid =
8777                                         perf_event_tid(event, se->next_prev);
8778         }
8779
8780         perf_event_header__init_id(&se->event_id.header, &sample, event);
8781
8782         ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
8783         if (ret)
8784                 return;
8785
8786         if (event->ctx->task)
8787                 perf_output_put(&handle, se->event_id.header);
8788         else
8789                 perf_output_put(&handle, se->event_id);
8790
8791         perf_event__output_id_sample(event, &handle, &sample);
8792
8793         perf_output_end(&handle);
8794 }
8795
8796 static void perf_event_switch(struct task_struct *task,
8797                               struct task_struct *next_prev, bool sched_in)
8798 {
8799         struct perf_switch_event switch_event;
8800
8801         /* N.B. caller checks nr_switch_events != 0 */
8802
8803         switch_event = (struct perf_switch_event){
8804                 .task           = task,
8805                 .next_prev      = next_prev,
8806                 .event_id       = {
8807                         .header = {
8808                                 /* .type */
8809                                 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
8810                                 /* .size */
8811                         },
8812                         /* .next_prev_pid */
8813                         /* .next_prev_tid */
8814                 },
8815         };
8816
8817         if (!sched_in && task->on_rq) {
8818                 switch_event.event_id.header.misc |=
8819                                 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
8820         }
8821
8822         perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
8823 }
8824
8825 /*
8826  * IRQ throttle logging
8827  */
8828
8829 static void perf_log_throttle(struct perf_event *event, int enable)
8830 {
8831         struct perf_output_handle handle;
8832         struct perf_sample_data sample;
8833         int ret;
8834
8835         struct {
8836                 struct perf_event_header        header;
8837                 u64                             time;
8838                 u64                             id;
8839                 u64                             stream_id;
8840         } throttle_event = {
8841                 .header = {
8842                         .type = PERF_RECORD_THROTTLE,
8843                         .misc = 0,
8844                         .size = sizeof(throttle_event),
8845                 },
8846                 .time           = perf_event_clock(event),
8847                 .id             = primary_event_id(event),
8848                 .stream_id      = event->id,
8849         };
8850
8851         if (enable)
8852                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
8853
8854         perf_event_header__init_id(&throttle_event.header, &sample, event);
8855
8856         ret = perf_output_begin(&handle, &sample, event,
8857                                 throttle_event.header.size);
8858         if (ret)
8859                 return;
8860
8861         perf_output_put(&handle, throttle_event);
8862         perf_event__output_id_sample(event, &handle, &sample);
8863         perf_output_end(&handle);
8864 }
8865
8866 /*
8867  * ksymbol register/unregister tracking
8868  */
8869
8870 struct perf_ksymbol_event {
8871         const char      *name;
8872         int             name_len;
8873         struct {
8874                 struct perf_event_header        header;
8875                 u64                             addr;
8876                 u32                             len;
8877                 u16                             ksym_type;
8878                 u16                             flags;
8879         } event_id;
8880 };
8881
8882 static int perf_event_ksymbol_match(struct perf_event *event)
8883 {
8884         return event->attr.ksymbol;
8885 }
8886
8887 static void perf_event_ksymbol_output(struct perf_event *event, void *data)
8888 {
8889         struct perf_ksymbol_event *ksymbol_event = data;
8890         struct perf_output_handle handle;
8891         struct perf_sample_data sample;
8892         int ret;
8893
8894         if (!perf_event_ksymbol_match(event))
8895                 return;
8896
8897         perf_event_header__init_id(&ksymbol_event->event_id.header,
8898                                    &sample, event);
8899         ret = perf_output_begin(&handle, &sample, event,
8900                                 ksymbol_event->event_id.header.size);
8901         if (ret)
8902                 return;
8903
8904         perf_output_put(&handle, ksymbol_event->event_id);
8905         __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
8906         perf_event__output_id_sample(event, &handle, &sample);
8907
8908         perf_output_end(&handle);
8909 }
8910
8911 void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
8912                         const char *sym)
8913 {
8914         struct perf_ksymbol_event ksymbol_event;
8915         char name[KSYM_NAME_LEN];
8916         u16 flags = 0;
8917         int name_len;
8918
8919         if (!atomic_read(&nr_ksymbol_events))
8920                 return;
8921
8922         if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
8923             ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
8924                 goto err;
8925
8926         strlcpy(name, sym, KSYM_NAME_LEN);
8927         name_len = strlen(name) + 1;
8928         while (!IS_ALIGNED(name_len, sizeof(u64)))
8929                 name[name_len++] = '\0';
8930         BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
8931
8932         if (unregister)
8933                 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
8934
8935         ksymbol_event = (struct perf_ksymbol_event){
8936                 .name = name,
8937                 .name_len = name_len,
8938                 .event_id = {
8939                         .header = {
8940                                 .type = PERF_RECORD_KSYMBOL,
8941                                 .size = sizeof(ksymbol_event.event_id) +
8942                                         name_len,
8943                         },
8944                         .addr = addr,
8945                         .len = len,
8946                         .ksym_type = ksym_type,
8947                         .flags = flags,
8948                 },
8949         };
8950
8951         perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
8952         return;
8953 err:
8954         WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
8955 }
8956
8957 /*
8958  * bpf program load/unload tracking
8959  */
8960
8961 struct perf_bpf_event {
8962         struct bpf_prog *prog;
8963         struct {
8964                 struct perf_event_header        header;
8965                 u16                             type;
8966                 u16                             flags;
8967                 u32                             id;
8968                 u8                              tag[BPF_TAG_SIZE];
8969         } event_id;
8970 };
8971
8972 static int perf_event_bpf_match(struct perf_event *event)
8973 {
8974         return event->attr.bpf_event;
8975 }
8976
8977 static void perf_event_bpf_output(struct perf_event *event, void *data)
8978 {
8979         struct perf_bpf_event *bpf_event = data;
8980         struct perf_output_handle handle;
8981         struct perf_sample_data sample;
8982         int ret;
8983
8984         if (!perf_event_bpf_match(event))
8985                 return;
8986
8987         perf_event_header__init_id(&bpf_event->event_id.header,
8988                                    &sample, event);
8989         ret = perf_output_begin(&handle, data, event,
8990                                 bpf_event->event_id.header.size);
8991         if (ret)
8992                 return;
8993
8994         perf_output_put(&handle, bpf_event->event_id);
8995         perf_event__output_id_sample(event, &handle, &sample);
8996
8997         perf_output_end(&handle);
8998 }
8999
9000 static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
9001                                          enum perf_bpf_event_type type)
9002 {
9003         bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
9004         int i;
9005
9006         if (prog->aux->func_cnt == 0) {
9007                 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
9008                                    (u64)(unsigned long)prog->bpf_func,
9009                                    prog->jited_len, unregister,
9010                                    prog->aux->ksym.name);
9011         } else {
9012                 for (i = 0; i < prog->aux->func_cnt; i++) {
9013                         struct bpf_prog *subprog = prog->aux->func[i];
9014
9015                         perf_event_ksymbol(
9016                                 PERF_RECORD_KSYMBOL_TYPE_BPF,
9017                                 (u64)(unsigned long)subprog->bpf_func,
9018                                 subprog->jited_len, unregister,
9019                                 prog->aux->ksym.name);
9020                 }
9021         }
9022 }
9023
9024 void perf_event_bpf_event(struct bpf_prog *prog,
9025                           enum perf_bpf_event_type type,
9026                           u16 flags)
9027 {
9028         struct perf_bpf_event bpf_event;
9029
9030         if (type <= PERF_BPF_EVENT_UNKNOWN ||
9031             type >= PERF_BPF_EVENT_MAX)
9032                 return;
9033
9034         switch (type) {
9035         case PERF_BPF_EVENT_PROG_LOAD:
9036         case PERF_BPF_EVENT_PROG_UNLOAD:
9037                 if (atomic_read(&nr_ksymbol_events))
9038                         perf_event_bpf_emit_ksymbols(prog, type);
9039                 break;
9040         default:
9041                 break;
9042         }
9043
9044         if (!atomic_read(&nr_bpf_events))
9045                 return;
9046
9047         bpf_event = (struct perf_bpf_event){
9048                 .prog = prog,
9049                 .event_id = {
9050                         .header = {
9051                                 .type = PERF_RECORD_BPF_EVENT,
9052                                 .size = sizeof(bpf_event.event_id),
9053                         },
9054                         .type = type,
9055                         .flags = flags,
9056                         .id = prog->aux->id,
9057                 },
9058         };
9059
9060         BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
9061
9062         memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
9063         perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
9064 }
9065
9066 struct perf_text_poke_event {
9067         const void              *old_bytes;
9068         const void              *new_bytes;
9069         size_t                  pad;
9070         u16                     old_len;
9071         u16                     new_len;
9072
9073         struct {
9074                 struct perf_event_header        header;
9075
9076                 u64                             addr;
9077         } event_id;
9078 };
9079
9080 static int perf_event_text_poke_match(struct perf_event *event)
9081 {
9082         return event->attr.text_poke;
9083 }
9084
9085 static void perf_event_text_poke_output(struct perf_event *event, void *data)
9086 {
9087         struct perf_text_poke_event *text_poke_event = data;
9088         struct perf_output_handle handle;
9089         struct perf_sample_data sample;
9090         u64 padding = 0;
9091         int ret;
9092
9093         if (!perf_event_text_poke_match(event))
9094                 return;
9095
9096         perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
9097
9098         ret = perf_output_begin(&handle, &sample, event,
9099                                 text_poke_event->event_id.header.size);
9100         if (ret)
9101                 return;
9102
9103         perf_output_put(&handle, text_poke_event->event_id);
9104         perf_output_put(&handle, text_poke_event->old_len);
9105         perf_output_put(&handle, text_poke_event->new_len);
9106
9107         __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
9108         __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
9109
9110         if (text_poke_event->pad)
9111                 __output_copy(&handle, &padding, text_poke_event->pad);
9112
9113         perf_event__output_id_sample(event, &handle, &sample);
9114
9115         perf_output_end(&handle);
9116 }
9117
9118 void perf_event_text_poke(const void *addr, const void *old_bytes,
9119                           size_t old_len, const void *new_bytes, size_t new_len)
9120 {
9121         struct perf_text_poke_event text_poke_event;
9122         size_t tot, pad;
9123
9124         if (!atomic_read(&nr_text_poke_events))
9125                 return;
9126
9127         tot  = sizeof(text_poke_event.old_len) + old_len;
9128         tot += sizeof(text_poke_event.new_len) + new_len;
9129         pad  = ALIGN(tot, sizeof(u64)) - tot;
9130
9131         text_poke_event = (struct perf_text_poke_event){
9132                 .old_bytes    = old_bytes,
9133                 .new_bytes    = new_bytes,
9134                 .pad          = pad,
9135                 .old_len      = old_len,
9136                 .new_len      = new_len,
9137                 .event_id  = {
9138                         .header = {
9139                                 .type = PERF_RECORD_TEXT_POKE,
9140                                 .misc = PERF_RECORD_MISC_KERNEL,
9141                                 .size = sizeof(text_poke_event.event_id) + tot + pad,
9142                         },
9143                         .addr = (unsigned long)addr,
9144                 },
9145         };
9146
9147         perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
9148 }
9149
9150 void perf_event_itrace_started(struct perf_event *event)
9151 {
9152         event->attach_state |= PERF_ATTACH_ITRACE;
9153 }
9154
9155 static void perf_log_itrace_start(struct perf_event *event)
9156 {
9157         struct perf_output_handle handle;
9158         struct perf_sample_data sample;
9159         struct perf_aux_event {
9160                 struct perf_event_header        header;
9161                 u32                             pid;
9162                 u32                             tid;
9163         } rec;
9164         int ret;
9165
9166         if (event->parent)
9167                 event = event->parent;
9168
9169         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
9170             event->attach_state & PERF_ATTACH_ITRACE)
9171                 return;
9172
9173         rec.header.type = PERF_RECORD_ITRACE_START;
9174         rec.header.misc = 0;
9175         rec.header.size = sizeof(rec);
9176         rec.pid = perf_event_pid(event, current);
9177         rec.tid = perf_event_tid(event, current);
9178
9179         perf_event_header__init_id(&rec.header, &sample, event);
9180         ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9181
9182         if (ret)
9183                 return;
9184
9185         perf_output_put(&handle, rec);
9186         perf_event__output_id_sample(event, &handle, &sample);
9187
9188         perf_output_end(&handle);
9189 }
9190
9191 void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
9192 {
9193         struct perf_output_handle handle;
9194         struct perf_sample_data sample;
9195         struct perf_aux_event {
9196                 struct perf_event_header        header;
9197                 u64                             hw_id;
9198         } rec;
9199         int ret;
9200
9201         if (event->parent)
9202                 event = event->parent;
9203
9204         rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID;
9205         rec.header.misc = 0;
9206         rec.header.size = sizeof(rec);
9207         rec.hw_id       = hw_id;
9208
9209         perf_event_header__init_id(&rec.header, &sample, event);
9210         ret = perf_output_begin(&handle, &sample, event, rec.header.size);
9211
9212         if (ret)
9213                 return;
9214
9215         perf_output_put(&handle, rec);
9216         perf_event__output_id_sample(event, &handle, &sample);
9217
9218         perf_output_end(&handle);
9219 }
9220
9221 static int
9222 __perf_event_account_interrupt(struct perf_event *event, int throttle)
9223 {
9224         struct hw_perf_event *hwc = &event->hw;
9225         int ret = 0;
9226         u64 seq;
9227
9228         seq = __this_cpu_read(perf_throttled_seq);
9229         if (seq != hwc->interrupts_seq) {
9230                 hwc->interrupts_seq = seq;
9231                 hwc->interrupts = 1;
9232         } else {
9233                 hwc->interrupts++;
9234                 if (unlikely(throttle
9235                              && hwc->interrupts >= max_samples_per_tick)) {
9236                         __this_cpu_inc(perf_throttled_count);
9237                         tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
9238                         hwc->interrupts = MAX_INTERRUPTS;
9239                         perf_log_throttle(event, 0);
9240                         ret = 1;
9241                 }
9242         }
9243
9244         if (event->attr.freq) {
9245                 u64 now = perf_clock();
9246                 s64 delta = now - hwc->freq_time_stamp;
9247
9248                 hwc->freq_time_stamp = now;
9249
9250                 if (delta > 0 && delta < 2*TICK_NSEC)
9251                         perf_adjust_period(event, delta, hwc->last_period, true);
9252         }
9253
9254         return ret;
9255 }
9256
9257 int perf_event_account_interrupt(struct perf_event *event)
9258 {
9259         return __perf_event_account_interrupt(event, 1);
9260 }
9261
9262 /*
9263  * Generic event overflow handling, sampling.
9264  */
9265
9266 static int __perf_event_overflow(struct perf_event *event,
9267                                    int throttle, struct perf_sample_data *data,
9268                                    struct pt_regs *regs)
9269 {
9270         int events = atomic_read(&event->event_limit);
9271         int ret = 0;
9272
9273         /*
9274          * Non-sampling counters might still use the PMI to fold short
9275          * hardware counters, ignore those.
9276          */
9277         if (unlikely(!is_sampling_event(event)))
9278                 return 0;
9279
9280         ret = __perf_event_account_interrupt(event, throttle);
9281
9282         /*
9283          * XXX event_limit might not quite work as expected on inherited
9284          * events
9285          */
9286
9287         event->pending_kill = POLL_IN;
9288         if (events && atomic_dec_and_test(&event->event_limit)) {
9289                 ret = 1;
9290                 event->pending_kill = POLL_HUP;
9291                 event->pending_addr = data->addr;
9292
9293                 perf_event_disable_inatomic(event);
9294         }
9295
9296         READ_ONCE(event->overflow_handler)(event, data, regs);
9297
9298         if (*perf_event_fasync(event) && event->pending_kill) {
9299                 event->pending_wakeup = 1;
9300                 irq_work_queue(&event->pending);
9301         }
9302
9303         return ret;
9304 }
9305
9306 int perf_event_overflow(struct perf_event *event,
9307                           struct perf_sample_data *data,
9308                           struct pt_regs *regs)
9309 {
9310         return __perf_event_overflow(event, 1, data, regs);
9311 }
9312
9313 /*
9314  * Generic software event infrastructure
9315  */
9316
9317 struct swevent_htable {
9318         struct swevent_hlist            *swevent_hlist;
9319         struct mutex                    hlist_mutex;
9320         int                             hlist_refcount;
9321
9322         /* Recursion avoidance in each contexts */
9323         int                             recursion[PERF_NR_CONTEXTS];
9324 };
9325
9326 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
9327
9328 /*
9329  * We directly increment event->count and keep a second value in
9330  * event->hw.period_left to count intervals. This period event
9331  * is kept in the range [-sample_period, 0] so that we can use the
9332  * sign as trigger.
9333  */
9334
9335 u64 perf_swevent_set_period(struct perf_event *event)
9336 {
9337         struct hw_perf_event *hwc = &event->hw;
9338         u64 period = hwc->last_period;
9339         u64 nr, offset;
9340         s64 old, val;
9341
9342         hwc->last_period = hwc->sample_period;
9343
9344 again:
9345         old = val = local64_read(&hwc->period_left);
9346         if (val < 0)
9347                 return 0;
9348
9349         nr = div64_u64(period + val, period);
9350         offset = nr * period;
9351         val -= offset;
9352         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
9353                 goto again;
9354
9355         return nr;
9356 }
9357
9358 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
9359                                     struct perf_sample_data *data,
9360                                     struct pt_regs *regs)
9361 {
9362         struct hw_perf_event *hwc = &event->hw;
9363         int throttle = 0;
9364
9365         if (!overflow)
9366                 overflow = perf_swevent_set_period(event);
9367
9368         if (hwc->interrupts == MAX_INTERRUPTS)
9369                 return;
9370
9371         for (; overflow; overflow--) {
9372                 if (__perf_event_overflow(event, throttle,
9373                                             data, regs)) {
9374                         /*
9375                          * We inhibit the overflow from happening when
9376                          * hwc->interrupts == MAX_INTERRUPTS.
9377                          */
9378                         break;
9379                 }
9380                 throttle = 1;
9381         }
9382 }
9383
9384 static void perf_swevent_event(struct perf_event *event, u64 nr,
9385                                struct perf_sample_data *data,
9386                                struct pt_regs *regs)
9387 {
9388         struct hw_perf_event *hwc = &event->hw;
9389
9390         local64_add(nr, &event->count);
9391
9392         if (!regs)
9393                 return;
9394
9395         if (!is_sampling_event(event))
9396                 return;
9397
9398         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
9399                 data->period = nr;
9400                 return perf_swevent_overflow(event, 1, data, regs);
9401         } else
9402                 data->period = event->hw.last_period;
9403
9404         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
9405                 return perf_swevent_overflow(event, 1, data, regs);
9406
9407         if (local64_add_negative(nr, &hwc->period_left))
9408                 return;
9409
9410         perf_swevent_overflow(event, 0, data, regs);
9411 }
9412
9413 static int perf_exclude_event(struct perf_event *event,
9414                               struct pt_regs *regs)
9415 {
9416         if (event->hw.state & PERF_HES_STOPPED)
9417                 return 1;
9418
9419         if (regs) {
9420                 if (event->attr.exclude_user && user_mode(regs))
9421                         return 1;
9422
9423                 if (event->attr.exclude_kernel && !user_mode(regs))
9424                         return 1;
9425         }
9426
9427         return 0;
9428 }
9429
9430 static int perf_swevent_match(struct perf_event *event,
9431                                 enum perf_type_id type,
9432                                 u32 event_id,
9433                                 struct perf_sample_data *data,
9434                                 struct pt_regs *regs)
9435 {
9436         if (event->attr.type != type)
9437                 return 0;
9438
9439         if (event->attr.config != event_id)
9440                 return 0;
9441
9442         if (perf_exclude_event(event, regs))
9443                 return 0;
9444
9445         return 1;
9446 }
9447
9448 static inline u64 swevent_hash(u64 type, u32 event_id)
9449 {
9450         u64 val = event_id | (type << 32);
9451
9452         return hash_64(val, SWEVENT_HLIST_BITS);
9453 }
9454
9455 static inline struct hlist_head *
9456 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
9457 {
9458         u64 hash = swevent_hash(type, event_id);
9459
9460         return &hlist->heads[hash];
9461 }
9462
9463 /* For the read side: events when they trigger */
9464 static inline struct hlist_head *
9465 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
9466 {
9467         struct swevent_hlist *hlist;
9468
9469         hlist = rcu_dereference(swhash->swevent_hlist);
9470         if (!hlist)
9471                 return NULL;
9472
9473         return __find_swevent_head(hlist, type, event_id);
9474 }
9475
9476 /* For the event head insertion and removal in the hlist */
9477 static inline struct hlist_head *
9478 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
9479 {
9480         struct swevent_hlist *hlist;
9481         u32 event_id = event->attr.config;
9482         u64 type = event->attr.type;
9483
9484         /*
9485          * Event scheduling is always serialized against hlist allocation
9486          * and release. Which makes the protected version suitable here.
9487          * The context lock guarantees that.
9488          */
9489         hlist = rcu_dereference_protected(swhash->swevent_hlist,
9490                                           lockdep_is_held(&event->ctx->lock));
9491         if (!hlist)
9492                 return NULL;
9493
9494         return __find_swevent_head(hlist, type, event_id);
9495 }
9496
9497 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
9498                                     u64 nr,
9499                                     struct perf_sample_data *data,
9500                                     struct pt_regs *regs)
9501 {
9502         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9503         struct perf_event *event;
9504         struct hlist_head *head;
9505
9506         rcu_read_lock();
9507         head = find_swevent_head_rcu(swhash, type, event_id);
9508         if (!head)
9509                 goto end;
9510
9511         hlist_for_each_entry_rcu(event, head, hlist_entry) {
9512                 if (perf_swevent_match(event, type, event_id, data, regs))
9513                         perf_swevent_event(event, nr, data, regs);
9514         }
9515 end:
9516         rcu_read_unlock();
9517 }
9518
9519 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
9520
9521 int perf_swevent_get_recursion_context(void)
9522 {
9523         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9524
9525         return get_recursion_context(swhash->recursion);
9526 }
9527 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
9528
9529 void perf_swevent_put_recursion_context(int rctx)
9530 {
9531         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9532
9533         put_recursion_context(swhash->recursion, rctx);
9534 }
9535
9536 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9537 {
9538         struct perf_sample_data data;
9539
9540         if (WARN_ON_ONCE(!regs))
9541                 return;
9542
9543         perf_sample_data_init(&data, addr, 0);
9544         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
9545 }
9546
9547 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9548 {
9549         int rctx;
9550
9551         preempt_disable_notrace();
9552         rctx = perf_swevent_get_recursion_context();
9553         if (unlikely(rctx < 0))
9554                 goto fail;
9555
9556         ___perf_sw_event(event_id, nr, regs, addr);
9557
9558         perf_swevent_put_recursion_context(rctx);
9559 fail:
9560         preempt_enable_notrace();
9561 }
9562
9563 static void perf_swevent_read(struct perf_event *event)
9564 {
9565 }
9566
9567 static int perf_swevent_add(struct perf_event *event, int flags)
9568 {
9569         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9570         struct hw_perf_event *hwc = &event->hw;
9571         struct hlist_head *head;
9572
9573         if (is_sampling_event(event)) {
9574                 hwc->last_period = hwc->sample_period;
9575                 perf_swevent_set_period(event);
9576         }
9577
9578         hwc->state = !(flags & PERF_EF_START);
9579
9580         head = find_swevent_head(swhash, event);
9581         if (WARN_ON_ONCE(!head))
9582                 return -EINVAL;
9583
9584         hlist_add_head_rcu(&event->hlist_entry, head);
9585         perf_event_update_userpage(event);
9586
9587         return 0;
9588 }
9589
9590 static void perf_swevent_del(struct perf_event *event, int flags)
9591 {
9592         hlist_del_rcu(&event->hlist_entry);
9593 }
9594
9595 static void perf_swevent_start(struct perf_event *event, int flags)
9596 {
9597         event->hw.state = 0;
9598 }
9599
9600 static void perf_swevent_stop(struct perf_event *event, int flags)
9601 {
9602         event->hw.state = PERF_HES_STOPPED;
9603 }
9604
9605 /* Deref the hlist from the update side */
9606 static inline struct swevent_hlist *
9607 swevent_hlist_deref(struct swevent_htable *swhash)
9608 {
9609         return rcu_dereference_protected(swhash->swevent_hlist,
9610                                          lockdep_is_held(&swhash->hlist_mutex));
9611 }
9612
9613 static void swevent_hlist_release(struct swevent_htable *swhash)
9614 {
9615         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
9616
9617         if (!hlist)
9618                 return;
9619
9620         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
9621         kfree_rcu(hlist, rcu_head);
9622 }
9623
9624 static void swevent_hlist_put_cpu(int cpu)
9625 {
9626         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9627
9628         mutex_lock(&swhash->hlist_mutex);
9629
9630         if (!--swhash->hlist_refcount)
9631                 swevent_hlist_release(swhash);
9632
9633         mutex_unlock(&swhash->hlist_mutex);
9634 }
9635
9636 static void swevent_hlist_put(void)
9637 {
9638         int cpu;
9639
9640         for_each_possible_cpu(cpu)
9641                 swevent_hlist_put_cpu(cpu);
9642 }
9643
9644 static int swevent_hlist_get_cpu(int cpu)
9645 {
9646         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9647         int err = 0;
9648
9649         mutex_lock(&swhash->hlist_mutex);
9650         if (!swevent_hlist_deref(swhash) &&
9651             cpumask_test_cpu(cpu, perf_online_mask)) {
9652                 struct swevent_hlist *hlist;
9653
9654                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
9655                 if (!hlist) {
9656                         err = -ENOMEM;
9657                         goto exit;
9658                 }
9659                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
9660         }
9661         swhash->hlist_refcount++;
9662 exit:
9663         mutex_unlock(&swhash->hlist_mutex);
9664
9665         return err;
9666 }
9667
9668 static int swevent_hlist_get(void)
9669 {
9670         int err, cpu, failed_cpu;
9671
9672         mutex_lock(&pmus_lock);
9673         for_each_possible_cpu(cpu) {
9674                 err = swevent_hlist_get_cpu(cpu);
9675                 if (err) {
9676                         failed_cpu = cpu;
9677                         goto fail;
9678                 }
9679         }
9680         mutex_unlock(&pmus_lock);
9681         return 0;
9682 fail:
9683         for_each_possible_cpu(cpu) {
9684                 if (cpu == failed_cpu)
9685                         break;
9686                 swevent_hlist_put_cpu(cpu);
9687         }
9688         mutex_unlock(&pmus_lock);
9689         return err;
9690 }
9691
9692 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
9693
9694 static void sw_perf_event_destroy(struct perf_event *event)
9695 {
9696         u64 event_id = event->attr.config;
9697
9698         WARN_ON(event->parent);
9699
9700         static_key_slow_dec(&perf_swevent_enabled[event_id]);
9701         swevent_hlist_put();
9702 }
9703
9704 static int perf_swevent_init(struct perf_event *event)
9705 {
9706         u64 event_id = event->attr.config;
9707
9708         if (event->attr.type != PERF_TYPE_SOFTWARE)
9709                 return -ENOENT;
9710
9711         /*
9712          * no branch sampling for software events
9713          */
9714         if (has_branch_stack(event))
9715                 return -EOPNOTSUPP;
9716
9717         switch (event_id) {
9718         case PERF_COUNT_SW_CPU_CLOCK:
9719         case PERF_COUNT_SW_TASK_CLOCK:
9720                 return -ENOENT;
9721
9722         default:
9723                 break;
9724         }
9725
9726         if (event_id >= PERF_COUNT_SW_MAX)
9727                 return -ENOENT;
9728
9729         if (!event->parent) {
9730                 int err;
9731
9732                 err = swevent_hlist_get();
9733                 if (err)
9734                         return err;
9735
9736                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
9737                 event->destroy = sw_perf_event_destroy;
9738         }
9739
9740         return 0;
9741 }
9742
9743 static struct pmu perf_swevent = {
9744         .task_ctx_nr    = perf_sw_context,
9745
9746         .capabilities   = PERF_PMU_CAP_NO_NMI,
9747
9748         .event_init     = perf_swevent_init,
9749         .add            = perf_swevent_add,
9750         .del            = perf_swevent_del,
9751         .start          = perf_swevent_start,
9752         .stop           = perf_swevent_stop,
9753         .read           = perf_swevent_read,
9754 };
9755
9756 #ifdef CONFIG_EVENT_TRACING
9757
9758 static int perf_tp_filter_match(struct perf_event *event,
9759                                 struct perf_sample_data *data)
9760 {
9761         void *record = data->raw->frag.data;
9762
9763         /* only top level events have filters set */
9764         if (event->parent)
9765                 event = event->parent;
9766
9767         if (likely(!event->filter) || filter_match_preds(event->filter, record))
9768                 return 1;
9769         return 0;
9770 }
9771
9772 static int perf_tp_event_match(struct perf_event *event,
9773                                 struct perf_sample_data *data,
9774                                 struct pt_regs *regs)
9775 {
9776         if (event->hw.state & PERF_HES_STOPPED)
9777                 return 0;
9778         /*
9779          * If exclude_kernel, only trace user-space tracepoints (uprobes)
9780          */
9781         if (event->attr.exclude_kernel && !user_mode(regs))
9782                 return 0;
9783
9784         if (!perf_tp_filter_match(event, data))
9785                 return 0;
9786
9787         return 1;
9788 }
9789
9790 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
9791                                struct trace_event_call *call, u64 count,
9792                                struct pt_regs *regs, struct hlist_head *head,
9793                                struct task_struct *task)
9794 {
9795         if (bpf_prog_array_valid(call)) {
9796                 *(struct pt_regs **)raw_data = regs;
9797                 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
9798                         perf_swevent_put_recursion_context(rctx);
9799                         return;
9800                 }
9801         }
9802         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
9803                       rctx, task);
9804 }
9805 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
9806
9807 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
9808                    struct pt_regs *regs, struct hlist_head *head, int rctx,
9809                    struct task_struct *task)
9810 {
9811         struct perf_sample_data data;
9812         struct perf_event *event;
9813
9814         struct perf_raw_record raw = {
9815                 .frag = {
9816                         .size = entry_size,
9817                         .data = record,
9818                 },
9819         };
9820
9821         perf_sample_data_init(&data, 0, 0);
9822         data.raw = &raw;
9823
9824         perf_trace_buf_update(record, event_type);
9825
9826         hlist_for_each_entry_rcu(event, head, hlist_entry) {
9827                 if (perf_tp_event_match(event, &data, regs))
9828                         perf_swevent_event(event, count, &data, regs);
9829         }
9830
9831         /*
9832          * If we got specified a target task, also iterate its context and
9833          * deliver this event there too.
9834          */
9835         if (task && task != current) {
9836                 struct perf_event_context *ctx;
9837                 struct trace_entry *entry = record;
9838
9839                 rcu_read_lock();
9840                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
9841                 if (!ctx)
9842                         goto unlock;
9843
9844                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
9845                         if (event->cpu != smp_processor_id())
9846                                 continue;
9847                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
9848                                 continue;
9849                         if (event->attr.config != entry->type)
9850                                 continue;
9851                         /* Cannot deliver synchronous signal to other task. */
9852                         if (event->attr.sigtrap)
9853                                 continue;
9854                         if (perf_tp_event_match(event, &data, regs))
9855                                 perf_swevent_event(event, count, &data, regs);
9856                 }
9857 unlock:
9858                 rcu_read_unlock();
9859         }
9860
9861         perf_swevent_put_recursion_context(rctx);
9862 }
9863 EXPORT_SYMBOL_GPL(perf_tp_event);
9864
9865 static void tp_perf_event_destroy(struct perf_event *event)
9866 {
9867         perf_trace_destroy(event);
9868 }
9869
9870 static int perf_tp_event_init(struct perf_event *event)
9871 {
9872         int err;
9873
9874         if (event->attr.type != PERF_TYPE_TRACEPOINT)
9875                 return -ENOENT;
9876
9877         /*
9878          * no branch sampling for tracepoint events
9879          */
9880         if (has_branch_stack(event))
9881                 return -EOPNOTSUPP;
9882
9883         err = perf_trace_init(event);
9884         if (err)
9885                 return err;
9886
9887         event->destroy = tp_perf_event_destroy;
9888
9889         return 0;
9890 }
9891
9892 static struct pmu perf_tracepoint = {
9893         .task_ctx_nr    = perf_sw_context,
9894
9895         .event_init     = perf_tp_event_init,
9896         .add            = perf_trace_add,
9897         .del            = perf_trace_del,
9898         .start          = perf_swevent_start,
9899         .stop           = perf_swevent_stop,
9900         .read           = perf_swevent_read,
9901 };
9902
9903 #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
9904 /*
9905  * Flags in config, used by dynamic PMU kprobe and uprobe
9906  * The flags should match following PMU_FORMAT_ATTR().
9907  *
9908  * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
9909  *                               if not set, create kprobe/uprobe
9910  *
9911  * The following values specify a reference counter (or semaphore in the
9912  * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
9913  * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
9914  *
9915  * PERF_UPROBE_REF_CTR_OFFSET_BITS      # of bits in config as th offset
9916  * PERF_UPROBE_REF_CTR_OFFSET_SHIFT     # of bits to shift left
9917  */
9918 enum perf_probe_config {
9919         PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
9920         PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
9921         PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
9922 };
9923
9924 PMU_FORMAT_ATTR(retprobe, "config:0");
9925 #endif
9926
9927 #ifdef CONFIG_KPROBE_EVENTS
9928 static struct attribute *kprobe_attrs[] = {
9929         &format_attr_retprobe.attr,
9930         NULL,
9931 };
9932
9933 static struct attribute_group kprobe_format_group = {
9934         .name = "format",
9935         .attrs = kprobe_attrs,
9936 };
9937
9938 static const struct attribute_group *kprobe_attr_groups[] = {
9939         &kprobe_format_group,
9940         NULL,
9941 };
9942
9943 static int perf_kprobe_event_init(struct perf_event *event);
9944 static struct pmu perf_kprobe = {
9945         .task_ctx_nr    = perf_sw_context,
9946         .event_init     = perf_kprobe_event_init,
9947         .add            = perf_trace_add,
9948         .del            = perf_trace_del,
9949         .start          = perf_swevent_start,
9950         .stop           = perf_swevent_stop,
9951         .read           = perf_swevent_read,
9952         .attr_groups    = kprobe_attr_groups,
9953 };
9954
9955 static int perf_kprobe_event_init(struct perf_event *event)
9956 {
9957         int err;
9958         bool is_retprobe;
9959
9960         if (event->attr.type != perf_kprobe.type)
9961                 return -ENOENT;
9962
9963         if (!perfmon_capable())
9964                 return -EACCES;
9965
9966         /*
9967          * no branch sampling for probe events
9968          */
9969         if (has_branch_stack(event))
9970                 return -EOPNOTSUPP;
9971
9972         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9973         err = perf_kprobe_init(event, is_retprobe);
9974         if (err)
9975                 return err;
9976
9977         event->destroy = perf_kprobe_destroy;
9978
9979         return 0;
9980 }
9981 #endif /* CONFIG_KPROBE_EVENTS */
9982
9983 #ifdef CONFIG_UPROBE_EVENTS
9984 PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
9985
9986 static struct attribute *uprobe_attrs[] = {
9987         &format_attr_retprobe.attr,
9988         &format_attr_ref_ctr_offset.attr,
9989         NULL,
9990 };
9991
9992 static struct attribute_group uprobe_format_group = {
9993         .name = "format",
9994         .attrs = uprobe_attrs,
9995 };
9996
9997 static const struct attribute_group *uprobe_attr_groups[] = {
9998         &uprobe_format_group,
9999         NULL,
10000 };
10001
10002 static int perf_uprobe_event_init(struct perf_event *event);
10003 static struct pmu perf_uprobe = {
10004         .task_ctx_nr    = perf_sw_context,
10005         .event_init     = perf_uprobe_event_init,
10006         .add            = perf_trace_add,
10007         .del            = perf_trace_del,
10008         .start          = perf_swevent_start,
10009         .stop           = perf_swevent_stop,
10010         .read           = perf_swevent_read,
10011         .attr_groups    = uprobe_attr_groups,
10012 };
10013
10014 static int perf_uprobe_event_init(struct perf_event *event)
10015 {
10016         int err;
10017         unsigned long ref_ctr_offset;
10018         bool is_retprobe;
10019
10020         if (event->attr.type != perf_uprobe.type)
10021                 return -ENOENT;
10022
10023         if (!perfmon_capable())
10024                 return -EACCES;
10025
10026         /*
10027          * no branch sampling for probe events
10028          */
10029         if (has_branch_stack(event))
10030                 return -EOPNOTSUPP;
10031
10032         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
10033         ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
10034         err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
10035         if (err)
10036                 return err;
10037
10038         event->destroy = perf_uprobe_destroy;
10039
10040         return 0;
10041 }
10042 #endif /* CONFIG_UPROBE_EVENTS */
10043
10044 static inline void perf_tp_register(void)
10045 {
10046         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
10047 #ifdef CONFIG_KPROBE_EVENTS
10048         perf_pmu_register(&perf_kprobe, "kprobe", -1);
10049 #endif
10050 #ifdef CONFIG_UPROBE_EVENTS
10051         perf_pmu_register(&perf_uprobe, "uprobe", -1);
10052 #endif
10053 }
10054
10055 static void perf_event_free_filter(struct perf_event *event)
10056 {
10057         ftrace_profile_free_filter(event);
10058 }
10059
10060 #ifdef CONFIG_BPF_SYSCALL
10061 static void bpf_overflow_handler(struct perf_event *event,
10062                                  struct perf_sample_data *data,
10063                                  struct pt_regs *regs)
10064 {
10065         struct bpf_perf_event_data_kern ctx = {
10066                 .data = data,
10067                 .event = event,
10068         };
10069         struct bpf_prog *prog;
10070         int ret = 0;
10071
10072         ctx.regs = perf_arch_bpf_user_pt_regs(regs);
10073         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
10074                 goto out;
10075         rcu_read_lock();
10076         prog = READ_ONCE(event->prog);
10077         if (prog)
10078                 ret = bpf_prog_run(prog, &ctx);
10079         rcu_read_unlock();
10080 out:
10081         __this_cpu_dec(bpf_prog_active);
10082         if (!ret)
10083                 return;
10084
10085         event->orig_overflow_handler(event, data, regs);
10086 }
10087
10088 static int perf_event_set_bpf_handler(struct perf_event *event,
10089                                       struct bpf_prog *prog,
10090                                       u64 bpf_cookie)
10091 {
10092         if (event->overflow_handler_context)
10093                 /* hw breakpoint or kernel counter */
10094                 return -EINVAL;
10095
10096         if (event->prog)
10097                 return -EEXIST;
10098
10099         if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
10100                 return -EINVAL;
10101
10102         if (event->attr.precise_ip &&
10103             prog->call_get_stack &&
10104             (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
10105              event->attr.exclude_callchain_kernel ||
10106              event->attr.exclude_callchain_user)) {
10107                 /*
10108                  * On perf_event with precise_ip, calling bpf_get_stack()
10109                  * may trigger unwinder warnings and occasional crashes.
10110                  * bpf_get_[stack|stackid] works around this issue by using
10111                  * callchain attached to perf_sample_data. If the
10112                  * perf_event does not full (kernel and user) callchain
10113                  * attached to perf_sample_data, do not allow attaching BPF
10114                  * program that calls bpf_get_[stack|stackid].
10115                  */
10116                 return -EPROTO;
10117         }
10118
10119         event->prog = prog;
10120         event->bpf_cookie = bpf_cookie;
10121         event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
10122         WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
10123         return 0;
10124 }
10125
10126 static void perf_event_free_bpf_handler(struct perf_event *event)
10127 {
10128         struct bpf_prog *prog = event->prog;
10129
10130         if (!prog)
10131                 return;
10132
10133         WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
10134         event->prog = NULL;
10135         bpf_prog_put(prog);
10136 }
10137 #else
10138 static int perf_event_set_bpf_handler(struct perf_event *event,
10139                                       struct bpf_prog *prog,
10140                                       u64 bpf_cookie)
10141 {
10142         return -EOPNOTSUPP;
10143 }
10144 static void perf_event_free_bpf_handler(struct perf_event *event)
10145 {
10146 }
10147 #endif
10148
10149 /*
10150  * returns true if the event is a tracepoint, or a kprobe/upprobe created
10151  * with perf_event_open()
10152  */
10153 static inline bool perf_event_is_tracing(struct perf_event *event)
10154 {
10155         if (event->pmu == &perf_tracepoint)
10156                 return true;
10157 #ifdef CONFIG_KPROBE_EVENTS
10158         if (event->pmu == &perf_kprobe)
10159                 return true;
10160 #endif
10161 #ifdef CONFIG_UPROBE_EVENTS
10162         if (event->pmu == &perf_uprobe)
10163                 return true;
10164 #endif
10165         return false;
10166 }
10167
10168 int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
10169                             u64 bpf_cookie)
10170 {
10171         bool is_kprobe, is_tracepoint, is_syscall_tp;
10172
10173         if (!perf_event_is_tracing(event))
10174                 return perf_event_set_bpf_handler(event, prog, bpf_cookie);
10175
10176         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
10177         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
10178         is_syscall_tp = is_syscall_trace_event(event->tp_event);
10179         if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
10180                 /* bpf programs can only be attached to u/kprobe or tracepoint */
10181                 return -EINVAL;
10182
10183         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
10184             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
10185             (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
10186                 return -EINVAL;
10187
10188         /* Kprobe override only works for kprobes, not uprobes. */
10189         if (prog->kprobe_override &&
10190             !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
10191                 return -EINVAL;
10192
10193         if (is_tracepoint || is_syscall_tp) {
10194                 int off = trace_event_get_offsets(event->tp_event);
10195
10196                 if (prog->aux->max_ctx_offset > off)
10197                         return -EACCES;
10198         }
10199
10200         return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
10201 }
10202
10203 void perf_event_free_bpf_prog(struct perf_event *event)
10204 {
10205         if (!perf_event_is_tracing(event)) {
10206                 perf_event_free_bpf_handler(event);
10207                 return;
10208         }
10209         perf_event_detach_bpf_prog(event);
10210 }
10211
10212 #else
10213
10214 static inline void perf_tp_register(void)
10215 {
10216 }
10217
10218 static void perf_event_free_filter(struct perf_event *event)
10219 {
10220 }
10221
10222 int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
10223                             u64 bpf_cookie)
10224 {
10225         return -ENOENT;
10226 }
10227
10228 void perf_event_free_bpf_prog(struct perf_event *event)
10229 {
10230 }
10231 #endif /* CONFIG_EVENT_TRACING */
10232
10233 #ifdef CONFIG_HAVE_HW_BREAKPOINT
10234 void perf_bp_event(struct perf_event *bp, void *data)
10235 {
10236         struct perf_sample_data sample;
10237         struct pt_regs *regs = data;
10238
10239         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
10240
10241         if (!bp->hw.state && !perf_exclude_event(bp, regs))
10242                 perf_swevent_event(bp, 1, &sample, regs);
10243 }
10244 #endif
10245
10246 /*
10247  * Allocate a new address filter
10248  */
10249 static struct perf_addr_filter *
10250 perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
10251 {
10252         int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
10253         struct perf_addr_filter *filter;
10254
10255         filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
10256         if (!filter)
10257                 return NULL;
10258
10259         INIT_LIST_HEAD(&filter->entry);
10260         list_add_tail(&filter->entry, filters);
10261
10262         return filter;
10263 }
10264
10265 static void free_filters_list(struct list_head *filters)
10266 {
10267         struct perf_addr_filter *filter, *iter;
10268
10269         list_for_each_entry_safe(filter, iter, filters, entry) {
10270                 path_put(&filter->path);
10271                 list_del(&filter->entry);
10272                 kfree(filter);
10273         }
10274 }
10275
10276 /*
10277  * Free existing address filters and optionally install new ones
10278  */
10279 static void perf_addr_filters_splice(struct perf_event *event,
10280                                      struct list_head *head)
10281 {
10282         unsigned long flags;
10283         LIST_HEAD(list);
10284
10285         if (!has_addr_filter(event))
10286                 return;
10287
10288         /* don't bother with children, they don't have their own filters */
10289         if (event->parent)
10290                 return;
10291
10292         raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
10293
10294         list_splice_init(&event->addr_filters.list, &list);
10295         if (head)
10296                 list_splice(head, &event->addr_filters.list);
10297
10298         raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
10299
10300         free_filters_list(&list);
10301 }
10302
10303 /*
10304  * Scan through mm's vmas and see if one of them matches the
10305  * @filter; if so, adjust filter's address range.
10306  * Called with mm::mmap_lock down for reading.
10307  */
10308 static void perf_addr_filter_apply(struct perf_addr_filter *filter,
10309                                    struct mm_struct *mm,
10310                                    struct perf_addr_filter_range *fr)
10311 {
10312         struct vm_area_struct *vma;
10313
10314         for (vma = mm->mmap; vma; vma = vma->vm_next) {
10315                 if (!vma->vm_file)
10316                         continue;
10317
10318                 if (perf_addr_filter_vma_adjust(filter, vma, fr))
10319                         return;
10320         }
10321 }
10322
10323 /*
10324  * Update event's address range filters based on the
10325  * task's existing mappings, if any.
10326  */
10327 static void perf_event_addr_filters_apply(struct perf_event *event)
10328 {
10329         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
10330         struct task_struct *task = READ_ONCE(event->ctx->task);
10331         struct perf_addr_filter *filter;
10332         struct mm_struct *mm = NULL;
10333         unsigned int count = 0;
10334         unsigned long flags;
10335
10336         /*
10337          * We may observe TASK_TOMBSTONE, which means that the event tear-down
10338          * will stop on the parent's child_mutex that our caller is also holding
10339          */
10340         if (task == TASK_TOMBSTONE)
10341                 return;
10342
10343         if (ifh->nr_file_filters) {
10344                 mm = get_task_mm(task);
10345                 if (!mm)
10346                         goto restart;
10347
10348                 mmap_read_lock(mm);
10349         }
10350
10351         raw_spin_lock_irqsave(&ifh->lock, flags);
10352         list_for_each_entry(filter, &ifh->list, entry) {
10353                 if (filter->path.dentry) {
10354                         /*
10355                          * Adjust base offset if the filter is associated to a
10356                          * binary that needs to be mapped:
10357                          */
10358                         event->addr_filter_ranges[count].start = 0;
10359                         event->addr_filter_ranges[count].size = 0;
10360
10361                         perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
10362                 } else {
10363                         event->addr_filter_ranges[count].start = filter->offset;
10364                         event->addr_filter_ranges[count].size  = filter->size;
10365                 }
10366
10367                 count++;
10368         }
10369
10370         event->addr_filters_gen++;
10371         raw_spin_unlock_irqrestore(&ifh->lock, flags);
10372
10373         if (ifh->nr_file_filters) {
10374                 mmap_read_unlock(mm);
10375
10376                 mmput(mm);
10377         }
10378
10379 restart:
10380         perf_event_stop(event, 1);
10381 }
10382
10383 /*
10384  * Address range filtering: limiting the data to certain
10385  * instruction address ranges. Filters are ioctl()ed to us from
10386  * userspace as ascii strings.
10387  *
10388  * Filter string format:
10389  *
10390  * ACTION RANGE_SPEC
10391  * where ACTION is one of the
10392  *  * "filter": limit the trace to this region
10393  *  * "start": start tracing from this address
10394  *  * "stop": stop tracing at this address/region;
10395  * RANGE_SPEC is
10396  *  * for kernel addresses: <start address>[/<size>]
10397  *  * for object files:     <start address>[/<size>]@</path/to/object/file>
10398  *
10399  * if <size> is not specified or is zero, the range is treated as a single
10400  * address; not valid for ACTION=="filter".
10401  */
10402 enum {
10403         IF_ACT_NONE = -1,
10404         IF_ACT_FILTER,
10405         IF_ACT_START,
10406         IF_ACT_STOP,
10407         IF_SRC_FILE,
10408         IF_SRC_KERNEL,
10409         IF_SRC_FILEADDR,
10410         IF_SRC_KERNELADDR,
10411 };
10412
10413 enum {
10414         IF_STATE_ACTION = 0,
10415         IF_STATE_SOURCE,
10416         IF_STATE_END,
10417 };
10418
10419 static const match_table_t if_tokens = {
10420         { IF_ACT_FILTER,        "filter" },
10421         { IF_ACT_START,         "start" },
10422         { IF_ACT_STOP,          "stop" },
10423         { IF_SRC_FILE,          "%u/%u@%s" },
10424         { IF_SRC_KERNEL,        "%u/%u" },
10425         { IF_SRC_FILEADDR,      "%u@%s" },
10426         { IF_SRC_KERNELADDR,    "%u" },
10427         { IF_ACT_NONE,          NULL },
10428 };
10429
10430 /*
10431  * Address filter string parser
10432  */
10433 static int
10434 perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
10435                              struct list_head *filters)
10436 {
10437         struct perf_addr_filter *filter = NULL;
10438         char *start, *orig, *filename = NULL;
10439         substring_t args[MAX_OPT_ARGS];
10440         int state = IF_STATE_ACTION, token;
10441         unsigned int kernel = 0;
10442         int ret = -EINVAL;
10443
10444         orig = fstr = kstrdup(fstr, GFP_KERNEL);
10445         if (!fstr)
10446                 return -ENOMEM;
10447
10448         while ((start = strsep(&fstr, " ,\n")) != NULL) {
10449                 static const enum perf_addr_filter_action_t actions[] = {
10450                         [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
10451                         [IF_ACT_START]  = PERF_ADDR_FILTER_ACTION_START,
10452                         [IF_ACT_STOP]   = PERF_ADDR_FILTER_ACTION_STOP,
10453                 };
10454                 ret = -EINVAL;
10455
10456                 if (!*start)
10457                         continue;
10458
10459                 /* filter definition begins */
10460                 if (state == IF_STATE_ACTION) {
10461                         filter = perf_addr_filter_new(event, filters);
10462                         if (!filter)
10463                                 goto fail;
10464                 }
10465
10466                 token = match_token(start, if_tokens, args);
10467                 switch (token) {
10468                 case IF_ACT_FILTER:
10469                 case IF_ACT_START:
10470                 case IF_ACT_STOP:
10471                         if (state != IF_STATE_ACTION)
10472                                 goto fail;
10473
10474                         filter->action = actions[token];
10475                         state = IF_STATE_SOURCE;
10476                         break;
10477
10478                 case IF_SRC_KERNELADDR:
10479                 case IF_SRC_KERNEL:
10480                         kernel = 1;
10481                         fallthrough;
10482
10483                 case IF_SRC_FILEADDR:
10484                 case IF_SRC_FILE:
10485                         if (state != IF_STATE_SOURCE)
10486                                 goto fail;
10487
10488                         *args[0].to = 0;
10489                         ret = kstrtoul(args[0].from, 0, &filter->offset);
10490                         if (ret)
10491                                 goto fail;
10492
10493                         if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
10494                                 *args[1].to = 0;
10495                                 ret = kstrtoul(args[1].from, 0, &filter->size);
10496                                 if (ret)
10497                                         goto fail;
10498                         }
10499
10500                         if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
10501                                 int fpos = token == IF_SRC_FILE ? 2 : 1;
10502
10503                                 kfree(filename);
10504                                 filename = match_strdup(&args[fpos]);
10505                                 if (!filename) {
10506                                         ret = -ENOMEM;
10507                                         goto fail;
10508                                 }
10509                         }
10510
10511                         state = IF_STATE_END;
10512                         break;
10513
10514                 default:
10515                         goto fail;
10516                 }
10517
10518                 /*
10519                  * Filter definition is fully parsed, validate and install it.
10520                  * Make sure that it doesn't contradict itself or the event's
10521                  * attribute.
10522                  */
10523                 if (state == IF_STATE_END) {
10524                         ret = -EINVAL;
10525
10526                         /*
10527                          * ACTION "filter" must have a non-zero length region
10528                          * specified.
10529                          */
10530                         if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
10531                             !filter->size)
10532                                 goto fail;
10533
10534                         if (!kernel) {
10535                                 if (!filename)
10536                                         goto fail;
10537
10538                                 /*
10539                                  * For now, we only support file-based filters
10540                                  * in per-task events; doing so for CPU-wide
10541                                  * events requires additional context switching
10542                                  * trickery, since same object code will be
10543                                  * mapped at different virtual addresses in
10544                                  * different processes.
10545                                  */
10546                                 ret = -EOPNOTSUPP;
10547                                 if (!event->ctx->task)
10548                                         goto fail;
10549
10550                                 /* look up the path and grab its inode */
10551                                 ret = kern_path(filename, LOOKUP_FOLLOW,
10552                                                 &filter->path);
10553                                 if (ret)
10554                                         goto fail;
10555
10556                                 ret = -EINVAL;
10557                                 if (!filter->path.dentry ||
10558                                     !S_ISREG(d_inode(filter->path.dentry)
10559                                              ->i_mode))
10560                                         goto fail;
10561
10562                                 event->addr_filters.nr_file_filters++;
10563                         }
10564
10565                         /* ready to consume more filters */
10566                         kfree(filename);
10567                         filename = NULL;
10568                         state = IF_STATE_ACTION;
10569                         filter = NULL;
10570                         kernel = 0;
10571                 }
10572         }
10573
10574         if (state != IF_STATE_ACTION)
10575                 goto fail;
10576
10577         kfree(filename);
10578         kfree(orig);
10579
10580         return 0;
10581
10582 fail:
10583         kfree(filename);
10584         free_filters_list(filters);
10585         kfree(orig);
10586
10587         return ret;
10588 }
10589
10590 static int
10591 perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
10592 {
10593         LIST_HEAD(filters);
10594         int ret;
10595
10596         /*
10597          * Since this is called in perf_ioctl() path, we're already holding
10598          * ctx::mutex.
10599          */
10600         lockdep_assert_held(&event->ctx->mutex);
10601
10602         if (WARN_ON_ONCE(event->parent))
10603                 return -EINVAL;
10604
10605         ret = perf_event_parse_addr_filter(event, filter_str, &filters);
10606         if (ret)
10607                 goto fail_clear_files;
10608
10609         ret = event->pmu->addr_filters_validate(&filters);
10610         if (ret)
10611                 goto fail_free_filters;
10612
10613         /* remove existing filters, if any */
10614         perf_addr_filters_splice(event, &filters);
10615
10616         /* install new filters */
10617         perf_event_for_each_child(event, perf_event_addr_filters_apply);
10618
10619         return ret;
10620
10621 fail_free_filters:
10622         free_filters_list(&filters);
10623
10624 fail_clear_files:
10625         event->addr_filters.nr_file_filters = 0;
10626
10627         return ret;
10628 }
10629
10630 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
10631 {
10632         int ret = -EINVAL;
10633         char *filter_str;
10634
10635         filter_str = strndup_user(arg, PAGE_SIZE);
10636         if (IS_ERR(filter_str))
10637                 return PTR_ERR(filter_str);
10638
10639 #ifdef CONFIG_EVENT_TRACING
10640         if (perf_event_is_tracing(event)) {
10641                 struct perf_event_context *ctx = event->ctx;
10642
10643                 /*
10644                  * Beware, here be dragons!!
10645                  *
10646                  * the tracepoint muck will deadlock against ctx->mutex, but
10647                  * the tracepoint stuff does not actually need it. So
10648                  * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
10649                  * already have a reference on ctx.
10650                  *
10651                  * This can result in event getting moved to a different ctx,
10652                  * but that does not affect the tracepoint state.
10653                  */
10654                 mutex_unlock(&ctx->mutex);
10655                 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
10656                 mutex_lock(&ctx->mutex);
10657         } else
10658 #endif
10659         if (has_addr_filter(event))
10660                 ret = perf_event_set_addr_filter(event, filter_str);
10661
10662         kfree(filter_str);
10663         return ret;
10664 }
10665
10666 /*
10667  * hrtimer based swevent callback
10668  */
10669
10670 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
10671 {
10672         enum hrtimer_restart ret = HRTIMER_RESTART;
10673         struct perf_sample_data data;
10674         struct pt_regs *regs;
10675         struct perf_event *event;
10676         u64 period;
10677
10678         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
10679
10680         if (event->state != PERF_EVENT_STATE_ACTIVE)
10681                 return HRTIMER_NORESTART;
10682
10683         event->pmu->read(event);
10684
10685         perf_sample_data_init(&data, 0, event->hw.last_period);
10686         regs = get_irq_regs();
10687
10688         if (regs && !perf_exclude_event(event, regs)) {
10689                 if (!(event->attr.exclude_idle && is_idle_task(current)))
10690                         if (__perf_event_overflow(event, 1, &data, regs))
10691                                 ret = HRTIMER_NORESTART;
10692         }
10693
10694         period = max_t(u64, 10000, event->hw.sample_period);
10695         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
10696
10697         return ret;
10698 }
10699
10700 static void perf_swevent_start_hrtimer(struct perf_event *event)
10701 {
10702         struct hw_perf_event *hwc = &event->hw;
10703         s64 period;
10704
10705         if (!is_sampling_event(event))
10706                 return;
10707
10708         period = local64_read(&hwc->period_left);
10709         if (period) {
10710                 if (period < 0)
10711                         period = 10000;
10712
10713                 local64_set(&hwc->period_left, 0);
10714         } else {
10715                 period = max_t(u64, 10000, hwc->sample_period);
10716         }
10717         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
10718                       HRTIMER_MODE_REL_PINNED_HARD);
10719 }
10720
10721 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
10722 {
10723         struct hw_perf_event *hwc = &event->hw;
10724
10725         if (is_sampling_event(event)) {
10726                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
10727                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
10728
10729                 hrtimer_cancel(&hwc->hrtimer);
10730         }
10731 }
10732
10733 static void perf_swevent_init_hrtimer(struct perf_event *event)
10734 {
10735         struct hw_perf_event *hwc = &event->hw;
10736
10737         if (!is_sampling_event(event))
10738                 return;
10739
10740         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
10741         hwc->hrtimer.function = perf_swevent_hrtimer;
10742
10743         /*
10744          * Since hrtimers have a fixed rate, we can do a static freq->period
10745          * mapping and avoid the whole period adjust feedback stuff.
10746          */
10747         if (event->attr.freq) {
10748                 long freq = event->attr.sample_freq;
10749
10750                 event->attr.sample_period = NSEC_PER_SEC / freq;
10751                 hwc->sample_period = event->attr.sample_period;
10752                 local64_set(&hwc->period_left, hwc->sample_period);
10753                 hwc->last_period = hwc->sample_period;
10754                 event->attr.freq = 0;
10755         }
10756 }
10757
10758 /*
10759  * Software event: cpu wall time clock
10760  */
10761
10762 static void cpu_clock_event_update(struct perf_event *event)
10763 {
10764         s64 prev;
10765         u64 now;
10766
10767         now = local_clock();
10768         prev = local64_xchg(&event->hw.prev_count, now);
10769         local64_add(now - prev, &event->count);
10770 }
10771
10772 static void cpu_clock_event_start(struct perf_event *event, int flags)
10773 {
10774         local64_set(&event->hw.prev_count, local_clock());
10775         perf_swevent_start_hrtimer(event);
10776 }
10777
10778 static void cpu_clock_event_stop(struct perf_event *event, int flags)
10779 {
10780         perf_swevent_cancel_hrtimer(event);
10781         cpu_clock_event_update(event);
10782 }
10783
10784 static int cpu_clock_event_add(struct perf_event *event, int flags)
10785 {
10786         if (flags & PERF_EF_START)
10787                 cpu_clock_event_start(event, flags);
10788         perf_event_update_userpage(event);
10789
10790         return 0;
10791 }
10792
10793 static void cpu_clock_event_del(struct perf_event *event, int flags)
10794 {
10795         cpu_clock_event_stop(event, flags);
10796 }
10797
10798 static void cpu_clock_event_read(struct perf_event *event)
10799 {
10800         cpu_clock_event_update(event);
10801 }
10802
10803 static int cpu_clock_event_init(struct perf_event *event)
10804 {
10805         if (event->attr.type != PERF_TYPE_SOFTWARE)
10806                 return -ENOENT;
10807
10808         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
10809                 return -ENOENT;
10810
10811         /*
10812          * no branch sampling for software events
10813          */
10814         if (has_branch_stack(event))
10815                 return -EOPNOTSUPP;
10816
10817         perf_swevent_init_hrtimer(event);
10818
10819         return 0;
10820 }
10821
10822 static struct pmu perf_cpu_clock = {
10823         .task_ctx_nr    = perf_sw_context,
10824
10825         .capabilities   = PERF_PMU_CAP_NO_NMI,
10826
10827         .event_init     = cpu_clock_event_init,
10828         .add            = cpu_clock_event_add,
10829         .del            = cpu_clock_event_del,
10830         .start          = cpu_clock_event_start,
10831         .stop           = cpu_clock_event_stop,
10832         .read           = cpu_clock_event_read,
10833 };
10834
10835 /*
10836  * Software event: task time clock
10837  */
10838
10839 static void task_clock_event_update(struct perf_event *event, u64 now)
10840 {
10841         u64 prev;
10842         s64 delta;
10843
10844         prev = local64_xchg(&event->hw.prev_count, now);
10845         delta = now - prev;
10846         local64_add(delta, &event->count);
10847 }
10848
10849 static void task_clock_event_start(struct perf_event *event, int flags)
10850 {
10851         local64_set(&event->hw.prev_count, event->ctx->time);
10852         perf_swevent_start_hrtimer(event);
10853 }
10854
10855 static void task_clock_event_stop(struct perf_event *event, int flags)
10856 {
10857         perf_swevent_cancel_hrtimer(event);
10858         task_clock_event_update(event, event->ctx->time);
10859 }
10860
10861 static int task_clock_event_add(struct perf_event *event, int flags)
10862 {
10863         if (flags & PERF_EF_START)
10864                 task_clock_event_start(event, flags);
10865         perf_event_update_userpage(event);
10866
10867         return 0;
10868 }
10869
10870 static void task_clock_event_del(struct perf_event *event, int flags)
10871 {
10872         task_clock_event_stop(event, PERF_EF_UPDATE);
10873 }
10874
10875 static void task_clock_event_read(struct perf_event *event)
10876 {
10877         u64 now = perf_clock();
10878         u64 delta = now - event->ctx->timestamp;
10879         u64 time = event->ctx->time + delta;
10880
10881         task_clock_event_update(event, time);
10882 }
10883
10884 static int task_clock_event_init(struct perf_event *event)
10885 {
10886         if (event->attr.type != PERF_TYPE_SOFTWARE)
10887                 return -ENOENT;
10888
10889         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
10890                 return -ENOENT;
10891
10892         /*
10893          * no branch sampling for software events
10894          */
10895         if (has_branch_stack(event))
10896                 return -EOPNOTSUPP;
10897
10898         perf_swevent_init_hrtimer(event);
10899
10900         return 0;
10901 }
10902
10903 static struct pmu perf_task_clock = {
10904         .task_ctx_nr    = perf_sw_context,
10905
10906         .capabilities   = PERF_PMU_CAP_NO_NMI,
10907
10908         .event_init     = task_clock_event_init,
10909         .add            = task_clock_event_add,
10910         .del            = task_clock_event_del,
10911         .start          = task_clock_event_start,
10912         .stop           = task_clock_event_stop,
10913         .read           = task_clock_event_read,
10914 };
10915
10916 static void perf_pmu_nop_void(struct pmu *pmu)
10917 {
10918 }
10919
10920 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
10921 {
10922 }
10923
10924 static int perf_pmu_nop_int(struct pmu *pmu)
10925 {
10926         return 0;
10927 }
10928
10929 static int perf_event_nop_int(struct perf_event *event, u64 value)
10930 {
10931         return 0;
10932 }
10933
10934 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
10935
10936 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
10937 {
10938         __this_cpu_write(nop_txn_flags, flags);
10939
10940         if (flags & ~PERF_PMU_TXN_ADD)
10941                 return;
10942
10943         perf_pmu_disable(pmu);
10944 }
10945
10946 static int perf_pmu_commit_txn(struct pmu *pmu)
10947 {
10948         unsigned int flags = __this_cpu_read(nop_txn_flags);
10949
10950         __this_cpu_write(nop_txn_flags, 0);
10951
10952         if (flags & ~PERF_PMU_TXN_ADD)
10953                 return 0;
10954
10955         perf_pmu_enable(pmu);
10956         return 0;
10957 }
10958
10959 static void perf_pmu_cancel_txn(struct pmu *pmu)
10960 {
10961         unsigned int flags =  __this_cpu_read(nop_txn_flags);
10962
10963         __this_cpu_write(nop_txn_flags, 0);
10964
10965         if (flags & ~PERF_PMU_TXN_ADD)
10966                 return;
10967
10968         perf_pmu_enable(pmu);
10969 }
10970
10971 static int perf_event_idx_default(struct perf_event *event)
10972 {
10973         return 0;
10974 }
10975
10976 /*
10977  * Ensures all contexts with the same task_ctx_nr have the same
10978  * pmu_cpu_context too.
10979  */
10980 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
10981 {
10982         struct pmu *pmu;
10983
10984         if (ctxn < 0)
10985                 return NULL;
10986
10987         list_for_each_entry(pmu, &pmus, entry) {
10988                 if (pmu->task_ctx_nr == ctxn)
10989                         return pmu->pmu_cpu_context;
10990         }
10991
10992         return NULL;
10993 }
10994
10995 static void free_pmu_context(struct pmu *pmu)
10996 {
10997         /*
10998          * Static contexts such as perf_sw_context have a global lifetime
10999          * and may be shared between different PMUs. Avoid freeing them
11000          * when a single PMU is going away.
11001          */
11002         if (pmu->task_ctx_nr > perf_invalid_context)
11003                 return;
11004
11005         free_percpu(pmu->pmu_cpu_context);
11006 }
11007
11008 /*
11009  * Let userspace know that this PMU supports address range filtering:
11010  */
11011 static ssize_t nr_addr_filters_show(struct device *dev,
11012                                     struct device_attribute *attr,
11013                                     char *page)
11014 {
11015         struct pmu *pmu = dev_get_drvdata(dev);
11016
11017         return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
11018 }
11019 DEVICE_ATTR_RO(nr_addr_filters);
11020
11021 static struct idr pmu_idr;
11022
11023 static ssize_t
11024 type_show(struct device *dev, struct device_attribute *attr, char *page)
11025 {
11026         struct pmu *pmu = dev_get_drvdata(dev);
11027
11028         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
11029 }
11030 static DEVICE_ATTR_RO(type);
11031
11032 static ssize_t
11033 perf_event_mux_interval_ms_show(struct device *dev,
11034                                 struct device_attribute *attr,
11035                                 char *page)
11036 {
11037         struct pmu *pmu = dev_get_drvdata(dev);
11038
11039         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
11040 }
11041
11042 static DEFINE_MUTEX(mux_interval_mutex);
11043
11044 static ssize_t
11045 perf_event_mux_interval_ms_store(struct device *dev,
11046                                  struct device_attribute *attr,
11047                                  const char *buf, size_t count)
11048 {
11049         struct pmu *pmu = dev_get_drvdata(dev);
11050         int timer, cpu, ret;
11051
11052         ret = kstrtoint(buf, 0, &timer);
11053         if (ret)
11054                 return ret;
11055
11056         if (timer < 1)
11057                 return -EINVAL;
11058
11059         /* same value, noting to do */
11060         if (timer == pmu->hrtimer_interval_ms)
11061                 return count;
11062
11063         mutex_lock(&mux_interval_mutex);
11064         pmu->hrtimer_interval_ms = timer;
11065
11066         /* update all cpuctx for this PMU */
11067         cpus_read_lock();
11068         for_each_online_cpu(cpu) {
11069                 struct perf_cpu_context *cpuctx;
11070                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11071                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
11072
11073                 cpu_function_call(cpu,
11074                         (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
11075         }
11076         cpus_read_unlock();
11077         mutex_unlock(&mux_interval_mutex);
11078
11079         return count;
11080 }
11081 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
11082
11083 static struct attribute *pmu_dev_attrs[] = {
11084         &dev_attr_type.attr,
11085         &dev_attr_perf_event_mux_interval_ms.attr,
11086         NULL,
11087 };
11088 ATTRIBUTE_GROUPS(pmu_dev);
11089
11090 static int pmu_bus_running;
11091 static struct bus_type pmu_bus = {
11092         .name           = "event_source",
11093         .dev_groups     = pmu_dev_groups,
11094 };
11095
11096 static void pmu_dev_release(struct device *dev)
11097 {
11098         kfree(dev);
11099 }
11100
11101 static int pmu_dev_alloc(struct pmu *pmu)
11102 {
11103         int ret = -ENOMEM;
11104
11105         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
11106         if (!pmu->dev)
11107                 goto out;
11108
11109         pmu->dev->groups = pmu->attr_groups;
11110         device_initialize(pmu->dev);
11111         ret = dev_set_name(pmu->dev, "%s", pmu->name);
11112         if (ret)
11113                 goto free_dev;
11114
11115         dev_set_drvdata(pmu->dev, pmu);
11116         pmu->dev->bus = &pmu_bus;
11117         pmu->dev->release = pmu_dev_release;
11118         ret = device_add(pmu->dev);
11119         if (ret)
11120                 goto free_dev;
11121
11122         /* For PMUs with address filters, throw in an extra attribute: */
11123         if (pmu->nr_addr_filters)
11124                 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
11125
11126         if (ret)
11127                 goto del_dev;
11128
11129         if (pmu->attr_update)
11130                 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
11131
11132         if (ret)
11133                 goto del_dev;
11134
11135 out:
11136         return ret;
11137
11138 del_dev:
11139         device_del(pmu->dev);
11140
11141 free_dev:
11142         put_device(pmu->dev);
11143         goto out;
11144 }
11145
11146 static struct lock_class_key cpuctx_mutex;
11147 static struct lock_class_key cpuctx_lock;
11148
11149 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
11150 {
11151         int cpu, ret, max = PERF_TYPE_MAX;
11152
11153         mutex_lock(&pmus_lock);
11154         ret = -ENOMEM;
11155         pmu->pmu_disable_count = alloc_percpu(int);
11156         if (!pmu->pmu_disable_count)
11157                 goto unlock;
11158
11159         pmu->type = -1;
11160         if (!name)
11161                 goto skip_type;
11162         pmu->name = name;
11163
11164         if (type != PERF_TYPE_SOFTWARE) {
11165                 if (type >= 0)
11166                         max = type;
11167
11168                 ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
11169                 if (ret < 0)
11170                         goto free_pdc;
11171
11172                 WARN_ON(type >= 0 && ret != type);
11173
11174                 type = ret;
11175         }
11176         pmu->type = type;
11177
11178         if (pmu_bus_running) {
11179                 ret = pmu_dev_alloc(pmu);
11180                 if (ret)
11181                         goto free_idr;
11182         }
11183
11184 skip_type:
11185         if (pmu->task_ctx_nr == perf_hw_context) {
11186                 static int hw_context_taken = 0;
11187
11188                 /*
11189                  * Other than systems with heterogeneous CPUs, it never makes
11190                  * sense for two PMUs to share perf_hw_context. PMUs which are
11191                  * uncore must use perf_invalid_context.
11192                  */
11193                 if (WARN_ON_ONCE(hw_context_taken &&
11194                     !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
11195                         pmu->task_ctx_nr = perf_invalid_context;
11196
11197                 hw_context_taken = 1;
11198         }
11199
11200         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
11201         if (pmu->pmu_cpu_context)
11202                 goto got_cpu_context;
11203
11204         ret = -ENOMEM;
11205         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
11206         if (!pmu->pmu_cpu_context)
11207                 goto free_dev;
11208
11209         for_each_possible_cpu(cpu) {
11210                 struct perf_cpu_context *cpuctx;
11211
11212                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11213                 __perf_event_init_context(&cpuctx->ctx);
11214                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
11215                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
11216                 cpuctx->ctx.pmu = pmu;
11217                 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
11218
11219                 __perf_mux_hrtimer_init(cpuctx, cpu);
11220
11221                 cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
11222                 cpuctx->heap = cpuctx->heap_default;
11223         }
11224
11225 got_cpu_context:
11226         if (!pmu->start_txn) {
11227                 if (pmu->pmu_enable) {
11228                         /*
11229                          * If we have pmu_enable/pmu_disable calls, install
11230                          * transaction stubs that use that to try and batch
11231                          * hardware accesses.
11232                          */
11233                         pmu->start_txn  = perf_pmu_start_txn;
11234                         pmu->commit_txn = perf_pmu_commit_txn;
11235                         pmu->cancel_txn = perf_pmu_cancel_txn;
11236                 } else {
11237                         pmu->start_txn  = perf_pmu_nop_txn;
11238                         pmu->commit_txn = perf_pmu_nop_int;
11239                         pmu->cancel_txn = perf_pmu_nop_void;
11240                 }
11241         }
11242
11243         if (!pmu->pmu_enable) {
11244                 pmu->pmu_enable  = perf_pmu_nop_void;
11245                 pmu->pmu_disable = perf_pmu_nop_void;
11246         }
11247
11248         if (!pmu->check_period)
11249                 pmu->check_period = perf_event_nop_int;
11250
11251         if (!pmu->event_idx)
11252                 pmu->event_idx = perf_event_idx_default;
11253
11254         /*
11255          * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
11256          * since these cannot be in the IDR. This way the linear search
11257          * is fast, provided a valid software event is provided.
11258          */
11259         if (type == PERF_TYPE_SOFTWARE || !name)
11260                 list_add_rcu(&pmu->entry, &pmus);
11261         else
11262                 list_add_tail_rcu(&pmu->entry, &pmus);
11263
11264         atomic_set(&pmu->exclusive_cnt, 0);
11265         ret = 0;
11266 unlock:
11267         mutex_unlock(&pmus_lock);
11268
11269         return ret;
11270
11271 free_dev:
11272         device_del(pmu->dev);
11273         put_device(pmu->dev);
11274
11275 free_idr:
11276         if (pmu->type != PERF_TYPE_SOFTWARE)
11277                 idr_remove(&pmu_idr, pmu->type);
11278
11279 free_pdc:
11280         free_percpu(pmu->pmu_disable_count);
11281         goto unlock;
11282 }
11283 EXPORT_SYMBOL_GPL(perf_pmu_register);
11284
11285 void perf_pmu_unregister(struct pmu *pmu)
11286 {
11287         mutex_lock(&pmus_lock);
11288         list_del_rcu(&pmu->entry);
11289
11290         /*
11291          * We dereference the pmu list under both SRCU and regular RCU, so
11292          * synchronize against both of those.
11293          */
11294         synchronize_srcu(&pmus_srcu);
11295         synchronize_rcu();
11296
11297         free_percpu(pmu->pmu_disable_count);
11298         if (pmu->type != PERF_TYPE_SOFTWARE)
11299                 idr_remove(&pmu_idr, pmu->type);
11300         if (pmu_bus_running) {
11301                 if (pmu->nr_addr_filters)
11302                         device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
11303                 device_del(pmu->dev);
11304                 put_device(pmu->dev);
11305         }
11306         free_pmu_context(pmu);
11307         mutex_unlock(&pmus_lock);
11308 }
11309 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
11310
11311 static inline bool has_extended_regs(struct perf_event *event)
11312 {
11313         return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
11314                (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
11315 }
11316
11317 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
11318 {
11319         struct perf_event_context *ctx = NULL;
11320         int ret;
11321
11322         if (!try_module_get(pmu->module))
11323                 return -ENODEV;
11324
11325         /*
11326          * A number of pmu->event_init() methods iterate the sibling_list to,
11327          * for example, validate if the group fits on the PMU. Therefore,
11328          * if this is a sibling event, acquire the ctx->mutex to protect
11329          * the sibling_list.
11330          */
11331         if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
11332                 /*
11333                  * This ctx->mutex can nest when we're called through
11334                  * inheritance. See the perf_event_ctx_lock_nested() comment.
11335                  */
11336                 ctx = perf_event_ctx_lock_nested(event->group_leader,
11337                                                  SINGLE_DEPTH_NESTING);
11338                 BUG_ON(!ctx);
11339         }
11340
11341         event->pmu = pmu;
11342         ret = pmu->event_init(event);
11343
11344         if (ctx)
11345                 perf_event_ctx_unlock(event->group_leader, ctx);
11346
11347         if (!ret) {
11348                 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
11349                     has_extended_regs(event))
11350                         ret = -EOPNOTSUPP;
11351
11352                 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
11353                     event_has_any_exclude_flag(event))
11354                         ret = -EINVAL;
11355
11356                 if (ret && event->destroy)
11357                         event->destroy(event);
11358         }
11359
11360         if (ret)
11361                 module_put(pmu->module);
11362
11363         return ret;
11364 }
11365
11366 static struct pmu *perf_init_event(struct perf_event *event)
11367 {
11368         bool extended_type = false;
11369         int idx, type, ret;
11370         struct pmu *pmu;
11371
11372         idx = srcu_read_lock(&pmus_srcu);
11373
11374         /* Try parent's PMU first: */
11375         if (event->parent && event->parent->pmu) {
11376                 pmu = event->parent->pmu;
11377                 ret = perf_try_init_event(pmu, event);
11378                 if (!ret)
11379                         goto unlock;
11380         }
11381
11382         /*
11383          * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
11384          * are often aliases for PERF_TYPE_RAW.
11385          */
11386         type = event->attr.type;
11387         if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
11388                 type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
11389                 if (!type) {
11390                         type = PERF_TYPE_RAW;
11391                 } else {
11392                         extended_type = true;
11393                         event->attr.config &= PERF_HW_EVENT_MASK;
11394                 }
11395         }
11396
11397 again:
11398         rcu_read_lock();
11399         pmu = idr_find(&pmu_idr, type);
11400         rcu_read_unlock();
11401         if (pmu) {
11402                 if (event->attr.type != type && type != PERF_TYPE_RAW &&
11403                     !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
11404                         goto fail;
11405
11406                 ret = perf_try_init_event(pmu, event);
11407                 if (ret == -ENOENT && event->attr.type != type && !extended_type) {
11408                         type = event->attr.type;
11409                         goto again;
11410                 }
11411
11412                 if (ret)
11413                         pmu = ERR_PTR(ret);
11414
11415                 goto unlock;
11416         }
11417
11418         list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
11419                 ret = perf_try_init_event(pmu, event);
11420                 if (!ret)
11421                         goto unlock;
11422
11423                 if (ret != -ENOENT) {
11424                         pmu = ERR_PTR(ret);
11425                         goto unlock;
11426                 }
11427         }
11428 fail:
11429         pmu = ERR_PTR(-ENOENT);
11430 unlock:
11431         srcu_read_unlock(&pmus_srcu, idx);
11432
11433         return pmu;
11434 }
11435
11436 static void attach_sb_event(struct perf_event *event)
11437 {
11438         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
11439
11440         raw_spin_lock(&pel->lock);
11441         list_add_rcu(&event->sb_list, &pel->list);
11442         raw_spin_unlock(&pel->lock);
11443 }
11444
11445 /*
11446  * We keep a list of all !task (and therefore per-cpu) events
11447  * that need to receive side-band records.
11448  *
11449  * This avoids having to scan all the various PMU per-cpu contexts
11450  * looking for them.
11451  */
11452 static void account_pmu_sb_event(struct perf_event *event)
11453 {
11454         if (is_sb_event(event))
11455                 attach_sb_event(event);
11456 }
11457
11458 static void account_event_cpu(struct perf_event *event, int cpu)
11459 {
11460         if (event->parent)
11461                 return;
11462
11463         if (is_cgroup_event(event))
11464                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
11465 }
11466
11467 /* Freq events need the tick to stay alive (see perf_event_task_tick). */
11468 static void account_freq_event_nohz(void)
11469 {
11470 #ifdef CONFIG_NO_HZ_FULL
11471         /* Lock so we don't race with concurrent unaccount */
11472         spin_lock(&nr_freq_lock);
11473         if (atomic_inc_return(&nr_freq_events) == 1)
11474                 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
11475         spin_unlock(&nr_freq_lock);
11476 #endif
11477 }
11478
11479 static void account_freq_event(void)
11480 {
11481         if (tick_nohz_full_enabled())
11482                 account_freq_event_nohz();
11483         else
11484                 atomic_inc(&nr_freq_events);
11485 }
11486
11487
11488 static void account_event(struct perf_event *event)
11489 {
11490         bool inc = false;
11491
11492         if (event->parent)
11493                 return;
11494
11495         if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
11496                 inc = true;
11497         if (event->attr.mmap || event->attr.mmap_data)
11498                 atomic_inc(&nr_mmap_events);
11499         if (event->attr.build_id)
11500                 atomic_inc(&nr_build_id_events);
11501         if (event->attr.comm)
11502                 atomic_inc(&nr_comm_events);
11503         if (event->attr.namespaces)
11504                 atomic_inc(&nr_namespaces_events);
11505         if (event->attr.cgroup)
11506                 atomic_inc(&nr_cgroup_events);
11507         if (event->attr.task)
11508                 atomic_inc(&nr_task_events);
11509         if (event->attr.freq)
11510                 account_freq_event();
11511         if (event->attr.context_switch) {
11512                 atomic_inc(&nr_switch_events);
11513                 inc = true;
11514         }
11515         if (has_branch_stack(event))
11516                 inc = true;
11517         if (is_cgroup_event(event))
11518                 inc = true;
11519         if (event->attr.ksymbol)
11520                 atomic_inc(&nr_ksymbol_events);
11521         if (event->attr.bpf_event)
11522                 atomic_inc(&nr_bpf_events);
11523         if (event->attr.text_poke)
11524                 atomic_inc(&nr_text_poke_events);
11525
11526         if (inc) {
11527                 /*
11528                  * We need the mutex here because static_branch_enable()
11529                  * must complete *before* the perf_sched_count increment
11530                  * becomes visible.
11531                  */
11532                 if (atomic_inc_not_zero(&perf_sched_count))
11533                         goto enabled;
11534
11535                 mutex_lock(&perf_sched_mutex);
11536                 if (!atomic_read(&perf_sched_count)) {
11537                         static_branch_enable(&perf_sched_events);
11538                         /*
11539                          * Guarantee that all CPUs observe they key change and
11540                          * call the perf scheduling hooks before proceeding to
11541                          * install events that need them.
11542                          */
11543                         synchronize_rcu();
11544                 }
11545                 /*
11546                  * Now that we have waited for the sync_sched(), allow further
11547                  * increments to by-pass the mutex.
11548                  */
11549                 atomic_inc(&perf_sched_count);
11550                 mutex_unlock(&perf_sched_mutex);
11551         }
11552 enabled:
11553
11554         account_event_cpu(event, event->cpu);
11555
11556         account_pmu_sb_event(event);
11557 }
11558
11559 /*
11560  * Allocate and initialize an event structure
11561  */
11562 static struct perf_event *
11563 perf_event_alloc(struct perf_event_attr *attr, int cpu,
11564                  struct task_struct *task,
11565                  struct perf_event *group_leader,
11566                  struct perf_event *parent_event,
11567                  perf_overflow_handler_t overflow_handler,
11568                  void *context, int cgroup_fd)
11569 {
11570         struct pmu *pmu;
11571         struct perf_event *event;
11572         struct hw_perf_event *hwc;
11573         long err = -EINVAL;
11574         int node;
11575
11576         if ((unsigned)cpu >= nr_cpu_ids) {
11577                 if (!task || cpu != -1)
11578                         return ERR_PTR(-EINVAL);
11579         }
11580         if (attr->sigtrap && !task) {
11581                 /* Requires a task: avoid signalling random tasks. */
11582                 return ERR_PTR(-EINVAL);
11583         }
11584
11585         node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
11586         event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
11587                                       node);
11588         if (!event)
11589                 return ERR_PTR(-ENOMEM);
11590
11591         /*
11592          * Single events are their own group leaders, with an
11593          * empty sibling list:
11594          */
11595         if (!group_leader)
11596                 group_leader = event;
11597
11598         mutex_init(&event->child_mutex);
11599         INIT_LIST_HEAD(&event->child_list);
11600
11601         INIT_LIST_HEAD(&event->event_entry);
11602         INIT_LIST_HEAD(&event->sibling_list);
11603         INIT_LIST_HEAD(&event->active_list);
11604         init_event_group(event);
11605         INIT_LIST_HEAD(&event->rb_entry);
11606         INIT_LIST_HEAD(&event->active_entry);
11607         INIT_LIST_HEAD(&event->addr_filters.list);
11608         INIT_HLIST_NODE(&event->hlist_entry);
11609
11610
11611         init_waitqueue_head(&event->waitq);
11612         event->pending_disable = -1;
11613         init_irq_work(&event->pending, perf_pending_event);
11614
11615         mutex_init(&event->mmap_mutex);
11616         raw_spin_lock_init(&event->addr_filters.lock);
11617
11618         atomic_long_set(&event->refcount, 1);
11619         event->cpu              = cpu;
11620         event->attr             = *attr;
11621         event->group_leader     = group_leader;
11622         event->pmu              = NULL;
11623         event->oncpu            = -1;
11624
11625         event->parent           = parent_event;
11626
11627         event->ns               = get_pid_ns(task_active_pid_ns(current));
11628         event->id               = atomic64_inc_return(&perf_event_id);
11629
11630         event->state            = PERF_EVENT_STATE_INACTIVE;
11631
11632         if (parent_event)
11633                 event->event_caps = parent_event->event_caps;
11634
11635         if (event->attr.sigtrap)
11636                 atomic_set(&event->event_limit, 1);
11637
11638         if (task) {
11639                 event->attach_state = PERF_ATTACH_TASK;
11640                 /*
11641                  * XXX pmu::event_init needs to know what task to account to
11642                  * and we cannot use the ctx information because we need the
11643                  * pmu before we get a ctx.
11644                  */
11645                 event->hw.target = get_task_struct(task);
11646         }
11647
11648         event->clock = &local_clock;
11649         if (parent_event)
11650                 event->clock = parent_event->clock;
11651
11652         if (!overflow_handler && parent_event) {
11653                 overflow_handler = parent_event->overflow_handler;
11654                 context = parent_event->overflow_handler_context;
11655 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
11656                 if (overflow_handler == bpf_overflow_handler) {
11657                         struct bpf_prog *prog = parent_event->prog;
11658
11659                         bpf_prog_inc(prog);
11660                         event->prog = prog;
11661                         event->orig_overflow_handler =
11662                                 parent_event->orig_overflow_handler;
11663                 }
11664 #endif
11665         }
11666
11667         if (overflow_handler) {
11668                 event->overflow_handler = overflow_handler;
11669                 event->overflow_handler_context = context;
11670         } else if (is_write_backward(event)){
11671                 event->overflow_handler = perf_event_output_backward;
11672                 event->overflow_handler_context = NULL;
11673         } else {
11674                 event->overflow_handler = perf_event_output_forward;
11675                 event->overflow_handler_context = NULL;
11676         }
11677
11678         perf_event__state_init(event);
11679
11680         pmu = NULL;
11681
11682         hwc = &event->hw;
11683         hwc->sample_period = attr->sample_period;
11684         if (attr->freq && attr->sample_freq)
11685                 hwc->sample_period = 1;
11686         hwc->last_period = hwc->sample_period;
11687
11688         local64_set(&hwc->period_left, hwc->sample_period);
11689
11690         /*
11691          * We currently do not support PERF_SAMPLE_READ on inherited events.
11692          * See perf_output_read().
11693          */
11694         if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
11695                 goto err_ns;
11696
11697         if (!has_branch_stack(event))
11698                 event->attr.branch_sample_type = 0;
11699
11700         pmu = perf_init_event(event);
11701         if (IS_ERR(pmu)) {
11702                 err = PTR_ERR(pmu);
11703                 goto err_ns;
11704         }
11705
11706         /*
11707          * Disallow uncore-cgroup events, they don't make sense as the cgroup will
11708          * be different on other CPUs in the uncore mask.
11709          */
11710         if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
11711                 err = -EINVAL;
11712                 goto err_pmu;
11713         }
11714
11715         if (event->attr.aux_output &&
11716             !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
11717                 err = -EOPNOTSUPP;
11718                 goto err_pmu;
11719         }
11720
11721         if (cgroup_fd != -1) {
11722                 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
11723                 if (err)
11724                         goto err_pmu;
11725         }
11726
11727         err = exclusive_event_init(event);
11728         if (err)
11729                 goto err_pmu;
11730
11731         if (has_addr_filter(event)) {
11732                 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
11733                                                     sizeof(struct perf_addr_filter_range),
11734                                                     GFP_KERNEL);
11735                 if (!event->addr_filter_ranges) {
11736                         err = -ENOMEM;
11737                         goto err_per_task;
11738                 }
11739
11740                 /*
11741                  * Clone the parent's vma offsets: they are valid until exec()
11742                  * even if the mm is not shared with the parent.
11743                  */
11744                 if (event->parent) {
11745                         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
11746
11747                         raw_spin_lock_irq(&ifh->lock);
11748                         memcpy(event->addr_filter_ranges,
11749                                event->parent->addr_filter_ranges,
11750                                pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
11751                         raw_spin_unlock_irq(&ifh->lock);
11752                 }
11753
11754                 /* force hw sync on the address filters */
11755                 event->addr_filters_gen = 1;
11756         }
11757
11758         if (!event->parent) {
11759                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
11760                         err = get_callchain_buffers(attr->sample_max_stack);
11761                         if (err)
11762                                 goto err_addr_filters;
11763                 }
11764         }
11765
11766         err = security_perf_event_alloc(event);
11767         if (err)
11768                 goto err_callchain_buffer;
11769
11770         /* symmetric to unaccount_event() in _free_event() */
11771         account_event(event);
11772
11773         return event;
11774
11775 err_callchain_buffer:
11776         if (!event->parent) {
11777                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
11778                         put_callchain_buffers();
11779         }
11780 err_addr_filters:
11781         kfree(event->addr_filter_ranges);
11782
11783 err_per_task:
11784         exclusive_event_destroy(event);
11785
11786 err_pmu:
11787         if (is_cgroup_event(event))
11788                 perf_detach_cgroup(event);
11789         if (event->destroy)
11790                 event->destroy(event);
11791         module_put(pmu->module);
11792 err_ns:
11793         if (event->ns)
11794                 put_pid_ns(event->ns);
11795         if (event->hw.target)
11796                 put_task_struct(event->hw.target);
11797         kmem_cache_free(perf_event_cache, event);
11798
11799         return ERR_PTR(err);
11800 }
11801
11802 static int perf_copy_attr(struct perf_event_attr __user *uattr,
11803                           struct perf_event_attr *attr)
11804 {
11805         u32 size;
11806         int ret;
11807
11808         /* Zero the full structure, so that a short copy will be nice. */
11809         memset(attr, 0, sizeof(*attr));
11810
11811         ret = get_user(size, &uattr->size);
11812         if (ret)
11813                 return ret;
11814
11815         /* ABI compatibility quirk: */
11816         if (!size)
11817                 size = PERF_ATTR_SIZE_VER0;
11818         if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
11819                 goto err_size;
11820
11821         ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
11822         if (ret) {
11823                 if (ret == -E2BIG)
11824                         goto err_size;
11825                 return ret;
11826         }
11827
11828         attr->size = size;
11829
11830         if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
11831                 return -EINVAL;
11832
11833         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
11834                 return -EINVAL;
11835
11836         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
11837                 return -EINVAL;
11838
11839         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
11840                 u64 mask = attr->branch_sample_type;
11841
11842                 /* only using defined bits */
11843                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
11844                         return -EINVAL;
11845
11846                 /* at least one branch bit must be set */
11847                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
11848                         return -EINVAL;
11849
11850                 /* propagate priv level, when not set for branch */
11851                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
11852
11853                         /* exclude_kernel checked on syscall entry */
11854                         if (!attr->exclude_kernel)
11855                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
11856
11857                         if (!attr->exclude_user)
11858                                 mask |= PERF_SAMPLE_BRANCH_USER;
11859
11860                         if (!attr->exclude_hv)
11861                                 mask |= PERF_SAMPLE_BRANCH_HV;
11862                         /*
11863                          * adjust user setting (for HW filter setup)
11864                          */
11865                         attr->branch_sample_type = mask;
11866                 }
11867                 /* privileged levels capture (kernel, hv): check permissions */
11868                 if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
11869                         ret = perf_allow_kernel(attr);
11870                         if (ret)
11871                                 return ret;
11872                 }
11873         }
11874
11875         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
11876                 ret = perf_reg_validate(attr->sample_regs_user);
11877                 if (ret)
11878                         return ret;
11879         }
11880
11881         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
11882                 if (!arch_perf_have_user_stack_dump())
11883                         return -ENOSYS;
11884
11885                 /*
11886                  * We have __u32 type for the size, but so far
11887                  * we can only use __u16 as maximum due to the
11888                  * __u16 sample size limit.
11889                  */
11890                 if (attr->sample_stack_user >= USHRT_MAX)
11891                         return -EINVAL;
11892                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
11893                         return -EINVAL;
11894         }
11895
11896         if (!attr->sample_max_stack)
11897                 attr->sample_max_stack = sysctl_perf_event_max_stack;
11898
11899         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
11900                 ret = perf_reg_validate(attr->sample_regs_intr);
11901
11902 #ifndef CONFIG_CGROUP_PERF
11903         if (attr->sample_type & PERF_SAMPLE_CGROUP)
11904                 return -EINVAL;
11905 #endif
11906         if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
11907             (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
11908                 return -EINVAL;
11909
11910         if (!attr->inherit && attr->inherit_thread)
11911                 return -EINVAL;
11912
11913         if (attr->remove_on_exec && attr->enable_on_exec)
11914                 return -EINVAL;
11915
11916         if (attr->sigtrap && !attr->remove_on_exec)
11917                 return -EINVAL;
11918
11919 out:
11920         return ret;
11921
11922 err_size:
11923         put_user(sizeof(*attr), &uattr->size);
11924         ret = -E2BIG;
11925         goto out;
11926 }
11927
11928 static int
11929 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
11930 {
11931         struct perf_buffer *rb = NULL;
11932         int ret = -EINVAL;
11933
11934         if (!output_event)
11935                 goto set;
11936
11937         /* don't allow circular references */
11938         if (event == output_event)
11939                 goto out;
11940
11941         /*
11942          * Don't allow cross-cpu buffers
11943          */
11944         if (output_event->cpu != event->cpu)
11945                 goto out;
11946
11947         /*
11948          * If its not a per-cpu rb, it must be the same task.
11949          */
11950         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
11951                 goto out;
11952
11953         /*
11954          * Mixing clocks in the same buffer is trouble you don't need.
11955          */
11956         if (output_event->clock != event->clock)
11957                 goto out;
11958
11959         /*
11960          * Either writing ring buffer from beginning or from end.
11961          * Mixing is not allowed.
11962          */
11963         if (is_write_backward(output_event) != is_write_backward(event))
11964                 goto out;
11965
11966         /*
11967          * If both events generate aux data, they must be on the same PMU
11968          */
11969         if (has_aux(event) && has_aux(output_event) &&
11970             event->pmu != output_event->pmu)
11971                 goto out;
11972
11973 set:
11974         mutex_lock(&event->mmap_mutex);
11975         /* Can't redirect output if we've got an active mmap() */
11976         if (atomic_read(&event->mmap_count))
11977                 goto unlock;
11978
11979         if (output_event) {
11980                 /* get the rb we want to redirect to */
11981                 rb = ring_buffer_get(output_event);
11982                 if (!rb)
11983                         goto unlock;
11984         }
11985
11986         ring_buffer_attach(event, rb);
11987
11988         ret = 0;
11989 unlock:
11990         mutex_unlock(&event->mmap_mutex);
11991
11992 out:
11993         return ret;
11994 }
11995
11996 static void mutex_lock_double(struct mutex *a, struct mutex *b)
11997 {
11998         if (b < a)
11999                 swap(a, b);
12000
12001         mutex_lock(a);
12002         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
12003 }
12004
12005 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
12006 {
12007         bool nmi_safe = false;
12008
12009         switch (clk_id) {
12010         case CLOCK_MONOTONIC:
12011                 event->clock = &ktime_get_mono_fast_ns;
12012                 nmi_safe = true;
12013                 break;
12014
12015         case CLOCK_MONOTONIC_RAW:
12016                 event->clock = &ktime_get_raw_fast_ns;
12017                 nmi_safe = true;
12018                 break;
12019
12020         case CLOCK_REALTIME:
12021                 event->clock = &ktime_get_real_ns;
12022                 break;
12023
12024         case CLOCK_BOOTTIME:
12025                 event->clock = &ktime_get_boottime_ns;
12026                 break;
12027
12028         case CLOCK_TAI:
12029                 event->clock = &ktime_get_clocktai_ns;
12030                 break;
12031
12032         default:
12033                 return -EINVAL;
12034         }
12035
12036         if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
12037                 return -EINVAL;
12038
12039         return 0;
12040 }
12041
12042 /*
12043  * Variation on perf_event_ctx_lock_nested(), except we take two context
12044  * mutexes.
12045  */
12046 static struct perf_event_context *
12047 __perf_event_ctx_lock_double(struct perf_event *group_leader,
12048                              struct perf_event_context *ctx)
12049 {
12050         struct perf_event_context *gctx;
12051
12052 again:
12053         rcu_read_lock();
12054         gctx = READ_ONCE(group_leader->ctx);
12055         if (!refcount_inc_not_zero(&gctx->refcount)) {
12056                 rcu_read_unlock();
12057                 goto again;
12058         }
12059         rcu_read_unlock();
12060
12061         mutex_lock_double(&gctx->mutex, &ctx->mutex);
12062
12063         if (group_leader->ctx != gctx) {
12064                 mutex_unlock(&ctx->mutex);
12065                 mutex_unlock(&gctx->mutex);
12066                 put_ctx(gctx);
12067                 goto again;
12068         }
12069
12070         return gctx;
12071 }
12072
12073 static bool
12074 perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
12075 {
12076         unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
12077         bool is_capable = perfmon_capable();
12078
12079         if (attr->sigtrap) {
12080                 /*
12081                  * perf_event_attr::sigtrap sends signals to the other task.
12082                  * Require the current task to also have CAP_KILL.
12083                  */
12084                 rcu_read_lock();
12085                 is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
12086                 rcu_read_unlock();
12087
12088                 /*
12089                  * If the required capabilities aren't available, checks for
12090                  * ptrace permissions: upgrade to ATTACH, since sending signals
12091                  * can effectively change the target task.
12092                  */
12093                 ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
12094         }
12095
12096         /*
12097          * Preserve ptrace permission check for backwards compatibility. The
12098          * ptrace check also includes checks that the current task and other
12099          * task have matching uids, and is therefore not done here explicitly.
12100          */
12101         return is_capable || ptrace_may_access(task, ptrace_mode);
12102 }
12103
12104 /**
12105  * sys_perf_event_open - open a performance event, associate it to a task/cpu
12106  *
12107  * @attr_uptr:  event_id type attributes for monitoring/sampling
12108  * @pid:                target pid
12109  * @cpu:                target cpu
12110  * @group_fd:           group leader event fd
12111  * @flags:              perf event open flags
12112  */
12113 SYSCALL_DEFINE5(perf_event_open,
12114                 struct perf_event_attr __user *, attr_uptr,
12115                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
12116 {
12117         struct perf_event *group_leader = NULL, *output_event = NULL;
12118         struct perf_event *event, *sibling;
12119         struct perf_event_attr attr;
12120         struct perf_event_context *ctx, *gctx;
12121         struct file *event_file = NULL;
12122         struct fd group = {NULL, 0};
12123         struct task_struct *task = NULL;
12124         struct pmu *pmu;
12125         int event_fd;
12126         int move_group = 0;
12127         int err;
12128         int f_flags = O_RDWR;
12129         int cgroup_fd = -1;
12130
12131         /* for future expandability... */
12132         if (flags & ~PERF_FLAG_ALL)
12133                 return -EINVAL;
12134
12135         /* Do we allow access to perf_event_open(2) ? */
12136         err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
12137         if (err)
12138                 return err;
12139
12140         err = perf_copy_attr(attr_uptr, &attr);
12141         if (err)
12142                 return err;
12143
12144         if (!attr.exclude_kernel) {
12145                 err = perf_allow_kernel(&attr);
12146                 if (err)
12147                         return err;
12148         }
12149
12150         if (attr.namespaces) {
12151                 if (!perfmon_capable())
12152                         return -EACCES;
12153         }
12154
12155         if (attr.freq) {
12156                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
12157                         return -EINVAL;
12158         } else {
12159                 if (attr.sample_period & (1ULL << 63))
12160                         return -EINVAL;
12161         }
12162
12163         /* Only privileged users can get physical addresses */
12164         if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
12165                 err = perf_allow_kernel(&attr);
12166                 if (err)
12167                         return err;
12168         }
12169
12170         /* REGS_INTR can leak data, lockdown must prevent this */
12171         if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
12172                 err = security_locked_down(LOCKDOWN_PERF);
12173                 if (err)
12174                         return err;
12175         }
12176
12177         /*
12178          * In cgroup mode, the pid argument is used to pass the fd
12179          * opened to the cgroup directory in cgroupfs. The cpu argument
12180          * designates the cpu on which to monitor threads from that
12181          * cgroup.
12182          */
12183         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
12184                 return -EINVAL;
12185
12186         if (flags & PERF_FLAG_FD_CLOEXEC)
12187                 f_flags |= O_CLOEXEC;
12188
12189         event_fd = get_unused_fd_flags(f_flags);
12190         if (event_fd < 0)
12191                 return event_fd;
12192
12193         if (group_fd != -1) {
12194                 err = perf_fget_light(group_fd, &group);
12195                 if (err)
12196                         goto err_fd;
12197                 group_leader = group.file->private_data;
12198                 if (flags & PERF_FLAG_FD_OUTPUT)
12199                         output_event = group_leader;
12200                 if (flags & PERF_FLAG_FD_NO_GROUP)
12201                         group_leader = NULL;
12202         }
12203
12204         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
12205                 task = find_lively_task_by_vpid(pid);
12206                 if (IS_ERR(task)) {
12207                         err = PTR_ERR(task);
12208                         goto err_group_fd;
12209                 }
12210         }
12211
12212         if (task && group_leader &&
12213             group_leader->attr.inherit != attr.inherit) {
12214                 err = -EINVAL;
12215                 goto err_task;
12216         }
12217
12218         if (flags & PERF_FLAG_PID_CGROUP)
12219                 cgroup_fd = pid;
12220
12221         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
12222                                  NULL, NULL, cgroup_fd);
12223         if (IS_ERR(event)) {
12224                 err = PTR_ERR(event);
12225                 goto err_task;
12226         }
12227
12228         if (is_sampling_event(event)) {
12229                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
12230                         err = -EOPNOTSUPP;
12231                         goto err_alloc;
12232                 }
12233         }
12234
12235         /*
12236          * Special case software events and allow them to be part of
12237          * any hardware group.
12238          */
12239         pmu = event->pmu;
12240
12241         if (attr.use_clockid) {
12242                 err = perf_event_set_clock(event, attr.clockid);
12243                 if (err)
12244                         goto err_alloc;
12245         }
12246
12247         if (pmu->task_ctx_nr == perf_sw_context)
12248                 event->event_caps |= PERF_EV_CAP_SOFTWARE;
12249
12250         if (group_leader) {
12251                 if (is_software_event(event) &&
12252                     !in_software_context(group_leader)) {
12253                         /*
12254                          * If the event is a sw event, but the group_leader
12255                          * is on hw context.
12256                          *
12257                          * Allow the addition of software events to hw
12258                          * groups, this is safe because software events
12259                          * never fail to schedule.
12260                          */
12261                         pmu = group_leader->ctx->pmu;
12262                 } else if (!is_software_event(event) &&
12263                            is_software_event(group_leader) &&
12264                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12265                         /*
12266                          * In case the group is a pure software group, and we
12267                          * try to add a hardware event, move the whole group to
12268                          * the hardware context.
12269                          */
12270                         move_group = 1;
12271                 }
12272         }
12273
12274         /*
12275          * Get the target context (task or percpu):
12276          */
12277         ctx = find_get_context(pmu, task, event);
12278         if (IS_ERR(ctx)) {
12279                 err = PTR_ERR(ctx);
12280                 goto err_alloc;
12281         }
12282
12283         /*
12284          * Look up the group leader (we will attach this event to it):
12285          */
12286         if (group_leader) {
12287                 err = -EINVAL;
12288
12289                 /*
12290                  * Do not allow a recursive hierarchy (this new sibling
12291                  * becoming part of another group-sibling):
12292                  */
12293                 if (group_leader->group_leader != group_leader)
12294                         goto err_context;
12295
12296                 /* All events in a group should have the same clock */
12297                 if (group_leader->clock != event->clock)
12298                         goto err_context;
12299
12300                 /*
12301                  * Make sure we're both events for the same CPU;
12302                  * grouping events for different CPUs is broken; since
12303                  * you can never concurrently schedule them anyhow.
12304                  */
12305                 if (group_leader->cpu != event->cpu)
12306                         goto err_context;
12307
12308                 /*
12309                  * Make sure we're both on the same task, or both
12310                  * per-CPU events.
12311                  */
12312                 if (group_leader->ctx->task != ctx->task)
12313                         goto err_context;
12314
12315                 /*
12316                  * Do not allow to attach to a group in a different task
12317                  * or CPU context. If we're moving SW events, we'll fix
12318                  * this up later, so allow that.
12319                  */
12320                 if (!move_group && group_leader->ctx != ctx)
12321                         goto err_context;
12322
12323                 /*
12324                  * Only a group leader can be exclusive or pinned
12325                  */
12326                 if (attr.exclusive || attr.pinned)
12327                         goto err_context;
12328         }
12329
12330         if (output_event) {
12331                 err = perf_event_set_output(event, output_event);
12332                 if (err)
12333                         goto err_context;
12334         }
12335
12336         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
12337                                         f_flags);
12338         if (IS_ERR(event_file)) {
12339                 err = PTR_ERR(event_file);
12340                 event_file = NULL;
12341                 goto err_context;
12342         }
12343
12344         if (task) {
12345                 err = down_read_interruptible(&task->signal->exec_update_lock);
12346                 if (err)
12347                         goto err_file;
12348
12349                 /*
12350                  * We must hold exec_update_lock across this and any potential
12351                  * perf_install_in_context() call for this new event to
12352                  * serialize against exec() altering our credentials (and the
12353                  * perf_event_exit_task() that could imply).
12354                  */
12355                 err = -EACCES;
12356                 if (!perf_check_permission(&attr, task))
12357                         goto err_cred;
12358         }
12359
12360         if (move_group) {
12361                 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
12362
12363                 if (gctx->task == TASK_TOMBSTONE) {
12364                         err = -ESRCH;
12365                         goto err_locked;
12366                 }
12367
12368                 /*
12369                  * Check if we raced against another sys_perf_event_open() call
12370                  * moving the software group underneath us.
12371                  */
12372                 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12373                         /*
12374                          * If someone moved the group out from under us, check
12375                          * if this new event wound up on the same ctx, if so
12376                          * its the regular !move_group case, otherwise fail.
12377                          */
12378                         if (gctx != ctx) {
12379                                 err = -EINVAL;
12380                                 goto err_locked;
12381                         } else {
12382                                 perf_event_ctx_unlock(group_leader, gctx);
12383                                 move_group = 0;
12384                         }
12385                 }
12386
12387                 /*
12388                  * Failure to create exclusive events returns -EBUSY.
12389                  */
12390                 err = -EBUSY;
12391                 if (!exclusive_event_installable(group_leader, ctx))
12392                         goto err_locked;
12393
12394                 for_each_sibling_event(sibling, group_leader) {
12395                         if (!exclusive_event_installable(sibling, ctx))
12396                                 goto err_locked;
12397                 }
12398         } else {
12399                 mutex_lock(&ctx->mutex);
12400         }
12401
12402         if (ctx->task == TASK_TOMBSTONE) {
12403                 err = -ESRCH;
12404                 goto err_locked;
12405         }
12406
12407         if (!perf_event_validate_size(event)) {
12408                 err = -E2BIG;
12409                 goto err_locked;
12410         }
12411
12412         if (!task) {
12413                 /*
12414                  * Check if the @cpu we're creating an event for is online.
12415                  *
12416                  * We use the perf_cpu_context::ctx::mutex to serialize against
12417                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
12418                  */
12419                 struct perf_cpu_context *cpuctx =
12420                         container_of(ctx, struct perf_cpu_context, ctx);
12421
12422                 if (!cpuctx->online) {
12423                         err = -ENODEV;
12424                         goto err_locked;
12425                 }
12426         }
12427
12428         if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
12429                 err = -EINVAL;
12430                 goto err_locked;
12431         }
12432
12433         /*
12434          * Must be under the same ctx::mutex as perf_install_in_context(),
12435          * because we need to serialize with concurrent event creation.
12436          */
12437         if (!exclusive_event_installable(event, ctx)) {
12438                 err = -EBUSY;
12439                 goto err_locked;
12440         }
12441
12442         WARN_ON_ONCE(ctx->parent_ctx);
12443
12444         /*
12445          * This is the point on no return; we cannot fail hereafter. This is
12446          * where we start modifying current state.
12447          */
12448
12449         if (move_group) {
12450                 /*
12451                  * See perf_event_ctx_lock() for comments on the details
12452                  * of swizzling perf_event::ctx.
12453                  */
12454                 perf_remove_from_context(group_leader, 0);
12455                 put_ctx(gctx);
12456
12457                 for_each_sibling_event(sibling, group_leader) {
12458                         perf_remove_from_context(sibling, 0);
12459                         put_ctx(gctx);
12460                 }
12461
12462                 /*
12463                  * Wait for everybody to stop referencing the events through
12464                  * the old lists, before installing it on new lists.
12465                  */
12466                 synchronize_rcu();
12467
12468                 /*
12469                  * Install the group siblings before the group leader.
12470                  *
12471                  * Because a group leader will try and install the entire group
12472                  * (through the sibling list, which is still in-tact), we can
12473                  * end up with siblings installed in the wrong context.
12474                  *
12475                  * By installing siblings first we NO-OP because they're not
12476                  * reachable through the group lists.
12477                  */
12478                 for_each_sibling_event(sibling, group_leader) {
12479                         perf_event__state_init(sibling);
12480                         perf_install_in_context(ctx, sibling, sibling->cpu);
12481                         get_ctx(ctx);
12482                 }
12483
12484                 /*
12485                  * Removing from the context ends up with disabled
12486                  * event. What we want here is event in the initial
12487                  * startup state, ready to be add into new context.
12488                  */
12489                 perf_event__state_init(group_leader);
12490                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
12491                 get_ctx(ctx);
12492         }
12493
12494         /*
12495          * Precalculate sample_data sizes; do while holding ctx::mutex such
12496          * that we're serialized against further additions and before
12497          * perf_install_in_context() which is the point the event is active and
12498          * can use these values.
12499          */
12500         perf_event__header_size(event);
12501         perf_event__id_header_size(event);
12502
12503         event->owner = current;
12504
12505         perf_install_in_context(ctx, event, event->cpu);
12506         perf_unpin_context(ctx);
12507
12508         if (move_group)
12509                 perf_event_ctx_unlock(group_leader, gctx);
12510         mutex_unlock(&ctx->mutex);
12511
12512         if (task) {
12513                 up_read(&task->signal->exec_update_lock);
12514                 put_task_struct(task);
12515         }
12516
12517         mutex_lock(&current->perf_event_mutex);
12518         list_add_tail(&event->owner_entry, &current->perf_event_list);
12519         mutex_unlock(&current->perf_event_mutex);
12520
12521         /*
12522          * Drop the reference on the group_event after placing the
12523          * new event on the sibling_list. This ensures destruction
12524          * of the group leader will find the pointer to itself in
12525          * perf_group_detach().
12526          */
12527         fdput(group);
12528         fd_install(event_fd, event_file);
12529         return event_fd;
12530
12531 err_locked:
12532         if (move_group)
12533                 perf_event_ctx_unlock(group_leader, gctx);
12534         mutex_unlock(&ctx->mutex);
12535 err_cred:
12536         if (task)
12537                 up_read(&task->signal->exec_update_lock);
12538 err_file:
12539         fput(event_file);
12540 err_context:
12541         perf_unpin_context(ctx);
12542         put_ctx(ctx);
12543 err_alloc:
12544         /*
12545          * If event_file is set, the fput() above will have called ->release()
12546          * and that will take care of freeing the event.
12547          */
12548         if (!event_file)
12549                 free_event(event);
12550 err_task:
12551         if (task)
12552                 put_task_struct(task);
12553 err_group_fd:
12554         fdput(group);
12555 err_fd:
12556         put_unused_fd(event_fd);
12557         return err;
12558 }
12559
12560 /**
12561  * perf_event_create_kernel_counter
12562  *
12563  * @attr: attributes of the counter to create
12564  * @cpu: cpu in which the counter is bound
12565  * @task: task to profile (NULL for percpu)
12566  * @overflow_handler: callback to trigger when we hit the event
12567  * @context: context data could be used in overflow_handler callback
12568  */
12569 struct perf_event *
12570 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
12571                                  struct task_struct *task,
12572                                  perf_overflow_handler_t overflow_handler,
12573                                  void *context)
12574 {
12575         struct perf_event_context *ctx;
12576         struct perf_event *event;
12577         int err;
12578
12579         /*
12580          * Grouping is not supported for kernel events, neither is 'AUX',
12581          * make sure the caller's intentions are adjusted.
12582          */
12583         if (attr->aux_output)
12584                 return ERR_PTR(-EINVAL);
12585
12586         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
12587                                  overflow_handler, context, -1);
12588         if (IS_ERR(event)) {
12589                 err = PTR_ERR(event);
12590                 goto err;
12591         }
12592
12593         /* Mark owner so we could distinguish it from user events. */
12594         event->owner = TASK_TOMBSTONE;
12595
12596         /*
12597          * Get the target context (task or percpu):
12598          */
12599         ctx = find_get_context(event->pmu, task, event);
12600         if (IS_ERR(ctx)) {
12601                 err = PTR_ERR(ctx);
12602                 goto err_free;
12603         }
12604
12605         WARN_ON_ONCE(ctx->parent_ctx);
12606         mutex_lock(&ctx->mutex);
12607         if (ctx->task == TASK_TOMBSTONE) {
12608                 err = -ESRCH;
12609                 goto err_unlock;
12610         }
12611
12612         if (!task) {
12613                 /*
12614                  * Check if the @cpu we're creating an event for is online.
12615                  *
12616                  * We use the perf_cpu_context::ctx::mutex to serialize against
12617                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
12618                  */
12619                 struct perf_cpu_context *cpuctx =
12620                         container_of(ctx, struct perf_cpu_context, ctx);
12621                 if (!cpuctx->online) {
12622                         err = -ENODEV;
12623                         goto err_unlock;
12624                 }
12625         }
12626
12627         if (!exclusive_event_installable(event, ctx)) {
12628                 err = -EBUSY;
12629                 goto err_unlock;
12630         }
12631
12632         perf_install_in_context(ctx, event, event->cpu);
12633         perf_unpin_context(ctx);
12634         mutex_unlock(&ctx->mutex);
12635
12636         return event;
12637
12638 err_unlock:
12639         mutex_unlock(&ctx->mutex);
12640         perf_unpin_context(ctx);
12641         put_ctx(ctx);
12642 err_free:
12643         free_event(event);
12644 err:
12645         return ERR_PTR(err);
12646 }
12647 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
12648
12649 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
12650 {
12651         struct perf_event_context *src_ctx;
12652         struct perf_event_context *dst_ctx;
12653         struct perf_event *event, *tmp;
12654         LIST_HEAD(events);
12655
12656         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
12657         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
12658
12659         /*
12660          * See perf_event_ctx_lock() for comments on the details
12661          * of swizzling perf_event::ctx.
12662          */
12663         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
12664         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
12665                                  event_entry) {
12666                 perf_remove_from_context(event, 0);
12667                 unaccount_event_cpu(event, src_cpu);
12668                 put_ctx(src_ctx);
12669                 list_add(&event->migrate_entry, &events);
12670         }
12671
12672         /*
12673          * Wait for the events to quiesce before re-instating them.
12674          */
12675         synchronize_rcu();
12676
12677         /*
12678          * Re-instate events in 2 passes.
12679          *
12680          * Skip over group leaders and only install siblings on this first
12681          * pass, siblings will not get enabled without a leader, however a
12682          * leader will enable its siblings, even if those are still on the old
12683          * context.
12684          */
12685         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12686                 if (event->group_leader == event)
12687                         continue;
12688
12689                 list_del(&event->migrate_entry);
12690                 if (event->state >= PERF_EVENT_STATE_OFF)
12691                         event->state = PERF_EVENT_STATE_INACTIVE;
12692                 account_event_cpu(event, dst_cpu);
12693                 perf_install_in_context(dst_ctx, event, dst_cpu);
12694                 get_ctx(dst_ctx);
12695         }
12696
12697         /*
12698          * Once all the siblings are setup properly, install the group leaders
12699          * to make it go.
12700          */
12701         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12702                 list_del(&event->migrate_entry);
12703                 if (event->state >= PERF_EVENT_STATE_OFF)
12704                         event->state = PERF_EVENT_STATE_INACTIVE;
12705                 account_event_cpu(event, dst_cpu);
12706                 perf_install_in_context(dst_ctx, event, dst_cpu);
12707                 get_ctx(dst_ctx);
12708         }
12709         mutex_unlock(&dst_ctx->mutex);
12710         mutex_unlock(&src_ctx->mutex);
12711 }
12712 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
12713
12714 static void sync_child_event(struct perf_event *child_event)
12715 {
12716         struct perf_event *parent_event = child_event->parent;
12717         u64 child_val;
12718
12719         if (child_event->attr.inherit_stat) {
12720                 struct task_struct *task = child_event->ctx->task;
12721
12722                 if (task && task != TASK_TOMBSTONE)
12723                         perf_event_read_event(child_event, task);
12724         }
12725
12726         child_val = perf_event_count(child_event);
12727
12728         /*
12729          * Add back the child's count to the parent's count:
12730          */
12731         atomic64_add(child_val, &parent_event->child_count);
12732         atomic64_add(child_event->total_time_enabled,
12733                      &parent_event->child_total_time_enabled);
12734         atomic64_add(child_event->total_time_running,
12735                      &parent_event->child_total_time_running);
12736 }
12737
12738 static void
12739 perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
12740 {
12741         struct perf_event *parent_event = event->parent;
12742         unsigned long detach_flags = 0;
12743
12744         if (parent_event) {
12745                 /*
12746                  * Do not destroy the 'original' grouping; because of the
12747                  * context switch optimization the original events could've
12748                  * ended up in a random child task.
12749                  *
12750                  * If we were to destroy the original group, all group related
12751                  * operations would cease to function properly after this
12752                  * random child dies.
12753                  *
12754                  * Do destroy all inherited groups, we don't care about those
12755                  * and being thorough is better.
12756                  */
12757                 detach_flags = DETACH_GROUP | DETACH_CHILD;
12758                 mutex_lock(&parent_event->child_mutex);
12759         }
12760
12761         perf_remove_from_context(event, detach_flags);
12762
12763         raw_spin_lock_irq(&ctx->lock);
12764         if (event->state > PERF_EVENT_STATE_EXIT)
12765                 perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
12766         raw_spin_unlock_irq(&ctx->lock);
12767
12768         /*
12769          * Child events can be freed.
12770          */
12771         if (parent_event) {
12772                 mutex_unlock(&parent_event->child_mutex);
12773                 /*
12774                  * Kick perf_poll() for is_event_hup();
12775                  */
12776                 perf_event_wakeup(parent_event);
12777                 free_event(event);
12778                 put_event(parent_event);
12779                 return;
12780         }
12781
12782         /*
12783          * Parent events are governed by their filedesc, retain them.
12784          */
12785         perf_event_wakeup(event);
12786 }
12787
12788 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
12789 {
12790         struct perf_event_context *child_ctx, *clone_ctx = NULL;
12791         struct perf_event *child_event, *next;
12792
12793         WARN_ON_ONCE(child != current);
12794
12795         child_ctx = perf_pin_task_context(child, ctxn);
12796         if (!child_ctx)
12797                 return;
12798
12799         /*
12800          * In order to reduce the amount of tricky in ctx tear-down, we hold
12801          * ctx::mutex over the entire thing. This serializes against almost
12802          * everything that wants to access the ctx.
12803          *
12804          * The exception is sys_perf_event_open() /
12805          * perf_event_create_kernel_count() which does find_get_context()
12806          * without ctx::mutex (it cannot because of the move_group double mutex
12807          * lock thing). See the comments in perf_install_in_context().
12808          */
12809         mutex_lock(&child_ctx->mutex);
12810
12811         /*
12812          * In a single ctx::lock section, de-schedule the events and detach the
12813          * context from the task such that we cannot ever get it scheduled back
12814          * in.
12815          */
12816         raw_spin_lock_irq(&child_ctx->lock);
12817         task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
12818
12819         /*
12820          * Now that the context is inactive, destroy the task <-> ctx relation
12821          * and mark the context dead.
12822          */
12823         RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
12824         put_ctx(child_ctx); /* cannot be last */
12825         WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
12826         put_task_struct(current); /* cannot be last */
12827
12828         clone_ctx = unclone_ctx(child_ctx);
12829         raw_spin_unlock_irq(&child_ctx->lock);
12830
12831         if (clone_ctx)
12832                 put_ctx(clone_ctx);
12833
12834         /*
12835          * Report the task dead after unscheduling the events so that we
12836          * won't get any samples after PERF_RECORD_EXIT. We can however still
12837          * get a few PERF_RECORD_READ events.
12838          */
12839         perf_event_task(child, child_ctx, 0);
12840
12841         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
12842                 perf_event_exit_event(child_event, child_ctx);
12843
12844         mutex_unlock(&child_ctx->mutex);
12845
12846         put_ctx(child_ctx);
12847 }
12848
12849 /*
12850  * When a child task exits, feed back event values to parent events.
12851  *
12852  * Can be called with exec_update_lock held when called from
12853  * setup_new_exec().
12854  */
12855 void perf_event_exit_task(struct task_struct *child)
12856 {
12857         struct perf_event *event, *tmp;
12858         int ctxn;
12859
12860         mutex_lock(&child->perf_event_mutex);
12861         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
12862                                  owner_entry) {
12863                 list_del_init(&event->owner_entry);
12864
12865                 /*
12866                  * Ensure the list deletion is visible before we clear
12867                  * the owner, closes a race against perf_release() where
12868                  * we need to serialize on the owner->perf_event_mutex.
12869                  */
12870                 smp_store_release(&event->owner, NULL);
12871         }
12872         mutex_unlock(&child->perf_event_mutex);
12873
12874         for_each_task_context_nr(ctxn)
12875                 perf_event_exit_task_context(child, ctxn);
12876
12877         /*
12878          * The perf_event_exit_task_context calls perf_event_task
12879          * with child's task_ctx, which generates EXIT events for
12880          * child contexts and sets child->perf_event_ctxp[] to NULL.
12881          * At this point we need to send EXIT events to cpu contexts.
12882          */
12883         perf_event_task(child, NULL, 0);
12884 }
12885
12886 static void perf_free_event(struct perf_event *event,
12887                             struct perf_event_context *ctx)
12888 {
12889         struct perf_event *parent = event->parent;
12890
12891         if (WARN_ON_ONCE(!parent))
12892                 return;
12893
12894         mutex_lock(&parent->child_mutex);
12895         list_del_init(&event->child_list);
12896         mutex_unlock(&parent->child_mutex);
12897
12898         put_event(parent);
12899
12900         raw_spin_lock_irq(&ctx->lock);
12901         perf_group_detach(event);
12902         list_del_event(event, ctx);
12903         raw_spin_unlock_irq(&ctx->lock);
12904         free_event(event);
12905 }
12906
12907 /*
12908  * Free a context as created by inheritance by perf_event_init_task() below,
12909  * used by fork() in case of fail.
12910  *
12911  * Even though the task has never lived, the context and events have been
12912  * exposed through the child_list, so we must take care tearing it all down.
12913  */
12914 void perf_event_free_task(struct task_struct *task)
12915 {
12916         struct perf_event_context *ctx;
12917         struct perf_event *event, *tmp;
12918         int ctxn;
12919
12920         for_each_task_context_nr(ctxn) {
12921                 ctx = task->perf_event_ctxp[ctxn];
12922                 if (!ctx)
12923                         continue;
12924
12925                 mutex_lock(&ctx->mutex);
12926                 raw_spin_lock_irq(&ctx->lock);
12927                 /*
12928                  * Destroy the task <-> ctx relation and mark the context dead.
12929                  *
12930                  * This is important because even though the task hasn't been
12931                  * exposed yet the context has been (through child_list).
12932                  */
12933                 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
12934                 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
12935                 put_task_struct(task); /* cannot be last */
12936                 raw_spin_unlock_irq(&ctx->lock);
12937
12938                 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
12939                         perf_free_event(event, ctx);
12940
12941                 mutex_unlock(&ctx->mutex);
12942
12943                 /*
12944                  * perf_event_release_kernel() could've stolen some of our
12945                  * child events and still have them on its free_list. In that
12946                  * case we must wait for these events to have been freed (in
12947                  * particular all their references to this task must've been
12948                  * dropped).
12949                  *
12950                  * Without this copy_process() will unconditionally free this
12951                  * task (irrespective of its reference count) and
12952                  * _free_event()'s put_task_struct(event->hw.target) will be a
12953                  * use-after-free.
12954                  *
12955                  * Wait for all events to drop their context reference.
12956                  */
12957                 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
12958                 put_ctx(ctx); /* must be last */
12959         }
12960 }
12961
12962 void perf_event_delayed_put(struct task_struct *task)
12963 {
12964         int ctxn;
12965
12966         for_each_task_context_nr(ctxn)
12967                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
12968 }
12969
12970 struct file *perf_event_get(unsigned int fd)
12971 {
12972         struct file *file = fget(fd);
12973         if (!file)
12974                 return ERR_PTR(-EBADF);
12975
12976         if (file->f_op != &perf_fops) {
12977                 fput(file);
12978                 return ERR_PTR(-EBADF);
12979         }
12980
12981         return file;
12982 }
12983
12984 const struct perf_event *perf_get_event(struct file *file)
12985 {
12986         if (file->f_op != &perf_fops)
12987                 return ERR_PTR(-EINVAL);
12988
12989         return file->private_data;
12990 }
12991
12992 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
12993 {
12994         if (!event)
12995                 return ERR_PTR(-EINVAL);
12996
12997         return &event->attr;
12998 }
12999
13000 /*
13001  * Inherit an event from parent task to child task.
13002  *
13003  * Returns:
13004  *  - valid pointer on success
13005  *  - NULL for orphaned events
13006  *  - IS_ERR() on error
13007  */
13008 static struct perf_event *
13009 inherit_event(struct perf_event *parent_event,
13010               struct task_struct *parent,
13011               struct perf_event_context *parent_ctx,
13012               struct task_struct *child,
13013               struct perf_event *group_leader,
13014               struct perf_event_context *child_ctx)
13015 {
13016         enum perf_event_state parent_state = parent_event->state;
13017         struct perf_event *child_event;
13018         unsigned long flags;
13019
13020         /*
13021          * Instead of creating recursive hierarchies of events,
13022          * we link inherited events back to the original parent,
13023          * which has a filp for sure, which we use as the reference
13024          * count:
13025          */
13026         if (parent_event->parent)
13027                 parent_event = parent_event->parent;
13028
13029         child_event = perf_event_alloc(&parent_event->attr,
13030                                            parent_event->cpu,
13031                                            child,
13032                                            group_leader, parent_event,
13033                                            NULL, NULL, -1);
13034         if (IS_ERR(child_event))
13035                 return child_event;
13036
13037
13038         if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
13039             !child_ctx->task_ctx_data) {
13040                 struct pmu *pmu = child_event->pmu;
13041
13042                 child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
13043                 if (!child_ctx->task_ctx_data) {
13044                         free_event(child_event);
13045                         return ERR_PTR(-ENOMEM);
13046                 }
13047         }
13048
13049         /*
13050          * is_orphaned_event() and list_add_tail(&parent_event->child_list)
13051          * must be under the same lock in order to serialize against
13052          * perf_event_release_kernel(), such that either we must observe
13053          * is_orphaned_event() or they will observe us on the child_list.
13054          */
13055         mutex_lock(&parent_event->child_mutex);
13056         if (is_orphaned_event(parent_event) ||
13057             !atomic_long_inc_not_zero(&parent_event->refcount)) {
13058                 mutex_unlock(&parent_event->child_mutex);
13059                 /* task_ctx_data is freed with child_ctx */
13060                 free_event(child_event);
13061                 return NULL;
13062         }
13063
13064         get_ctx(child_ctx);
13065
13066         /*
13067          * Make the child state follow the state of the parent event,
13068          * not its attr.disabled bit.  We hold the parent's mutex,
13069          * so we won't race with perf_event_{en, dis}able_family.
13070          */
13071         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
13072                 child_event->state = PERF_EVENT_STATE_INACTIVE;
13073         else
13074                 child_event->state = PERF_EVENT_STATE_OFF;
13075
13076         if (parent_event->attr.freq) {
13077                 u64 sample_period = parent_event->hw.sample_period;
13078                 struct hw_perf_event *hwc = &child_event->hw;
13079
13080                 hwc->sample_period = sample_period;
13081                 hwc->last_period   = sample_period;
13082
13083                 local64_set(&hwc->period_left, sample_period);
13084         }
13085
13086         child_event->ctx = child_ctx;
13087         child_event->overflow_handler = parent_event->overflow_handler;
13088         child_event->overflow_handler_context
13089                 = parent_event->overflow_handler_context;
13090
13091         /*
13092          * Precalculate sample_data sizes
13093          */
13094         perf_event__header_size(child_event);
13095         perf_event__id_header_size(child_event);
13096
13097         /*
13098          * Link it up in the child's context:
13099          */
13100         raw_spin_lock_irqsave(&child_ctx->lock, flags);
13101         add_event_to_ctx(child_event, child_ctx);
13102         child_event->attach_state |= PERF_ATTACH_CHILD;
13103         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
13104
13105         /*
13106          * Link this into the parent event's child list
13107          */
13108         list_add_tail(&child_event->child_list, &parent_event->child_list);
13109         mutex_unlock(&parent_event->child_mutex);
13110
13111         return child_event;
13112 }
13113
13114 /*
13115  * Inherits an event group.
13116  *
13117  * This will quietly suppress orphaned events; !inherit_event() is not an error.
13118  * This matches with perf_event_release_kernel() removing all child events.
13119  *
13120  * Returns:
13121  *  - 0 on success
13122  *  - <0 on error
13123  */
13124 static int inherit_group(struct perf_event *parent_event,
13125               struct task_struct *parent,
13126               struct perf_event_context *parent_ctx,
13127               struct task_struct *child,
13128               struct perf_event_context *child_ctx)
13129 {
13130         struct perf_event *leader;
13131         struct perf_event *sub;
13132         struct perf_event *child_ctr;
13133
13134         leader = inherit_event(parent_event, parent, parent_ctx,
13135                                  child, NULL, child_ctx);
13136         if (IS_ERR(leader))
13137                 return PTR_ERR(leader);
13138         /*
13139          * @leader can be NULL here because of is_orphaned_event(). In this
13140          * case inherit_event() will create individual events, similar to what
13141          * perf_group_detach() would do anyway.
13142          */
13143         for_each_sibling_event(sub, parent_event) {
13144                 child_ctr = inherit_event(sub, parent, parent_ctx,
13145                                             child, leader, child_ctx);
13146                 if (IS_ERR(child_ctr))
13147                         return PTR_ERR(child_ctr);
13148
13149                 if (sub->aux_event == parent_event && child_ctr &&
13150                     !perf_get_aux_event(child_ctr, leader))
13151                         return -EINVAL;
13152         }
13153         return 0;
13154 }
13155
13156 /*
13157  * Creates the child task context and tries to inherit the event-group.
13158  *
13159  * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
13160  * inherited_all set when we 'fail' to inherit an orphaned event; this is
13161  * consistent with perf_event_release_kernel() removing all child events.
13162  *
13163  * Returns:
13164  *  - 0 on success
13165  *  - <0 on error
13166  */
13167 static int
13168 inherit_task_group(struct perf_event *event, struct task_struct *parent,
13169                    struct perf_event_context *parent_ctx,
13170                    struct task_struct *child, int ctxn,
13171                    u64 clone_flags, int *inherited_all)
13172 {
13173         int ret;
13174         struct perf_event_context *child_ctx;
13175
13176         if (!event->attr.inherit ||
13177             (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
13178             /* Do not inherit if sigtrap and signal handlers were cleared. */
13179             (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
13180                 *inherited_all = 0;
13181                 return 0;
13182         }
13183
13184         child_ctx = child->perf_event_ctxp[ctxn];
13185         if (!child_ctx) {
13186                 /*
13187                  * This is executed from the parent task context, so
13188                  * inherit events that have been marked for cloning.
13189                  * First allocate and initialize a context for the
13190                  * child.
13191                  */
13192                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
13193                 if (!child_ctx)
13194                         return -ENOMEM;
13195
13196                 child->perf_event_ctxp[ctxn] = child_ctx;
13197         }
13198
13199         ret = inherit_group(event, parent, parent_ctx,
13200                             child, child_ctx);
13201
13202         if (ret)
13203                 *inherited_all = 0;
13204
13205         return ret;
13206 }
13207
13208 /*
13209  * Initialize the perf_event context in task_struct
13210  */
13211 static int perf_event_init_context(struct task_struct *child, int ctxn,
13212                                    u64 clone_flags)
13213 {
13214         struct perf_event_context *child_ctx, *parent_ctx;
13215         struct perf_event_context *cloned_ctx;
13216         struct perf_event *event;
13217         struct task_struct *parent = current;
13218         int inherited_all = 1;
13219         unsigned long flags;
13220         int ret = 0;
13221
13222         if (likely(!parent->perf_event_ctxp[ctxn]))
13223                 return 0;
13224
13225         /*
13226          * If the parent's context is a clone, pin it so it won't get
13227          * swapped under us.
13228          */
13229         parent_ctx = perf_pin_task_context(parent, ctxn);
13230         if (!parent_ctx)
13231                 return 0;
13232
13233         /*
13234          * No need to check if parent_ctx != NULL here; since we saw
13235          * it non-NULL earlier, the only reason for it to become NULL
13236          * is if we exit, and since we're currently in the middle of
13237          * a fork we can't be exiting at the same time.
13238          */
13239
13240         /*
13241          * Lock the parent list. No need to lock the child - not PID
13242          * hashed yet and not running, so nobody can access it.
13243          */
13244         mutex_lock(&parent_ctx->mutex);
13245
13246         /*
13247          * We dont have to disable NMIs - we are only looking at
13248          * the list, not manipulating it:
13249          */
13250         perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
13251                 ret = inherit_task_group(event, parent, parent_ctx,
13252                                          child, ctxn, clone_flags,
13253                                          &inherited_all);
13254                 if (ret)
13255                         goto out_unlock;
13256         }
13257
13258         /*
13259          * We can't hold ctx->lock when iterating the ->flexible_group list due
13260          * to allocations, but we need to prevent rotation because
13261          * rotate_ctx() will change the list from interrupt context.
13262          */
13263         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13264         parent_ctx->rotate_disable = 1;
13265         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13266
13267         perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
13268                 ret = inherit_task_group(event, parent, parent_ctx,
13269                                          child, ctxn, clone_flags,
13270                                          &inherited_all);
13271                 if (ret)
13272                         goto out_unlock;
13273         }
13274
13275         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
13276         parent_ctx->rotate_disable = 0;
13277
13278         child_ctx = child->perf_event_ctxp[ctxn];
13279
13280         if (child_ctx && inherited_all) {
13281                 /*
13282                  * Mark the child context as a clone of the parent
13283                  * context, or of whatever the parent is a clone of.
13284                  *
13285                  * Note that if the parent is a clone, the holding of
13286                  * parent_ctx->lock avoids it from being uncloned.
13287                  */
13288                 cloned_ctx = parent_ctx->parent_ctx;
13289                 if (cloned_ctx) {
13290                         child_ctx->parent_ctx = cloned_ctx;
13291                         child_ctx->parent_gen = parent_ctx->parent_gen;
13292                 } else {
13293                         child_ctx->parent_ctx = parent_ctx;
13294                         child_ctx->parent_gen = parent_ctx->generation;
13295                 }
13296                 get_ctx(child_ctx->parent_ctx);
13297         }
13298
13299         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
13300 out_unlock:
13301         mutex_unlock(&parent_ctx->mutex);
13302
13303         perf_unpin_context(parent_ctx);
13304         put_ctx(parent_ctx);
13305
13306         return ret;
13307 }
13308
13309 /*
13310  * Initialize the perf_event context in task_struct
13311  */
13312 int perf_event_init_task(struct task_struct *child, u64 clone_flags)
13313 {
13314         int ctxn, ret;
13315
13316         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
13317         mutex_init(&child->perf_event_mutex);
13318         INIT_LIST_HEAD(&child->perf_event_list);
13319
13320         for_each_task_context_nr(ctxn) {
13321                 ret = perf_event_init_context(child, ctxn, clone_flags);
13322                 if (ret) {
13323                         perf_event_free_task(child);
13324                         return ret;
13325                 }
13326         }
13327
13328         return 0;
13329 }
13330
13331 static void __init perf_event_init_all_cpus(void)
13332 {
13333         struct swevent_htable *swhash;
13334         int cpu;
13335
13336         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
13337
13338         for_each_possible_cpu(cpu) {
13339                 swhash = &per_cpu(swevent_htable, cpu);
13340                 mutex_init(&swhash->hlist_mutex);
13341                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
13342
13343                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
13344                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
13345
13346 #ifdef CONFIG_CGROUP_PERF
13347                 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
13348 #endif
13349                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
13350         }
13351 }
13352
13353 static void perf_swevent_init_cpu(unsigned int cpu)
13354 {
13355         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
13356
13357         mutex_lock(&swhash->hlist_mutex);
13358         if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
13359                 struct swevent_hlist *hlist;
13360
13361                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
13362                 WARN_ON(!hlist);
13363                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
13364         }
13365         mutex_unlock(&swhash->hlist_mutex);
13366 }
13367
13368 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
13369 static void __perf_event_exit_context(void *__info)
13370 {
13371         struct perf_event_context *ctx = __info;
13372         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
13373         struct perf_event *event;
13374
13375         raw_spin_lock(&ctx->lock);
13376         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
13377         list_for_each_entry(event, &ctx->event_list, event_entry)
13378                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
13379         raw_spin_unlock(&ctx->lock);
13380 }
13381
13382 static void perf_event_exit_cpu_context(int cpu)
13383 {
13384         struct perf_cpu_context *cpuctx;
13385         struct perf_event_context *ctx;
13386         struct pmu *pmu;
13387
13388         mutex_lock(&pmus_lock);
13389         list_for_each_entry(pmu, &pmus, entry) {
13390                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13391                 ctx = &cpuctx->ctx;
13392
13393                 mutex_lock(&ctx->mutex);
13394                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
13395                 cpuctx->online = 0;
13396                 mutex_unlock(&ctx->mutex);
13397         }
13398         cpumask_clear_cpu(cpu, perf_online_mask);
13399         mutex_unlock(&pmus_lock);
13400 }
13401 #else
13402
13403 static void perf_event_exit_cpu_context(int cpu) { }
13404
13405 #endif
13406
13407 int perf_event_init_cpu(unsigned int cpu)
13408 {
13409         struct perf_cpu_context *cpuctx;
13410         struct perf_event_context *ctx;
13411         struct pmu *pmu;
13412
13413         perf_swevent_init_cpu(cpu);
13414
13415         mutex_lock(&pmus_lock);
13416         cpumask_set_cpu(cpu, perf_online_mask);
13417         list_for_each_entry(pmu, &pmus, entry) {
13418                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13419                 ctx = &cpuctx->ctx;
13420
13421                 mutex_lock(&ctx->mutex);
13422                 cpuctx->online = 1;
13423                 mutex_unlock(&ctx->mutex);
13424         }
13425         mutex_unlock(&pmus_lock);
13426
13427         return 0;
13428 }
13429
13430 int perf_event_exit_cpu(unsigned int cpu)
13431 {
13432         perf_event_exit_cpu_context(cpu);
13433         return 0;
13434 }
13435
13436 static int
13437 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
13438 {
13439         int cpu;
13440
13441         for_each_online_cpu(cpu)
13442                 perf_event_exit_cpu(cpu);
13443
13444         return NOTIFY_OK;
13445 }
13446
13447 /*
13448  * Run the perf reboot notifier at the very last possible moment so that
13449  * the generic watchdog code runs as long as possible.
13450  */
13451 static struct notifier_block perf_reboot_notifier = {
13452         .notifier_call = perf_reboot,
13453         .priority = INT_MIN,
13454 };
13455
13456 void __init perf_event_init(void)
13457 {
13458         int ret;
13459
13460         idr_init(&pmu_idr);
13461
13462         perf_event_init_all_cpus();
13463         init_srcu_struct(&pmus_srcu);
13464         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
13465         perf_pmu_register(&perf_cpu_clock, NULL, -1);
13466         perf_pmu_register(&perf_task_clock, NULL, -1);
13467         perf_tp_register();
13468         perf_event_init_cpu(smp_processor_id());
13469         register_reboot_notifier(&perf_reboot_notifier);
13470
13471         ret = init_hw_breakpoint();
13472         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
13473
13474         perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
13475
13476         /*
13477          * Build time assertion that we keep the data_head at the intended
13478          * location.  IOW, validation we got the __reserved[] size right.
13479          */
13480         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
13481                      != 1024);
13482 }
13483
13484 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
13485                               char *page)
13486 {
13487         struct perf_pmu_events_attr *pmu_attr =
13488                 container_of(attr, struct perf_pmu_events_attr, attr);
13489
13490         if (pmu_attr->event_str)
13491                 return sprintf(page, "%s\n", pmu_attr->event_str);
13492
13493         return 0;
13494 }
13495 EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
13496
13497 static int __init perf_event_sysfs_init(void)
13498 {
13499         struct pmu *pmu;
13500         int ret;
13501
13502         mutex_lock(&pmus_lock);
13503
13504         ret = bus_register(&pmu_bus);
13505         if (ret)
13506                 goto unlock;
13507
13508         list_for_each_entry(pmu, &pmus, entry) {
13509                 if (!pmu->name || pmu->type < 0)
13510                         continue;
13511
13512                 ret = pmu_dev_alloc(pmu);
13513                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
13514         }
13515         pmu_bus_running = 1;
13516         ret = 0;
13517
13518 unlock:
13519         mutex_unlock(&pmus_lock);
13520
13521         return ret;
13522 }
13523 device_initcall(perf_event_sysfs_init);
13524
13525 #ifdef CONFIG_CGROUP_PERF
13526 static struct cgroup_subsys_state *
13527 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
13528 {
13529         struct perf_cgroup *jc;
13530
13531         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
13532         if (!jc)
13533                 return ERR_PTR(-ENOMEM);
13534
13535         jc->info = alloc_percpu(struct perf_cgroup_info);
13536         if (!jc->info) {
13537                 kfree(jc);
13538                 return ERR_PTR(-ENOMEM);
13539         }
13540
13541         return &jc->css;
13542 }
13543
13544 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
13545 {
13546         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
13547
13548         free_percpu(jc->info);
13549         kfree(jc);
13550 }
13551
13552 static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
13553 {
13554         perf_event_cgroup(css->cgroup);
13555         return 0;
13556 }
13557
13558 static int __perf_cgroup_move(void *info)
13559 {
13560         struct task_struct *task = info;
13561         rcu_read_lock();
13562         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
13563         rcu_read_unlock();
13564         return 0;
13565 }
13566
13567 static void perf_cgroup_attach(struct cgroup_taskset *tset)
13568 {
13569         struct task_struct *task;
13570         struct cgroup_subsys_state *css;
13571
13572         cgroup_taskset_for_each(task, css, tset)
13573                 task_function_call(task, __perf_cgroup_move, task);
13574 }
13575
13576 struct cgroup_subsys perf_event_cgrp_subsys = {
13577         .css_alloc      = perf_cgroup_css_alloc,
13578         .css_free       = perf_cgroup_css_free,
13579         .css_online     = perf_cgroup_css_online,
13580         .attach         = perf_cgroup_attach,
13581         /*
13582          * Implicitly enable on dfl hierarchy so that perf events can
13583          * always be filtered by cgroup2 path as long as perf_event
13584          * controller is not mounted on a legacy hierarchy.
13585          */
13586         .implicit_on_dfl = true,
13587         .threaded       = true,
13588 };
13589 #endif /* CONFIG_CGROUP_PERF */
13590
13591 DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);