arch/x86/kernel/cpu/perf_event_intel_rapl.c

   1 /*
   2  * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
   3  * Copyright (C) 2013 Google, Inc., Stephane Eranian
   4  *
   5  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
   6  * section 14.7.1 (September 2013)
   7  *
   8  * RAPL provides more controls than just reporting energy consumption
   9  * however here we only expose the 3 energy consumption free running
  10  * counters (pp0, pkg, dram).
  11  *
  12  * Each of those counters increments in a power unit defined by the
  13  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  14  * but it can vary.
  15  *
  16  * Counter to rapl events mappings:
  17  *
  18  *  pp0 counter: consumption of all physical cores (power plane 0)
  19  *        event: rapl_energy_cores
  20  *    perf code: 0x1
  21  *
  22  *  pkg counter: consumption of the whole processor package
  23  *        event: rapl_energy_pkg
  24  *    perf code: 0x2
  25  *
  26  * dram counter: consumption of the dram domain (servers only)
  27  *        event: rapl_energy_dram
  28  *    perf code: 0x3
  29  *
  30  * dram counter: consumption of the builtin-gpu domain (client only)
  31  *        event: rapl_energy_gpu
  32  *    perf code: 0x4
  33  *
  34  * We manage those counters as free running (read-only). They may be
  35  * use simultaneously by other tools, such as turbostat.
  36  *
  37  * The events only support system-wide mode counting. There is no
  38  * sampling support because it does not make sense and is not
  39  * supported by the RAPL hardware.
  40  *
  41  * Because we want to avoid floating-point operations in the kernel,
  42  * the events are all reported in fixed point arithmetic (32.32).
  43  * Tools must adjust the counts to convert them to Watts using
  44  * the duration of the measurement. Tools may use a function such as
  45  * ldexp(raw_count, -32);
  46  */
  47 #include <linux/module.h>
  48 #include <linux/slab.h>
  49 #include <linux/perf_event.h>
  50 #include <asm/cpu_device_id.h>
  51 #include "perf_event.h"
  52
  53 /*
  54  * RAPL energy status counters
  55  */
  56 #define RAPL_IDX_PP0_NRG_STAT   0       /* all cores */
  57 #define INTEL_RAPL_PP0          0x1     /* pseudo-encoding */
  58 #define RAPL_IDX_PKG_NRG_STAT   1       /* entire package */
  59 #define INTEL_RAPL_PKG          0x2     /* pseudo-encoding */
  60 #define RAPL_IDX_RAM_NRG_STAT   2       /* DRAM */
  61 #define INTEL_RAPL_RAM          0x3     /* pseudo-encoding */
  62 #define RAPL_IDX_PP1_NRG_STAT   3       /* gpu */
  63 #define INTEL_RAPL_PP1          0x4     /* pseudo-encoding */
  64
  65 /* Clients have PP0, PKG */
  66 #define RAPL_IDX_CLN    (1<<RAPL_IDX_PP0_NRG_STAT|\
  67                          1<<RAPL_IDX_PKG_NRG_STAT|\
  68                          1<<RAPL_IDX_PP1_NRG_STAT)
  69
  70 /* Servers have PP0, PKG, RAM */
  71 #define RAPL_IDX_SRV    (1<<RAPL_IDX_PP0_NRG_STAT|\
  72                          1<<RAPL_IDX_PKG_NRG_STAT|\
  73                          1<<RAPL_IDX_RAM_NRG_STAT)
  74
  75 /* Servers have PP0, PKG, RAM, PP1 */
  76 #define RAPL_IDX_HSW    (1<<RAPL_IDX_PP0_NRG_STAT|\
  77                          1<<RAPL_IDX_PKG_NRG_STAT|\
  78                          1<<RAPL_IDX_RAM_NRG_STAT|\
  79                          1<<RAPL_IDX_PP1_NRG_STAT)
  80
  81 /*
  82  * event code: LSB 8 bits, passed in attr->config
  83  * any other bit is reserved
  84  */
  85 #define RAPL_EVENT_MASK 0xFFULL
  86
  87 #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)           \
  88 static ssize_t __rapl_##_var##_show(struct kobject *kobj,       \
  89                                 struct kobj_attribute *attr,    \
  90                                 char *page)                     \
  91 {                                                               \
  92         BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
  93         return sprintf(page, _format "\n");                     \
  94 }                                                               \
  95 static struct kobj_attribute format_attr_##_var =               \
  96         __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
  97
  98 #define RAPL_EVENT_DESC(_name, _config)                         \
  99 {                                                               \
 100         .attr   = __ATTR(_name, 0444, rapl_event_show, NULL),   \
 101         .config = _config,                                      \
 102 }
 103
 104 #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
 105
 106 struct rapl_pmu {
 107         spinlock_t       lock;
 108         int              hw_unit;  /* 1/2^hw_unit Joule */
 109         int              n_active; /* number of active events */
 110         struct list_head active_list;
 111         struct pmu       *pmu; /* pointer to rapl_pmu_class */
 112         ktime_t          timer_interval; /* in ktime_t unit */
 113         struct hrtimer   hrtimer;
 114 };
 115
 116 static struct pmu rapl_pmu_class;
 117 static cpumask_t rapl_cpu_mask;
 118 static int rapl_cntr_mask;
 119
 120 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
 121 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
 122
 123 static inline u64 rapl_read_counter(struct perf_event *event)
 124 {
 125         u64 raw;
 126         rdmsrl(event->hw.event_base, raw);
 127         return raw;
 128 }
 129
 130 static inline u64 rapl_scale(u64 v)
 131 {
 132         /*
 133          * scale delta to smallest unit (1/2^32)
 134          * users must then scale back: count * 1/(1e9*2^32) to get Joules
 135          * or use ldexp(count, -32).
 136          * Watts = Joules/Time delta
 137          */
 138         return v << (32 - __this_cpu_read(rapl_pmu->hw_unit));
 139 }
 140
 141 static u64 rapl_event_update(struct perf_event *event)
 142 {
 143         struct hw_perf_event *hwc = &event->hw;
 144         u64 prev_raw_count, new_raw_count;
 145         s64 delta, sdelta;
 146         int shift = RAPL_CNTR_WIDTH;
 147
 148 again:
 149         prev_raw_count = local64_read(&hwc->prev_count);
 150         rdmsrl(event->hw.event_base, new_raw_count);
 151
 152         if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 153                             new_raw_count) != prev_raw_count) {
 154                 cpu_relax();
 155                 goto again;
 156         }
 157
 158         /*
 159          * Now we have the new raw value and have updated the prev
 160          * timestamp already. We can now calculate the elapsed delta
 161          * (event-)time and add that to the generic event.
 162          *
 163          * Careful, not all hw sign-extends above the physical width
 164          * of the count.
 165          */
 166         delta = (new_raw_count << shift) - (prev_raw_count << shift);
 167         delta >>= shift;
 168
 169         sdelta = rapl_scale(delta);
 170
 171         local64_add(sdelta, &event->count);
 172
 173         return new_raw_count;
 174 }
 175
 176 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 177 {
 178         __hrtimer_start_range_ns(&pmu->hrtimer,
 179                         pmu->timer_interval, 0,
 180                         HRTIMER_MODE_REL_PINNED, 0);
 181 }
 182
 183 static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
 184 {
 185         hrtimer_cancel(&pmu->hrtimer);
 186 }
 187
 188 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 189 {
 190         struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 191         struct perf_event *event;
 192         unsigned long flags;
 193
 194         if (!pmu->n_active)
 195                 return HRTIMER_NORESTART;
 196
 197         spin_lock_irqsave(&pmu->lock, flags);
 198
 199         list_for_each_entry(event, &pmu->active_list, active_entry) {
 200                 rapl_event_update(event);
 201         }
 202
 203         spin_unlock_irqrestore(&pmu->lock, flags);
 204
 205         hrtimer_forward_now(hrtimer, pmu->timer_interval);
 206
 207         return HRTIMER_RESTART;
 208 }
 209
 210 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
 211 {
 212         struct hrtimer *hr = &pmu->hrtimer;
 213
 214         hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 215         hr->function = rapl_hrtimer_handle;
 216 }
 217
 218 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 219                                    struct perf_event *event)
 220 {
 221         if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
 222                 return;
 223
 224         event->hw.state = 0;
 225
 226         list_add_tail(&event->active_entry, &pmu->active_list);
 227
 228         local64_set(&event->hw.prev_count, rapl_read_counter(event));
 229
 230         pmu->n_active++;
 231         if (pmu->n_active == 1)
 232                 rapl_start_hrtimer(pmu);
 233 }
 234
 235 static void rapl_pmu_event_start(struct perf_event *event, int mode)
 236 {
 237         struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 238         unsigned long flags;
 239
 240         spin_lock_irqsave(&pmu->lock, flags);
 241         __rapl_pmu_event_start(pmu, event);
 242         spin_unlock_irqrestore(&pmu->lock, flags);
 243 }
 244
 245 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 246 {
 247         struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 248         struct hw_perf_event *hwc = &event->hw;
 249         unsigned long flags;
 250
 251         spin_lock_irqsave(&pmu->lock, flags);
 252
 253         /* mark event as deactivated and stopped */
 254         if (!(hwc->state & PERF_HES_STOPPED)) {
 255                 WARN_ON_ONCE(pmu->n_active <= 0);
 256                 pmu->n_active--;
 257                 if (pmu->n_active == 0)
 258                         rapl_stop_hrtimer(pmu);
 259
 260                 list_del(&event->active_entry);
 261
 262                 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 263                 hwc->state |= PERF_HES_STOPPED;
 264         }
 265
 266         /* check if update of sw counter is necessary */
 267         if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
 268                 /*
 269                  * Drain the remaining delta count out of a event
 270                  * that we are disabling:
 271                  */
 272                 rapl_event_update(event);
 273                 hwc->state |= PERF_HES_UPTODATE;
 274         }
 275
 276         spin_unlock_irqrestore(&pmu->lock, flags);
 277 }
 278
 279 static int rapl_pmu_event_add(struct perf_event *event, int mode)
 280 {
 281         struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 282         struct hw_perf_event *hwc = &event->hw;
 283         unsigned long flags;
 284
 285         spin_lock_irqsave(&pmu->lock, flags);
 286
 287         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 288
 289         if (mode & PERF_EF_START)
 290                 __rapl_pmu_event_start(pmu, event);
 291
 292         spin_unlock_irqrestore(&pmu->lock, flags);
 293
 294         return 0;
 295 }
 296
 297 static void rapl_pmu_event_del(struct perf_event *event, int flags)
 298 {
 299         rapl_pmu_event_stop(event, PERF_EF_UPDATE);
 300 }
 301
 302 static int rapl_pmu_event_init(struct perf_event *event)
 303 {
 304         u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 305         int bit, msr, ret = 0;
 306
 307         /* only look at RAPL events */
 308         if (event->attr.type != rapl_pmu_class.type)
 309                 return -ENOENT;
 310
 311         /* check only supported bits are set */
 312         if (event->attr.config & ~RAPL_EVENT_MASK)
 313                 return -EINVAL;
 314
 315         /*
 316          * check event is known (determines counter)
 317          */
 318         switch (cfg) {
 319         case INTEL_RAPL_PP0:
 320                 bit = RAPL_IDX_PP0_NRG_STAT;
 321                 msr = MSR_PP0_ENERGY_STATUS;
 322                 break;
 323         case INTEL_RAPL_PKG:
 324                 bit = RAPL_IDX_PKG_NRG_STAT;
 325                 msr = MSR_PKG_ENERGY_STATUS;
 326                 break;
 327         case INTEL_RAPL_RAM:
 328                 bit = RAPL_IDX_RAM_NRG_STAT;
 329                 msr = MSR_DRAM_ENERGY_STATUS;
 330                 break;
 331         case INTEL_RAPL_PP1:
 332                 bit = RAPL_IDX_PP1_NRG_STAT;
 333                 msr = MSR_PP1_ENERGY_STATUS;
 334                 break;
 335         default:
 336                 return -EINVAL;
 337         }
 338         /* check event supported */
 339         if (!(rapl_cntr_mask & (1 << bit)))
 340                 return -EINVAL;
 341
 342         /* unsupported modes and filters */
 343         if (event->attr.exclude_user   ||
 344             event->attr.exclude_kernel ||
 345             event->attr.exclude_hv     ||
 346             event->attr.exclude_idle   ||
 347             event->attr.exclude_host   ||
 348             event->attr.exclude_guest  ||
 349             event->attr.sample_period) /* no sampling */
 350                 return -EINVAL;
 351
 352         /* must be done before validate_group */
 353         event->hw.event_base = msr;
 354         event->hw.config = cfg;
 355         event->hw.idx = bit;
 356
 357         return ret;
 358 }
 359
 360 static void rapl_pmu_event_read(struct perf_event *event)
 361 {
 362         rapl_event_update(event);
 363 }
 364
 365 static ssize_t rapl_get_attr_cpumask(struct device *dev,
 366                                 struct device_attribute *attr, char *buf)
 367 {
 368         return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
 369 }
 370
 371 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
 372
 373 static struct attribute *rapl_pmu_attrs[] = {
 374         &dev_attr_cpumask.attr,
 375         NULL,
 376 };
 377
 378 static struct attribute_group rapl_pmu_attr_group = {
 379         .attrs = rapl_pmu_attrs,
 380 };
 381
 382 EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 383 EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 384 EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 385 EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 386
 387 EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 388 EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 389 EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 390 EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 391
 392 /*
 393  * we compute in 0.23 nJ increments regardless of MSR
 394  */
 395 EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
 396 EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
 397 EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 398 EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 399
 400 static struct attribute *rapl_events_srv_attr[] = {
 401         EVENT_PTR(rapl_cores),
 402         EVENT_PTR(rapl_pkg),
 403         EVENT_PTR(rapl_ram),
 404
 405         EVENT_PTR(rapl_cores_unit),
 406         EVENT_PTR(rapl_pkg_unit),
 407         EVENT_PTR(rapl_ram_unit),
 408
 409         EVENT_PTR(rapl_cores_scale),
 410         EVENT_PTR(rapl_pkg_scale),
 411         EVENT_PTR(rapl_ram_scale),
 412         NULL,
 413 };
 414
 415 static struct attribute *rapl_events_cln_attr[] = {
 416         EVENT_PTR(rapl_cores),
 417         EVENT_PTR(rapl_pkg),
 418         EVENT_PTR(rapl_gpu),
 419
 420         EVENT_PTR(rapl_cores_unit),
 421         EVENT_PTR(rapl_pkg_unit),
 422         EVENT_PTR(rapl_gpu_unit),
 423
 424         EVENT_PTR(rapl_cores_scale),
 425         EVENT_PTR(rapl_pkg_scale),
 426         EVENT_PTR(rapl_gpu_scale),
 427         NULL,
 428 };
 429
 430 static struct attribute *rapl_events_hsw_attr[] = {
 431         EVENT_PTR(rapl_cores),
 432         EVENT_PTR(rapl_pkg),
 433         EVENT_PTR(rapl_gpu),
 434         EVENT_PTR(rapl_ram),
 435
 436         EVENT_PTR(rapl_cores_unit),
 437         EVENT_PTR(rapl_pkg_unit),
 438         EVENT_PTR(rapl_gpu_unit),
 439         EVENT_PTR(rapl_ram_unit),
 440
 441         EVENT_PTR(rapl_cores_scale),
 442         EVENT_PTR(rapl_pkg_scale),
 443         EVENT_PTR(rapl_gpu_scale),
 444         EVENT_PTR(rapl_ram_scale),
 445         NULL,
 446 };
 447
 448 static struct attribute_group rapl_pmu_events_group = {
 449         .name = "events",
 450         .attrs = NULL, /* patched at runtime */
 451 };
 452
 453 DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
 454 static struct attribute *rapl_formats_attr[] = {
 455         &format_attr_event.attr,
 456         NULL,
 457 };
 458
 459 static struct attribute_group rapl_pmu_format_group = {
 460         .name = "format",
 461         .attrs = rapl_formats_attr,
 462 };
 463
 464 const struct attribute_group *rapl_attr_groups[] = {
 465         &rapl_pmu_attr_group,
 466         &rapl_pmu_format_group,
 467         &rapl_pmu_events_group,
 468         NULL,
 469 };
 470
 471 static struct pmu rapl_pmu_class = {
 472         .attr_groups    = rapl_attr_groups,
 473         .task_ctx_nr    = perf_invalid_context, /* system-wide only */
 474         .event_init     = rapl_pmu_event_init,
 475         .add            = rapl_pmu_event_add, /* must have */
 476         .del            = rapl_pmu_event_del, /* must have */
 477         .start          = rapl_pmu_event_start,
 478         .stop           = rapl_pmu_event_stop,
 479         .read           = rapl_pmu_event_read,
 480 };
 481
 482 static void rapl_cpu_exit(int cpu)
 483 {
 484         struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 485         int i, phys_id = topology_physical_package_id(cpu);
 486         int target = -1;
 487
 488         /* find a new cpu on same package */
 489         for_each_online_cpu(i) {
 490                 if (i == cpu)
 491                         continue;
 492                 if (phys_id == topology_physical_package_id(i)) {
 493                         target = i;
 494                         break;
 495                 }
 496         }
 497         /*
 498          * clear cpu from cpumask
 499          * if was set in cpumask and still some cpu on package,
 500          * then move to new cpu
 501          */
 502         if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
 503                 cpumask_set_cpu(target, &rapl_cpu_mask);
 504
 505         WARN_ON(cpumask_empty(&rapl_cpu_mask));
 506         /*
 507          * migrate events and context to new cpu
 508          */
 509         if (target >= 0)
 510                 perf_pmu_migrate_context(pmu->pmu, cpu, target);
 511
 512         /* cancel overflow polling timer for CPU */
 513         rapl_stop_hrtimer(pmu);
 514 }
 515
 516 static void rapl_cpu_init(int cpu)
 517 {
 518         int i, phys_id = topology_physical_package_id(cpu);
 519
 520         /* check if phys_is is already covered */
 521         for_each_cpu(i, &rapl_cpu_mask) {
 522                 if (phys_id == topology_physical_package_id(i))
 523                         return;
 524         }
 525         /* was not found, so add it */
 526         cpumask_set_cpu(cpu, &rapl_cpu_mask);
 527 }
 528
 529 static int rapl_cpu_prepare(int cpu)
 530 {
 531         struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 532         int phys_id = topology_physical_package_id(cpu);
 533         u64 ms;
 534         u64 msr_rapl_power_unit_bits;
 535
 536         if (pmu)
 537                 return 0;
 538
 539         if (phys_id < 0)
 540                 return -1;
 541
 542         /* protect rdmsrl() to handle virtualization */
 543         if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
 544                 return -1;
 545
 546         pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 547         if (!pmu)
 548                 return -1;
 549
 550         spin_lock_init(&pmu->lock);
 551
 552         INIT_LIST_HEAD(&pmu->active_list);
 553
 554         /*
 555          * grab power unit as: 1/2^unit Joules
 556          *
 557          * we cache in local PMU instance
 558          */
 559         pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 560         pmu->pmu = &rapl_pmu_class;
 561
 562         /*
 563          * use reference of 200W for scaling the timeout
 564          * to avoid missing counter overflows.
 565          * 200W = 200 Joules/sec
 566          * divide interval by 2 to avoid lockstep (2 * 100)
 567          * if hw unit is 32, then we use 2 ms 1/200/2
 568          */
 569         if (pmu->hw_unit < 32)
 570                 ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
 571         else
 572                 ms = 2;
 573
 574         pmu->timer_interval = ms_to_ktime(ms);
 575
 576         rapl_hrtimer_init(pmu);
 577
 578         /* set RAPL pmu for this cpu for now */
 579         per_cpu(rapl_pmu, cpu) = pmu;
 580         per_cpu(rapl_pmu_to_free, cpu) = NULL;
 581
 582         return 0;
 583 }
 584
 585 static void rapl_cpu_kfree(int cpu)
 586 {
 587         struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
 588
 589         kfree(pmu);
 590
 591         per_cpu(rapl_pmu_to_free, cpu) = NULL;
 592 }
 593
 594 static int rapl_cpu_dying(int cpu)
 595 {
 596         struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 597
 598         if (!pmu)
 599                 return 0;
 600
 601         per_cpu(rapl_pmu, cpu) = NULL;
 602
 603         per_cpu(rapl_pmu_to_free, cpu) = pmu;
 604
 605         return 0;
 606 }
 607
 608 static int rapl_cpu_notifier(struct notifier_block *self,
 609                              unsigned long action, void *hcpu)
 610 {
 611         unsigned int cpu = (long)hcpu;
 612
 613         switch (action & ~CPU_TASKS_FROZEN) {
 614         case CPU_UP_PREPARE:
 615                 rapl_cpu_prepare(cpu);
 616                 break;
 617         case CPU_STARTING:
 618                 rapl_cpu_init(cpu);
 619                 break;
 620         case CPU_UP_CANCELED:
 621         case CPU_DYING:
 622                 rapl_cpu_dying(cpu);
 623                 break;
 624         case CPU_ONLINE:
 625         case CPU_DEAD:
 626                 rapl_cpu_kfree(cpu);
 627                 break;
 628         case CPU_DOWN_PREPARE:
 629                 rapl_cpu_exit(cpu);
 630                 break;
 631         default:
 632                 break;
 633         }
 634
 635         return NOTIFY_OK;
 636 }
 637
 638 static const struct x86_cpu_id rapl_cpu_match[] = {
 639         [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
 640         [1] = {},
 641 };
 642
 643 static int __init rapl_pmu_init(void)
 644 {
 645         struct rapl_pmu *pmu;
 646         int cpu, ret;
 647
 648         /*
 649          * check for Intel processor family 6
 650          */
 651         if (!x86_match_cpu(rapl_cpu_match))
 652                 return 0;
 653
 654         /* check supported CPU */
 655         switch (boot_cpu_data.x86_model) {
 656         case 42: /* Sandy Bridge */
 657         case 58: /* Ivy Bridge */
 658                 rapl_cntr_mask = RAPL_IDX_CLN;
 659                 rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 660                 break;
 661         case 60: /* Haswell */
 662         case 69: /* Haswell-Celeron */
 663                 rapl_cntr_mask = RAPL_IDX_HSW;
 664                 rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
 665                 break;
 666         case 45: /* Sandy Bridge-EP */
 667         case 62: /* IvyTown */
 668                 rapl_cntr_mask = RAPL_IDX_SRV;
 669                 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 670                 break;
 671
 672         default:
 673                 /* unsupported */
 674                 return 0;
 675         }
 676
 677         cpu_notifier_register_begin();
 678
 679         for_each_online_cpu(cpu) {
 680                 ret = rapl_cpu_prepare(cpu);
 681                 if (ret)
 682                         goto out;
 683                 rapl_cpu_init(cpu);
 684         }
 685
 686         __perf_cpu_notifier(rapl_cpu_notifier);
 687
 688         ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
 689         if (WARN_ON(ret)) {
 690                 pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
 691                 cpu_notifier_register_done();
 692                 return -1;
 693         }
 694
 695         pmu = __this_cpu_read(rapl_pmu);
 696
 697         pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
 698                 " API unit is 2^-32 Joules,"
 699                 " %d fixed counters"
 700                 " %llu ms ovfl timer\n",
 701                 pmu->hw_unit,
 702                 hweight32(rapl_cntr_mask),
 703                 ktime_to_ms(pmu->timer_interval));
 704
 705 out:
 706         cpu_notifier_register_done();
 707
 708         return 0;
 709 }
 710 device_initcall(rapl_pmu_init);