arch/x86/kvm/pmu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine -- Performance Monitoring Unit support
   4  *
   5  * Copyright 2015 Red Hat, Inc. and/or its affiliates.
   6  *
   7  * Authors:
   8  *   Avi Kivity   <avi@redhat.com>
   9  *   Gleb Natapov <gleb@redhat.com>
  10  *   Wei Huang    <wei@redhat.com>
  11  */
  12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14 #include <linux/types.h>
  15 #include <linux/kvm_host.h>
  16 #include <linux/perf_event.h>
  17 #include <linux/bsearch.h>
  18 #include <linux/sort.h>
  19 #include <asm/perf_event.h>
  20 #include <asm/cpu_device_id.h>
  21 #include "x86.h"
  22 #include "cpuid.h"
  23 #include "lapic.h"
  24 #include "pmu.h"
  25
  26 /* This is enough to filter the vast majority of currently defined events. */
  27 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
  28
  29 struct x86_pmu_capability __read_mostly kvm_pmu_cap;
  30 EXPORT_SYMBOL_GPL(kvm_pmu_cap);
  31
  32 /* Precise Distribution of Instructions Retired (PDIR) */
  33 static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
  34         X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
  35         X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
  36         /* Instruction-Accurate PDIR (PDIR++) */
  37         X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
  38         {}
  39 };
  40
  41 /* Precise Distribution (PDist) */
  42 static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
  43         X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
  44         {}
  45 };
  46
  47 /* NOTE:
  48  * - Each perf counter is defined as "struct kvm_pmc";
  49  * - There are two types of perf counters: general purpose (gp) and fixed.
  50  *   gp counters are stored in gp_counters[] and fixed counters are stored
  51  *   in fixed_counters[] respectively. Both of them are part of "struct
  52  *   kvm_pmu";
  53  * - pmu.c understands the difference between gp counters and fixed counters.
  54  *   However AMD doesn't support fixed-counters;
  55  * - There are three types of index to access perf counters (PMC):
  56  *     1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
  57  *        has MSR_K7_PERFCTRn and, for families 15H and later,
  58  *        MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
  59  *        aliased to MSR_K7_PERFCTRn.
  60  *     2. MSR Index (named idx): This normally is used by RDPMC instruction.
  61  *        For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
  62  *        C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
  63  *        that it also supports fixed counters. idx can be used to as index to
  64  *        gp and fixed counters.
  65  *     3. Global PMC Index (named pmc): pmc is an index specific to PMU
  66  *        code. Each pmc, stored in kvm_pmc.idx field, is unique across
  67  *        all perf counters (both gp and fixed). The mapping relationship
  68  *        between pmc and perf counters is as the following:
  69  *        * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
  70  *                 [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
  71  *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
  72  *          and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
  73  */
  74
  75 static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
  76
  77 #define KVM_X86_PMU_OP(func)                                         \
  78         DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func,                          \
  79                                 *(((struct kvm_pmu_ops *)0)->func));
  80 #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
  81 #include <asm/kvm-x86-pmu-ops.h>
  82
  83 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
  84 {
  85         memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
  86
  87 #define __KVM_X86_PMU_OP(func) \
  88         static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
  89 #define KVM_X86_PMU_OP(func) \
  90         WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
  91 #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
  92 #include <asm/kvm-x86-pmu-ops.h>
  93 #undef __KVM_X86_PMU_OP
  94 }
  95
  96 static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
  97 {
  98         return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
  99 }
 100
 101 static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
 102 {
 103         struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
 104         struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
 105
 106         kvm_pmu_deliver_pmi(vcpu);
 107 }
 108
 109 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
 110 {
 111         struct kvm_pmu *pmu = pmc_to_pmu(pmc);
 112         bool skip_pmi = false;
 113
 114         if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
 115                 if (!in_pmi) {
 116                         /*
 117                          * TODO: KVM is currently _choosing_ to not generate records
 118                          * for emulated instructions, avoiding BUFFER_OVF PMI when
 119                          * there are no records. Strictly speaking, it should be done
 120                          * as well in the right context to improve sampling accuracy.
 121                          */
 122                         skip_pmi = true;
 123                 } else {
 124                         /* Indicate PEBS overflow PMI to guest. */
 125                         skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
 126                                                       (unsigned long *)&pmu->global_status);
 127                 }
 128         } else {
 129                 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
 130         }
 131
 132         if (!pmc->intr || skip_pmi)
 133                 return;
 134
 135         /*
 136          * Inject PMI. If vcpu was in a guest mode during NMI PMI
 137          * can be ejected on a guest mode re-entry. Otherwise we can't
 138          * be sure that vcpu wasn't executing hlt instruction at the
 139          * time of vmexit and is not going to re-enter guest mode until
 140          * woken up. So we should wake it, but this is impossible from
 141          * NMI context. Do it from irq work instead.
 142          */
 143         if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
 144                 irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
 145         else
 146                 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
 147 }
 148
 149 static void kvm_perf_overflow(struct perf_event *perf_event,
 150                               struct perf_sample_data *data,
 151                               struct pt_regs *regs)
 152 {
 153         struct kvm_pmc *pmc = perf_event->overflow_handler_context;
 154
 155         /*
 156          * Ignore overflow events for counters that are scheduled to be
 157          * reprogrammed, e.g. if a PMI for the previous event races with KVM's
 158          * handling of a related guest WRMSR.
 159          */
 160         if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
 161                 return;
 162
 163         __kvm_perf_overflow(pmc, true);
 164
 165         kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 166 }
 167
 168 static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
 169 {
 170         /*
 171          * For some model specific pebs counters with special capabilities
 172          * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
 173          * level to the maximum value (currently 3, backwards compatible)
 174          * so that the perf subsystem would assign specific hardware counter
 175          * with that capability for vPMC.
 176          */
 177         if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
 178             (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
 179                 return 3;
 180
 181         /*
 182          * The non-zero precision level of guest event makes the ordinary
 183          * guest event becomes a guest PEBS event and triggers the host
 184          * PEBS PMI handler to determine whether the PEBS overflow PMI
 185          * comes from the host counters or the guest.
 186          */
 187         return 1;
 188 }
 189
 190 static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
 191                                  bool exclude_user, bool exclude_kernel,
 192                                  bool intr)
 193 {
 194         struct kvm_pmu *pmu = pmc_to_pmu(pmc);
 195         struct perf_event *event;
 196         struct perf_event_attr attr = {
 197                 .type = type,
 198                 .size = sizeof(attr),
 199                 .pinned = true,
 200                 .exclude_idle = true,
 201                 .exclude_host = 1,
 202                 .exclude_user = exclude_user,
 203                 .exclude_kernel = exclude_kernel,
 204                 .config = config,
 205         };
 206         bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
 207
 208         attr.sample_period = get_sample_period(pmc, pmc->counter);
 209
 210         if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
 211             guest_cpuid_is_intel(pmc->vcpu)) {
 212                 /*
 213                  * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
 214                  * period. Just clear the sample period so at least
 215                  * allocating the counter doesn't fail.
 216                  */
 217                 attr.sample_period = 0;
 218         }
 219         if (pebs) {
 220                 /*
 221                  * For most PEBS hardware events, the difference in the software
 222                  * precision levels of guest and host PEBS events will not affect
 223                  * the accuracy of the PEBS profiling result, because the "event IP"
 224                  * in the PEBS record is calibrated on the guest side.
 225                  */
 226                 attr.precise_ip = pmc_get_pebs_precise_level(pmc);
 227         }
 228
 229         event = perf_event_create_kernel_counter(&attr, -1, current,
 230                                                  kvm_perf_overflow, pmc);
 231         if (IS_ERR(event)) {
 232                 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
 233                             PTR_ERR(event), pmc->idx);
 234                 return PTR_ERR(event);
 235         }
 236
 237         pmc->perf_event = event;
 238         pmc_to_pmu(pmc)->event_count++;
 239         pmc->is_paused = false;
 240         pmc->intr = intr || pebs;
 241         return 0;
 242 }
 243
 244 static void pmc_pause_counter(struct kvm_pmc *pmc)
 245 {
 246         u64 counter = pmc->counter;
 247
 248         if (!pmc->perf_event || pmc->is_paused)
 249                 return;
 250
 251         /* update counter, reset event value to avoid redundant accumulation */
 252         counter += perf_event_pause(pmc->perf_event, true);
 253         pmc->counter = counter & pmc_bitmask(pmc);
 254         pmc->is_paused = true;
 255 }
 256
 257 static bool pmc_resume_counter(struct kvm_pmc *pmc)
 258 {
 259         if (!pmc->perf_event)
 260                 return false;
 261
 262         /* recalibrate sample period and check if it's accepted by perf core */
 263         if (is_sampling_event(pmc->perf_event) &&
 264             perf_event_period(pmc->perf_event,
 265                               get_sample_period(pmc, pmc->counter)))
 266                 return false;
 267
 268         if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
 269             (!!pmc->perf_event->attr.precise_ip))
 270                 return false;
 271
 272         /* reuse perf_event to serve as pmc_reprogram_counter() does*/
 273         perf_event_enable(pmc->perf_event);
 274         pmc->is_paused = false;
 275
 276         return true;
 277 }
 278
 279 static int filter_cmp(const void *pa, const void *pb, u64 mask)
 280 {
 281         u64 a = *(u64 *)pa & mask;
 282         u64 b = *(u64 *)pb & mask;
 283
 284         return (a > b) - (a < b);
 285 }
 286
 287
 288 static int filter_sort_cmp(const void *pa, const void *pb)
 289 {
 290         return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
 291                                    KVM_PMU_MASKED_ENTRY_EXCLUDE));
 292 }
 293
 294 /*
 295  * For the event filter, searching is done on the 'includes' list and
 296  * 'excludes' list separately rather than on the 'events' list (which
 297  * has both).  As a result the exclude bit can be ignored.
 298  */
 299 static int filter_event_cmp(const void *pa, const void *pb)
 300 {
 301         return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
 302 }
 303
 304 static int find_filter_index(u64 *events, u64 nevents, u64 key)
 305 {
 306         u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
 307                           filter_event_cmp);
 308
 309         if (!fe)
 310                 return -1;
 311
 312         return fe - events;
 313 }
 314
 315 static bool is_filter_entry_match(u64 filter_event, u64 umask)
 316 {
 317         u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
 318         u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
 319
 320         BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
 321                      (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
 322                      ARCH_PERFMON_EVENTSEL_UMASK);
 323
 324         return (umask & mask) == match;
 325 }
 326
 327 static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
 328 {
 329         u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
 330         u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
 331         int i, index;
 332
 333         index = find_filter_index(events, nevents, event_select);
 334         if (index < 0)
 335                 return false;
 336
 337         /*
 338          * Entries are sorted by the event select.  Walk the list in both
 339          * directions to process all entries with the targeted event select.
 340          */
 341         for (i = index; i < nevents; i++) {
 342                 if (filter_event_cmp(&events[i], &event_select))
 343                         break;
 344
 345                 if (is_filter_entry_match(events[i], umask))
 346                         return true;
 347         }
 348
 349         for (i = index - 1; i >= 0; i--) {
 350                 if (filter_event_cmp(&events[i], &event_select))
 351                         break;
 352
 353                 if (is_filter_entry_match(events[i], umask))
 354                         return true;
 355         }
 356
 357         return false;
 358 }
 359
 360 static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
 361                                 u64 eventsel)
 362 {
 363         if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
 364             !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
 365                 return f->action == KVM_PMU_EVENT_ALLOW;
 366
 367         return f->action == KVM_PMU_EVENT_DENY;
 368 }
 369
 370 static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
 371                                    int idx)
 372 {
 373         int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
 374
 375         if (filter->action == KVM_PMU_EVENT_DENY &&
 376             test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
 377                 return false;
 378         if (filter->action == KVM_PMU_EVENT_ALLOW &&
 379             !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
 380                 return false;
 381
 382         return true;
 383 }
 384
 385 static bool check_pmu_event_filter(struct kvm_pmc *pmc)
 386 {
 387         struct kvm_x86_pmu_event_filter *filter;
 388         struct kvm *kvm = pmc->vcpu->kvm;
 389
 390         if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
 391                 return false;
 392
 393         filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
 394         if (!filter)
 395                 return true;
 396
 397         if (pmc_is_gp(pmc))
 398                 return is_gp_event_allowed(filter, pmc->eventsel);
 399
 400         return is_fixed_event_allowed(filter, pmc->idx);
 401 }
 402
 403 static void reprogram_counter(struct kvm_pmc *pmc)
 404 {
 405         struct kvm_pmu *pmu = pmc_to_pmu(pmc);
 406         u64 eventsel = pmc->eventsel;
 407         u64 new_config = eventsel;
 408         u8 fixed_ctr_ctrl;
 409
 410         pmc_pause_counter(pmc);
 411
 412         if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
 413                 goto reprogram_complete;
 414
 415         if (!check_pmu_event_filter(pmc))
 416                 goto reprogram_complete;
 417
 418         if (pmc->counter < pmc->prev_counter)
 419                 __kvm_perf_overflow(pmc, false);
 420
 421         if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
 422                 printk_once("kvm pmu: pin control bit is ignored\n");
 423
 424         if (pmc_is_fixed(pmc)) {
 425                 fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
 426                                                   pmc->idx - INTEL_PMC_IDX_FIXED);
 427                 if (fixed_ctr_ctrl & 0x1)
 428                         eventsel |= ARCH_PERFMON_EVENTSEL_OS;
 429                 if (fixed_ctr_ctrl & 0x2)
 430                         eventsel |= ARCH_PERFMON_EVENTSEL_USR;
 431                 if (fixed_ctr_ctrl & 0x8)
 432                         eventsel |= ARCH_PERFMON_EVENTSEL_INT;
 433                 new_config = (u64)fixed_ctr_ctrl;
 434         }
 435
 436         if (pmc->current_config == new_config && pmc_resume_counter(pmc))
 437                 goto reprogram_complete;
 438
 439         pmc_release_perf_event(pmc);
 440
 441         pmc->current_config = new_config;
 442
 443         /*
 444          * If reprogramming fails, e.g. due to contention, leave the counter's
 445          * regprogram bit set, i.e. opportunistically try again on the next PMU
 446          * refresh.  Don't make a new request as doing so can stall the guest
 447          * if reprogramming repeatedly fails.
 448          */
 449         if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
 450                                   (eventsel & pmu->raw_event_mask),
 451                                   !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
 452                                   !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
 453                                   eventsel & ARCH_PERFMON_EVENTSEL_INT))
 454                 return;
 455
 456 reprogram_complete:
 457         clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
 458         pmc->prev_counter = 0;
 459 }
 460
 461 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
 462 {
 463         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 464         int bit;
 465
 466         for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
 467                 struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
 468
 469                 if (unlikely(!pmc)) {
 470                         clear_bit(bit, pmu->reprogram_pmi);
 471                         continue;
 472                 }
 473
 474                 reprogram_counter(pmc);
 475         }
 476
 477         /*
 478          * Unused perf_events are only released if the corresponding MSRs
 479          * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
 480          * triggers KVM_REQ_PMU if cleanup is needed.
 481          */
 482         if (unlikely(pmu->need_cleanup))
 483                 kvm_pmu_cleanup(vcpu);
 484 }
 485
 486 /* check if idx is a valid index to access PMU */
 487 bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
 488 {
 489         return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
 490 }
 491
 492 bool is_vmware_backdoor_pmc(u32 pmc_idx)
 493 {
 494         switch (pmc_idx) {
 495         case VMWARE_BACKDOOR_PMC_HOST_TSC:
 496         case VMWARE_BACKDOOR_PMC_REAL_TIME:
 497         case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
 498                 return true;
 499         }
 500         return false;
 501 }
 502
 503 static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 504 {
 505         u64 ctr_val;
 506
 507         switch (idx) {
 508         case VMWARE_BACKDOOR_PMC_HOST_TSC:
 509                 ctr_val = rdtsc();
 510                 break;
 511         case VMWARE_BACKDOOR_PMC_REAL_TIME:
 512                 ctr_val = ktime_get_boottime_ns();
 513                 break;
 514         case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
 515                 ctr_val = ktime_get_boottime_ns() +
 516                         vcpu->kvm->arch.kvmclock_offset;
 517                 break;
 518         default:
 519                 return 1;
 520         }
 521
 522         *data = ctr_val;
 523         return 0;
 524 }
 525
 526 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 527 {
 528         bool fast_mode = idx & (1u << 31);
 529         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 530         struct kvm_pmc *pmc;
 531         u64 mask = fast_mode ? ~0u : ~0ull;
 532
 533         if (!pmu->version)
 534                 return 1;
 535
 536         if (is_vmware_backdoor_pmc(idx))
 537                 return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
 538
 539         pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
 540         if (!pmc)
 541                 return 1;
 542
 543         if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
 544             (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
 545             (kvm_read_cr0(vcpu) & X86_CR0_PE))
 546                 return 1;
 547
 548         *data = pmc_read_counter(pmc) & mask;
 549         return 0;
 550 }
 551
 552 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
 553 {
 554         if (lapic_in_kernel(vcpu)) {
 555                 static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
 556                 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
 557         }
 558 }
 559
 560 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 561 {
 562         return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
 563                 static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
 564 }
 565
 566 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
 567 {
 568         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 569         struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
 570
 571         if (pmc)
 572                 __set_bit(pmc->idx, pmu->pmc_in_use);
 573 }
 574
 575 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 576 {
 577         return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
 578 }
 579
 580 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 581 {
 582         kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
 583         return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
 584 }
 585
 586 /* refresh PMU settings. This function generally is called when underlying
 587  * settings are changed (such as changes of PMU CPUID by guest VMs), which
 588  * should rarely happen.
 589  */
 590 void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
 591 {
 592         static_call(kvm_x86_pmu_refresh)(vcpu);
 593 }
 594
 595 void kvm_pmu_reset(struct kvm_vcpu *vcpu)
 596 {
 597         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 598
 599         irq_work_sync(&pmu->irq_work);
 600         static_call(kvm_x86_pmu_reset)(vcpu);
 601 }
 602
 603 void kvm_pmu_init(struct kvm_vcpu *vcpu)
 604 {
 605         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 606
 607         memset(pmu, 0, sizeof(*pmu));
 608         static_call(kvm_x86_pmu_init)(vcpu);
 609         init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
 610         pmu->event_count = 0;
 611         pmu->need_cleanup = false;
 612         kvm_pmu_refresh(vcpu);
 613 }
 614
 615 /* Release perf_events for vPMCs that have been unused for a full time slice.  */
 616 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
 617 {
 618         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 619         struct kvm_pmc *pmc = NULL;
 620         DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
 621         int i;
 622
 623         pmu->need_cleanup = false;
 624
 625         bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
 626                       pmu->pmc_in_use, X86_PMC_IDX_MAX);
 627
 628         for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
 629                 pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
 630
 631                 if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
 632                         pmc_stop_counter(pmc);
 633         }
 634
 635         static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
 636
 637         bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
 638 }
 639
 640 void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
 641 {
 642         kvm_pmu_reset(vcpu);
 643 }
 644
 645 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
 646 {
 647         pmc->prev_counter = pmc->counter;
 648         pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
 649         kvm_pmu_request_counter_reprogam(pmc);
 650 }
 651
 652 static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
 653         unsigned int perf_hw_id)
 654 {
 655         return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
 656                 AMD64_RAW_EVENT_MASK_NB);
 657 }
 658
 659 static inline bool cpl_is_matched(struct kvm_pmc *pmc)
 660 {
 661         bool select_os, select_user;
 662         u64 config;
 663
 664         if (pmc_is_gp(pmc)) {
 665                 config = pmc->eventsel;
 666                 select_os = config & ARCH_PERFMON_EVENTSEL_OS;
 667                 select_user = config & ARCH_PERFMON_EVENTSEL_USR;
 668         } else {
 669                 config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
 670                                           pmc->idx - INTEL_PMC_IDX_FIXED);
 671                 select_os = config & 0x1;
 672                 select_user = config & 0x2;
 673         }
 674
 675         return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
 676 }
 677
 678 void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
 679 {
 680         struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 681         struct kvm_pmc *pmc;
 682         int i;
 683
 684         for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
 685                 pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
 686
 687                 if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
 688                         continue;
 689
 690                 /* Ignore checks for edge detect, pin control, invert and CMASK bits */
 691                 if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
 692                         kvm_pmu_incr_counter(pmc);
 693         }
 694 }
 695 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
 696
 697 static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
 698 {
 699         u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
 700                    KVM_PMU_MASKED_ENTRY_UMASK_MASK |
 701                    KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
 702                    KVM_PMU_MASKED_ENTRY_EXCLUDE;
 703         int i;
 704
 705         for (i = 0; i < filter->nevents; i++) {
 706                 if (filter->events[i] & ~mask)
 707                         return false;
 708         }
 709
 710         return true;
 711 }
 712
 713 static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
 714 {
 715         int i, j;
 716
 717         for (i = 0, j = 0; i < filter->nevents; i++) {
 718                 /*
 719                  * Skip events that are impossible to match against a guest
 720                  * event.  When filtering, only the event select + unit mask
 721                  * of the guest event is used.  To maintain backwards
 722                  * compatibility, impossible filters can't be rejected :-(
 723                  */
 724                 if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
 725                                           ARCH_PERFMON_EVENTSEL_UMASK))
 726                         continue;
 727                 /*
 728                  * Convert userspace events to a common in-kernel event so
 729                  * only one code path is needed to support both events.  For
 730                  * the in-kernel events use masked events because they are
 731                  * flexible enough to handle both cases.  To convert to masked
 732                  * events all that's needed is to add an "all ones" umask_mask,
 733                  * (unmasked filter events don't support EXCLUDE).
 734                  */
 735                 filter->events[j++] = filter->events[i] |
 736                                       (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
 737         }
 738
 739         filter->nevents = j;
 740 }
 741
 742 static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
 743 {
 744         int i;
 745
 746         if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
 747                 convert_to_masked_filter(filter);
 748         else if (!is_masked_filter_valid(filter))
 749                 return -EINVAL;
 750
 751         /*
 752          * Sort entries by event select and includes vs. excludes so that all
 753          * entries for a given event select can be processed efficiently during
 754          * filtering.  The EXCLUDE flag uses a more significant bit than the
 755          * event select, and so the sorted list is also effectively split into
 756          * includes and excludes sub-lists.
 757          */
 758         sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
 759              filter_sort_cmp, NULL);
 760
 761         i = filter->nevents;
 762         /* Find the first EXCLUDE event (only supported for masked events). */
 763         if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
 764                 for (i = 0; i < filter->nevents; i++) {
 765                         if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
 766                                 break;
 767                 }
 768         }
 769
 770         filter->nr_includes = i;
 771         filter->nr_excludes = filter->nevents - filter->nr_includes;
 772         filter->includes = filter->events;
 773         filter->excludes = filter->events + filter->nr_includes;
 774
 775         return 0;
 776 }
 777
 778 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
 779 {
 780         struct kvm_pmu_event_filter __user *user_filter = argp;
 781         struct kvm_x86_pmu_event_filter *filter;
 782         struct kvm_pmu_event_filter tmp;
 783         struct kvm_vcpu *vcpu;
 784         unsigned long i;
 785         size_t size;
 786         int r;
 787
 788         if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
 789                 return -EFAULT;
 790
 791         if (tmp.action != KVM_PMU_EVENT_ALLOW &&
 792             tmp.action != KVM_PMU_EVENT_DENY)
 793                 return -EINVAL;
 794
 795         if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
 796                 return -EINVAL;
 797
 798         if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
 799                 return -E2BIG;
 800
 801         size = struct_size(filter, events, tmp.nevents);
 802         filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
 803         if (!filter)
 804                 return -ENOMEM;
 805
 806         filter->action = tmp.action;
 807         filter->nevents = tmp.nevents;
 808         filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
 809         filter->flags = tmp.flags;
 810
 811         r = -EFAULT;
 812         if (copy_from_user(filter->events, user_filter->events,
 813                            sizeof(filter->events[0]) * filter->nevents))
 814                 goto cleanup;
 815
 816         r = prepare_filter_lists(filter);
 817         if (r)
 818                 goto cleanup;
 819
 820         mutex_lock(&kvm->lock);
 821         filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
 822                                      mutex_is_locked(&kvm->lock));
 823         mutex_unlock(&kvm->lock);
 824         synchronize_srcu_expedited(&kvm->srcu);
 825
 826         BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
 827                      sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
 828
 829         kvm_for_each_vcpu(i, vcpu, kvm)
 830                 atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
 831
 832         kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
 833
 834         r = 0;
 835 cleanup:
 836         kfree(filter);
 837         return r;
 838 }