KVM: x86/pmu: Defer reprogram_counter() to kvm_pmu_handle_event()
[linux-block.git] / arch / x86 / kvm / pmu.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
f5132b01 2/*
c7a7062f 3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
f5132b01 4 *
25462f7f 5 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
f5132b01
GN
6 *
7 * Authors:
8 * Avi Kivity <avi@redhat.com>
9 * Gleb Natapov <gleb@redhat.com>
25462f7f 10 * Wei Huang <wei@redhat.com>
f5132b01
GN
11 */
12
13#include <linux/types.h>
14#include <linux/kvm_host.h>
15#include <linux/perf_event.h>
7ff775ac
JM
16#include <linux/bsearch.h>
17#include <linux/sort.h>
d27aa7f1 18#include <asm/perf_event.h>
43d62d10 19#include <asm/cpu_device_id.h>
f5132b01
GN
20#include "x86.h"
21#include "cpuid.h"
22#include "lapic.h"
474a5bb9 23#include "pmu.h"
f5132b01 24
30cd8604
EH
25/* This is enough to filter the vast majority of currently defined events. */
26#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
66bb8a06 27
968635ab
LX
28struct x86_pmu_capability __read_mostly kvm_pmu_cap;
29EXPORT_SYMBOL_GPL(kvm_pmu_cap);
30
43d62d10
LX
31static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
32 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
33 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
34 {}
35};
36
25462f7f
WH
37/* NOTE:
38 * - Each perf counter is defined as "struct kvm_pmc";
39 * - There are two types of perf counters: general purpose (gp) and fixed.
40 * gp counters are stored in gp_counters[] and fixed counters are stored
41 * in fixed_counters[] respectively. Both of them are part of "struct
42 * kvm_pmu";
43 * - pmu.c understands the difference between gp counters and fixed counters.
44 * However AMD doesn't support fixed-counters;
45 * - There are three types of index to access perf counters (PMC):
46 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
a33095f4
LX
47 * has MSR_K7_PERFCTRn and, for families 15H and later,
48 * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
49 * aliased to MSR_K7_PERFCTRn.
25462f7f
WH
50 * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
51 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
52 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
53 * that it also supports fixed counters. idx can be used to as index to
54 * gp and fixed counters.
55 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
56 * code. Each pmc, stored in kvm_pmc.idx field, is unique across
57 * all perf counters (both gp and fixed). The mapping relationship
58 * between pmc and perf counters is as the following:
4f1fa2a1 59 * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
25462f7f 60 * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
a33095f4
LX
61 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
62 * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
25462f7f 63 */
f5132b01 64
8f969c0c
LX
65static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
66
1921f3aa
LX
67#define KVM_X86_PMU_OP(func) \
68 DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
69 *(((struct kvm_pmu_ops *)0)->func));
70#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
71#include <asm/kvm-x86-pmu-ops.h>
72
8f969c0c
LX
73void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
74{
75 memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
1921f3aa
LX
76
77#define __KVM_X86_PMU_OP(func) \
78 static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
79#define KVM_X86_PMU_OP(func) \
80 WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
81#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
82#include <asm/kvm-x86-pmu-ops.h>
83#undef __KVM_X86_PMU_OP
8f969c0c
LX
84}
85
86static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
87{
1921f3aa 88 return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
8f969c0c
LX
89}
90
c6702c9d 91static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
f5132b01 92{
212dba12
WH
93 struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
94 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
f5132b01 95
c6702c9d 96 kvm_pmu_deliver_pmi(vcpu);
f5132b01
GN
97}
98
40ccb96d 99static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
f5132b01 100{
212dba12 101 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
79f3e3b5 102 bool skip_pmi = false;
e84cfe4c 103
68fb4757
LX
104 /*
105 * Ignore overflow events for counters that are scheduled to be
106 * reprogrammed, e.g. if a PMI for the previous event races with KVM's
107 * handling of a related guest WRMSR.
108 */
40ccb96d
LX
109 if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
110 return;
111
79f3e3b5 112 if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
f331601c
LX
113 if (!in_pmi) {
114 /*
115 * TODO: KVM is currently _choosing_ to not generate records
116 * for emulated instructions, avoiding BUFFER_OVF PMI when
117 * there are no records. Strictly speaking, it should be done
118 * as well in the right context to improve sampling accuracy.
119 */
120 skip_pmi = true;
121 } else {
122 /* Indicate PEBS overflow PMI to guest. */
123 skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
124 (unsigned long *)&pmu->global_status);
125 }
79f3e3b5
LX
126 } else {
127 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
128 }
40ccb96d
LX
129 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
130
79f3e3b5 131 if (!pmc->intr || skip_pmi)
40ccb96d
LX
132 return;
133
134 /*
135 * Inject PMI. If vcpu was in a guest mode during NMI PMI
136 * can be ejected on a guest mode re-entry. Otherwise we can't
137 * be sure that vcpu wasn't executing hlt instruction at the
138 * time of vmexit and is not going to re-enter guest mode until
139 * woken up. So we should wake it, but this is impossible from
140 * NMI context. Do it from irq work instead.
141 */
79e06c4c 142 if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
40ccb96d
LX
143 irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
144 else
145 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
f5132b01
GN
146}
147
40ccb96d
LX
148static void kvm_perf_overflow(struct perf_event *perf_event,
149 struct perf_sample_data *data,
150 struct pt_regs *regs)
f5132b01
GN
151{
152 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
e84cfe4c 153
40ccb96d 154 __kvm_perf_overflow(pmc, true);
f5132b01
GN
155}
156
dcbb816a
SC
157static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
158 bool exclude_user, bool exclude_kernel,
159 bool intr)
f5132b01 160{
79f3e3b5 161 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
f5132b01
GN
162 struct perf_event *event;
163 struct perf_event_attr attr = {
164 .type = type,
165 .size = sizeof(attr),
166 .pinned = true,
167 .exclude_idle = true,
168 .exclude_host = 1,
169 .exclude_user = exclude_user,
170 .exclude_kernel = exclude_kernel,
171 .config = config,
172 };
79f3e3b5 173 bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
e84cfe4c 174
168d918f 175 attr.sample_period = get_sample_period(pmc, pmc->counter);
bba82fd7 176
e644896f
LX
177 if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
178 guest_cpuid_is_intel(pmc->vcpu)) {
bba82fd7
RC
179 /*
180 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
181 * period. Just clear the sample period so at least
182 * allocating the counter doesn't fail.
183 */
184 attr.sample_period = 0;
bba82fd7 185 }
79f3e3b5
LX
186 if (pebs) {
187 /*
188 * The non-zero precision level of guest event makes the ordinary
189 * guest event becomes a guest PEBS event and triggers the host
190 * PEBS PMI handler to determine whether the PEBS overflow PMI
191 * comes from the host counters or the guest.
192 *
193 * For most PEBS hardware events, the difference in the software
194 * precision levels of guest and host PEBS events will not affect
195 * the accuracy of the PEBS profiling result, because the "event IP"
196 * in the PEBS record is calibrated on the guest side.
197 *
198 * On Icelake everything is fine. Other hardware (GLC+, TNT+) that
199 * could possibly care here is unsupported and needs changes.
200 */
201 attr.precise_ip = 1;
6ebe4436
LX
202 if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32)
203 attr.precise_ip = 3;
79f3e3b5 204 }
f5132b01
GN
205
206 event = perf_event_create_kernel_counter(&attr, -1, current,
f5132b01
GN
207 kvm_perf_overflow, pmc);
208 if (IS_ERR(event)) {
6fc3977c
LX
209 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
210 PTR_ERR(event), pmc->idx);
dcbb816a 211 return PTR_ERR(event);
f5132b01
GN
212 }
213
214 pmc->perf_event = event;
b35e5548 215 pmc_to_pmu(pmc)->event_count++;
e79f49c3 216 pmc->is_paused = false;
79f3e3b5 217 pmc->intr = intr || pebs;
dcbb816a 218 return 0;
f5132b01
GN
219}
220
a6da0d77
LX
221static void pmc_pause_counter(struct kvm_pmc *pmc)
222{
223 u64 counter = pmc->counter;
224
e79f49c3 225 if (!pmc->perf_event || pmc->is_paused)
a6da0d77
LX
226 return;
227
228 /* update counter, reset event value to avoid redundant accumulation */
229 counter += perf_event_pause(pmc->perf_event, true);
230 pmc->counter = counter & pmc_bitmask(pmc);
e79f49c3 231 pmc->is_paused = true;
a6da0d77
LX
232}
233
234static bool pmc_resume_counter(struct kvm_pmc *pmc)
235{
236 if (!pmc->perf_event)
237 return false;
238
239 /* recalibrate sample period and check if it's accepted by perf core */
240 if (perf_event_period(pmc->perf_event,
168d918f 241 get_sample_period(pmc, pmc->counter)))
a6da0d77
LX
242 return false;
243
cf52de61
LX
244 if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
245 (!!pmc->perf_event->attr.precise_ip))
79f3e3b5
LX
246 return false;
247
a6da0d77
LX
248 /* reuse perf_event to serve as pmc_reprogram_counter() does*/
249 perf_event_enable(pmc->perf_event);
e79f49c3 250 pmc->is_paused = false;
a6da0d77 251
a6da0d77
LX
252 return true;
253}
254
4ac19ead 255static int cmp_u64(const void *pa, const void *pb)
7ff775ac 256{
4ac19ead
AL
257 u64 a = *(u64 *)pa;
258 u64 b = *(u64 *)pb;
259
260 return (a > b) - (a < b);
7ff775ac
JM
261}
262
89cb454e
LX
263static bool check_pmu_event_filter(struct kvm_pmc *pmc)
264{
265 struct kvm_pmu_event_filter *filter;
266 struct kvm *kvm = pmc->vcpu->kvm;
267 bool allow_event = true;
268 __u64 key;
269 int idx;
270
7aadaa98
LX
271 if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
272 return false;
273
89cb454e
LX
274 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
275 if (!filter)
276 goto out;
277
278 if (pmc_is_gp(pmc)) {
279 key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB;
280 if (bsearch(&key, filter->events, filter->nevents,
281 sizeof(__u64), cmp_u64))
282 allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
283 else
284 allow_event = filter->action == KVM_PMU_EVENT_DENY;
285 } else {
286 idx = pmc->idx - INTEL_PMC_IDX_FIXED;
287 if (filter->action == KVM_PMU_EVENT_DENY &&
288 test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
289 allow_event = false;
290 if (filter->action == KVM_PMU_EVENT_ALLOW &&
291 !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
292 allow_event = false;
293 }
294
295out:
296 return allow_event;
297}
298
68fb4757 299static void reprogram_counter(struct kvm_pmc *pmc)
f5132b01 300{
89cb454e 301 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
fb121aaf 302 u64 eventsel = pmc->eventsel;
02791a5c
LX
303 u64 new_config = eventsel;
304 u8 fixed_ctr_ctrl;
a7b9d2cc 305
a6da0d77 306 pmc_pause_counter(pmc);
f5132b01 307
02791a5c 308 if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
dcbb816a 309 goto reprogram_complete;
f5132b01 310
89cb454e 311 if (!check_pmu_event_filter(pmc))
dcbb816a 312 goto reprogram_complete;
66bb8a06 313
02791a5c
LX
314 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
315 printk_once("kvm pmu: pin control bit is ignored\n");
f5132b01 316
02791a5c
LX
317 if (pmc_is_fixed(pmc)) {
318 fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
319 pmc->idx - INTEL_PMC_IDX_FIXED);
320 if (fixed_ctr_ctrl & 0x1)
321 eventsel |= ARCH_PERFMON_EVENTSEL_OS;
322 if (fixed_ctr_ctrl & 0x2)
323 eventsel |= ARCH_PERFMON_EVENTSEL_USR;
324 if (fixed_ctr_ctrl & 0x8)
325 eventsel |= ARCH_PERFMON_EVENTSEL_INT;
326 new_config = (u64)fixed_ctr_ctrl;
327 }
f5132b01 328
02791a5c 329 if (pmc->current_config == new_config && pmc_resume_counter(pmc))
dcbb816a 330 goto reprogram_complete;
a6da0d77
LX
331
332 pmc_release_perf_event(pmc);
333
02791a5c 334 pmc->current_config = new_config;
dcbb816a
SC
335
336 /*
337 * If reprogramming fails, e.g. due to contention, leave the counter's
338 * regprogram bit set, i.e. opportunistically try again on the next PMU
339 * refresh. Don't make a new request as doing so can stall the guest
340 * if reprogramming repeatedly fails.
341 */
342 if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
343 (eventsel & pmu->raw_event_mask),
344 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
345 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
346 eventsel & ARCH_PERFMON_EVENTSEL_INT))
347 return;
348
349reprogram_complete:
350 clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
f5132b01 351}
f5132b01 352
e5af058a
WH
353void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
354{
355 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
e5af058a
WH
356 int bit;
357
4be94672 358 for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
1921f3aa 359 struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
e5af058a 360
68fb4757 361 if (unlikely(!pmc)) {
4be94672 362 clear_bit(bit, pmu->reprogram_pmi);
e5af058a
WH
363 continue;
364 }
68fb4757 365
a40239b4 366 reprogram_counter(pmc);
e5af058a 367 }
b35e5548
LX
368
369 /*
370 * Unused perf_events are only released if the corresponding MSRs
371 * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
372 * triggers KVM_REQ_PMU if cleanup is needed.
373 */
374 if (unlikely(pmu->need_cleanup))
375 kvm_pmu_cleanup(vcpu);
e5af058a
WH
376}
377
378/* check if idx is a valid index to access PMU */
e6cd31f1 379bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
e5af058a 380{
1921f3aa 381 return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
41aac14a
WH
382}
383
2d7921c4
AM
384bool is_vmware_backdoor_pmc(u32 pmc_idx)
385{
386 switch (pmc_idx) {
387 case VMWARE_BACKDOOR_PMC_HOST_TSC:
388 case VMWARE_BACKDOOR_PMC_REAL_TIME:
389 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
390 return true;
391 }
392 return false;
393}
394
395static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
396{
397 u64 ctr_val;
398
399 switch (idx) {
400 case VMWARE_BACKDOOR_PMC_HOST_TSC:
401 ctr_val = rdtsc();
402 break;
403 case VMWARE_BACKDOOR_PMC_REAL_TIME:
9285ec4c 404 ctr_val = ktime_get_boottime_ns();
2d7921c4
AM
405 break;
406 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
9285ec4c 407 ctr_val = ktime_get_boottime_ns() +
2d7921c4
AM
408 vcpu->kvm->arch.kvmclock_offset;
409 break;
410 default:
411 return 1;
412 }
413
414 *data = ctr_val;
415 return 0;
416}
417
41aac14a
WH
418int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
419{
420 bool fast_mode = idx & (1u << 31);
672ff6cf 421 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
41aac14a 422 struct kvm_pmc *pmc;
0e6f467e 423 u64 mask = fast_mode ? ~0u : ~0ull;
41aac14a 424
672ff6cf
LA
425 if (!pmu->version)
426 return 1;
427
2d7921c4
AM
428 if (is_vmware_backdoor_pmc(idx))
429 return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
430
1921f3aa 431 pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
41aac14a
WH
432 if (!pmc)
433 return 1;
434
632a4cf5 435 if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
b3646477 436 (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
632a4cf5
LX
437 (kvm_read_cr0(vcpu) & X86_CR0_PE))
438 return 1;
439
0e6f467e 440 *data = pmc_read_counter(pmc) & mask;
e5af058a
WH
441 return 0;
442}
443
444void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
445{
e6209a3b 446 if (lapic_in_kernel(vcpu)) {
1921f3aa 447 static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
e5af058a 448 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
e6209a3b 449 }
e5af058a
WH
450}
451
545feb96 452bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
f5132b01 453{
1921f3aa 454 return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
545feb96 455 static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
f5132b01
GN
456}
457
b35e5548
LX
458static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
459{
460 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
1921f3aa 461 struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
b35e5548
LX
462
463 if (pmc)
464 __set_bit(pmc->idx, pmu->pmc_in_use);
465}
466
cbd71758 467int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
f5132b01 468{
1921f3aa 469 return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
f5132b01
GN
470}
471
afd80d85 472int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
f5132b01 473{
b35e5548 474 kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
1921f3aa 475 return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
f5132b01
GN
476}
477
e84cfe4c
WH
478/* refresh PMU settings. This function generally is called when underlying
479 * settings are changed (such as changes of PMU CPUID by guest VMs), which
480 * should rarely happen.
481 */
c6702c9d 482void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
f5132b01 483{
1921f3aa 484 static_call(kvm_x86_pmu_refresh)(vcpu);
f5132b01
GN
485}
486
f5132b01
GN
487void kvm_pmu_reset(struct kvm_vcpu *vcpu)
488{
212dba12 489 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
f5132b01
GN
490
491 irq_work_sync(&pmu->irq_work);
1921f3aa 492 static_call(kvm_x86_pmu_reset)(vcpu);
f5132b01
GN
493}
494
e5af058a 495void kvm_pmu_init(struct kvm_vcpu *vcpu)
f5132b01 496{
212dba12 497 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
f5132b01 498
e5af058a 499 memset(pmu, 0, sizeof(*pmu));
1921f3aa 500 static_call(kvm_x86_pmu_init)(vcpu);
e5af058a 501 init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
b35e5548
LX
502 pmu->event_count = 0;
503 pmu->need_cleanup = false;
e5af058a
WH
504 kvm_pmu_refresh(vcpu);
505}
506
b35e5548
LX
507/* Release perf_events for vPMCs that have been unused for a full time slice. */
508void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
509{
510 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
511 struct kvm_pmc *pmc = NULL;
512 DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
513 int i;
514
515 pmu->need_cleanup = false;
516
517 bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
518 pmu->pmc_in_use, X86_PMC_IDX_MAX);
519
520 for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
1921f3aa 521 pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
b35e5548
LX
522
523 if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
524 pmc_stop_counter(pmc);
525 }
526
1921f3aa 527 static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
9aa4f622 528
b35e5548
LX
529 bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
530}
531
e5af058a
WH
532void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
533{
534 kvm_pmu_reset(vcpu);
f5132b01 535}
66bb8a06 536
9cd803d4
EH
537static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
538{
9cd803d4
EH
539 u64 prev_count;
540
541 prev_count = pmc->counter;
542 pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
543
a40239b4 544 reprogram_counter(pmc);
9cd803d4
EH
545 if (pmc->counter < prev_count)
546 __kvm_perf_overflow(pmc, false);
547}
548
549static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
550 unsigned int perf_hw_id)
551{
08dca7a8
LX
552 return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
553 AMD64_RAW_EVENT_MASK_NB);
9cd803d4
EH
554}
555
556static inline bool cpl_is_matched(struct kvm_pmc *pmc)
557{
558 bool select_os, select_user;
68fb4757 559 u64 config;
9cd803d4
EH
560
561 if (pmc_is_gp(pmc)) {
68fb4757 562 config = pmc->eventsel;
9cd803d4
EH
563 select_os = config & ARCH_PERFMON_EVENTSEL_OS;
564 select_user = config & ARCH_PERFMON_EVENTSEL_USR;
565 } else {
68fb4757
LX
566 config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
567 pmc->idx - INTEL_PMC_IDX_FIXED);
9cd803d4
EH
568 select_os = config & 0x1;
569 select_user = config & 0x2;
570 }
571
572 return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
573}
574
575void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
576{
577 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
578 struct kvm_pmc *pmc;
579 int i;
580
581 for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
1921f3aa 582 pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
9cd803d4
EH
583
584 if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
585 continue;
586
587 /* Ignore checks for edge detect, pin control, invert and CMASK bits */
588 if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
589 kvm_pmu_incr_counter(pmc);
590 }
591}
592EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
593
66bb8a06
EH
594int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
595{
596 struct kvm_pmu_event_filter tmp, *filter;
f1c5651f
SC
597 struct kvm_vcpu *vcpu;
598 unsigned long i;
66bb8a06
EH
599 size_t size;
600 int r;
601
602 if (copy_from_user(&tmp, argp, sizeof(tmp)))
603 return -EFAULT;
604
605 if (tmp.action != KVM_PMU_EVENT_ALLOW &&
606 tmp.action != KVM_PMU_EVENT_DENY)
607 return -EINVAL;
608
30cd8604
EH
609 if (tmp.flags != 0)
610 return -EINVAL;
611
66bb8a06
EH
612 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
613 return -E2BIG;
614
615 size = struct_size(filter, events, tmp.nevents);
616 filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
617 if (!filter)
618 return -ENOMEM;
619
620 r = -EFAULT;
621 if (copy_from_user(filter, argp, size))
622 goto cleanup;
623
624 /* Ensure nevents can't be changed between the user copies. */
625 *filter = tmp;
626
7ff775ac
JM
627 /*
628 * Sort the in-kernel list so that we can search it with bsearch.
629 */
630 sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
631
66bb8a06 632 mutex_lock(&kvm->lock);
12e78e69
PM
633 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
634 mutex_is_locked(&kvm->lock));
f1c5651f
SC
635 synchronize_srcu_expedited(&kvm->srcu);
636
637 BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
638 sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
639
640 kvm_for_each_vcpu(i, vcpu, kvm)
641 atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
642
643 kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
644
66bb8a06
EH
645 mutex_unlock(&kvm->lock);
646
30cd8604 647 r = 0;
66bb8a06
EH
648cleanup:
649 kfree(filter);
30cd8604 650 return r;
66bb8a06 651}