Commit | Line | Data |
---|---|---|
20c8ccb1 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
f5132b01 | 2 | /* |
c7a7062f | 3 | * Kernel-based Virtual Machine -- Performance Monitoring Unit support |
f5132b01 | 4 | * |
25462f7f | 5 | * Copyright 2015 Red Hat, Inc. and/or its affiliates. |
f5132b01 GN |
6 | * |
7 | * Authors: | |
8 | * Avi Kivity <avi@redhat.com> | |
9 | * Gleb Natapov <gleb@redhat.com> | |
25462f7f | 10 | * Wei Huang <wei@redhat.com> |
f5132b01 GN |
11 | */ |
12 | ||
13 | #include <linux/types.h> | |
14 | #include <linux/kvm_host.h> | |
15 | #include <linux/perf_event.h> | |
7ff775ac JM |
16 | #include <linux/bsearch.h> |
17 | #include <linux/sort.h> | |
d27aa7f1 | 18 | #include <asm/perf_event.h> |
43d62d10 | 19 | #include <asm/cpu_device_id.h> |
f5132b01 GN |
20 | #include "x86.h" |
21 | #include "cpuid.h" | |
22 | #include "lapic.h" | |
474a5bb9 | 23 | #include "pmu.h" |
f5132b01 | 24 | |
30cd8604 EH |
25 | /* This is enough to filter the vast majority of currently defined events. */ |
26 | #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 | |
66bb8a06 | 27 | |
968635ab LX |
28 | struct x86_pmu_capability __read_mostly kvm_pmu_cap; |
29 | EXPORT_SYMBOL_GPL(kvm_pmu_cap); | |
30 | ||
43d62d10 LX |
31 | static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { |
32 | X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), | |
33 | X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), | |
34 | {} | |
35 | }; | |
36 | ||
25462f7f WH |
37 | /* NOTE: |
38 | * - Each perf counter is defined as "struct kvm_pmc"; | |
39 | * - There are two types of perf counters: general purpose (gp) and fixed. | |
40 | * gp counters are stored in gp_counters[] and fixed counters are stored | |
41 | * in fixed_counters[] respectively. Both of them are part of "struct | |
42 | * kvm_pmu"; | |
43 | * - pmu.c understands the difference between gp counters and fixed counters. | |
44 | * However AMD doesn't support fixed-counters; | |
45 | * - There are three types of index to access perf counters (PMC): | |
46 | * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD | |
a33095f4 LX |
47 | * has MSR_K7_PERFCTRn and, for families 15H and later, |
48 | * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are | |
49 | * aliased to MSR_K7_PERFCTRn. | |
25462f7f WH |
50 | * 2. MSR Index (named idx): This normally is used by RDPMC instruction. |
51 | * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access | |
52 | * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except | |
53 | * that it also supports fixed counters. idx can be used to as index to | |
54 | * gp and fixed counters. | |
55 | * 3. Global PMC Index (named pmc): pmc is an index specific to PMU | |
56 | * code. Each pmc, stored in kvm_pmc.idx field, is unique across | |
57 | * all perf counters (both gp and fixed). The mapping relationship | |
58 | * between pmc and perf counters is as the following: | |
4f1fa2a1 | 59 | * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters |
25462f7f | 60 | * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed |
a33095f4 LX |
61 | * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H |
62 | * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters | |
25462f7f | 63 | */ |
f5132b01 | 64 | |
8f969c0c LX |
65 | static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; |
66 | ||
1921f3aa LX |
67 | #define KVM_X86_PMU_OP(func) \ |
68 | DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \ | |
69 | *(((struct kvm_pmu_ops *)0)->func)); | |
70 | #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP | |
71 | #include <asm/kvm-x86-pmu-ops.h> | |
72 | ||
8f969c0c LX |
73 | void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) |
74 | { | |
75 | memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops)); | |
1921f3aa LX |
76 | |
77 | #define __KVM_X86_PMU_OP(func) \ | |
78 | static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func); | |
79 | #define KVM_X86_PMU_OP(func) \ | |
80 | WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func) | |
81 | #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP | |
82 | #include <asm/kvm-x86-pmu-ops.h> | |
83 | #undef __KVM_X86_PMU_OP | |
8f969c0c LX |
84 | } |
85 | ||
86 | static inline bool pmc_is_enabled(struct kvm_pmc *pmc) | |
87 | { | |
1921f3aa | 88 | return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); |
8f969c0c LX |
89 | } |
90 | ||
c6702c9d | 91 | static void kvm_pmi_trigger_fn(struct irq_work *irq_work) |
f5132b01 | 92 | { |
212dba12 WH |
93 | struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); |
94 | struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); | |
f5132b01 | 95 | |
c6702c9d | 96 | kvm_pmu_deliver_pmi(vcpu); |
f5132b01 GN |
97 | } |
98 | ||
40ccb96d | 99 | static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) |
f5132b01 | 100 | { |
212dba12 | 101 | struct kvm_pmu *pmu = pmc_to_pmu(pmc); |
79f3e3b5 | 102 | bool skip_pmi = false; |
e84cfe4c | 103 | |
68fb4757 LX |
104 | /* |
105 | * Ignore overflow events for counters that are scheduled to be | |
106 | * reprogrammed, e.g. if a PMI for the previous event races with KVM's | |
107 | * handling of a related guest WRMSR. | |
108 | */ | |
40ccb96d LX |
109 | if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) |
110 | return; | |
111 | ||
79f3e3b5 | 112 | if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { |
f331601c LX |
113 | if (!in_pmi) { |
114 | /* | |
115 | * TODO: KVM is currently _choosing_ to not generate records | |
116 | * for emulated instructions, avoiding BUFFER_OVF PMI when | |
117 | * there are no records. Strictly speaking, it should be done | |
118 | * as well in the right context to improve sampling accuracy. | |
119 | */ | |
120 | skip_pmi = true; | |
121 | } else { | |
122 | /* Indicate PEBS overflow PMI to guest. */ | |
123 | skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, | |
124 | (unsigned long *)&pmu->global_status); | |
125 | } | |
79f3e3b5 LX |
126 | } else { |
127 | __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); | |
128 | } | |
40ccb96d LX |
129 | kvm_make_request(KVM_REQ_PMU, pmc->vcpu); |
130 | ||
79f3e3b5 | 131 | if (!pmc->intr || skip_pmi) |
40ccb96d LX |
132 | return; |
133 | ||
134 | /* | |
135 | * Inject PMI. If vcpu was in a guest mode during NMI PMI | |
136 | * can be ejected on a guest mode re-entry. Otherwise we can't | |
137 | * be sure that vcpu wasn't executing hlt instruction at the | |
138 | * time of vmexit and is not going to re-enter guest mode until | |
139 | * woken up. So we should wake it, but this is impossible from | |
140 | * NMI context. Do it from irq work instead. | |
141 | */ | |
79e06c4c | 142 | if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) |
40ccb96d LX |
143 | irq_work_queue(&pmc_to_pmu(pmc)->irq_work); |
144 | else | |
145 | kvm_make_request(KVM_REQ_PMI, pmc->vcpu); | |
f5132b01 GN |
146 | } |
147 | ||
40ccb96d LX |
148 | static void kvm_perf_overflow(struct perf_event *perf_event, |
149 | struct perf_sample_data *data, | |
150 | struct pt_regs *regs) | |
f5132b01 GN |
151 | { |
152 | struct kvm_pmc *pmc = perf_event->overflow_handler_context; | |
e84cfe4c | 153 | |
40ccb96d | 154 | __kvm_perf_overflow(pmc, true); |
f5132b01 GN |
155 | } |
156 | ||
dcbb816a SC |
157 | static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, |
158 | bool exclude_user, bool exclude_kernel, | |
159 | bool intr) | |
f5132b01 | 160 | { |
79f3e3b5 | 161 | struct kvm_pmu *pmu = pmc_to_pmu(pmc); |
f5132b01 GN |
162 | struct perf_event *event; |
163 | struct perf_event_attr attr = { | |
164 | .type = type, | |
165 | .size = sizeof(attr), | |
166 | .pinned = true, | |
167 | .exclude_idle = true, | |
168 | .exclude_host = 1, | |
169 | .exclude_user = exclude_user, | |
170 | .exclude_kernel = exclude_kernel, | |
171 | .config = config, | |
172 | }; | |
79f3e3b5 | 173 | bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); |
e84cfe4c | 174 | |
168d918f | 175 | attr.sample_period = get_sample_period(pmc, pmc->counter); |
bba82fd7 | 176 | |
e644896f LX |
177 | if ((attr.config & HSW_IN_TX_CHECKPOINTED) && |
178 | guest_cpuid_is_intel(pmc->vcpu)) { | |
bba82fd7 RC |
179 | /* |
180 | * HSW_IN_TX_CHECKPOINTED is not supported with nonzero | |
181 | * period. Just clear the sample period so at least | |
182 | * allocating the counter doesn't fail. | |
183 | */ | |
184 | attr.sample_period = 0; | |
bba82fd7 | 185 | } |
79f3e3b5 LX |
186 | if (pebs) { |
187 | /* | |
188 | * The non-zero precision level of guest event makes the ordinary | |
189 | * guest event becomes a guest PEBS event and triggers the host | |
190 | * PEBS PMI handler to determine whether the PEBS overflow PMI | |
191 | * comes from the host counters or the guest. | |
192 | * | |
193 | * For most PEBS hardware events, the difference in the software | |
194 | * precision levels of guest and host PEBS events will not affect | |
195 | * the accuracy of the PEBS profiling result, because the "event IP" | |
196 | * in the PEBS record is calibrated on the guest side. | |
197 | * | |
198 | * On Icelake everything is fine. Other hardware (GLC+, TNT+) that | |
199 | * could possibly care here is unsupported and needs changes. | |
200 | */ | |
201 | attr.precise_ip = 1; | |
6ebe4436 LX |
202 | if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) |
203 | attr.precise_ip = 3; | |
79f3e3b5 | 204 | } |
f5132b01 GN |
205 | |
206 | event = perf_event_create_kernel_counter(&attr, -1, current, | |
f5132b01 GN |
207 | kvm_perf_overflow, pmc); |
208 | if (IS_ERR(event)) { | |
6fc3977c LX |
209 | pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", |
210 | PTR_ERR(event), pmc->idx); | |
dcbb816a | 211 | return PTR_ERR(event); |
f5132b01 GN |
212 | } |
213 | ||
214 | pmc->perf_event = event; | |
b35e5548 | 215 | pmc_to_pmu(pmc)->event_count++; |
e79f49c3 | 216 | pmc->is_paused = false; |
79f3e3b5 | 217 | pmc->intr = intr || pebs; |
dcbb816a | 218 | return 0; |
f5132b01 GN |
219 | } |
220 | ||
a6da0d77 LX |
221 | static void pmc_pause_counter(struct kvm_pmc *pmc) |
222 | { | |
223 | u64 counter = pmc->counter; | |
224 | ||
e79f49c3 | 225 | if (!pmc->perf_event || pmc->is_paused) |
a6da0d77 LX |
226 | return; |
227 | ||
228 | /* update counter, reset event value to avoid redundant accumulation */ | |
229 | counter += perf_event_pause(pmc->perf_event, true); | |
230 | pmc->counter = counter & pmc_bitmask(pmc); | |
e79f49c3 | 231 | pmc->is_paused = true; |
a6da0d77 LX |
232 | } |
233 | ||
234 | static bool pmc_resume_counter(struct kvm_pmc *pmc) | |
235 | { | |
236 | if (!pmc->perf_event) | |
237 | return false; | |
238 | ||
239 | /* recalibrate sample period and check if it's accepted by perf core */ | |
240 | if (perf_event_period(pmc->perf_event, | |
168d918f | 241 | get_sample_period(pmc, pmc->counter))) |
a6da0d77 LX |
242 | return false; |
243 | ||
cf52de61 LX |
244 | if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) != |
245 | (!!pmc->perf_event->attr.precise_ip)) | |
79f3e3b5 LX |
246 | return false; |
247 | ||
a6da0d77 LX |
248 | /* reuse perf_event to serve as pmc_reprogram_counter() does*/ |
249 | perf_event_enable(pmc->perf_event); | |
e79f49c3 | 250 | pmc->is_paused = false; |
a6da0d77 | 251 | |
a6da0d77 LX |
252 | return true; |
253 | } | |
254 | ||
4ac19ead | 255 | static int cmp_u64(const void *pa, const void *pb) |
7ff775ac | 256 | { |
4ac19ead AL |
257 | u64 a = *(u64 *)pa; |
258 | u64 b = *(u64 *)pb; | |
259 | ||
260 | return (a > b) - (a < b); | |
7ff775ac JM |
261 | } |
262 | ||
89cb454e LX |
263 | static bool check_pmu_event_filter(struct kvm_pmc *pmc) |
264 | { | |
265 | struct kvm_pmu_event_filter *filter; | |
266 | struct kvm *kvm = pmc->vcpu->kvm; | |
267 | bool allow_event = true; | |
268 | __u64 key; | |
269 | int idx; | |
270 | ||
7aadaa98 LX |
271 | if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) |
272 | return false; | |
273 | ||
89cb454e LX |
274 | filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); |
275 | if (!filter) | |
276 | goto out; | |
277 | ||
278 | if (pmc_is_gp(pmc)) { | |
279 | key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; | |
280 | if (bsearch(&key, filter->events, filter->nevents, | |
281 | sizeof(__u64), cmp_u64)) | |
282 | allow_event = filter->action == KVM_PMU_EVENT_ALLOW; | |
283 | else | |
284 | allow_event = filter->action == KVM_PMU_EVENT_DENY; | |
285 | } else { | |
286 | idx = pmc->idx - INTEL_PMC_IDX_FIXED; | |
287 | if (filter->action == KVM_PMU_EVENT_DENY && | |
288 | test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) | |
289 | allow_event = false; | |
290 | if (filter->action == KVM_PMU_EVENT_ALLOW && | |
291 | !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) | |
292 | allow_event = false; | |
293 | } | |
294 | ||
295 | out: | |
296 | return allow_event; | |
297 | } | |
298 | ||
68fb4757 | 299 | static void reprogram_counter(struct kvm_pmc *pmc) |
f5132b01 | 300 | { |
89cb454e | 301 | struct kvm_pmu *pmu = pmc_to_pmu(pmc); |
fb121aaf | 302 | u64 eventsel = pmc->eventsel; |
02791a5c LX |
303 | u64 new_config = eventsel; |
304 | u8 fixed_ctr_ctrl; | |
a7b9d2cc | 305 | |
a6da0d77 | 306 | pmc_pause_counter(pmc); |
f5132b01 | 307 | |
02791a5c | 308 | if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) |
dcbb816a | 309 | goto reprogram_complete; |
f5132b01 | 310 | |
89cb454e | 311 | if (!check_pmu_event_filter(pmc)) |
dcbb816a | 312 | goto reprogram_complete; |
66bb8a06 | 313 | |
02791a5c LX |
314 | if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) |
315 | printk_once("kvm pmu: pin control bit is ignored\n"); | |
f5132b01 | 316 | |
02791a5c LX |
317 | if (pmc_is_fixed(pmc)) { |
318 | fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, | |
319 | pmc->idx - INTEL_PMC_IDX_FIXED); | |
320 | if (fixed_ctr_ctrl & 0x1) | |
321 | eventsel |= ARCH_PERFMON_EVENTSEL_OS; | |
322 | if (fixed_ctr_ctrl & 0x2) | |
323 | eventsel |= ARCH_PERFMON_EVENTSEL_USR; | |
324 | if (fixed_ctr_ctrl & 0x8) | |
325 | eventsel |= ARCH_PERFMON_EVENTSEL_INT; | |
326 | new_config = (u64)fixed_ctr_ctrl; | |
327 | } | |
f5132b01 | 328 | |
02791a5c | 329 | if (pmc->current_config == new_config && pmc_resume_counter(pmc)) |
dcbb816a | 330 | goto reprogram_complete; |
a6da0d77 LX |
331 | |
332 | pmc_release_perf_event(pmc); | |
333 | ||
02791a5c | 334 | pmc->current_config = new_config; |
dcbb816a SC |
335 | |
336 | /* | |
337 | * If reprogramming fails, e.g. due to contention, leave the counter's | |
338 | * regprogram bit set, i.e. opportunistically try again on the next PMU | |
339 | * refresh. Don't make a new request as doing so can stall the guest | |
340 | * if reprogramming repeatedly fails. | |
341 | */ | |
342 | if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, | |
343 | (eventsel & pmu->raw_event_mask), | |
344 | !(eventsel & ARCH_PERFMON_EVENTSEL_USR), | |
345 | !(eventsel & ARCH_PERFMON_EVENTSEL_OS), | |
346 | eventsel & ARCH_PERFMON_EVENTSEL_INT)) | |
347 | return; | |
348 | ||
349 | reprogram_complete: | |
350 | clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); | |
f5132b01 | 351 | } |
f5132b01 | 352 | |
e5af058a WH |
353 | void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) |
354 | { | |
355 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); | |
e5af058a WH |
356 | int bit; |
357 | ||
4be94672 | 358 | for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { |
1921f3aa | 359 | struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); |
e5af058a | 360 | |
68fb4757 | 361 | if (unlikely(!pmc)) { |
4be94672 | 362 | clear_bit(bit, pmu->reprogram_pmi); |
e5af058a WH |
363 | continue; |
364 | } | |
68fb4757 | 365 | |
a40239b4 | 366 | reprogram_counter(pmc); |
e5af058a | 367 | } |
b35e5548 LX |
368 | |
369 | /* | |
370 | * Unused perf_events are only released if the corresponding MSRs | |
371 | * weren't accessed during the last vCPU time slice. kvm_arch_sched_in | |
372 | * triggers KVM_REQ_PMU if cleanup is needed. | |
373 | */ | |
374 | if (unlikely(pmu->need_cleanup)) | |
375 | kvm_pmu_cleanup(vcpu); | |
e5af058a WH |
376 | } |
377 | ||
378 | /* check if idx is a valid index to access PMU */ | |
e6cd31f1 | 379 | bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) |
e5af058a | 380 | { |
1921f3aa | 381 | return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx); |
41aac14a WH |
382 | } |
383 | ||
2d7921c4 AM |
384 | bool is_vmware_backdoor_pmc(u32 pmc_idx) |
385 | { | |
386 | switch (pmc_idx) { | |
387 | case VMWARE_BACKDOOR_PMC_HOST_TSC: | |
388 | case VMWARE_BACKDOOR_PMC_REAL_TIME: | |
389 | case VMWARE_BACKDOOR_PMC_APPARENT_TIME: | |
390 | return true; | |
391 | } | |
392 | return false; | |
393 | } | |
394 | ||
395 | static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) | |
396 | { | |
397 | u64 ctr_val; | |
398 | ||
399 | switch (idx) { | |
400 | case VMWARE_BACKDOOR_PMC_HOST_TSC: | |
401 | ctr_val = rdtsc(); | |
402 | break; | |
403 | case VMWARE_BACKDOOR_PMC_REAL_TIME: | |
9285ec4c | 404 | ctr_val = ktime_get_boottime_ns(); |
2d7921c4 AM |
405 | break; |
406 | case VMWARE_BACKDOOR_PMC_APPARENT_TIME: | |
9285ec4c | 407 | ctr_val = ktime_get_boottime_ns() + |
2d7921c4 AM |
408 | vcpu->kvm->arch.kvmclock_offset; |
409 | break; | |
410 | default: | |
411 | return 1; | |
412 | } | |
413 | ||
414 | *data = ctr_val; | |
415 | return 0; | |
416 | } | |
417 | ||
41aac14a WH |
418 | int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) |
419 | { | |
420 | bool fast_mode = idx & (1u << 31); | |
672ff6cf | 421 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
41aac14a | 422 | struct kvm_pmc *pmc; |
0e6f467e | 423 | u64 mask = fast_mode ? ~0u : ~0ull; |
41aac14a | 424 | |
672ff6cf LA |
425 | if (!pmu->version) |
426 | return 1; | |
427 | ||
2d7921c4 AM |
428 | if (is_vmware_backdoor_pmc(idx)) |
429 | return kvm_pmu_rdpmc_vmware(vcpu, idx, data); | |
430 | ||
1921f3aa | 431 | pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); |
41aac14a WH |
432 | if (!pmc) |
433 | return 1; | |
434 | ||
632a4cf5 | 435 | if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) && |
b3646477 | 436 | (static_call(kvm_x86_get_cpl)(vcpu) != 0) && |
632a4cf5 LX |
437 | (kvm_read_cr0(vcpu) & X86_CR0_PE)) |
438 | return 1; | |
439 | ||
0e6f467e | 440 | *data = pmc_read_counter(pmc) & mask; |
e5af058a WH |
441 | return 0; |
442 | } | |
443 | ||
444 | void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) | |
445 | { | |
e6209a3b | 446 | if (lapic_in_kernel(vcpu)) { |
1921f3aa | 447 | static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); |
e5af058a | 448 | kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); |
e6209a3b | 449 | } |
e5af058a WH |
450 | } |
451 | ||
545feb96 | 452 | bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) |
f5132b01 | 453 | { |
1921f3aa | 454 | return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || |
545feb96 | 455 | static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); |
f5132b01 GN |
456 | } |
457 | ||
b35e5548 LX |
458 | static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) |
459 | { | |
460 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); | |
1921f3aa | 461 | struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); |
b35e5548 LX |
462 | |
463 | if (pmc) | |
464 | __set_bit(pmc->idx, pmu->pmc_in_use); | |
465 | } | |
466 | ||
cbd71758 | 467 | int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
f5132b01 | 468 | { |
1921f3aa | 469 | return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); |
f5132b01 GN |
470 | } |
471 | ||
afd80d85 | 472 | int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
f5132b01 | 473 | { |
b35e5548 | 474 | kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); |
1921f3aa | 475 | return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); |
f5132b01 GN |
476 | } |
477 | ||
e84cfe4c WH |
478 | /* refresh PMU settings. This function generally is called when underlying |
479 | * settings are changed (such as changes of PMU CPUID by guest VMs), which | |
480 | * should rarely happen. | |
481 | */ | |
c6702c9d | 482 | void kvm_pmu_refresh(struct kvm_vcpu *vcpu) |
f5132b01 | 483 | { |
1921f3aa | 484 | static_call(kvm_x86_pmu_refresh)(vcpu); |
f5132b01 GN |
485 | } |
486 | ||
f5132b01 GN |
487 | void kvm_pmu_reset(struct kvm_vcpu *vcpu) |
488 | { | |
212dba12 | 489 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
f5132b01 GN |
490 | |
491 | irq_work_sync(&pmu->irq_work); | |
1921f3aa | 492 | static_call(kvm_x86_pmu_reset)(vcpu); |
f5132b01 GN |
493 | } |
494 | ||
e5af058a | 495 | void kvm_pmu_init(struct kvm_vcpu *vcpu) |
f5132b01 | 496 | { |
212dba12 | 497 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
f5132b01 | 498 | |
e5af058a | 499 | memset(pmu, 0, sizeof(*pmu)); |
1921f3aa | 500 | static_call(kvm_x86_pmu_init)(vcpu); |
e5af058a | 501 | init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); |
b35e5548 LX |
502 | pmu->event_count = 0; |
503 | pmu->need_cleanup = false; | |
e5af058a WH |
504 | kvm_pmu_refresh(vcpu); |
505 | } | |
506 | ||
b35e5548 LX |
507 | /* Release perf_events for vPMCs that have been unused for a full time slice. */ |
508 | void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) | |
509 | { | |
510 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); | |
511 | struct kvm_pmc *pmc = NULL; | |
512 | DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX); | |
513 | int i; | |
514 | ||
515 | pmu->need_cleanup = false; | |
516 | ||
517 | bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, | |
518 | pmu->pmc_in_use, X86_PMC_IDX_MAX); | |
519 | ||
520 | for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { | |
1921f3aa | 521 | pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); |
b35e5548 LX |
522 | |
523 | if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) | |
524 | pmc_stop_counter(pmc); | |
525 | } | |
526 | ||
1921f3aa | 527 | static_call_cond(kvm_x86_pmu_cleanup)(vcpu); |
9aa4f622 | 528 | |
b35e5548 LX |
529 | bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); |
530 | } | |
531 | ||
e5af058a WH |
532 | void kvm_pmu_destroy(struct kvm_vcpu *vcpu) |
533 | { | |
534 | kvm_pmu_reset(vcpu); | |
f5132b01 | 535 | } |
66bb8a06 | 536 | |
9cd803d4 EH |
537 | static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) |
538 | { | |
9cd803d4 EH |
539 | u64 prev_count; |
540 | ||
541 | prev_count = pmc->counter; | |
542 | pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); | |
543 | ||
a40239b4 | 544 | reprogram_counter(pmc); |
9cd803d4 EH |
545 | if (pmc->counter < prev_count) |
546 | __kvm_perf_overflow(pmc, false); | |
547 | } | |
548 | ||
549 | static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, | |
550 | unsigned int perf_hw_id) | |
551 | { | |
08dca7a8 LX |
552 | return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & |
553 | AMD64_RAW_EVENT_MASK_NB); | |
9cd803d4 EH |
554 | } |
555 | ||
556 | static inline bool cpl_is_matched(struct kvm_pmc *pmc) | |
557 | { | |
558 | bool select_os, select_user; | |
68fb4757 | 559 | u64 config; |
9cd803d4 EH |
560 | |
561 | if (pmc_is_gp(pmc)) { | |
68fb4757 | 562 | config = pmc->eventsel; |
9cd803d4 EH |
563 | select_os = config & ARCH_PERFMON_EVENTSEL_OS; |
564 | select_user = config & ARCH_PERFMON_EVENTSEL_USR; | |
565 | } else { | |
68fb4757 LX |
566 | config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, |
567 | pmc->idx - INTEL_PMC_IDX_FIXED); | |
9cd803d4 EH |
568 | select_os = config & 0x1; |
569 | select_user = config & 0x2; | |
570 | } | |
571 | ||
572 | return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; | |
573 | } | |
574 | ||
575 | void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) | |
576 | { | |
577 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); | |
578 | struct kvm_pmc *pmc; | |
579 | int i; | |
580 | ||
581 | for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { | |
1921f3aa | 582 | pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); |
9cd803d4 EH |
583 | |
584 | if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) | |
585 | continue; | |
586 | ||
587 | /* Ignore checks for edge detect, pin control, invert and CMASK bits */ | |
588 | if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) | |
589 | kvm_pmu_incr_counter(pmc); | |
590 | } | |
591 | } | |
592 | EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); | |
593 | ||
66bb8a06 EH |
594 | int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) |
595 | { | |
596 | struct kvm_pmu_event_filter tmp, *filter; | |
f1c5651f SC |
597 | struct kvm_vcpu *vcpu; |
598 | unsigned long i; | |
66bb8a06 EH |
599 | size_t size; |
600 | int r; | |
601 | ||
602 | if (copy_from_user(&tmp, argp, sizeof(tmp))) | |
603 | return -EFAULT; | |
604 | ||
605 | if (tmp.action != KVM_PMU_EVENT_ALLOW && | |
606 | tmp.action != KVM_PMU_EVENT_DENY) | |
607 | return -EINVAL; | |
608 | ||
30cd8604 EH |
609 | if (tmp.flags != 0) |
610 | return -EINVAL; | |
611 | ||
66bb8a06 EH |
612 | if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) |
613 | return -E2BIG; | |
614 | ||
615 | size = struct_size(filter, events, tmp.nevents); | |
616 | filter = kmalloc(size, GFP_KERNEL_ACCOUNT); | |
617 | if (!filter) | |
618 | return -ENOMEM; | |
619 | ||
620 | r = -EFAULT; | |
621 | if (copy_from_user(filter, argp, size)) | |
622 | goto cleanup; | |
623 | ||
624 | /* Ensure nevents can't be changed between the user copies. */ | |
625 | *filter = tmp; | |
626 | ||
7ff775ac JM |
627 | /* |
628 | * Sort the in-kernel list so that we can search it with bsearch. | |
629 | */ | |
630 | sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); | |
631 | ||
66bb8a06 | 632 | mutex_lock(&kvm->lock); |
12e78e69 PM |
633 | filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, |
634 | mutex_is_locked(&kvm->lock)); | |
f1c5651f SC |
635 | synchronize_srcu_expedited(&kvm->srcu); |
636 | ||
637 | BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) > | |
638 | sizeof(((struct kvm_pmu *)0)->__reprogram_pmi)); | |
639 | ||
640 | kvm_for_each_vcpu(i, vcpu, kvm) | |
641 | atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull); | |
642 | ||
643 | kvm_make_all_cpus_request(kvm, KVM_REQ_PMU); | |
644 | ||
66bb8a06 EH |
645 | mutex_unlock(&kvm->lock); |
646 | ||
30cd8604 | 647 | r = 0; |
66bb8a06 EH |
648 | cleanup: |
649 | kfree(filter); | |
30cd8604 | 650 | return r; |
66bb8a06 | 651 | } |