Commit | Line | Data |
---|---|---|
20c8ccb1 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
f5132b01 | 2 | /* |
c7a7062f | 3 | * Kernel-based Virtual Machine -- Performance Monitoring Unit support |
f5132b01 | 4 | * |
25462f7f | 5 | * Copyright 2015 Red Hat, Inc. and/or its affiliates. |
f5132b01 GN |
6 | * |
7 | * Authors: | |
8 | * Avi Kivity <avi@redhat.com> | |
9 | * Gleb Natapov <gleb@redhat.com> | |
25462f7f | 10 | * Wei Huang <wei@redhat.com> |
f5132b01 | 11 | */ |
8d20bd63 | 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
f5132b01 GN |
13 | |
14 | #include <linux/types.h> | |
15 | #include <linux/kvm_host.h> | |
16 | #include <linux/perf_event.h> | |
7ff775ac JM |
17 | #include <linux/bsearch.h> |
18 | #include <linux/sort.h> | |
d27aa7f1 | 19 | #include <asm/perf_event.h> |
43d62d10 | 20 | #include <asm/cpu_device_id.h> |
f5132b01 GN |
21 | #include "x86.h" |
22 | #include "cpuid.h" | |
23 | #include "lapic.h" | |
474a5bb9 | 24 | #include "pmu.h" |
f5132b01 | 25 | |
30cd8604 EH |
26 | /* This is enough to filter the vast majority of currently defined events. */ |
27 | #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 | |
66bb8a06 | 28 | |
968635ab LX |
29 | struct x86_pmu_capability __read_mostly kvm_pmu_cap; |
30 | EXPORT_SYMBOL_GPL(kvm_pmu_cap); | |
31 | ||
974850be LX |
32 | /* Precise Distribution of Instructions Retired (PDIR) */ |
33 | static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { | |
43d62d10 LX |
34 | X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), |
35 | X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), | |
974850be LX |
36 | /* Instruction-Accurate PDIR (PDIR++) */ |
37 | X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), | |
38 | {} | |
39 | }; | |
40 | ||
41 | /* Precise Distribution (PDist) */ | |
42 | static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { | |
43 | X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), | |
43d62d10 LX |
44 | {} |
45 | }; | |
46 | ||
25462f7f WH |
47 | /* NOTE: |
48 | * - Each perf counter is defined as "struct kvm_pmc"; | |
49 | * - There are two types of perf counters: general purpose (gp) and fixed. | |
50 | * gp counters are stored in gp_counters[] and fixed counters are stored | |
51 | * in fixed_counters[] respectively. Both of them are part of "struct | |
52 | * kvm_pmu"; | |
53 | * - pmu.c understands the difference between gp counters and fixed counters. | |
54 | * However AMD doesn't support fixed-counters; | |
55 | * - There are three types of index to access perf counters (PMC): | |
56 | * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD | |
a33095f4 LX |
57 | * has MSR_K7_PERFCTRn and, for families 15H and later, |
58 | * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are | |
59 | * aliased to MSR_K7_PERFCTRn. | |
25462f7f WH |
60 | * 2. MSR Index (named idx): This normally is used by RDPMC instruction. |
61 | * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access | |
62 | * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except | |
63 | * that it also supports fixed counters. idx can be used to as index to | |
64 | * gp and fixed counters. | |
65 | * 3. Global PMC Index (named pmc): pmc is an index specific to PMU | |
66 | * code. Each pmc, stored in kvm_pmc.idx field, is unique across | |
67 | * all perf counters (both gp and fixed). The mapping relationship | |
68 | * between pmc and perf counters is as the following: | |
4f1fa2a1 | 69 | * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters |
be6b067d | 70 | * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed |
a33095f4 LX |
71 | * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H |
72 | * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters | |
25462f7f | 73 | */ |
f5132b01 | 74 | |
8f969c0c LX |
75 | static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; |
76 | ||
1921f3aa LX |
77 | #define KVM_X86_PMU_OP(func) \ |
78 | DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \ | |
79 | *(((struct kvm_pmu_ops *)0)->func)); | |
80 | #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP | |
81 | #include <asm/kvm-x86-pmu-ops.h> | |
82 | ||
8f969c0c LX |
83 | void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) |
84 | { | |
85 | memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops)); | |
1921f3aa LX |
86 | |
87 | #define __KVM_X86_PMU_OP(func) \ | |
88 | static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func); | |
89 | #define KVM_X86_PMU_OP(func) \ | |
90 | WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func) | |
91 | #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP | |
92 | #include <asm/kvm-x86-pmu-ops.h> | |
93 | #undef __KVM_X86_PMU_OP | |
8f969c0c LX |
94 | } |
95 | ||
40ccb96d | 96 | static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) |
f5132b01 | 97 | { |
212dba12 | 98 | struct kvm_pmu *pmu = pmc_to_pmu(pmc); |
79f3e3b5 | 99 | bool skip_pmi = false; |
e84cfe4c | 100 | |
79f3e3b5 | 101 | if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { |
f331601c LX |
102 | if (!in_pmi) { |
103 | /* | |
104 | * TODO: KVM is currently _choosing_ to not generate records | |
105 | * for emulated instructions, avoiding BUFFER_OVF PMI when | |
106 | * there are no records. Strictly speaking, it should be done | |
107 | * as well in the right context to improve sampling accuracy. | |
108 | */ | |
109 | skip_pmi = true; | |
110 | } else { | |
111 | /* Indicate PEBS overflow PMI to guest. */ | |
112 | skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, | |
113 | (unsigned long *)&pmu->global_status); | |
114 | } | |
79f3e3b5 LX |
115 | } else { |
116 | __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); | |
117 | } | |
40ccb96d | 118 | |
73554b29 | 119 | if (pmc->intr && !skip_pmi) |
40ccb96d | 120 | kvm_make_request(KVM_REQ_PMI, pmc->vcpu); |
f5132b01 GN |
121 | } |
122 | ||
40ccb96d LX |
123 | static void kvm_perf_overflow(struct perf_event *perf_event, |
124 | struct perf_sample_data *data, | |
125 | struct pt_regs *regs) | |
f5132b01 GN |
126 | { |
127 | struct kvm_pmc *pmc = perf_event->overflow_handler_context; | |
e84cfe4c | 128 | |
de0f6195 | 129 | /* |
fd89499a SC |
130 | * Ignore asynchronous overflow events for counters that are scheduled |
131 | * to be reprogrammed, e.g. if a PMI for the previous event races with | |
132 | * KVM's handling of a related guest WRMSR. | |
de0f6195 LX |
133 | */ |
134 | if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) | |
135 | return; | |
136 | ||
40ccb96d | 137 | __kvm_perf_overflow(pmc, true); |
de0f6195 LX |
138 | |
139 | kvm_make_request(KVM_REQ_PMU, pmc->vcpu); | |
f5132b01 GN |
140 | } |
141 | ||
974850be LX |
142 | static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) |
143 | { | |
144 | /* | |
145 | * For some model specific pebs counters with special capabilities | |
146 | * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise | |
147 | * level to the maximum value (currently 3, backwards compatible) | |
148 | * so that the perf subsystem would assign specific hardware counter | |
149 | * with that capability for vPMC. | |
150 | */ | |
151 | if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) || | |
152 | (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu))) | |
153 | return 3; | |
154 | ||
155 | /* | |
156 | * The non-zero precision level of guest event makes the ordinary | |
157 | * guest event becomes a guest PEBS event and triggers the host | |
158 | * PEBS PMI handler to determine whether the PEBS overflow PMI | |
159 | * comes from the host counters or the guest. | |
160 | */ | |
161 | return 1; | |
162 | } | |
163 | ||
89acf123 SC |
164 | static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) |
165 | { | |
166 | u64 sample_period = (-counter_value) & pmc_bitmask(pmc); | |
167 | ||
168 | if (!sample_period) | |
169 | sample_period = pmc_bitmask(pmc) + 1; | |
170 | return sample_period; | |
171 | } | |
172 | ||
dcbb816a SC |
173 | static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, |
174 | bool exclude_user, bool exclude_kernel, | |
175 | bool intr) | |
f5132b01 | 176 | { |
79f3e3b5 | 177 | struct kvm_pmu *pmu = pmc_to_pmu(pmc); |
f5132b01 GN |
178 | struct perf_event *event; |
179 | struct perf_event_attr attr = { | |
180 | .type = type, | |
181 | .size = sizeof(attr), | |
182 | .pinned = true, | |
183 | .exclude_idle = true, | |
184 | .exclude_host = 1, | |
185 | .exclude_user = exclude_user, | |
186 | .exclude_kernel = exclude_kernel, | |
187 | .config = config, | |
188 | }; | |
79f3e3b5 | 189 | bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); |
e84cfe4c | 190 | |
168d918f | 191 | attr.sample_period = get_sample_period(pmc, pmc->counter); |
bba82fd7 | 192 | |
e644896f LX |
193 | if ((attr.config & HSW_IN_TX_CHECKPOINTED) && |
194 | guest_cpuid_is_intel(pmc->vcpu)) { | |
bba82fd7 RC |
195 | /* |
196 | * HSW_IN_TX_CHECKPOINTED is not supported with nonzero | |
197 | * period. Just clear the sample period so at least | |
198 | * allocating the counter doesn't fail. | |
199 | */ | |
200 | attr.sample_period = 0; | |
bba82fd7 | 201 | } |
79f3e3b5 LX |
202 | if (pebs) { |
203 | /* | |
79f3e3b5 LX |
204 | * For most PEBS hardware events, the difference in the software |
205 | * precision levels of guest and host PEBS events will not affect | |
206 | * the accuracy of the PEBS profiling result, because the "event IP" | |
207 | * in the PEBS record is calibrated on the guest side. | |
79f3e3b5 | 208 | */ |
974850be | 209 | attr.precise_ip = pmc_get_pebs_precise_level(pmc); |
79f3e3b5 | 210 | } |
f5132b01 GN |
211 | |
212 | event = perf_event_create_kernel_counter(&attr, -1, current, | |
f5132b01 GN |
213 | kvm_perf_overflow, pmc); |
214 | if (IS_ERR(event)) { | |
6fc3977c LX |
215 | pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", |
216 | PTR_ERR(event), pmc->idx); | |
dcbb816a | 217 | return PTR_ERR(event); |
f5132b01 GN |
218 | } |
219 | ||
220 | pmc->perf_event = event; | |
b35e5548 | 221 | pmc_to_pmu(pmc)->event_count++; |
e79f49c3 | 222 | pmc->is_paused = false; |
79f3e3b5 | 223 | pmc->intr = intr || pebs; |
dcbb816a | 224 | return 0; |
f5132b01 GN |
225 | } |
226 | ||
fd89499a | 227 | static bool pmc_pause_counter(struct kvm_pmc *pmc) |
a6da0d77 LX |
228 | { |
229 | u64 counter = pmc->counter; | |
fd89499a | 230 | u64 prev_counter; |
a6da0d77 LX |
231 | |
232 | /* update counter, reset event value to avoid redundant accumulation */ | |
fd89499a SC |
233 | if (pmc->perf_event && !pmc->is_paused) |
234 | counter += perf_event_pause(pmc->perf_event, true); | |
235 | ||
236 | /* | |
237 | * Snapshot the previous counter *after* accumulating state from perf. | |
238 | * If overflow already happened, hardware (via perf) is responsible for | |
239 | * generating a PMI. KVM just needs to detect overflow on emulated | |
240 | * counter events that haven't yet been processed. | |
241 | */ | |
242 | prev_counter = counter & pmc_bitmask(pmc); | |
243 | ||
244 | counter += pmc->emulated_counter; | |
a6da0d77 | 245 | pmc->counter = counter & pmc_bitmask(pmc); |
fd89499a SC |
246 | |
247 | pmc->emulated_counter = 0; | |
e79f49c3 | 248 | pmc->is_paused = true; |
fd89499a SC |
249 | |
250 | return pmc->counter < prev_counter; | |
a6da0d77 LX |
251 | } |
252 | ||
253 | static bool pmc_resume_counter(struct kvm_pmc *pmc) | |
254 | { | |
255 | if (!pmc->perf_event) | |
256 | return false; | |
257 | ||
258 | /* recalibrate sample period and check if it's accepted by perf core */ | |
55c590ad LX |
259 | if (is_sampling_event(pmc->perf_event) && |
260 | perf_event_period(pmc->perf_event, | |
168d918f | 261 | get_sample_period(pmc, pmc->counter))) |
a6da0d77 LX |
262 | return false; |
263 | ||
cf52de61 LX |
264 | if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) != |
265 | (!!pmc->perf_event->attr.precise_ip)) | |
79f3e3b5 LX |
266 | return false; |
267 | ||
a6da0d77 LX |
268 | /* reuse perf_event to serve as pmc_reprogram_counter() does*/ |
269 | perf_event_enable(pmc->perf_event); | |
e79f49c3 | 270 | pmc->is_paused = false; |
a6da0d77 | 271 | |
a6da0d77 LX |
272 | return true; |
273 | } | |
274 | ||
cbb359d8 SC |
275 | static void pmc_release_perf_event(struct kvm_pmc *pmc) |
276 | { | |
277 | if (pmc->perf_event) { | |
278 | perf_event_release_kernel(pmc->perf_event); | |
279 | pmc->perf_event = NULL; | |
280 | pmc->current_config = 0; | |
281 | pmc_to_pmu(pmc)->event_count--; | |
282 | } | |
283 | } | |
284 | ||
285 | static void pmc_stop_counter(struct kvm_pmc *pmc) | |
286 | { | |
287 | if (pmc->perf_event) { | |
288 | pmc->counter = pmc_read_counter(pmc); | |
289 | pmc_release_perf_event(pmc); | |
290 | } | |
291 | } | |
292 | ||
89acf123 SC |
293 | static void pmc_update_sample_period(struct kvm_pmc *pmc) |
294 | { | |
295 | if (!pmc->perf_event || pmc->is_paused || | |
296 | !is_sampling_event(pmc->perf_event)) | |
297 | return; | |
298 | ||
299 | perf_event_period(pmc->perf_event, | |
300 | get_sample_period(pmc, pmc->counter)); | |
301 | } | |
302 | ||
303 | void pmc_write_counter(struct kvm_pmc *pmc, u64 val) | |
304 | { | |
fd89499a SC |
305 | /* |
306 | * Drop any unconsumed accumulated counts, the WRMSR is a write, not a | |
307 | * read-modify-write. Adjust the counter value so that its value is | |
308 | * relative to the current count, as reading the current count from | |
309 | * perf is faster than pausing and repgrogramming the event in order to | |
310 | * reset it to '0'. Note, this very sneakily offsets the accumulated | |
311 | * emulated count too, by using pmc_read_counter()! | |
312 | */ | |
313 | pmc->emulated_counter = 0; | |
89acf123 SC |
314 | pmc->counter += val - pmc_read_counter(pmc); |
315 | pmc->counter &= pmc_bitmask(pmc); | |
316 | pmc_update_sample_period(pmc); | |
317 | } | |
318 | EXPORT_SYMBOL_GPL(pmc_write_counter); | |
319 | ||
14329b82 | 320 | static int filter_cmp(const void *pa, const void *pb, u64 mask) |
7ff775ac | 321 | { |
14329b82 AL |
322 | u64 a = *(u64 *)pa & mask; |
323 | u64 b = *(u64 *)pb & mask; | |
4ac19ead AL |
324 | |
325 | return (a > b) - (a < b); | |
7ff775ac JM |
326 | } |
327 | ||
14329b82 AL |
328 | |
329 | static int filter_sort_cmp(const void *pa, const void *pb) | |
330 | { | |
331 | return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT | | |
332 | KVM_PMU_MASKED_ENTRY_EXCLUDE)); | |
333 | } | |
334 | ||
335 | /* | |
336 | * For the event filter, searching is done on the 'includes' list and | |
337 | * 'excludes' list separately rather than on the 'events' list (which | |
338 | * has both). As a result the exclude bit can be ignored. | |
339 | */ | |
340 | static int filter_event_cmp(const void *pa, const void *pb) | |
341 | { | |
342 | return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT)); | |
343 | } | |
344 | ||
345 | static int find_filter_index(u64 *events, u64 nevents, u64 key) | |
346 | { | |
347 | u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]), | |
348 | filter_event_cmp); | |
349 | ||
350 | if (!fe) | |
351 | return -1; | |
352 | ||
353 | return fe - events; | |
354 | } | |
355 | ||
356 | static bool is_filter_entry_match(u64 filter_event, u64 umask) | |
357 | { | |
358 | u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8); | |
359 | u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH; | |
360 | ||
361 | BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >> | |
362 | (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) != | |
363 | ARCH_PERFMON_EVENTSEL_UMASK); | |
364 | ||
365 | return (umask & mask) == match; | |
366 | } | |
367 | ||
368 | static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel) | |
c5a287fa | 369 | { |
14329b82 AL |
370 | u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT; |
371 | u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK; | |
372 | int i, index; | |
373 | ||
374 | index = find_filter_index(events, nevents, event_select); | |
375 | if (index < 0) | |
376 | return false; | |
377 | ||
378 | /* | |
379 | * Entries are sorted by the event select. Walk the list in both | |
380 | * directions to process all entries with the targeted event select. | |
381 | */ | |
382 | for (i = index; i < nevents; i++) { | |
383 | if (filter_event_cmp(&events[i], &event_select)) | |
384 | break; | |
385 | ||
386 | if (is_filter_entry_match(events[i], umask)) | |
387 | return true; | |
388 | } | |
389 | ||
390 | for (i = index - 1; i >= 0; i--) { | |
391 | if (filter_event_cmp(&events[i], &event_select)) | |
392 | break; | |
393 | ||
394 | if (is_filter_entry_match(events[i], umask)) | |
395 | return true; | |
396 | } | |
397 | ||
398 | return false; | |
c5a287fa AL |
399 | } |
400 | ||
14329b82 AL |
401 | static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, |
402 | u64 eventsel) | |
c5a287fa | 403 | { |
14329b82 AL |
404 | if (filter_contains_match(f->includes, f->nr_includes, eventsel) && |
405 | !filter_contains_match(f->excludes, f->nr_excludes, eventsel)) | |
406 | return f->action == KVM_PMU_EVENT_ALLOW; | |
c5a287fa | 407 | |
14329b82 | 408 | return f->action == KVM_PMU_EVENT_DENY; |
c5a287fa AL |
409 | } |
410 | ||
14329b82 AL |
411 | static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, |
412 | int idx) | |
c5a287fa | 413 | { |
be6b067d | 414 | int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX; |
c5a287fa AL |
415 | |
416 | if (filter->action == KVM_PMU_EVENT_DENY && | |
417 | test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) | |
418 | return false; | |
419 | if (filter->action == KVM_PMU_EVENT_ALLOW && | |
420 | !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) | |
421 | return false; | |
422 | ||
423 | return true; | |
424 | } | |
425 | ||
89cb454e LX |
426 | static bool check_pmu_event_filter(struct kvm_pmc *pmc) |
427 | { | |
14329b82 | 428 | struct kvm_x86_pmu_event_filter *filter; |
89cb454e | 429 | struct kvm *kvm = pmc->vcpu->kvm; |
89cb454e LX |
430 | |
431 | filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); | |
432 | if (!filter) | |
c5a287fa | 433 | return true; |
89cb454e | 434 | |
c5a287fa AL |
435 | if (pmc_is_gp(pmc)) |
436 | return is_gp_event_allowed(filter, pmc->eventsel); | |
89cb454e | 437 | |
c5a287fa | 438 | return is_fixed_event_allowed(filter, pmc->idx); |
89cb454e LX |
439 | } |
440 | ||
dfdeda67 AL |
441 | static bool pmc_event_is_allowed(struct kvm_pmc *pmc) |
442 | { | |
443 | return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && | |
444 | check_pmu_event_filter(pmc); | |
445 | } | |
446 | ||
004a0aa5 | 447 | static int reprogram_counter(struct kvm_pmc *pmc) |
f5132b01 | 448 | { |
89cb454e | 449 | struct kvm_pmu *pmu = pmc_to_pmu(pmc); |
fb121aaf | 450 | u64 eventsel = pmc->eventsel; |
02791a5c | 451 | u64 new_config = eventsel; |
fd89499a | 452 | bool emulate_overflow; |
02791a5c | 453 | u8 fixed_ctr_ctrl; |
a7b9d2cc | 454 | |
fd89499a | 455 | emulate_overflow = pmc_pause_counter(pmc); |
f5132b01 | 456 | |
dfdeda67 | 457 | if (!pmc_event_is_allowed(pmc)) |
004a0aa5 | 458 | return 0; |
66bb8a06 | 459 | |
fd89499a | 460 | if (emulate_overflow) |
de0f6195 LX |
461 | __kvm_perf_overflow(pmc, false); |
462 | ||
02791a5c LX |
463 | if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) |
464 | printk_once("kvm pmu: pin control bit is ignored\n"); | |
f5132b01 | 465 | |
02791a5c LX |
466 | if (pmc_is_fixed(pmc)) { |
467 | fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, | |
be6b067d | 468 | pmc->idx - KVM_FIXED_PMC_BASE_IDX); |
02791a5c LX |
469 | if (fixed_ctr_ctrl & 0x1) |
470 | eventsel |= ARCH_PERFMON_EVENTSEL_OS; | |
471 | if (fixed_ctr_ctrl & 0x2) | |
472 | eventsel |= ARCH_PERFMON_EVENTSEL_USR; | |
473 | if (fixed_ctr_ctrl & 0x8) | |
474 | eventsel |= ARCH_PERFMON_EVENTSEL_INT; | |
475 | new_config = (u64)fixed_ctr_ctrl; | |
476 | } | |
f5132b01 | 477 | |
02791a5c | 478 | if (pmc->current_config == new_config && pmc_resume_counter(pmc)) |
004a0aa5 | 479 | return 0; |
a6da0d77 LX |
480 | |
481 | pmc_release_perf_event(pmc); | |
482 | ||
02791a5c | 483 | pmc->current_config = new_config; |
dcbb816a | 484 | |
004a0aa5 SC |
485 | return pmc_reprogram_counter(pmc, PERF_TYPE_RAW, |
486 | (eventsel & pmu->raw_event_mask), | |
487 | !(eventsel & ARCH_PERFMON_EVENTSEL_USR), | |
488 | !(eventsel & ARCH_PERFMON_EVENTSEL_OS), | |
489 | eventsel & ARCH_PERFMON_EVENTSEL_INT); | |
f5132b01 | 490 | } |
f5132b01 | 491 | |
e5af058a WH |
492 | void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) |
493 | { | |
004a0aa5 | 494 | DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); |
e5af058a | 495 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
e5a65d4f | 496 | struct kvm_pmc *pmc; |
e5af058a WH |
497 | int bit; |
498 | ||
004a0aa5 SC |
499 | bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX); |
500 | ||
501 | /* | |
502 | * The reprogramming bitmap can be written asynchronously by something | |
503 | * other than the task that holds vcpu->mutex, take care to clear only | |
504 | * the bits that will actually processed. | |
505 | */ | |
506 | BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t)); | |
507 | atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi); | |
508 | ||
e5a65d4f | 509 | kvm_for_each_pmc(pmu, pmc, bit, bitmap) { |
004a0aa5 SC |
510 | /* |
511 | * If reprogramming fails, e.g. due to contention, re-set the | |
512 | * regprogram bit set, i.e. opportunistically try again on the | |
513 | * next PMU refresh. Don't make a new request as doing so can | |
514 | * stall the guest if reprogramming repeatedly fails. | |
515 | */ | |
516 | if (reprogram_counter(pmc)) | |
517 | set_bit(pmc->idx, pmu->reprogram_pmi); | |
e5af058a | 518 | } |
b35e5548 LX |
519 | |
520 | /* | |
521 | * Unused perf_events are only released if the corresponding MSRs | |
522 | * weren't accessed during the last vCPU time slice. kvm_arch_sched_in | |
523 | * triggers KVM_REQ_PMU if cleanup is needed. | |
524 | */ | |
525 | if (unlikely(pmu->need_cleanup)) | |
526 | kvm_pmu_cleanup(vcpu); | |
e5af058a WH |
527 | } |
528 | ||
7bb7fce1 | 529 | int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) |
e5af058a | 530 | { |
7bb7fce1 SC |
531 | /* |
532 | * On Intel, VMX interception has priority over RDPMC exceptions that | |
533 | * aren't already handled by the emulator, i.e. there are no additional | |
534 | * check needed for Intel PMUs. | |
535 | * | |
536 | * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts, | |
537 | * i.e. an invalid PMC results in a #GP, not #VMEXIT. | |
538 | */ | |
539 | if (!kvm_pmu_ops.check_rdpmc_early) | |
540 | return 0; | |
541 | ||
542 | return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx); | |
41aac14a WH |
543 | } |
544 | ||
2d7921c4 AM |
545 | bool is_vmware_backdoor_pmc(u32 pmc_idx) |
546 | { | |
547 | switch (pmc_idx) { | |
548 | case VMWARE_BACKDOOR_PMC_HOST_TSC: | |
549 | case VMWARE_BACKDOOR_PMC_REAL_TIME: | |
550 | case VMWARE_BACKDOOR_PMC_APPARENT_TIME: | |
551 | return true; | |
552 | } | |
553 | return false; | |
554 | } | |
555 | ||
556 | static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) | |
557 | { | |
558 | u64 ctr_val; | |
559 | ||
560 | switch (idx) { | |
561 | case VMWARE_BACKDOOR_PMC_HOST_TSC: | |
562 | ctr_val = rdtsc(); | |
563 | break; | |
564 | case VMWARE_BACKDOOR_PMC_REAL_TIME: | |
9285ec4c | 565 | ctr_val = ktime_get_boottime_ns(); |
2d7921c4 AM |
566 | break; |
567 | case VMWARE_BACKDOOR_PMC_APPARENT_TIME: | |
9285ec4c | 568 | ctr_val = ktime_get_boottime_ns() + |
2d7921c4 AM |
569 | vcpu->kvm->arch.kvmclock_offset; |
570 | break; | |
571 | default: | |
572 | return 1; | |
573 | } | |
574 | ||
575 | *data = ctr_val; | |
576 | return 0; | |
577 | } | |
578 | ||
41aac14a WH |
579 | int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) |
580 | { | |
672ff6cf | 581 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
41aac14a | 582 | struct kvm_pmc *pmc; |
d652981d | 583 | u64 mask = ~0ull; |
41aac14a | 584 | |
672ff6cf LA |
585 | if (!pmu->version) |
586 | return 1; | |
587 | ||
2d7921c4 AM |
588 | if (is_vmware_backdoor_pmc(idx)) |
589 | return kvm_pmu_rdpmc_vmware(vcpu, idx, data); | |
590 | ||
1921f3aa | 591 | pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); |
41aac14a WH |
592 | if (!pmc) |
593 | return 1; | |
594 | ||
607475cf | 595 | if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) && |
b3646477 | 596 | (static_call(kvm_x86_get_cpl)(vcpu) != 0) && |
607475cf | 597 | kvm_is_cr0_bit_set(vcpu, X86_CR0_PE)) |
632a4cf5 LX |
598 | return 1; |
599 | ||
0e6f467e | 600 | *data = pmc_read_counter(pmc) & mask; |
e5af058a WH |
601 | return 0; |
602 | } | |
603 | ||
604 | void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) | |
605 | { | |
e6209a3b | 606 | if (lapic_in_kernel(vcpu)) { |
1921f3aa | 607 | static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); |
e5af058a | 608 | kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); |
e6209a3b | 609 | } |
e5af058a WH |
610 | } |
611 | ||
545feb96 | 612 | bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) |
f5132b01 | 613 | { |
c85cdc1c LX |
614 | switch (msr) { |
615 | case MSR_CORE_PERF_GLOBAL_STATUS: | |
616 | case MSR_CORE_PERF_GLOBAL_CTRL: | |
617 | case MSR_CORE_PERF_GLOBAL_OVF_CTRL: | |
618 | return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); | |
619 | default: | |
620 | break; | |
621 | } | |
1921f3aa | 622 | return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || |
545feb96 | 623 | static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); |
f5132b01 GN |
624 | } |
625 | ||
b35e5548 LX |
626 | static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) |
627 | { | |
628 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); | |
1921f3aa | 629 | struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); |
b35e5548 LX |
630 | |
631 | if (pmc) | |
632 | __set_bit(pmc->idx, pmu->pmc_in_use); | |
633 | } | |
634 | ||
cbd71758 | 635 | int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
f5132b01 | 636 | { |
c85cdc1c LX |
637 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
638 | u32 msr = msr_info->index; | |
639 | ||
640 | switch (msr) { | |
641 | case MSR_CORE_PERF_GLOBAL_STATUS: | |
4a277189 | 642 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: |
c85cdc1c LX |
643 | msr_info->data = pmu->global_status; |
644 | break; | |
4a277189 | 645 | case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: |
c85cdc1c LX |
646 | case MSR_CORE_PERF_GLOBAL_CTRL: |
647 | msr_info->data = pmu->global_ctrl; | |
648 | break; | |
4a277189 | 649 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: |
c85cdc1c LX |
650 | case MSR_CORE_PERF_GLOBAL_OVF_CTRL: |
651 | msr_info->data = 0; | |
652 | break; | |
653 | default: | |
654 | return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); | |
655 | } | |
656 | ||
657 | return 0; | |
f5132b01 GN |
658 | } |
659 | ||
afd80d85 | 660 | int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
f5132b01 | 661 | { |
c85cdc1c LX |
662 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
663 | u32 msr = msr_info->index; | |
664 | u64 data = msr_info->data; | |
665 | u64 diff; | |
666 | ||
4a277189 LX |
667 | /* |
668 | * Note, AMD ignores writes to reserved bits and read-only PMU MSRs, | |
669 | * whereas Intel generates #GP on attempts to write reserved/RO MSRs. | |
670 | */ | |
c85cdc1c LX |
671 | switch (msr) { |
672 | case MSR_CORE_PERF_GLOBAL_STATUS: | |
673 | if (!msr_info->host_initiated) | |
674 | return 1; /* RO MSR */ | |
4a277189 LX |
675 | fallthrough; |
676 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: | |
677 | /* Per PPR, Read-only MSR. Writes are ignored. */ | |
678 | if (!msr_info->host_initiated) | |
679 | break; | |
c85cdc1c LX |
680 | |
681 | if (data & pmu->global_status_mask) | |
682 | return 1; | |
683 | ||
684 | pmu->global_status = data; | |
685 | break; | |
4a277189 LX |
686 | case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: |
687 | data &= ~pmu->global_ctrl_mask; | |
688 | fallthrough; | |
c85cdc1c LX |
689 | case MSR_CORE_PERF_GLOBAL_CTRL: |
690 | if (!kvm_valid_perf_global_ctrl(pmu, data)) | |
691 | return 1; | |
692 | ||
693 | if (pmu->global_ctrl != data) { | |
694 | diff = pmu->global_ctrl ^ data; | |
695 | pmu->global_ctrl = data; | |
696 | reprogram_counters(pmu, diff); | |
697 | } | |
698 | break; | |
699 | case MSR_CORE_PERF_GLOBAL_OVF_CTRL: | |
700 | /* | |
701 | * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in | |
702 | * GLOBAL_STATUS, and so the set of reserved bits is the same. | |
703 | */ | |
704 | if (data & pmu->global_status_mask) | |
705 | return 1; | |
4a277189 LX |
706 | fallthrough; |
707 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: | |
c85cdc1c LX |
708 | if (!msr_info->host_initiated) |
709 | pmu->global_status &= ~data; | |
710 | break; | |
711 | default: | |
712 | kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); | |
713 | return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); | |
714 | } | |
715 | ||
716 | return 0; | |
f5132b01 GN |
717 | } |
718 | ||
f2f63f7e | 719 | static void kvm_pmu_reset(struct kvm_vcpu *vcpu) |
f5132b01 | 720 | { |
cbb359d8 SC |
721 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
722 | struct kvm_pmc *pmc; | |
723 | int i; | |
724 | ||
1647b527 SC |
725 | pmu->need_cleanup = false; |
726 | ||
cbb359d8 SC |
727 | bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); |
728 | ||
e5a65d4f | 729 | kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) { |
cbb359d8 SC |
730 | pmc_stop_counter(pmc); |
731 | pmc->counter = 0; | |
fd89499a | 732 | pmc->emulated_counter = 0; |
cbb359d8 SC |
733 | |
734 | if (pmc_is_gp(pmc)) | |
735 | pmc->eventsel = 0; | |
736 | } | |
737 | ||
738 | pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; | |
739 | ||
740 | static_call_cond(kvm_x86_pmu_reset)(vcpu); | |
f5132b01 GN |
741 | } |
742 | ||
1647b527 SC |
743 | |
744 | /* | |
745 | * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID | |
746 | * and/or PERF_CAPABILITIES. | |
747 | */ | |
748 | void kvm_pmu_refresh(struct kvm_vcpu *vcpu) | |
749 | { | |
f933b88e SC |
750 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
751 | ||
1647b527 SC |
752 | if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) |
753 | return; | |
754 | ||
755 | /* | |
756 | * Stop/release all existing counters/events before realizing the new | |
757 | * vPMU model. | |
758 | */ | |
759 | kvm_pmu_reset(vcpu); | |
760 | ||
f933b88e SC |
761 | pmu->version = 0; |
762 | pmu->nr_arch_gp_counters = 0; | |
763 | pmu->nr_arch_fixed_counters = 0; | |
764 | pmu->counter_bitmask[KVM_PMC_GP] = 0; | |
765 | pmu->counter_bitmask[KVM_PMC_FIXED] = 0; | |
766 | pmu->reserved_bits = 0xffffffff00200000ull; | |
767 | pmu->raw_event_mask = X86_RAW_EVENT_MASK; | |
768 | pmu->global_ctrl_mask = ~0ull; | |
769 | pmu->global_status_mask = ~0ull; | |
770 | pmu->fixed_ctr_ctrl_mask = ~0ull; | |
771 | pmu->pebs_enable_mask = ~0ull; | |
772 | pmu->pebs_data_cfg_mask = ~0ull; | |
773 | bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); | |
774 | ||
775 | if (vcpu->kvm->arch.enable_pmu) | |
776 | static_call(kvm_x86_pmu_refresh)(vcpu); | |
1647b527 SC |
777 | } |
778 | ||
e5af058a | 779 | void kvm_pmu_init(struct kvm_vcpu *vcpu) |
f5132b01 | 780 | { |
212dba12 | 781 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
f5132b01 | 782 | |
e5af058a | 783 | memset(pmu, 0, sizeof(*pmu)); |
1921f3aa | 784 | static_call(kvm_x86_pmu_init)(vcpu); |
e5af058a WH |
785 | kvm_pmu_refresh(vcpu); |
786 | } | |
787 | ||
b35e5548 LX |
788 | /* Release perf_events for vPMCs that have been unused for a full time slice. */ |
789 | void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) | |
790 | { | |
791 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); | |
792 | struct kvm_pmc *pmc = NULL; | |
793 | DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX); | |
794 | int i; | |
795 | ||
796 | pmu->need_cleanup = false; | |
797 | ||
798 | bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, | |
799 | pmu->pmc_in_use, X86_PMC_IDX_MAX); | |
800 | ||
e5a65d4f SC |
801 | kvm_for_each_pmc(pmu, pmc, i, bitmask) { |
802 | if (pmc->perf_event && !pmc_speculative_in_use(pmc)) | |
b35e5548 LX |
803 | pmc_stop_counter(pmc); |
804 | } | |
805 | ||
1921f3aa | 806 | static_call_cond(kvm_x86_pmu_cleanup)(vcpu); |
9aa4f622 | 807 | |
b35e5548 LX |
808 | bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); |
809 | } | |
810 | ||
e5af058a WH |
811 | void kvm_pmu_destroy(struct kvm_vcpu *vcpu) |
812 | { | |
813 | kvm_pmu_reset(vcpu); | |
f5132b01 | 814 | } |
66bb8a06 | 815 | |
9cd803d4 EH |
816 | static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) |
817 | { | |
fd89499a | 818 | pmc->emulated_counter++; |
4fa5843d | 819 | kvm_pmu_request_counter_reprogram(pmc); |
9cd803d4 EH |
820 | } |
821 | ||
822 | static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, | |
823 | unsigned int perf_hw_id) | |
824 | { | |
08dca7a8 LX |
825 | return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & |
826 | AMD64_RAW_EVENT_MASK_NB); | |
9cd803d4 EH |
827 | } |
828 | ||
829 | static inline bool cpl_is_matched(struct kvm_pmc *pmc) | |
830 | { | |
831 | bool select_os, select_user; | |
68fb4757 | 832 | u64 config; |
9cd803d4 EH |
833 | |
834 | if (pmc_is_gp(pmc)) { | |
68fb4757 | 835 | config = pmc->eventsel; |
9cd803d4 EH |
836 | select_os = config & ARCH_PERFMON_EVENTSEL_OS; |
837 | select_user = config & ARCH_PERFMON_EVENTSEL_USR; | |
838 | } else { | |
68fb4757 | 839 | config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, |
be6b067d | 840 | pmc->idx - KVM_FIXED_PMC_BASE_IDX); |
9cd803d4 EH |
841 | select_os = config & 0x1; |
842 | select_user = config & 0x2; | |
843 | } | |
844 | ||
845 | return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; | |
846 | } | |
847 | ||
848 | void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) | |
849 | { | |
850 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); | |
851 | struct kvm_pmc *pmc; | |
852 | int i; | |
853 | ||
e5a65d4f SC |
854 | kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) { |
855 | if (!pmc_event_is_allowed(pmc)) | |
9cd803d4 EH |
856 | continue; |
857 | ||
858 | /* Ignore checks for edge detect, pin control, invert and CMASK bits */ | |
859 | if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) | |
860 | kvm_pmu_incr_counter(pmc); | |
861 | } | |
862 | } | |
863 | EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); | |
864 | ||
14329b82 AL |
865 | static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) |
866 | { | |
867 | u64 mask = kvm_pmu_ops.EVENTSEL_EVENT | | |
868 | KVM_PMU_MASKED_ENTRY_UMASK_MASK | | |
869 | KVM_PMU_MASKED_ENTRY_UMASK_MATCH | | |
870 | KVM_PMU_MASKED_ENTRY_EXCLUDE; | |
871 | int i; | |
872 | ||
873 | for (i = 0; i < filter->nevents; i++) { | |
874 | if (filter->events[i] & ~mask) | |
875 | return false; | |
876 | } | |
877 | ||
878 | return true; | |
879 | } | |
880 | ||
881 | static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter) | |
8589827f AL |
882 | { |
883 | int i, j; | |
884 | ||
885 | for (i = 0, j = 0; i < filter->nevents; i++) { | |
14329b82 AL |
886 | /* |
887 | * Skip events that are impossible to match against a guest | |
888 | * event. When filtering, only the event select + unit mask | |
889 | * of the guest event is used. To maintain backwards | |
890 | * compatibility, impossible filters can't be rejected :-( | |
891 | */ | |
8589827f AL |
892 | if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT | |
893 | ARCH_PERFMON_EVENTSEL_UMASK)) | |
894 | continue; | |
14329b82 AL |
895 | /* |
896 | * Convert userspace events to a common in-kernel event so | |
897 | * only one code path is needed to support both events. For | |
898 | * the in-kernel events use masked events because they are | |
899 | * flexible enough to handle both cases. To convert to masked | |
900 | * events all that's needed is to add an "all ones" umask_mask, | |
901 | * (unmasked filter events don't support EXCLUDE). | |
902 | */ | |
903 | filter->events[j++] = filter->events[i] | | |
904 | (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT); | |
8589827f AL |
905 | } |
906 | ||
907 | filter->nevents = j; | |
908 | } | |
909 | ||
14329b82 AL |
910 | static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter) |
911 | { | |
912 | int i; | |
913 | ||
914 | if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)) | |
915 | convert_to_masked_filter(filter); | |
916 | else if (!is_masked_filter_valid(filter)) | |
917 | return -EINVAL; | |
918 | ||
919 | /* | |
920 | * Sort entries by event select and includes vs. excludes so that all | |
921 | * entries for a given event select can be processed efficiently during | |
922 | * filtering. The EXCLUDE flag uses a more significant bit than the | |
923 | * event select, and so the sorted list is also effectively split into | |
924 | * includes and excludes sub-lists. | |
925 | */ | |
926 | sort(&filter->events, filter->nevents, sizeof(filter->events[0]), | |
927 | filter_sort_cmp, NULL); | |
928 | ||
929 | i = filter->nevents; | |
930 | /* Find the first EXCLUDE event (only supported for masked events). */ | |
931 | if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) { | |
932 | for (i = 0; i < filter->nevents; i++) { | |
933 | if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE) | |
934 | break; | |
935 | } | |
936 | } | |
937 | ||
938 | filter->nr_includes = i; | |
939 | filter->nr_excludes = filter->nevents - filter->nr_includes; | |
940 | filter->includes = filter->events; | |
941 | filter->excludes = filter->events + filter->nr_includes; | |
942 | ||
943 | return 0; | |
944 | } | |
945 | ||
66bb8a06 EH |
946 | int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) |
947 | { | |
14329b82 AL |
948 | struct kvm_pmu_event_filter __user *user_filter = argp; |
949 | struct kvm_x86_pmu_event_filter *filter; | |
950 | struct kvm_pmu_event_filter tmp; | |
f1c5651f SC |
951 | struct kvm_vcpu *vcpu; |
952 | unsigned long i; | |
66bb8a06 EH |
953 | size_t size; |
954 | int r; | |
955 | ||
14329b82 | 956 | if (copy_from_user(&tmp, user_filter, sizeof(tmp))) |
66bb8a06 EH |
957 | return -EFAULT; |
958 | ||
959 | if (tmp.action != KVM_PMU_EVENT_ALLOW && | |
960 | tmp.action != KVM_PMU_EVENT_DENY) | |
961 | return -EINVAL; | |
962 | ||
14329b82 | 963 | if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK) |
30cd8604 EH |
964 | return -EINVAL; |
965 | ||
66bb8a06 EH |
966 | if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) |
967 | return -E2BIG; | |
968 | ||
969 | size = struct_size(filter, events, tmp.nevents); | |
14329b82 | 970 | filter = kzalloc(size, GFP_KERNEL_ACCOUNT); |
66bb8a06 EH |
971 | if (!filter) |
972 | return -ENOMEM; | |
973 | ||
14329b82 AL |
974 | filter->action = tmp.action; |
975 | filter->nevents = tmp.nevents; | |
976 | filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap; | |
977 | filter->flags = tmp.flags; | |
978 | ||
66bb8a06 | 979 | r = -EFAULT; |
14329b82 AL |
980 | if (copy_from_user(filter->events, user_filter->events, |
981 | sizeof(filter->events[0]) * filter->nevents)) | |
66bb8a06 EH |
982 | goto cleanup; |
983 | ||
14329b82 AL |
984 | r = prepare_filter_lists(filter); |
985 | if (r) | |
986 | goto cleanup; | |
7ff775ac | 987 | |
66bb8a06 | 988 | mutex_lock(&kvm->lock); |
12e78e69 PM |
989 | filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, |
990 | mutex_is_locked(&kvm->lock)); | |
95744a90 | 991 | mutex_unlock(&kvm->lock); |
f1c5651f SC |
992 | synchronize_srcu_expedited(&kvm->srcu); |
993 | ||
994 | BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) > | |
995 | sizeof(((struct kvm_pmu *)0)->__reprogram_pmi)); | |
996 | ||
997 | kvm_for_each_vcpu(i, vcpu, kvm) | |
998 | atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull); | |
999 | ||
1000 | kvm_make_all_cpus_request(kvm, KVM_REQ_PMU); | |
1001 | ||
30cd8604 | 1002 | r = 0; |
66bb8a06 EH |
1003 | cleanup: |
1004 | kfree(filter); | |
30cd8604 | 1005 | return r; |
66bb8a06 | 1006 | } |