Commit | Line | Data |
---|---|---|
eec688e1 RB |
1 | /* |
2 | * Copyright © 2015-2016 Intel Corporation | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice (including the next | |
12 | * paragraph) shall be included in all copies or substantial portions of the | |
13 | * Software. | |
14 | * | |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
21 | * IN THE SOFTWARE. | |
22 | * | |
23 | * Authors: | |
24 | * Robert Bragg <robert@sixbynine.org> | |
25 | */ | |
26 | ||
7abbd8d6 RB |
27 | |
28 | /** | |
16d98b31 | 29 | * DOC: i915 Perf Overview |
7abbd8d6 RB |
30 | * |
31 | * Gen graphics supports a large number of performance counters that can help | |
32 | * driver and application developers understand and optimize their use of the | |
33 | * GPU. | |
34 | * | |
35 | * This i915 perf interface enables userspace to configure and open a file | |
36 | * descriptor representing a stream of GPU metrics which can then be read() as | |
37 | * a stream of sample records. | |
38 | * | |
39 | * The interface is particularly suited to exposing buffered metrics that are | |
40 | * captured by DMA from the GPU, unsynchronized with and unrelated to the CPU. | |
41 | * | |
42 | * Streams representing a single context are accessible to applications with a | |
43 | * corresponding drm file descriptor, such that OpenGL can use the interface | |
44 | * without special privileges. Access to system-wide metrics requires root | |
45 | * privileges by default, unless changed via the dev.i915.perf_event_paranoid | |
46 | * sysctl option. | |
47 | * | |
16d98b31 RB |
48 | */ |
49 | ||
50 | /** | |
51 | * DOC: i915 Perf History and Comparison with Core Perf | |
7abbd8d6 RB |
52 | * |
53 | * The interface was initially inspired by the core Perf infrastructure but | |
54 | * some notable differences are: | |
55 | * | |
56 | * i915 perf file descriptors represent a "stream" instead of an "event"; where | |
57 | * a perf event primarily corresponds to a single 64bit value, while a stream | |
58 | * might sample sets of tightly-coupled counters, depending on the | |
59 | * configuration. For example the Gen OA unit isn't designed to support | |
60 | * orthogonal configurations of individual counters; it's configured for a set | |
61 | * of related counters. Samples for an i915 perf stream capturing OA metrics | |
62 | * will include a set of counter values packed in a compact HW specific format. | |
63 | * The OA unit supports a number of different packing formats which can be | |
64 | * selected by the user opening the stream. Perf has support for grouping | |
65 | * events, but each event in the group is configured, validated and | |
66 | * authenticated individually with separate system calls. | |
67 | * | |
68 | * i915 perf stream configurations are provided as an array of u64 (key,value) | |
69 | * pairs, instead of a fixed struct with multiple miscellaneous config members, | |
70 | * interleaved with event-type specific members. | |
71 | * | |
72 | * i915 perf doesn't support exposing metrics via an mmap'd circular buffer. | |
73 | * The supported metrics are being written to memory by the GPU unsynchronized | |
74 | * with the CPU, using HW specific packing formats for counter sets. Sometimes | |
75 | * the constraints on HW configuration require reports to be filtered before it | |
76 | * would be acceptable to expose them to unprivileged applications - to hide | |
77 | * the metrics of other processes/contexts. For these use cases a read() based | |
78 | * interface is a good fit, and provides an opportunity to filter data as it | |
79 | * gets copied from the GPU mapped buffers to userspace buffers. | |
80 | * | |
81 | * | |
16d98b31 RB |
82 | * Issues hit with first prototype based on Core Perf |
83 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
7abbd8d6 RB |
84 | * |
85 | * The first prototype of this driver was based on the core perf | |
86 | * infrastructure, and while we did make that mostly work, with some changes to | |
87 | * perf, we found we were breaking or working around too many assumptions baked | |
88 | * into perf's currently cpu centric design. | |
89 | * | |
90 | * In the end we didn't see a clear benefit to making perf's implementation and | |
91 | * interface more complex by changing design assumptions while we knew we still | |
92 | * wouldn't be able to use any existing perf based userspace tools. | |
93 | * | |
94 | * Also considering the Gen specific nature of the Observability hardware and | |
95 | * how userspace will sometimes need to combine i915 perf OA metrics with | |
96 | * side-band OA data captured via MI_REPORT_PERF_COUNT commands; we're | |
97 | * expecting the interface to be used by a platform specific userspace such as | |
98 | * OpenGL or tools. This is to say; we aren't inherently missing out on having | |
99 | * a standard vendor/architecture agnostic interface by not using perf. | |
100 | * | |
101 | * | |
102 | * For posterity, in case we might re-visit trying to adapt core perf to be | |
103 | * better suited to exposing i915 metrics these were the main pain points we | |
104 | * hit: | |
105 | * | |
106 | * - The perf based OA PMU driver broke some significant design assumptions: | |
107 | * | |
108 | * Existing perf pmus are used for profiling work on a cpu and we were | |
109 | * introducing the idea of _IS_DEVICE pmus with different security | |
110 | * implications, the need to fake cpu-related data (such as user/kernel | |
111 | * registers) to fit with perf's current design, and adding _DEVICE records | |
112 | * as a way to forward device-specific status records. | |
113 | * | |
114 | * The OA unit writes reports of counters into a circular buffer, without | |
115 | * involvement from the CPU, making our PMU driver the first of a kind. | |
116 | * | |
117 | * Given the way we were periodically forward data from the GPU-mapped, OA | |
118 | * buffer to perf's buffer, those bursts of sample writes looked to perf like | |
119 | * we were sampling too fast and so we had to subvert its throttling checks. | |
120 | * | |
121 | * Perf supports groups of counters and allows those to be read via | |
122 | * transactions internally but transactions currently seem designed to be | |
123 | * explicitly initiated from the cpu (say in response to a userspace read()) | |
124 | * and while we could pull a report out of the OA buffer we can't | |
125 | * trigger a report from the cpu on demand. | |
126 | * | |
127 | * Related to being report based; the OA counters are configured in HW as a | |
128 | * set while perf generally expects counter configurations to be orthogonal. | |
129 | * Although counters can be associated with a group leader as they are | |
130 | * opened, there's no clear precedent for being able to provide group-wide | |
131 | * configuration attributes (for example we want to let userspace choose the | |
132 | * OA unit report format used to capture all counters in a set, or specify a | |
133 | * GPU context to filter metrics on). We avoided using perf's grouping | |
134 | * feature and forwarded OA reports to userspace via perf's 'raw' sample | |
135 | * field. This suited our userspace well considering how coupled the counters | |
136 | * are when dealing with normalizing. It would be inconvenient to split | |
137 | * counters up into separate events, only to require userspace to recombine | |
138 | * them. For Mesa it's also convenient to be forwarded raw, periodic reports | |
139 | * for combining with the side-band raw reports it captures using | |
140 | * MI_REPORT_PERF_COUNT commands. | |
141 | * | |
16d98b31 | 142 | * - As a side note on perf's grouping feature; there was also some concern |
7abbd8d6 RB |
143 | * that using PERF_FORMAT_GROUP as a way to pack together counter values |
144 | * would quite drastically inflate our sample sizes, which would likely | |
145 | * lower the effective sampling resolutions we could use when the available | |
146 | * memory bandwidth is limited. | |
147 | * | |
148 | * With the OA unit's report formats, counters are packed together as 32 | |
149 | * or 40bit values, with the largest report size being 256 bytes. | |
150 | * | |
151 | * PERF_FORMAT_GROUP values are 64bit, but there doesn't appear to be a | |
152 | * documented ordering to the values, implying PERF_FORMAT_ID must also be | |
153 | * used to add a 64bit ID before each value; giving 16 bytes per counter. | |
154 | * | |
155 | * Related to counter orthogonality; we can't time share the OA unit, while | |
156 | * event scheduling is a central design idea within perf for allowing | |
157 | * userspace to open + enable more events than can be configured in HW at any | |
158 | * one time. The OA unit is not designed to allow re-configuration while in | |
159 | * use. We can't reconfigure the OA unit without losing internal OA unit | |
160 | * state which we can't access explicitly to save and restore. Reconfiguring | |
161 | * the OA unit is also relatively slow, involving ~100 register writes. From | |
162 | * userspace Mesa also depends on a stable OA configuration when emitting | |
163 | * MI_REPORT_PERF_COUNT commands and importantly the OA unit can't be | |
164 | * disabled while there are outstanding MI_RPC commands lest we hang the | |
165 | * command streamer. | |
166 | * | |
167 | * The contents of sample records aren't extensible by device drivers (i.e. | |
168 | * the sample_type bits). As an example; Sourab Gupta had been looking to | |
169 | * attach GPU timestamps to our OA samples. We were shoehorning OA reports | |
170 | * into sample records by using the 'raw' field, but it's tricky to pack more | |
171 | * than one thing into this field because events/core.c currently only lets a | |
172 | * pmu give a single raw data pointer plus len which will be copied into the | |
173 | * ring buffer. To include more than the OA report we'd have to copy the | |
174 | * report into an intermediate larger buffer. I'd been considering allowing a | |
175 | * vector of data+len values to be specified for copying the raw data, but | |
176 | * it felt like a kludge to being using the raw field for this purpose. | |
177 | * | |
178 | * - It felt like our perf based PMU was making some technical compromises | |
179 | * just for the sake of using perf: | |
180 | * | |
181 | * perf_event_open() requires events to either relate to a pid or a specific | |
182 | * cpu core, while our device pmu related to neither. Events opened with a | |
183 | * pid will be automatically enabled/disabled according to the scheduling of | |
184 | * that process - so not appropriate for us. When an event is related to a | |
185 | * cpu id, perf ensures pmu methods will be invoked via an inter process | |
186 | * interrupt on that core. To avoid invasive changes our userspace opened OA | |
187 | * perf events for a specific cpu. This was workable but it meant the | |
188 | * majority of the OA driver ran in atomic context, including all OA report | |
189 | * forwarding, which wasn't really necessary in our case and seems to make | |
190 | * our locking requirements somewhat complex as we handled the interaction | |
191 | * with the rest of the i915 driver. | |
192 | */ | |
193 | ||
eec688e1 | 194 | #include <linux/anon_inodes.h> |
d7965152 | 195 | #include <linux/sizes.h> |
f89823c2 | 196 | #include <linux/uuid.h> |
eec688e1 | 197 | |
10be98a7 | 198 | #include "gem/i915_gem_context.h" |
b508d01f | 199 | #include "gem/i915_gem_internal.h" |
a5efcde6 | 200 | #include "gt/intel_engine_pm.h" |
202b1f4c | 201 | #include "gt/intel_engine_regs.h" |
9a61363a | 202 | #include "gt/intel_engine_user.h" |
70a2b431 | 203 | #include "gt/intel_execlists_submission.h" |
45233ab2 | 204 | #include "gt/intel_gpu_commands.h" |
daed3e44 | 205 | #include "gt/intel_gt.h" |
f170523a | 206 | #include "gt/intel_gt_clock_utils.h" |
ed6b25aa | 207 | #include "gt/intel_gt_mcr.h" |
0d6419e9 | 208 | #include "gt/intel_gt_regs.h" |
a0d3fdb6 | 209 | #include "gt/intel_lrc.h" |
dd4821ba | 210 | #include "gt/intel_lrc_reg.h" |
2871ea85 | 211 | #include "gt/intel_ring.h" |
01e74274 | 212 | #include "gt/uc/intel_guc_slpc.h" |
112ed2d3 | 213 | |
eec688e1 | 214 | #include "i915_drv.h" |
5472b3f2 | 215 | #include "i915_file_private.h" |
db94e9f1 | 216 | #include "i915_perf.h" |
2ef6d3bf | 217 | #include "i915_perf_oa_regs.h" |
801543b2 | 218 | #include "i915_reg.h" |
d7965152 | 219 | |
fe841686 JL |
220 | /* HW requires this to be a power of two, between 128k and 16M, though driver |
221 | * is currently generally designed assuming the largest 16M size is used such | |
222 | * that the overflow cases are unlikely in normal operation. | |
223 | */ | |
224 | #define OA_BUFFER_SIZE SZ_16M | |
225 | ||
226 | #define OA_TAKEN(tail, head) ((tail - head) & (OA_BUFFER_SIZE - 1)) | |
d7965152 | 227 | |
0dd860cf RB |
228 | /** |
229 | * DOC: OA Tail Pointer Race | |
230 | * | |
231 | * There's a HW race condition between OA unit tail pointer register updates and | |
d7965152 | 232 | * writes to memory whereby the tail pointer can sometimes get ahead of what's |
0dd860cf RB |
233 | * been written out to the OA buffer so far (in terms of what's visible to the |
234 | * CPU). | |
235 | * | |
236 | * Although this can be observed explicitly while copying reports to userspace | |
237 | * by checking for a zeroed report-id field in tail reports, we want to account | |
d1df41eb LL |
238 | * for this earlier, as part of the oa_buffer_check_unlocked to avoid lots of |
239 | * redundant read() attempts. | |
240 | * | |
241 | * We workaround this issue in oa_buffer_check_unlocked() by reading the reports | |
242 | * in the OA buffer, starting from the tail reported by the HW until we find a | |
243 | * report with its first 2 dwords not 0 meaning its previous report is | |
244 | * completely in memory and ready to be read. Those dwords are also set to 0 | |
245 | * once read and the whole buffer is cleared upon OA buffer initialization. The | |
246 | * first dword is the reason for this report while the second is the timestamp, | |
247 | * making the chances of having those 2 fields at 0 fairly unlikely. A more | |
248 | * detailed explanation is available in oa_buffer_check_unlocked(). | |
0dd860cf RB |
249 | * |
250 | * Most of the implementation details for this workaround are in | |
19f81df2 | 251 | * oa_buffer_check_unlocked() and _append_oa_reports() |
0dd860cf RB |
252 | * |
253 | * Note for posterity: previously the driver used to define an effective tail | |
254 | * pointer that lagged the real pointer by a 'tail margin' measured in bytes | |
255 | * derived from %OA_TAIL_MARGIN_NSEC and the configured sampling frequency. | |
256 | * This was flawed considering that the OA unit may also automatically generate | |
257 | * non-periodic reports (such as on context switch) or the OA unit may be | |
258 | * enabled without any periodic sampling. | |
d7965152 RB |
259 | */ |
260 | #define OA_TAIL_MARGIN_NSEC 100000ULL | |
0dd860cf | 261 | #define INVALID_TAIL_PTR 0xffffffff |
d7965152 | 262 | |
4ef10fe0 LL |
263 | /* The default frequency for checking whether the OA unit has written new |
264 | * reports to the circular OA buffer... | |
d7965152 | 265 | */ |
4ef10fe0 LL |
266 | #define DEFAULT_POLL_FREQUENCY_HZ 200 |
267 | #define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ) | |
d7965152 | 268 | |
ccdf6341 | 269 | /* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */ |
ccdf6341 RB |
270 | static u32 i915_perf_stream_paranoid = true; |
271 | ||
d7965152 RB |
272 | /* The maximum exponent the hardware accepts is 63 (essentially it selects one |
273 | * of the 64bit timestamp bits to trigger reports from) but there's currently | |
274 | * no known use case for sampling as infrequently as once per 47 thousand years. | |
275 | * | |
276 | * Since the timestamps included in OA reports are only 32bits it seems | |
277 | * reasonable to limit the OA exponent where it's still possible to account for | |
278 | * overflow in OA report timestamps. | |
279 | */ | |
280 | #define OA_EXPONENT_MAX 31 | |
281 | ||
282 | #define INVALID_CTX_ID 0xffffffff | |
283 | ||
19f81df2 RB |
284 | /* On Gen8+ automatically triggered OA reports include a 'reason' field... */ |
285 | #define OAREPORT_REASON_MASK 0x3f | |
00a7f0d7 | 286 | #define OAREPORT_REASON_MASK_EXTENDED 0x7f |
19f81df2 RB |
287 | #define OAREPORT_REASON_SHIFT 19 |
288 | #define OAREPORT_REASON_TIMER (1<<0) | |
289 | #define OAREPORT_REASON_CTX_SWITCH (1<<3) | |
290 | #define OAREPORT_REASON_CLK_RATIO (1<<5) | |
291 | ||
2d9da585 | 292 | #define HAS_MI_SET_PREDICATE(i915) (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) |
d7965152 | 293 | |
00319ba0 RB |
294 | /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate |
295 | * | |
155e941f RB |
296 | * The highest sampling frequency we can theoretically program the OA unit |
297 | * with is always half the timestamp frequency: E.g. 6.25Mhz for Haswell. | |
298 | * | |
299 | * Initialized just before we register the sysctl parameter. | |
00319ba0 | 300 | */ |
155e941f | 301 | static int oa_sample_rate_hard_limit; |
00319ba0 RB |
302 | |
303 | /* Theoretically we can program the OA unit to sample every 160ns but don't | |
304 | * allow that by default unless root... | |
305 | * | |
306 | * The default threshold of 100000Hz is based on perf's similar | |
307 | * kernel.perf_event_max_sample_rate sysctl parameter. | |
308 | */ | |
309 | static u32 i915_oa_max_sample_rate = 100000; | |
310 | ||
d7965152 RB |
311 | /* XXX: beware if future OA HW adds new report formats that the current |
312 | * code assumes all reports have a power-of-two size and ~(size - 1) can | |
313 | * be used as a mask to align the OA tail pointer. | |
314 | */ | |
0f15c5b0 | 315 | static const struct i915_oa_format oa_formats[I915_OA_FORMAT_MAX] = { |
d7965152 RB |
316 | [I915_OA_FORMAT_A13] = { 0, 64 }, |
317 | [I915_OA_FORMAT_A29] = { 1, 128 }, | |
318 | [I915_OA_FORMAT_A13_B8_C8] = { 2, 128 }, | |
319 | /* A29_B8_C8 Disallowed as 192 bytes doesn't factor into buffer size */ | |
320 | [I915_OA_FORMAT_B4_C8] = { 4, 64 }, | |
321 | [I915_OA_FORMAT_A45_B8_C8] = { 5, 256 }, | |
322 | [I915_OA_FORMAT_B4_C8_A16] = { 6, 128 }, | |
323 | [I915_OA_FORMAT_C4_B8] = { 7, 64 }, | |
19f81df2 RB |
324 | [I915_OA_FORMAT_A12] = { 0, 64 }, |
325 | [I915_OA_FORMAT_A12_B8_C8] = { 2, 128 }, | |
326 | [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 }, | |
81d5f7d9 UNR |
327 | [I915_OAR_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 }, |
328 | [I915_OA_FORMAT_A24u40_A14u32_B8_C8] = { 5, 256 }, | |
00a7f0d7 LL |
329 | }; |
330 | ||
d7965152 | 331 | #define SAMPLE_OA_REPORT (1<<0) |
eec688e1 | 332 | |
16d98b31 RB |
333 | /** |
334 | * struct perf_open_properties - for validated properties given to open a stream | |
335 | * @sample_flags: `DRM_I915_PERF_PROP_SAMPLE_*` properties are tracked as flags | |
336 | * @single_context: Whether a single or all gpu contexts should be monitored | |
9cd20ef7 LL |
337 | * @hold_preemption: Whether the preemption is disabled for the filtered |
338 | * context | |
16d98b31 RB |
339 | * @ctx_handle: A gem ctx handle for use with @single_context |
340 | * @metrics_set: An ID for an OA unit metric set advertised via sysfs | |
341 | * @oa_format: An OA unit HW report format | |
342 | * @oa_periodic: Whether to enable periodic OA unit sampling | |
343 | * @oa_period_exponent: The OA unit sampling period is derived from this | |
9a61363a | 344 | * @engine: The engine (typically rcs0) being monitored by the OA unit |
11ecbddd LL |
345 | * @has_sseu: Whether @sseu was specified by userspace |
346 | * @sseu: internal SSEU configuration computed either from the userspace | |
347 | * specified configuration in the opening parameters or a default value | |
348 | * (see get_default_sseu_config()) | |
4ef10fe0 LL |
349 | * @poll_oa_period: The period in nanoseconds at which the CPU will check for OA |
350 | * data availability | |
16d98b31 RB |
351 | * |
352 | * As read_properties_unlocked() enumerates and validates the properties given | |
353 | * to open a stream of metrics the configuration is built up in the structure | |
354 | * which starts out zero initialized. | |
355 | */ | |
eec688e1 RB |
356 | struct perf_open_properties { |
357 | u32 sample_flags; | |
358 | ||
359 | u64 single_context:1; | |
9cd20ef7 | 360 | u64 hold_preemption:1; |
eec688e1 | 361 | u64 ctx_handle; |
d7965152 RB |
362 | |
363 | /* OA sampling state */ | |
364 | int metrics_set; | |
365 | int oa_format; | |
366 | bool oa_periodic; | |
367 | int oa_period_exponent; | |
9a61363a LL |
368 | |
369 | struct intel_engine_cs *engine; | |
11ecbddd LL |
370 | |
371 | bool has_sseu; | |
372 | struct intel_sseu sseu; | |
4ef10fe0 LL |
373 | |
374 | u64 poll_oa_period; | |
d7965152 RB |
375 | }; |
376 | ||
6a45008a LL |
377 | struct i915_oa_config_bo { |
378 | struct llist_node node; | |
379 | ||
380 | struct i915_oa_config *oa_config; | |
381 | struct i915_vma *vma; | |
382 | }; | |
383 | ||
3dc716fd VSD |
384 | static struct ctl_table_header *sysctl_header; |
385 | ||
a37f08a8 UNR |
386 | static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer); |
387 | ||
6a45008a | 388 | void i915_oa_config_release(struct kref *ref) |
f89823c2 | 389 | { |
6a45008a LL |
390 | struct i915_oa_config *oa_config = |
391 | container_of(ref, typeof(*oa_config), ref); | |
392 | ||
c2fba936 CW |
393 | kfree(oa_config->flex_regs); |
394 | kfree(oa_config->b_counter_regs); | |
395 | kfree(oa_config->mux_regs); | |
f89823c2 | 396 | |
6a45008a | 397 | kfree_rcu(oa_config, rcu); |
f89823c2 LL |
398 | } |
399 | ||
6a45008a LL |
400 | struct i915_oa_config * |
401 | i915_perf_get_oa_config(struct i915_perf *perf, int metrics_set) | |
f89823c2 | 402 | { |
6a45008a | 403 | struct i915_oa_config *oa_config; |
f89823c2 | 404 | |
6a45008a | 405 | rcu_read_lock(); |
9aba9c18 | 406 | oa_config = idr_find(&perf->metrics_idr, metrics_set); |
6a45008a LL |
407 | if (oa_config) |
408 | oa_config = i915_oa_config_get(oa_config); | |
409 | rcu_read_unlock(); | |
f89823c2 | 410 | |
6a45008a LL |
411 | return oa_config; |
412 | } | |
f89823c2 | 413 | |
6a45008a LL |
414 | static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo) |
415 | { | |
416 | i915_oa_config_put(oa_bo->oa_config); | |
417 | i915_vma_put(oa_bo->vma); | |
418 | kfree(oa_bo); | |
f89823c2 LL |
419 | } |
420 | ||
00a7f0d7 LL |
421 | static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream) |
422 | { | |
423 | struct intel_uncore *uncore = stream->uncore; | |
424 | ||
425 | return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) & | |
426 | GEN12_OAG_OATAILPTR_MASK; | |
427 | } | |
428 | ||
a37f08a8 | 429 | static u32 gen8_oa_hw_tail_read(struct i915_perf_stream *stream) |
19f81df2 | 430 | { |
52111c46 | 431 | struct intel_uncore *uncore = stream->uncore; |
a37f08a8 | 432 | |
8f8b1171 | 433 | return intel_uncore_read(uncore, GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK; |
19f81df2 RB |
434 | } |
435 | ||
a37f08a8 | 436 | static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream) |
19f81df2 | 437 | { |
52111c46 | 438 | struct intel_uncore *uncore = stream->uncore; |
8f8b1171 | 439 | u32 oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1); |
19f81df2 RB |
440 | |
441 | return oastatus1 & GEN7_OASTATUS1_TAIL_MASK; | |
442 | } | |
443 | ||
dbc9a5fb UNR |
444 | #define oa_report_header_64bit(__s) \ |
445 | ((__s)->oa_buffer.format->header == HDR_64_BIT) | |
446 | ||
447 | static u64 oa_report_id(struct i915_perf_stream *stream, void *report) | |
448 | { | |
449 | return oa_report_header_64bit(stream) ? *(u64 *)report : *(u32 *)report; | |
450 | } | |
451 | ||
452 | static u64 oa_report_reason(struct i915_perf_stream *stream, void *report) | |
453 | { | |
454 | return (oa_report_id(stream, report) >> OAREPORT_REASON_SHIFT) & | |
455 | (GRAPHICS_VER(stream->perf->i915) == 12 ? | |
456 | OAREPORT_REASON_MASK_EXTENDED : | |
457 | OAREPORT_REASON_MASK); | |
458 | } | |
459 | ||
460 | static void oa_report_id_clear(struct i915_perf_stream *stream, u32 *report) | |
461 | { | |
462 | if (oa_report_header_64bit(stream)) | |
463 | *(u64 *)report = 0; | |
464 | else | |
465 | *report = 0; | |
466 | } | |
467 | ||
468 | static bool oa_report_ctx_invalid(struct i915_perf_stream *stream, void *report) | |
469 | { | |
470 | return !(oa_report_id(stream, report) & | |
471 | stream->perf->gen8_valid_ctx_bit) && | |
472 | GRAPHICS_VER(stream->perf->i915) <= 11; | |
473 | } | |
474 | ||
475 | static u64 oa_timestamp(struct i915_perf_stream *stream, void *report) | |
476 | { | |
477 | return oa_report_header_64bit(stream) ? | |
478 | *((u64 *)report + 1) : | |
479 | *((u32 *)report + 1); | |
480 | } | |
481 | ||
482 | static void oa_timestamp_clear(struct i915_perf_stream *stream, u32 *report) | |
483 | { | |
484 | if (oa_report_header_64bit(stream)) | |
485 | *(u64 *)&report[2] = 0; | |
486 | else | |
487 | report[1] = 0; | |
488 | } | |
489 | ||
490 | static u32 oa_context_id(struct i915_perf_stream *stream, u32 *report) | |
491 | { | |
492 | u32 ctx_id = oa_report_header_64bit(stream) ? report[4] : report[2]; | |
493 | ||
494 | return ctx_id & stream->specific_ctx_id_mask; | |
495 | } | |
496 | ||
497 | static void oa_context_id_squash(struct i915_perf_stream *stream, u32 *report) | |
498 | { | |
499 | if (oa_report_header_64bit(stream)) | |
500 | report[4] = INVALID_CTX_ID; | |
501 | else | |
502 | report[2] = INVALID_CTX_ID; | |
503 | } | |
504 | ||
0dd860cf | 505 | /** |
19f81df2 | 506 | * oa_buffer_check_unlocked - check for data and update tail ptr state |
a37f08a8 | 507 | * @stream: i915 stream instance |
d7965152 | 508 | * |
0dd860cf RB |
509 | * This is either called via fops (for blocking reads in user ctx) or the poll |
510 | * check hrtimer (atomic ctx) to check the OA buffer tail pointer and check | |
511 | * if there is data available for userspace to read. | |
d7965152 | 512 | * |
0dd860cf RB |
513 | * This function is central to providing a workaround for the OA unit tail |
514 | * pointer having a race with respect to what data is visible to the CPU. | |
515 | * It is responsible for reading tail pointers from the hardware and giving | |
516 | * the pointers time to 'age' before they are made available for reading. | |
517 | * (See description of OA_TAIL_MARGIN_NSEC above for further details.) | |
518 | * | |
519 | * Besides returning true when there is data available to read() this function | |
d1df41eb LL |
520 | * also updates the tail, aging_tail and aging_timestamp in the oa_buffer |
521 | * object. | |
0dd860cf RB |
522 | * |
523 | * Note: It's safe to read OA config state here unlocked, assuming that this is | |
524 | * only called while the stream is enabled, while the global OA configuration | |
525 | * can't be modified. | |
526 | * | |
527 | * Returns: %true if the OA buffer contains data, else %false | |
d7965152 | 528 | */ |
a37f08a8 | 529 | static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) |
d7965152 | 530 | { |
d1df41eb | 531 | u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); |
90981da6 | 532 | int report_size = stream->oa_buffer.format->size; |
0dd860cf | 533 | unsigned long flags; |
d16e137e | 534 | bool pollin; |
d1df41eb | 535 | u32 hw_tail; |
0dd860cf | 536 | u64 now; |
3c67ce06 | 537 | u32 partial_report_size; |
0dd860cf RB |
538 | |
539 | /* We have to consider the (unlikely) possibility that read() errors | |
d1df41eb LL |
540 | * could result in an OA buffer reset which might reset the head and |
541 | * tail state. | |
0dd860cf | 542 | */ |
a37f08a8 | 543 | spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); |
0dd860cf | 544 | |
8f8b1171 | 545 | hw_tail = stream->perf->ops.oa_hw_tail_read(stream); |
0dd860cf | 546 | |
3c67ce06 UNR |
547 | /* The tail pointer increases in 64 byte increments, not in report_size |
548 | * steps. Also the report size may not be a power of 2. Compute | |
549 | * potentially partially landed report in the OA buffer | |
0dd860cf | 550 | */ |
3c67ce06 UNR |
551 | partial_report_size = OA_TAKEN(hw_tail, stream->oa_buffer.tail); |
552 | partial_report_size %= report_size; | |
553 | ||
554 | /* Subtract partial amount off the tail */ | |
555 | hw_tail = gtt_offset + OA_TAKEN(hw_tail, partial_report_size); | |
0dd860cf RB |
556 | |
557 | now = ktime_get_mono_fast_ns(); | |
558 | ||
d1df41eb LL |
559 | if (hw_tail == stream->oa_buffer.aging_tail && |
560 | (now - stream->oa_buffer.aging_timestamp) > OA_TAIL_MARGIN_NSEC) { | |
561 | /* If the HW tail hasn't move since the last check and the HW | |
562 | * tail has been aging for long enough, declare it the new | |
563 | * tail. | |
564 | */ | |
565 | stream->oa_buffer.tail = stream->oa_buffer.aging_tail; | |
566 | } else { | |
567 | u32 head, tail, aged_tail; | |
4117ebc7 | 568 | |
d1df41eb LL |
569 | /* NB: The head we observe here might effectively be a little |
570 | * out of date. If a read() is in progress, the head could be | |
571 | * anywhere between this head and stream->oa_buffer.tail. | |
572 | */ | |
573 | head = stream->oa_buffer.head - gtt_offset; | |
574 | aged_tail = stream->oa_buffer.tail - gtt_offset; | |
575 | ||
576 | hw_tail -= gtt_offset; | |
577 | tail = hw_tail; | |
578 | ||
dbc9a5fb UNR |
579 | /* Walk the stream backward until we find a report with report |
580 | * id and timestmap not at 0. Since the circular buffer pointers | |
581 | * progress by increments of 64 bytes and that reports can be up | |
582 | * to 256 bytes long, we can't tell whether a report has fully | |
583 | * landed in memory before the report id and timestamp of the | |
584 | * following report have effectively landed. | |
d1df41eb LL |
585 | * |
586 | * This is assuming that the writes of the OA unit land in | |
587 | * memory in the order they were written to. | |
588 | * If not : (╯°□°)╯︵ ┻━┻ | |
589 | */ | |
590 | while (OA_TAKEN(tail, aged_tail) >= report_size) { | |
dbc9a5fb | 591 | void *report = stream->oa_buffer.vaddr + tail; |
4117ebc7 | 592 | |
dbc9a5fb UNR |
593 | if (oa_report_id(stream, report) || |
594 | oa_timestamp(stream, report)) | |
d1df41eb | 595 | break; |
4117ebc7 | 596 | |
d1df41eb | 597 | tail = (tail - report_size) & (OA_BUFFER_SIZE - 1); |
0dd860cf | 598 | } |
d1df41eb LL |
599 | |
600 | if (OA_TAKEN(hw_tail, tail) > report_size && | |
601 | __ratelimit(&stream->perf->tail_pointer_race)) | |
a10234fd TU |
602 | drm_notice(&stream->uncore->i915->drm, |
603 | "unlanded report(s) head=0x%x tail=0x%x hw_tail=0x%x\n", | |
604 | head, tail, hw_tail); | |
d1df41eb LL |
605 | |
606 | stream->oa_buffer.tail = gtt_offset + tail; | |
607 | stream->oa_buffer.aging_tail = gtt_offset + hw_tail; | |
608 | stream->oa_buffer.aging_timestamp = now; | |
0dd860cf RB |
609 | } |
610 | ||
d16e137e LL |
611 | pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset, |
612 | stream->oa_buffer.head - gtt_offset) >= report_size; | |
613 | ||
a37f08a8 | 614 | spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); |
0dd860cf | 615 | |
d16e137e | 616 | return pollin; |
d7965152 RB |
617 | } |
618 | ||
619 | /** | |
16d98b31 RB |
620 | * append_oa_status - Appends a status record to a userspace read() buffer. |
621 | * @stream: An i915-perf stream opened for OA metrics | |
622 | * @buf: destination buffer given by userspace | |
623 | * @count: the number of bytes userspace wants to read | |
624 | * @offset: (inout): the current position for writing into @buf | |
625 | * @type: The kind of status to report to userspace | |
626 | * | |
627 | * Writes a status record (such as `DRM_I915_PERF_RECORD_OA_REPORT_LOST`) | |
628 | * into the userspace read() buffer. | |
629 | * | |
630 | * The @buf @offset will only be updated on success. | |
631 | * | |
632 | * Returns: 0 on success, negative error code on failure. | |
d7965152 RB |
633 | */ |
634 | static int append_oa_status(struct i915_perf_stream *stream, | |
635 | char __user *buf, | |
636 | size_t count, | |
637 | size_t *offset, | |
638 | enum drm_i915_perf_record_type type) | |
639 | { | |
640 | struct drm_i915_perf_record_header header = { type, 0, sizeof(header) }; | |
641 | ||
642 | if ((count - *offset) < header.size) | |
643 | return -ENOSPC; | |
644 | ||
645 | if (copy_to_user(buf + *offset, &header, sizeof(header))) | |
646 | return -EFAULT; | |
647 | ||
648 | (*offset) += header.size; | |
649 | ||
650 | return 0; | |
651 | } | |
652 | ||
653 | /** | |
16d98b31 RB |
654 | * append_oa_sample - Copies single OA report into userspace read() buffer. |
655 | * @stream: An i915-perf stream opened for OA metrics | |
656 | * @buf: destination buffer given by userspace | |
657 | * @count: the number of bytes userspace wants to read | |
658 | * @offset: (inout): the current position for writing into @buf | |
659 | * @report: A single OA report to (optionally) include as part of the sample | |
660 | * | |
661 | * The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*` | |
662 | * properties when opening a stream, tracked as `stream->sample_flags`. This | |
663 | * function copies the requested components of a single sample to the given | |
664 | * read() @buf. | |
665 | * | |
666 | * The @buf @offset will only be updated on success. | |
667 | * | |
668 | * Returns: 0 on success, negative error code on failure. | |
d7965152 RB |
669 | */ |
670 | static int append_oa_sample(struct i915_perf_stream *stream, | |
671 | char __user *buf, | |
672 | size_t count, | |
673 | size_t *offset, | |
674 | const u8 *report) | |
675 | { | |
90981da6 | 676 | int report_size = stream->oa_buffer.format->size; |
d7965152 | 677 | struct drm_i915_perf_record_header header; |
3c67ce06 UNR |
678 | int report_size_partial; |
679 | u8 *oa_buf_end; | |
d7965152 RB |
680 | |
681 | header.type = DRM_I915_PERF_RECORD_SAMPLE; | |
682 | header.pad = 0; | |
683 | header.size = stream->sample_size; | |
684 | ||
685 | if ((count - *offset) < header.size) | |
686 | return -ENOSPC; | |
687 | ||
688 | buf += *offset; | |
689 | if (copy_to_user(buf, &header, sizeof(header))) | |
690 | return -EFAULT; | |
691 | buf += sizeof(header); | |
692 | ||
3c67ce06 UNR |
693 | oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE; |
694 | report_size_partial = oa_buf_end - report; | |
695 | ||
696 | if (report_size_partial < report_size) { | |
697 | if (copy_to_user(buf, report, report_size_partial)) | |
698 | return -EFAULT; | |
699 | buf += report_size_partial; | |
700 | ||
701 | if (copy_to_user(buf, stream->oa_buffer.vaddr, | |
702 | report_size - report_size_partial)) | |
703 | return -EFAULT; | |
704 | } else if (copy_to_user(buf, report, report_size)) { | |
be0bdd67 | 705 | return -EFAULT; |
3c67ce06 | 706 | } |
d7965152 RB |
707 | |
708 | (*offset) += header.size; | |
709 | ||
710 | return 0; | |
711 | } | |
712 | ||
19f81df2 | 713 | /** |
e9d2871f MCC |
714 | * gen8_append_oa_reports - Copies all buffered OA reports into |
715 | * userspace read() buffer. | |
19f81df2 RB |
716 | * @stream: An i915-perf stream opened for OA metrics |
717 | * @buf: destination buffer given by userspace | |
718 | * @count: the number of bytes userspace wants to read | |
719 | * @offset: (inout): the current position for writing into @buf | |
720 | * | |
721 | * Notably any error condition resulting in a short read (-%ENOSPC or | |
722 | * -%EFAULT) will be returned even though one or more records may | |
723 | * have been successfully copied. In this case it's up to the caller | |
724 | * to decide if the error should be squashed before returning to | |
725 | * userspace. | |
726 | * | |
727 | * Note: reports are consumed from the head, and appended to the | |
728 | * tail, so the tail chases the head?... If you think that's mad | |
729 | * and back-to-front you're not alone, but this follows the | |
730 | * Gen PRM naming convention. | |
731 | * | |
732 | * Returns: 0 on success, negative error code on failure. | |
733 | */ | |
734 | static int gen8_append_oa_reports(struct i915_perf_stream *stream, | |
735 | char __user *buf, | |
736 | size_t count, | |
737 | size_t *offset) | |
738 | { | |
52111c46 | 739 | struct intel_uncore *uncore = stream->uncore; |
90981da6 | 740 | int report_size = stream->oa_buffer.format->size; |
a37f08a8 UNR |
741 | u8 *oa_buf_base = stream->oa_buffer.vaddr; |
742 | u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); | |
fe841686 | 743 | u32 mask = (OA_BUFFER_SIZE - 1); |
19f81df2 RB |
744 | size_t start_offset = *offset; |
745 | unsigned long flags; | |
19f81df2 | 746 | u32 head, tail; |
19f81df2 RB |
747 | int ret = 0; |
748 | ||
a9f236d1 | 749 | if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled)) |
19f81df2 RB |
750 | return -EIO; |
751 | ||
a37f08a8 | 752 | spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); |
19f81df2 | 753 | |
a37f08a8 | 754 | head = stream->oa_buffer.head; |
d1df41eb | 755 | tail = stream->oa_buffer.tail; |
19f81df2 | 756 | |
a37f08a8 | 757 | spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); |
19f81df2 | 758 | |
19f81df2 RB |
759 | /* |
760 | * NB: oa_buffer.head/tail include the gtt_offset which we don't want | |
761 | * while indexing relative to oa_buf_base. | |
762 | */ | |
763 | head -= gtt_offset; | |
764 | tail -= gtt_offset; | |
765 | ||
766 | /* | |
767 | * An out of bounds or misaligned head or tail pointer implies a driver | |
768 | * bug since we validate + align the tail pointers we read from the | |
769 | * hardware and we are in full control of the head pointer which should | |
3c67ce06 | 770 | * only be incremented by multiples of the report size. |
19f81df2 | 771 | */ |
a9f236d1 | 772 | if (drm_WARN_ONCE(&uncore->i915->drm, |
3c67ce06 UNR |
773 | head > OA_BUFFER_SIZE || |
774 | tail > OA_BUFFER_SIZE, | |
a9f236d1 PB |
775 | "Inconsistent OA buffer pointers: head = %u, tail = %u\n", |
776 | head, tail)) | |
19f81df2 RB |
777 | return -EIO; |
778 | ||
779 | ||
780 | for (/* none */; | |
f1d8e2bf | 781 | OA_TAKEN(tail, head); |
19f81df2 RB |
782 | head = (head + report_size) & mask) { |
783 | u8 *report = oa_buf_base + head; | |
784 | u32 *report32 = (void *)report; | |
785 | u32 ctx_id; | |
dbc9a5fb | 786 | u64 reason; |
19f81df2 | 787 | |
19f81df2 RB |
788 | /* |
789 | * The reason field includes flags identifying what | |
790 | * triggered this specific report (mostly timer | |
791 | * triggered or e.g. due to a context switch). | |
792 | * | |
dbc9a5fb UNR |
793 | * In MMIO triggered reports, some platforms do not set the |
794 | * reason bit in this field and it is valid to have a reason | |
795 | * field of zero. | |
19f81df2 | 796 | */ |
dbc9a5fb UNR |
797 | reason = oa_report_reason(stream, report); |
798 | ctx_id = oa_context_id(stream, report32); | |
19f81df2 RB |
799 | |
800 | /* | |
801 | * Squash whatever is in the CTX_ID field if it's marked as | |
802 | * invalid to be sure we avoid false-positive, single-context | |
803 | * filtering below... | |
804 | * | |
805 | * Note: that we don't clear the valid_ctx_bit so userspace can | |
806 | * understand that the ID has been squashed by the kernel. | |
807 | */ | |
dbc9a5fb UNR |
808 | if (oa_report_ctx_invalid(stream, report)) { |
809 | ctx_id = INVALID_CTX_ID; | |
810 | oa_context_id_squash(stream, report32); | |
811 | } | |
19f81df2 RB |
812 | |
813 | /* | |
814 | * NB: For Gen 8 the OA unit no longer supports clock gating | |
815 | * off for a specific context and the kernel can't securely | |
816 | * stop the counters from updating as system-wide / global | |
817 | * values. | |
818 | * | |
819 | * Automatic reports now include a context ID so reports can be | |
820 | * filtered on the cpu but it's not worth trying to | |
821 | * automatically subtract/hide counter progress for other | |
822 | * contexts while filtering since we can't stop userspace | |
823 | * issuing MI_REPORT_PERF_COUNT commands which would still | |
824 | * provide a side-band view of the real values. | |
825 | * | |
826 | * To allow userspace (such as Mesa/GL_INTEL_performance_query) | |
827 | * to normalize counters for a single filtered context then it | |
828 | * needs be forwarded bookend context-switch reports so that it | |
829 | * can track switches in between MI_REPORT_PERF_COUNT commands | |
830 | * and can itself subtract/ignore the progress of counters | |
831 | * associated with other contexts. Note that the hardware | |
832 | * automatically triggers reports when switching to a new | |
833 | * context which are tagged with the ID of the newly active | |
834 | * context. To avoid the complexity (and likely fragility) of | |
835 | * reading ahead while parsing reports to try and minimize | |
836 | * forwarding redundant context switch reports (i.e. between | |
837 | * other, unrelated contexts) we simply elect to forward them | |
838 | * all. | |
839 | * | |
840 | * We don't rely solely on the reason field to identify context | |
841 | * switches since it's not-uncommon for periodic samples to | |
842 | * identify a switch before any 'context switch' report. | |
843 | */ | |
a5a6d92f | 844 | if (!stream->ctx || |
a37f08a8 UNR |
845 | stream->specific_ctx_id == ctx_id || |
846 | stream->oa_buffer.last_ctx_id == stream->specific_ctx_id || | |
19f81df2 RB |
847 | reason & OAREPORT_REASON_CTX_SWITCH) { |
848 | ||
849 | /* | |
850 | * While filtering for a single context we avoid | |
851 | * leaking the IDs of other contexts. | |
852 | */ | |
a5a6d92f | 853 | if (stream->ctx && |
a37f08a8 | 854 | stream->specific_ctx_id != ctx_id) { |
dbc9a5fb | 855 | oa_context_id_squash(stream, report32); |
19f81df2 RB |
856 | } |
857 | ||
858 | ret = append_oa_sample(stream, buf, count, offset, | |
859 | report); | |
860 | if (ret) | |
861 | break; | |
862 | ||
a37f08a8 | 863 | stream->oa_buffer.last_ctx_id = ctx_id; |
19f81df2 RB |
864 | } |
865 | ||
866 | /* | |
dbc9a5fb | 867 | * Clear out the report id and timestamp as a means to detect unlanded |
d1df41eb | 868 | * reports. |
19f81df2 | 869 | */ |
dbc9a5fb UNR |
870 | oa_report_id_clear(stream, report32); |
871 | oa_timestamp_clear(stream, report32); | |
19f81df2 RB |
872 | } |
873 | ||
874 | if (start_offset != *offset) { | |
00a7f0d7 LL |
875 | i915_reg_t oaheadptr; |
876 | ||
651e7d48 | 877 | oaheadptr = GRAPHICS_VER(stream->perf->i915) == 12 ? |
00a7f0d7 LL |
878 | GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR; |
879 | ||
a37f08a8 | 880 | spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); |
19f81df2 RB |
881 | |
882 | /* | |
883 | * We removed the gtt_offset for the copy loop above, indexing | |
884 | * relative to oa_buf_base so put back here... | |
885 | */ | |
886 | head += gtt_offset; | |
00a7f0d7 LL |
887 | intel_uncore_write(uncore, oaheadptr, |
888 | head & GEN12_OAG_OAHEADPTR_MASK); | |
a37f08a8 | 889 | stream->oa_buffer.head = head; |
19f81df2 | 890 | |
a37f08a8 | 891 | spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); |
19f81df2 RB |
892 | } |
893 | ||
894 | return ret; | |
895 | } | |
896 | ||
897 | /** | |
898 | * gen8_oa_read - copy status records then buffered OA reports | |
899 | * @stream: An i915-perf stream opened for OA metrics | |
900 | * @buf: destination buffer given by userspace | |
901 | * @count: the number of bytes userspace wants to read | |
902 | * @offset: (inout): the current position for writing into @buf | |
903 | * | |
904 | * Checks OA unit status registers and if necessary appends corresponding | |
905 | * status records for userspace (such as for a buffer full condition) and then | |
906 | * initiate appending any buffered OA reports. | |
907 | * | |
908 | * Updates @offset according to the number of bytes successfully copied into | |
909 | * the userspace buffer. | |
910 | * | |
911 | * NB: some data may be successfully copied to the userspace buffer | |
912 | * even if an error is returned, and this is reflected in the | |
913 | * updated @offset. | |
914 | * | |
915 | * Returns: zero on success or a negative error code | |
916 | */ | |
917 | static int gen8_oa_read(struct i915_perf_stream *stream, | |
918 | char __user *buf, | |
919 | size_t count, | |
920 | size_t *offset) | |
921 | { | |
52111c46 | 922 | struct intel_uncore *uncore = stream->uncore; |
19f81df2 | 923 | u32 oastatus; |
00a7f0d7 | 924 | i915_reg_t oastatus_reg; |
19f81df2 RB |
925 | int ret; |
926 | ||
a9f236d1 | 927 | if (drm_WARN_ON(&uncore->i915->drm, !stream->oa_buffer.vaddr)) |
19f81df2 RB |
928 | return -EIO; |
929 | ||
651e7d48 | 930 | oastatus_reg = GRAPHICS_VER(stream->perf->i915) == 12 ? |
00a7f0d7 LL |
931 | GEN12_OAG_OASTATUS : GEN8_OASTATUS; |
932 | ||
933 | oastatus = intel_uncore_read(uncore, oastatus_reg); | |
19f81df2 RB |
934 | |
935 | /* | |
936 | * We treat OABUFFER_OVERFLOW as a significant error: | |
937 | * | |
938 | * Although theoretically we could handle this more gracefully | |
939 | * sometimes, some Gens don't correctly suppress certain | |
940 | * automatically triggered reports in this condition and so we | |
941 | * have to assume that old reports are now being trampled | |
942 | * over. | |
fe841686 JL |
943 | * |
944 | * Considering how we don't currently give userspace control | |
945 | * over the OA buffer size and always configure a large 16MB | |
946 | * buffer, then a buffer overflow does anyway likely indicate | |
947 | * that something has gone quite badly wrong. | |
19f81df2 RB |
948 | */ |
949 | if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) { | |
950 | ret = append_oa_status(stream, buf, count, offset, | |
951 | DRM_I915_PERF_RECORD_OA_BUFFER_LOST); | |
952 | if (ret) | |
953 | return ret; | |
954 | ||
2fec5391 UNR |
955 | drm_dbg(&stream->perf->i915->drm, |
956 | "OA buffer overflow (exponent = %d): force restart\n", | |
957 | stream->period_exponent); | |
19f81df2 | 958 | |
8f8b1171 CW |
959 | stream->perf->ops.oa_disable(stream); |
960 | stream->perf->ops.oa_enable(stream); | |
19f81df2 RB |
961 | |
962 | /* | |
963 | * Note: .oa_enable() is expected to re-init the oabuffer and | |
964 | * reset GEN8_OASTATUS for us | |
965 | */ | |
00a7f0d7 | 966 | oastatus = intel_uncore_read(uncore, oastatus_reg); |
19f81df2 RB |
967 | } |
968 | ||
969 | if (oastatus & GEN8_OASTATUS_REPORT_LOST) { | |
970 | ret = append_oa_status(stream, buf, count, offset, | |
971 | DRM_I915_PERF_RECORD_OA_REPORT_LOST); | |
972 | if (ret) | |
973 | return ret; | |
059a0beb LL |
974 | |
975 | intel_uncore_rmw(uncore, oastatus_reg, | |
976 | GEN8_OASTATUS_COUNTER_OVERFLOW | | |
977 | GEN8_OASTATUS_REPORT_LOST, | |
651e7d48 | 978 | IS_GRAPHICS_VER(uncore->i915, 8, 11) ? |
059a0beb LL |
979 | (GEN8_OASTATUS_HEAD_POINTER_WRAP | |
980 | GEN8_OASTATUS_TAIL_POINTER_WRAP) : 0); | |
19f81df2 RB |
981 | } |
982 | ||
983 | return gen8_append_oa_reports(stream, buf, count, offset); | |
984 | } | |
985 | ||
d7965152 | 986 | /** |
e9d2871f MCC |
987 | * gen7_append_oa_reports - Copies all buffered OA reports into |
988 | * userspace read() buffer. | |
d7965152 RB |
989 | * @stream: An i915-perf stream opened for OA metrics |
990 | * @buf: destination buffer given by userspace | |
991 | * @count: the number of bytes userspace wants to read | |
992 | * @offset: (inout): the current position for writing into @buf | |
d7965152 | 993 | * |
16d98b31 RB |
994 | * Notably any error condition resulting in a short read (-%ENOSPC or |
995 | * -%EFAULT) will be returned even though one or more records may | |
d7965152 RB |
996 | * have been successfully copied. In this case it's up to the caller |
997 | * to decide if the error should be squashed before returning to | |
998 | * userspace. | |
999 | * | |
1000 | * Note: reports are consumed from the head, and appended to the | |
e81b3a55 | 1001 | * tail, so the tail chases the head?... If you think that's mad |
d7965152 RB |
1002 | * and back-to-front you're not alone, but this follows the |
1003 | * Gen PRM naming convention. | |
16d98b31 RB |
1004 | * |
1005 | * Returns: 0 on success, negative error code on failure. | |
d7965152 RB |
1006 | */ |
1007 | static int gen7_append_oa_reports(struct i915_perf_stream *stream, | |
1008 | char __user *buf, | |
1009 | size_t count, | |
3bb335c1 | 1010 | size_t *offset) |
d7965152 | 1011 | { |
52111c46 | 1012 | struct intel_uncore *uncore = stream->uncore; |
90981da6 | 1013 | int report_size = stream->oa_buffer.format->size; |
a37f08a8 UNR |
1014 | u8 *oa_buf_base = stream->oa_buffer.vaddr; |
1015 | u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); | |
fe841686 | 1016 | u32 mask = (OA_BUFFER_SIZE - 1); |
3bb335c1 | 1017 | size_t start_offset = *offset; |
0dd860cf | 1018 | unsigned long flags; |
0dd860cf | 1019 | u32 head, tail; |
d7965152 RB |
1020 | int ret = 0; |
1021 | ||
a9f236d1 | 1022 | if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled)) |
d7965152 RB |
1023 | return -EIO; |
1024 | ||
a37f08a8 | 1025 | spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); |
f279020a | 1026 | |
a37f08a8 | 1027 | head = stream->oa_buffer.head; |
d1df41eb | 1028 | tail = stream->oa_buffer.tail; |
f279020a | 1029 | |
a37f08a8 | 1030 | spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); |
d7965152 | 1031 | |
0dd860cf RB |
1032 | /* NB: oa_buffer.head/tail include the gtt_offset which we don't want |
1033 | * while indexing relative to oa_buf_base. | |
d7965152 | 1034 | */ |
0dd860cf RB |
1035 | head -= gtt_offset; |
1036 | tail -= gtt_offset; | |
d7965152 | 1037 | |
0dd860cf RB |
1038 | /* An out of bounds or misaligned head or tail pointer implies a driver |
1039 | * bug since we validate + align the tail pointers we read from the | |
1040 | * hardware and we are in full control of the head pointer which should | |
1041 | * only be incremented by multiples of the report size (notably also | |
1042 | * all a power of two). | |
d7965152 | 1043 | */ |
a9f236d1 PB |
1044 | if (drm_WARN_ONCE(&uncore->i915->drm, |
1045 | head > OA_BUFFER_SIZE || head % report_size || | |
1046 | tail > OA_BUFFER_SIZE || tail % report_size, | |
1047 | "Inconsistent OA buffer pointers: head = %u, tail = %u\n", | |
1048 | head, tail)) | |
0dd860cf | 1049 | return -EIO; |
d7965152 | 1050 | |
d7965152 RB |
1051 | |
1052 | for (/* none */; | |
f1d8e2bf | 1053 | OA_TAKEN(tail, head); |
d7965152 RB |
1054 | head = (head + report_size) & mask) { |
1055 | u8 *report = oa_buf_base + head; | |
1056 | u32 *report32 = (void *)report; | |
1057 | ||
1058 | /* All the report sizes factor neatly into the buffer | |
1059 | * size so we never expect to see a report split | |
1060 | * between the beginning and end of the buffer. | |
1061 | * | |
1062 | * Given the initial alignment check a misalignment | |
1063 | * here would imply a driver bug that would result | |
1064 | * in an overrun. | |
1065 | */ | |
a9f236d1 PB |
1066 | if (drm_WARN_ON(&uncore->i915->drm, |
1067 | (OA_BUFFER_SIZE - head) < report_size)) { | |
0bf85735 WK |
1068 | drm_err(&uncore->i915->drm, |
1069 | "Spurious OA head ptr: non-integral report offset\n"); | |
d7965152 RB |
1070 | break; |
1071 | } | |
1072 | ||
1073 | /* The report-ID field for periodic samples includes | |
1074 | * some undocumented flags related to what triggered | |
1075 | * the report and is never expected to be zero so we | |
1076 | * can check that the report isn't invalid before | |
1077 | * copying it to userspace... | |
1078 | */ | |
1079 | if (report32[0] == 0) { | |
8f8b1171 | 1080 | if (__ratelimit(&stream->perf->spurious_report_rs)) |
a10234fd TU |
1081 | drm_notice(&uncore->i915->drm, |
1082 | "Skipping spurious, invalid OA report\n"); | |
d7965152 RB |
1083 | continue; |
1084 | } | |
1085 | ||
1086 | ret = append_oa_sample(stream, buf, count, offset, report); | |
1087 | if (ret) | |
1088 | break; | |
1089 | ||
d1df41eb LL |
1090 | /* Clear out the first 2 dwords as a mean to detect unlanded |
1091 | * reports. | |
d7965152 RB |
1092 | */ |
1093 | report32[0] = 0; | |
d1df41eb | 1094 | report32[1] = 0; |
d7965152 RB |
1095 | } |
1096 | ||
3bb335c1 | 1097 | if (start_offset != *offset) { |
a37f08a8 | 1098 | spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); |
0dd860cf | 1099 | |
3bb335c1 RB |
1100 | /* We removed the gtt_offset for the copy loop above, indexing |
1101 | * relative to oa_buf_base so put back here... | |
1102 | */ | |
1103 | head += gtt_offset; | |
1104 | ||
8f8b1171 CW |
1105 | intel_uncore_write(uncore, GEN7_OASTATUS2, |
1106 | (head & GEN7_OASTATUS2_HEAD_MASK) | | |
1107 | GEN7_OASTATUS2_MEM_SELECT_GGTT); | |
a37f08a8 | 1108 | stream->oa_buffer.head = head; |
0dd860cf | 1109 | |
a37f08a8 | 1110 | spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); |
3bb335c1 | 1111 | } |
d7965152 RB |
1112 | |
1113 | return ret; | |
1114 | } | |
1115 | ||
16d98b31 RB |
1116 | /** |
1117 | * gen7_oa_read - copy status records then buffered OA reports | |
1118 | * @stream: An i915-perf stream opened for OA metrics | |
1119 | * @buf: destination buffer given by userspace | |
1120 | * @count: the number of bytes userspace wants to read | |
1121 | * @offset: (inout): the current position for writing into @buf | |
1122 | * | |
1123 | * Checks Gen 7 specific OA unit status registers and if necessary appends | |
1124 | * corresponding status records for userspace (such as for a buffer full | |
1125 | * condition) and then initiate appending any buffered OA reports. | |
1126 | * | |
1127 | * Updates @offset according to the number of bytes successfully copied into | |
1128 | * the userspace buffer. | |
1129 | * | |
1130 | * Returns: zero on success or a negative error code | |
1131 | */ | |
d7965152 RB |
1132 | static int gen7_oa_read(struct i915_perf_stream *stream, |
1133 | char __user *buf, | |
1134 | size_t count, | |
1135 | size_t *offset) | |
1136 | { | |
52111c46 | 1137 | struct intel_uncore *uncore = stream->uncore; |
d7965152 | 1138 | u32 oastatus1; |
d7965152 RB |
1139 | int ret; |
1140 | ||
a9f236d1 | 1141 | if (drm_WARN_ON(&uncore->i915->drm, !stream->oa_buffer.vaddr)) |
d7965152 RB |
1142 | return -EIO; |
1143 | ||
8f8b1171 | 1144 | oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1); |
d7965152 | 1145 | |
d7965152 RB |
1146 | /* XXX: On Haswell we don't have a safe way to clear oastatus1 |
1147 | * bits while the OA unit is enabled (while the tail pointer | |
1148 | * may be updated asynchronously) so we ignore status bits | |
1149 | * that have already been reported to userspace. | |
1150 | */ | |
8f8b1171 | 1151 | oastatus1 &= ~stream->perf->gen7_latched_oastatus1; |
d7965152 RB |
1152 | |
1153 | /* We treat OABUFFER_OVERFLOW as a significant error: | |
1154 | * | |
1155 | * - The status can be interpreted to mean that the buffer is | |
1156 | * currently full (with a higher precedence than OA_TAKEN() | |
1157 | * which will start to report a near-empty buffer after an | |
1158 | * overflow) but it's awkward that we can't clear the status | |
1159 | * on Haswell, so without a reset we won't be able to catch | |
1160 | * the state again. | |
1161 | * | |
1162 | * - Since it also implies the HW has started overwriting old | |
1163 | * reports it may also affect our sanity checks for invalid | |
1164 | * reports when copying to userspace that assume new reports | |
1165 | * are being written to cleared memory. | |
1166 | * | |
1167 | * - In the future we may want to introduce a flight recorder | |
1168 | * mode where the driver will automatically maintain a safe | |
1169 | * guard band between head/tail, avoiding this overflow | |
1170 | * condition, but we avoid the added driver complexity for | |
1171 | * now. | |
1172 | */ | |
1173 | if (unlikely(oastatus1 & GEN7_OASTATUS1_OABUFFER_OVERFLOW)) { | |
1174 | ret = append_oa_status(stream, buf, count, offset, | |
1175 | DRM_I915_PERF_RECORD_OA_BUFFER_LOST); | |
1176 | if (ret) | |
1177 | return ret; | |
1178 | ||
2fec5391 UNR |
1179 | drm_dbg(&stream->perf->i915->drm, |
1180 | "OA buffer overflow (exponent = %d): force restart\n", | |
1181 | stream->period_exponent); | |
d7965152 | 1182 | |
8f8b1171 CW |
1183 | stream->perf->ops.oa_disable(stream); |
1184 | stream->perf->ops.oa_enable(stream); | |
d7965152 | 1185 | |
8f8b1171 | 1186 | oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1); |
d7965152 RB |
1187 | } |
1188 | ||
1189 | if (unlikely(oastatus1 & GEN7_OASTATUS1_REPORT_LOST)) { | |
1190 | ret = append_oa_status(stream, buf, count, offset, | |
1191 | DRM_I915_PERF_RECORD_OA_REPORT_LOST); | |
1192 | if (ret) | |
1193 | return ret; | |
8f8b1171 | 1194 | stream->perf->gen7_latched_oastatus1 |= |
d7965152 RB |
1195 | GEN7_OASTATUS1_REPORT_LOST; |
1196 | } | |
1197 | ||
3bb335c1 | 1198 | return gen7_append_oa_reports(stream, buf, count, offset); |
d7965152 RB |
1199 | } |
1200 | ||
16d98b31 RB |
1201 | /** |
1202 | * i915_oa_wait_unlocked - handles blocking IO until OA data available | |
1203 | * @stream: An i915-perf stream opened for OA metrics | |
1204 | * | |
1205 | * Called when userspace tries to read() from a blocking stream FD opened | |
1206 | * for OA metrics. It waits until the hrtimer callback finds a non-empty | |
1207 | * OA buffer and wakes us. | |
1208 | * | |
1209 | * Note: it's acceptable to have this return with some false positives | |
1210 | * since any subsequent read handling will return -EAGAIN if there isn't | |
1211 | * really data ready for userspace yet. | |
1212 | * | |
1213 | * Returns: zero on success or a negative error code | |
1214 | */ | |
d7965152 RB |
1215 | static int i915_oa_wait_unlocked(struct i915_perf_stream *stream) |
1216 | { | |
d7965152 | 1217 | /* We would wait indefinitely if periodic sampling is not enabled */ |
a37f08a8 | 1218 | if (!stream->periodic) |
d7965152 RB |
1219 | return -EIO; |
1220 | ||
a37f08a8 UNR |
1221 | return wait_event_interruptible(stream->poll_wq, |
1222 | oa_buffer_check_unlocked(stream)); | |
d7965152 RB |
1223 | } |
1224 | ||
16d98b31 RB |
1225 | /** |
1226 | * i915_oa_poll_wait - call poll_wait() for an OA stream poll() | |
1227 | * @stream: An i915-perf stream opened for OA metrics | |
1228 | * @file: An i915 perf stream file | |
1229 | * @wait: poll() state table | |
1230 | * | |
1231 | * For handling userspace polling on an i915 perf stream opened for OA metrics, | |
1232 | * this starts a poll_wait with the wait queue that our hrtimer callback wakes | |
1233 | * when it sees data ready to read in the circular OA buffer. | |
1234 | */ | |
d7965152 RB |
1235 | static void i915_oa_poll_wait(struct i915_perf_stream *stream, |
1236 | struct file *file, | |
1237 | poll_table *wait) | |
1238 | { | |
a37f08a8 | 1239 | poll_wait(file, &stream->poll_wq, wait); |
d7965152 RB |
1240 | } |
1241 | ||
16d98b31 RB |
1242 | /** |
1243 | * i915_oa_read - just calls through to &i915_oa_ops->read | |
1244 | * @stream: An i915-perf stream opened for OA metrics | |
1245 | * @buf: destination buffer given by userspace | |
1246 | * @count: the number of bytes userspace wants to read | |
1247 | * @offset: (inout): the current position for writing into @buf | |
1248 | * | |
1249 | * Updates @offset according to the number of bytes successfully copied into | |
1250 | * the userspace buffer. | |
1251 | * | |
1252 | * Returns: zero on success or a negative error code | |
1253 | */ | |
d7965152 RB |
1254 | static int i915_oa_read(struct i915_perf_stream *stream, |
1255 | char __user *buf, | |
1256 | size_t count, | |
1257 | size_t *offset) | |
1258 | { | |
8f8b1171 | 1259 | return stream->perf->ops.read(stream, buf, count, offset); |
d7965152 RB |
1260 | } |
1261 | ||
a37f08a8 | 1262 | static struct intel_context *oa_pin_context(struct i915_perf_stream *stream) |
61d5676b | 1263 | { |
5e2a0419 | 1264 | struct i915_gem_engines_iter it; |
a37f08a8 | 1265 | struct i915_gem_context *ctx = stream->ctx; |
61d5676b | 1266 | struct intel_context *ce; |
f00ecc2e ML |
1267 | struct i915_gem_ww_ctx ww; |
1268 | int err = -ENODEV; | |
61d5676b | 1269 | |
5e2a0419 | 1270 | for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) { |
9a61363a | 1271 | if (ce->engine != stream->engine) /* first match! */ |
5e2a0419 CW |
1272 | continue; |
1273 | ||
f00ecc2e ML |
1274 | err = 0; |
1275 | break; | |
fa9f6681 | 1276 | } |
5e2a0419 | 1277 | i915_gem_context_unlock_engines(ctx); |
61d5676b | 1278 | |
f00ecc2e ML |
1279 | if (err) |
1280 | return ERR_PTR(err); | |
1281 | ||
1282 | i915_gem_ww_ctx_init(&ww, true); | |
1283 | retry: | |
1284 | /* | |
1285 | * As the ID is the gtt offset of the context's vma we | |
1286 | * pin the vma to ensure the ID remains fixed. | |
1287 | */ | |
1288 | err = intel_context_pin_ww(ce, &ww); | |
1289 | if (err == -EDEADLK) { | |
1290 | err = i915_gem_ww_ctx_backoff(&ww); | |
1291 | if (!err) | |
1292 | goto retry; | |
1293 | } | |
1294 | i915_gem_ww_ctx_fini(&ww); | |
1295 | ||
1296 | if (err) | |
1297 | return ERR_PTR(err); | |
1298 | ||
1299 | stream->pinned_ctx = ce; | |
a37f08a8 | 1300 | return stream->pinned_ctx; |
61d5676b LL |
1301 | } |
1302 | ||
682aa437 UNR |
1303 | static int |
1304 | __store_reg_to_mem(struct i915_request *rq, i915_reg_t reg, u32 ggtt_offset) | |
1305 | { | |
1306 | u32 *cs, cmd; | |
1307 | ||
1308 | cmd = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; | |
1309 | if (GRAPHICS_VER(rq->engine->i915) >= 8) | |
1310 | cmd++; | |
1311 | ||
1312 | cs = intel_ring_begin(rq, 4); | |
1313 | if (IS_ERR(cs)) | |
1314 | return PTR_ERR(cs); | |
1315 | ||
1316 | *cs++ = cmd; | |
1317 | *cs++ = i915_mmio_reg_offset(reg); | |
1318 | *cs++ = ggtt_offset; | |
1319 | *cs++ = 0; | |
1320 | ||
1321 | intel_ring_advance(rq, cs); | |
1322 | ||
1323 | return 0; | |
1324 | } | |
1325 | ||
1326 | static int | |
1327 | __read_reg(struct intel_context *ce, i915_reg_t reg, u32 ggtt_offset) | |
1328 | { | |
1329 | struct i915_request *rq; | |
1330 | int err; | |
1331 | ||
1332 | rq = i915_request_create(ce); | |
1333 | if (IS_ERR(rq)) | |
1334 | return PTR_ERR(rq); | |
1335 | ||
1336 | i915_request_get(rq); | |
1337 | ||
1338 | err = __store_reg_to_mem(rq, reg, ggtt_offset); | |
1339 | ||
1340 | i915_request_add(rq); | |
1341 | if (!err && i915_request_wait(rq, 0, HZ / 2) < 0) | |
1342 | err = -ETIME; | |
1343 | ||
1344 | i915_request_put(rq); | |
1345 | ||
1346 | return err; | |
1347 | } | |
1348 | ||
1349 | static int | |
1350 | gen12_guc_sw_ctx_id(struct intel_context *ce, u32 *ctx_id) | |
1351 | { | |
1352 | struct i915_vma *scratch; | |
1353 | u32 *val; | |
1354 | int err; | |
1355 | ||
1356 | scratch = __vm_create_scratch_for_read_pinned(&ce->engine->gt->ggtt->vm, 4); | |
1357 | if (IS_ERR(scratch)) | |
1358 | return PTR_ERR(scratch); | |
1359 | ||
1360 | err = i915_vma_sync(scratch); | |
1361 | if (err) | |
1362 | goto err_scratch; | |
1363 | ||
1364 | err = __read_reg(ce, RING_EXECLIST_STATUS_HI(ce->engine->mmio_base), | |
1365 | i915_ggtt_offset(scratch)); | |
1366 | if (err) | |
1367 | goto err_scratch; | |
1368 | ||
1369 | val = i915_gem_object_pin_map_unlocked(scratch->obj, I915_MAP_WB); | |
1370 | if (IS_ERR(val)) { | |
1371 | err = PTR_ERR(val); | |
1372 | goto err_scratch; | |
1373 | } | |
1374 | ||
1375 | *ctx_id = *val; | |
1376 | i915_gem_object_unpin_map(scratch->obj); | |
1377 | ||
1378 | err_scratch: | |
1379 | i915_vma_unpin_and_release(&scratch, 0); | |
1380 | return err; | |
1381 | } | |
1382 | ||
1383 | /* | |
1384 | * For execlist mode of submission, pick an unused context id | |
1385 | * 0 - (NUM_CONTEXT_TAG -1) are used by other contexts | |
1386 | * XXX_MAX_CONTEXT_HW_ID is used by idle context | |
1387 | * | |
1388 | * For GuC mode of submission read context id from the upper dword of the | |
1389 | * EXECLIST_STATUS register. Note that we read this value only once and expect | |
1390 | * that the value stays fixed for the entire OA use case. There are cases where | |
1391 | * GuC KMD implementation may deregister a context to reuse it's context id, but | |
1392 | * we prevent that from happening to the OA context by pinning it. | |
1393 | */ | |
1394 | static int gen12_get_render_context_id(struct i915_perf_stream *stream) | |
1395 | { | |
1396 | u32 ctx_id, mask; | |
1397 | int ret; | |
1398 | ||
1399 | if (intel_engine_uses_guc(stream->engine)) { | |
1400 | ret = gen12_guc_sw_ctx_id(stream->pinned_ctx, &ctx_id); | |
1401 | if (ret) | |
1402 | return ret; | |
1403 | ||
1404 | mask = ((1U << GEN12_GUC_SW_CTX_ID_WIDTH) - 1) << | |
1405 | (GEN12_GUC_SW_CTX_ID_SHIFT - 32); | |
1406 | } else if (GRAPHICS_VER_FULL(stream->engine->i915) >= IP_VER(12, 50)) { | |
1407 | ctx_id = (XEHP_MAX_CONTEXT_HW_ID - 1) << | |
1408 | (XEHP_SW_CTX_ID_SHIFT - 32); | |
1409 | ||
1410 | mask = ((1U << XEHP_SW_CTX_ID_WIDTH) - 1) << | |
1411 | (XEHP_SW_CTX_ID_SHIFT - 32); | |
1412 | } else { | |
1413 | ctx_id = (GEN12_MAX_CONTEXT_HW_ID - 1) << | |
1414 | (GEN11_SW_CTX_ID_SHIFT - 32); | |
1415 | ||
1416 | mask = ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << | |
1417 | (GEN11_SW_CTX_ID_SHIFT - 32); | |
1418 | } | |
1419 | stream->specific_ctx_id = ctx_id & mask; | |
1420 | stream->specific_ctx_id_mask = mask; | |
1421 | ||
1422 | return 0; | |
1423 | } | |
1424 | ||
a5c3a3cb UNR |
1425 | static bool oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end) |
1426 | { | |
1427 | u32 idx = *offset; | |
1428 | u32 len = min(MI_LRI_LEN(state[idx]) + idx, end); | |
1429 | bool found = false; | |
1430 | ||
1431 | idx++; | |
1432 | for (; idx < len; idx += 2) { | |
1433 | if (state[idx] == reg) { | |
1434 | found = true; | |
1435 | break; | |
1436 | } | |
1437 | } | |
1438 | ||
1439 | *offset = idx; | |
1440 | return found; | |
1441 | } | |
1442 | ||
1443 | static u32 oa_context_image_offset(struct intel_context *ce, u32 reg) | |
1444 | { | |
1445 | u32 offset, len = (ce->engine->context_size - PAGE_SIZE) / 4; | |
1446 | u32 *state = ce->lrc_reg_state; | |
1447 | ||
95c713d7 UNR |
1448 | if (drm_WARN_ON(&ce->engine->i915->drm, !state)) |
1449 | return U32_MAX; | |
1450 | ||
a5c3a3cb UNR |
1451 | for (offset = 0; offset < len; ) { |
1452 | if (IS_MI_LRI_CMD(state[offset])) { | |
1453 | /* | |
1454 | * We expect reg-value pairs in MI_LRI command, so | |
1455 | * MI_LRI_LEN() should be even, if not, issue a warning. | |
1456 | */ | |
1457 | drm_WARN_ON(&ce->engine->i915->drm, | |
1458 | MI_LRI_LEN(state[offset]) & 0x1); | |
1459 | ||
1460 | if (oa_find_reg_in_lri(state, reg, &offset, len)) | |
1461 | break; | |
1462 | } else { | |
1463 | offset++; | |
1464 | } | |
1465 | } | |
1466 | ||
1467 | return offset < len ? offset : U32_MAX; | |
1468 | } | |
1469 | ||
1470 | static int set_oa_ctx_ctrl_offset(struct intel_context *ce) | |
1471 | { | |
1472 | i915_reg_t reg = GEN12_OACTXCONTROL(ce->engine->mmio_base); | |
1473 | struct i915_perf *perf = &ce->engine->i915->perf; | |
1474 | u32 offset = perf->ctx_oactxctrl_offset; | |
1475 | ||
1476 | /* Do this only once. Failure is stored as offset of U32_MAX */ | |
1477 | if (offset) | |
1478 | goto exit; | |
1479 | ||
1480 | offset = oa_context_image_offset(ce, i915_mmio_reg_offset(reg)); | |
1481 | perf->ctx_oactxctrl_offset = offset; | |
1482 | ||
1483 | drm_dbg(&ce->engine->i915->drm, | |
1484 | "%s oa ctx control at 0x%08x dword offset\n", | |
1485 | ce->engine->name, offset); | |
1486 | ||
1487 | exit: | |
1488 | return offset && offset != U32_MAX ? 0 : -ENODEV; | |
1489 | } | |
1490 | ||
1491 | static bool engine_supports_mi_query(struct intel_engine_cs *engine) | |
1492 | { | |
1493 | return engine->class == RENDER_CLASS; | |
1494 | } | |
1495 | ||
16d98b31 RB |
1496 | /** |
1497 | * oa_get_render_ctx_id - determine and hold ctx hw id | |
1498 | * @stream: An i915-perf stream opened for OA metrics | |
1499 | * | |
1500 | * Determine the render context hw id, and ensure it remains fixed for the | |
d7965152 RB |
1501 | * lifetime of the stream. This ensures that we don't have to worry about |
1502 | * updating the context ID in OACONTROL on the fly. | |
16d98b31 RB |
1503 | * |
1504 | * Returns: zero on success or a negative error code | |
d7965152 RB |
1505 | */ |
1506 | static int oa_get_render_ctx_id(struct i915_perf_stream *stream) | |
1507 | { | |
61d5676b | 1508 | struct intel_context *ce; |
682aa437 | 1509 | int ret = 0; |
d7965152 | 1510 | |
a37f08a8 | 1511 | ce = oa_pin_context(stream); |
61d5676b LL |
1512 | if (IS_ERR(ce)) |
1513 | return PTR_ERR(ce); | |
19f81df2 | 1514 | |
95c713d7 UNR |
1515 | if (engine_supports_mi_query(stream->engine) && |
1516 | HAS_LOGICAL_RING_CONTEXTS(stream->perf->i915)) { | |
a5c3a3cb UNR |
1517 | /* |
1518 | * We are enabling perf query here. If we don't find the context | |
1519 | * offset here, just return an error. | |
1520 | */ | |
1521 | ret = set_oa_ctx_ctrl_offset(ce); | |
1522 | if (ret) { | |
1523 | intel_context_unpin(ce); | |
1524 | drm_err(&stream->perf->i915->drm, | |
1525 | "Enabling perf query failed for %s\n", | |
1526 | stream->engine->name); | |
1527 | return ret; | |
1528 | } | |
1529 | } | |
1530 | ||
651e7d48 | 1531 | switch (GRAPHICS_VER(ce->engine->i915)) { |
61d5676b | 1532 | case 7: { |
19f81df2 | 1533 | /* |
61d5676b LL |
1534 | * On Haswell we don't do any post processing of the reports |
1535 | * and don't need to use the mask. | |
19f81df2 | 1536 | */ |
a37f08a8 UNR |
1537 | stream->specific_ctx_id = i915_ggtt_offset(ce->state); |
1538 | stream->specific_ctx_id_mask = 0; | |
61d5676b LL |
1539 | break; |
1540 | } | |
d7965152 | 1541 | |
61d5676b LL |
1542 | case 8: |
1543 | case 9: | |
c92c36ed | 1544 | if (intel_engine_uses_guc(ce->engine)) { |
61d5676b LL |
1545 | /* |
1546 | * When using GuC, the context descriptor we write in | |
1547 | * i915 is read by GuC and rewritten before it's | |
1548 | * actually written into the hardware. The LRCA is | |
1549 | * what is put into the context id field of the | |
1550 | * context descriptor by GuC. Because it's aligned to | |
1551 | * a page, the lower 12bits are always at 0 and | |
1552 | * dropped by GuC. They won't be part of the context | |
1553 | * ID in the OA reports, so squash those lower bits. | |
1554 | */ | |
53b2622e | 1555 | stream->specific_ctx_id = ce->lrc.lrca >> 12; |
19f81df2 | 1556 | |
61d5676b LL |
1557 | /* |
1558 | * GuC uses the top bit to signal proxy submission, so | |
1559 | * ignore that bit. | |
1560 | */ | |
a37f08a8 | 1561 | stream->specific_ctx_id_mask = |
61d5676b | 1562 | (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1; |
c92c36ed CW |
1563 | } else { |
1564 | stream->specific_ctx_id_mask = | |
1565 | (1U << GEN8_CTX_ID_WIDTH) - 1; | |
1566 | stream->specific_ctx_id = stream->specific_ctx_id_mask; | |
61d5676b LL |
1567 | } |
1568 | break; | |
1569 | ||
45e9c829 | 1570 | case 11: |
50a9ea08 | 1571 | case 12: |
682aa437 | 1572 | ret = gen12_get_render_context_id(stream); |
61d5676b | 1573 | break; |
61d5676b LL |
1574 | |
1575 | default: | |
651e7d48 | 1576 | MISSING_CASE(GRAPHICS_VER(ce->engine->i915)); |
19f81df2 | 1577 | } |
d7965152 | 1578 | |
6f280b13 | 1579 | ce->tag = stream->specific_ctx_id; |
2935ed53 | 1580 | |
0bf85735 WK |
1581 | drm_dbg(&stream->perf->i915->drm, |
1582 | "filtering on ctx_id=0x%x ctx_id_mask=0x%x\n", | |
1583 | stream->specific_ctx_id, | |
1584 | stream->specific_ctx_id_mask); | |
61d5676b | 1585 | |
682aa437 | 1586 | return ret; |
d7965152 RB |
1587 | } |
1588 | ||
16d98b31 RB |
1589 | /** |
1590 | * oa_put_render_ctx_id - counterpart to oa_get_render_ctx_id releases hold | |
1591 | * @stream: An i915-perf stream opened for OA metrics | |
1592 | * | |
1593 | * In case anything needed doing to ensure the context HW ID would remain valid | |
1594 | * for the lifetime of the stream, then that can be undone here. | |
1595 | */ | |
d7965152 RB |
1596 | static void oa_put_render_ctx_id(struct i915_perf_stream *stream) |
1597 | { | |
1fc44d9b | 1598 | struct intel_context *ce; |
d7965152 | 1599 | |
a37f08a8 | 1600 | ce = fetch_and_zero(&stream->pinned_ctx); |
2935ed53 CW |
1601 | if (ce) { |
1602 | ce->tag = 0; /* recomputed on next submission after parking */ | |
1fc44d9b | 1603 | intel_context_unpin(ce); |
2935ed53 CW |
1604 | } |
1605 | ||
1606 | stream->specific_ctx_id = INVALID_CTX_ID; | |
1607 | stream->specific_ctx_id_mask = 0; | |
d7965152 RB |
1608 | } |
1609 | ||
1610 | static void | |
a37f08a8 | 1611 | free_oa_buffer(struct i915_perf_stream *stream) |
d7965152 | 1612 | { |
a37f08a8 | 1613 | i915_vma_unpin_and_release(&stream->oa_buffer.vma, |
6a2f59e4 | 1614 | I915_VMA_RELEASE_MAP); |
d7965152 | 1615 | |
a37f08a8 | 1616 | stream->oa_buffer.vaddr = NULL; |
d7965152 RB |
1617 | } |
1618 | ||
6a45008a LL |
1619 | static void |
1620 | free_oa_configs(struct i915_perf_stream *stream) | |
1621 | { | |
1622 | struct i915_oa_config_bo *oa_bo, *tmp; | |
1623 | ||
1624 | i915_oa_config_put(stream->oa_config); | |
1625 | llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node) | |
1626 | free_oa_config_bo(oa_bo); | |
1627 | } | |
1628 | ||
daed3e44 LL |
1629 | static void |
1630 | free_noa_wait(struct i915_perf_stream *stream) | |
1631 | { | |
1632 | i915_vma_unpin_and_release(&stream->noa_wait, 0); | |
1633 | } | |
1634 | ||
5f284e9c UNR |
1635 | static bool engine_supports_oa(const struct intel_engine_cs *engine) |
1636 | { | |
1637 | return engine->oa_group; | |
1638 | } | |
1639 | ||
d7965152 RB |
1640 | static void i915_oa_stream_destroy(struct i915_perf_stream *stream) |
1641 | { | |
8f8b1171 | 1642 | struct i915_perf *perf = stream->perf; |
9677a9f3 | 1643 | struct intel_gt *gt = stream->engine->gt; |
5f284e9c | 1644 | struct i915_perf_group *g = stream->engine->oa_group; |
d7965152 | 1645 | |
5f284e9c | 1646 | if (WARN_ON(stream != g->exclusive_stream)) |
6f10c4d6 | 1647 | return; |
d7965152 | 1648 | |
19f81df2 | 1649 | /* |
f89823c2 LL |
1650 | * Unset exclusive_stream first, it will be checked while disabling |
1651 | * the metric set on gen8+. | |
a5af081d CW |
1652 | * |
1653 | * See i915_oa_init_reg_state() and lrc_configure_all_contexts() | |
19f81df2 | 1654 | */ |
5f284e9c | 1655 | WRITE_ONCE(g->exclusive_stream, NULL); |
8f8b1171 | 1656 | perf->ops.disable_metric_set(stream); |
d7965152 | 1657 | |
a37f08a8 | 1658 | free_oa_buffer(stream); |
d7965152 | 1659 | |
01e74274 VB |
1660 | /* |
1661 | * Wa_16011777198:dg2: Unset the override of GUCRC mode to enable rc6. | |
1662 | */ | |
2810ac6c | 1663 | if (stream->override_gucrc) |
01e74274 VB |
1664 | drm_WARN_ON(>->i915->drm, |
1665 | intel_guc_slpc_unset_gucrc_mode(>->uc.guc.slpc)); | |
1666 | ||
52111c46 | 1667 | intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL); |
a5efcde6 | 1668 | intel_engine_pm_put(stream->engine); |
d7965152 RB |
1669 | |
1670 | if (stream->ctx) | |
1671 | oa_put_render_ctx_id(stream); | |
1672 | ||
6a45008a | 1673 | free_oa_configs(stream); |
daed3e44 | 1674 | free_noa_wait(stream); |
f89823c2 | 1675 | |
8f8b1171 | 1676 | if (perf->spurious_report_rs.missed) { |
a10234fd TU |
1677 | drm_notice(>->i915->drm, |
1678 | "%d spurious OA report notices suppressed due to ratelimiting\n", | |
1679 | perf->spurious_report_rs.missed); | |
712122ea | 1680 | } |
d7965152 RB |
1681 | } |
1682 | ||
a37f08a8 | 1683 | static void gen7_init_oa_buffer(struct i915_perf_stream *stream) |
d7965152 | 1684 | { |
52111c46 | 1685 | struct intel_uncore *uncore = stream->uncore; |
a37f08a8 | 1686 | u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); |
0dd860cf RB |
1687 | unsigned long flags; |
1688 | ||
a37f08a8 | 1689 | spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); |
d7965152 RB |
1690 | |
1691 | /* Pre-DevBDW: OABUFFER must be set with counters off, | |
1692 | * before OASTATUS1, but after OASTATUS2 | |
1693 | */ | |
8f8b1171 CW |
1694 | intel_uncore_write(uncore, GEN7_OASTATUS2, /* head */ |
1695 | gtt_offset | GEN7_OASTATUS2_MEM_SELECT_GGTT); | |
a37f08a8 | 1696 | stream->oa_buffer.head = gtt_offset; |
f279020a | 1697 | |
8f8b1171 | 1698 | intel_uncore_write(uncore, GEN7_OABUFFER, gtt_offset); |
f279020a | 1699 | |
8f8b1171 CW |
1700 | intel_uncore_write(uncore, GEN7_OASTATUS1, /* tail */ |
1701 | gtt_offset | OABUFFER_SIZE_16M); | |
d7965152 | 1702 | |
0dd860cf | 1703 | /* Mark that we need updated tail pointers to read from... */ |
d1df41eb LL |
1704 | stream->oa_buffer.aging_tail = INVALID_TAIL_PTR; |
1705 | stream->oa_buffer.tail = gtt_offset; | |
0dd860cf | 1706 | |
a37f08a8 | 1707 | spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); |
0dd860cf | 1708 | |
d7965152 RB |
1709 | /* On Haswell we have to track which OASTATUS1 flags we've |
1710 | * already seen since they can't be cleared while periodic | |
1711 | * sampling is enabled. | |
1712 | */ | |
8f8b1171 | 1713 | stream->perf->gen7_latched_oastatus1 = 0; |
d7965152 RB |
1714 | |
1715 | /* NB: although the OA buffer will initially be allocated | |
1716 | * zeroed via shmfs (and so this memset is redundant when | |
1717 | * first allocating), we may re-init the OA buffer, either | |
1718 | * when re-enabling a stream or in error/reset paths. | |
1719 | * | |
1720 | * The reason we clear the buffer for each re-init is for the | |
1721 | * sanity check in gen7_append_oa_reports() that looks at the | |
1722 | * report-id field to make sure it's non-zero which relies on | |
1723 | * the assumption that new reports are being written to zeroed | |
1724 | * memory... | |
1725 | */ | |
a37f08a8 | 1726 | memset(stream->oa_buffer.vaddr, 0, OA_BUFFER_SIZE); |
d7965152 RB |
1727 | } |
1728 | ||
a37f08a8 | 1729 | static void gen8_init_oa_buffer(struct i915_perf_stream *stream) |
19f81df2 | 1730 | { |
52111c46 | 1731 | struct intel_uncore *uncore = stream->uncore; |
a37f08a8 | 1732 | u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); |
19f81df2 RB |
1733 | unsigned long flags; |
1734 | ||
a37f08a8 | 1735 | spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); |
19f81df2 | 1736 | |
8f8b1171 CW |
1737 | intel_uncore_write(uncore, GEN8_OASTATUS, 0); |
1738 | intel_uncore_write(uncore, GEN8_OAHEADPTR, gtt_offset); | |
a37f08a8 | 1739 | stream->oa_buffer.head = gtt_offset; |
19f81df2 | 1740 | |
8f8b1171 | 1741 | intel_uncore_write(uncore, GEN8_OABUFFER_UDW, 0); |
19f81df2 RB |
1742 | |
1743 | /* | |
1744 | * PRM says: | |
1745 | * | |
1746 | * "This MMIO must be set before the OATAILPTR | |
1747 | * register and after the OAHEADPTR register. This is | |
1748 | * to enable proper functionality of the overflow | |
1749 | * bit." | |
1750 | */ | |
8f8b1171 | 1751 | intel_uncore_write(uncore, GEN8_OABUFFER, gtt_offset | |
fe841686 | 1752 | OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT); |
8f8b1171 | 1753 | intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK); |
19f81df2 RB |
1754 | |
1755 | /* Mark that we need updated tail pointers to read from... */ | |
d1df41eb LL |
1756 | stream->oa_buffer.aging_tail = INVALID_TAIL_PTR; |
1757 | stream->oa_buffer.tail = gtt_offset; | |
19f81df2 RB |
1758 | |
1759 | /* | |
1760 | * Reset state used to recognise context switches, affecting which | |
1761 | * reports we will forward to userspace while filtering for a single | |
1762 | * context. | |
1763 | */ | |
a37f08a8 | 1764 | stream->oa_buffer.last_ctx_id = INVALID_CTX_ID; |
19f81df2 | 1765 | |
a37f08a8 | 1766 | spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); |
19f81df2 RB |
1767 | |
1768 | /* | |
1769 | * NB: although the OA buffer will initially be allocated | |
1770 | * zeroed via shmfs (and so this memset is redundant when | |
1771 | * first allocating), we may re-init the OA buffer, either | |
1772 | * when re-enabling a stream or in error/reset paths. | |
1773 | * | |
1774 | * The reason we clear the buffer for each re-init is for the | |
1775 | * sanity check in gen8_append_oa_reports() that looks at the | |
1776 | * reason field to make sure it's non-zero which relies on | |
1777 | * the assumption that new reports are being written to zeroed | |
1778 | * memory... | |
1779 | */ | |
a37f08a8 | 1780 | memset(stream->oa_buffer.vaddr, 0, OA_BUFFER_SIZE); |
19f81df2 RB |
1781 | } |
1782 | ||
00a7f0d7 LL |
1783 | static void gen12_init_oa_buffer(struct i915_perf_stream *stream) |
1784 | { | |
1785 | struct intel_uncore *uncore = stream->uncore; | |
1786 | u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); | |
1787 | unsigned long flags; | |
1788 | ||
1789 | spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); | |
1790 | ||
1791 | intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0); | |
1792 | intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR, | |
1793 | gtt_offset & GEN12_OAG_OAHEADPTR_MASK); | |
1794 | stream->oa_buffer.head = gtt_offset; | |
1795 | ||
1796 | /* | |
1797 | * PRM says: | |
1798 | * | |
1799 | * "This MMIO must be set before the OATAILPTR | |
1800 | * register and after the OAHEADPTR register. This is | |
1801 | * to enable proper functionality of the overflow | |
1802 | * bit." | |
1803 | */ | |
1804 | intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset | | |
1805 | OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT); | |
1806 | intel_uncore_write(uncore, GEN12_OAG_OATAILPTR, | |
1807 | gtt_offset & GEN12_OAG_OATAILPTR_MASK); | |
1808 | ||
1809 | /* Mark that we need updated tail pointers to read from... */ | |
d1df41eb LL |
1810 | stream->oa_buffer.aging_tail = INVALID_TAIL_PTR; |
1811 | stream->oa_buffer.tail = gtt_offset; | |
00a7f0d7 LL |
1812 | |
1813 | /* | |
1814 | * Reset state used to recognise context switches, affecting which | |
1815 | * reports we will forward to userspace while filtering for a single | |
1816 | * context. | |
1817 | */ | |
1818 | stream->oa_buffer.last_ctx_id = INVALID_CTX_ID; | |
1819 | ||
1820 | spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); | |
1821 | ||
1822 | /* | |
1823 | * NB: although the OA buffer will initially be allocated | |
1824 | * zeroed via shmfs (and so this memset is redundant when | |
1825 | * first allocating), we may re-init the OA buffer, either | |
1826 | * when re-enabling a stream or in error/reset paths. | |
1827 | * | |
1828 | * The reason we clear the buffer for each re-init is for the | |
1829 | * sanity check in gen8_append_oa_reports() that looks at the | |
1830 | * reason field to make sure it's non-zero which relies on | |
1831 | * the assumption that new reports are being written to zeroed | |
1832 | * memory... | |
1833 | */ | |
1834 | memset(stream->oa_buffer.vaddr, 0, | |
1835 | stream->oa_buffer.vma->size); | |
00a7f0d7 LL |
1836 | } |
1837 | ||
a37f08a8 | 1838 | static int alloc_oa_buffer(struct i915_perf_stream *stream) |
d7965152 | 1839 | { |
a9f236d1 | 1840 | struct drm_i915_private *i915 = stream->perf->i915; |
cc85345d | 1841 | struct intel_gt *gt = stream->engine->gt; |
d7965152 RB |
1842 | struct drm_i915_gem_object *bo; |
1843 | struct i915_vma *vma; | |
1844 | int ret; | |
1845 | ||
a9f236d1 | 1846 | if (drm_WARN_ON(&i915->drm, stream->oa_buffer.vma)) |
d7965152 RB |
1847 | return -ENODEV; |
1848 | ||
fe841686 JL |
1849 | BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE); |
1850 | BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M); | |
1851 | ||
8f8b1171 | 1852 | bo = i915_gem_object_create_shmem(stream->perf->i915, OA_BUFFER_SIZE); |
d7965152 | 1853 | if (IS_ERR(bo)) { |
00376ccf | 1854 | drm_err(&i915->drm, "Failed to allocate OA buffer\n"); |
2850748e | 1855 | return PTR_ERR(bo); |
d7965152 RB |
1856 | } |
1857 | ||
a679f58d | 1858 | i915_gem_object_set_cache_coherency(bo, I915_CACHE_LLC); |
d7965152 RB |
1859 | |
1860 | /* PreHSW required 512K alignment, HSW requires 16M */ | |
cc85345d | 1861 | vma = i915_vma_instance(bo, >->ggtt->vm, NULL); |
d7965152 RB |
1862 | if (IS_ERR(vma)) { |
1863 | ret = PTR_ERR(vma); | |
1864 | goto err_unref; | |
1865 | } | |
cc85345d UNR |
1866 | |
1867 | /* | |
1868 | * PreHSW required 512K alignment. | |
1869 | * HSW and onwards, align to requested size of OA buffer. | |
1870 | */ | |
1871 | ret = i915_vma_pin(vma, 0, SZ_16M, PIN_GLOBAL | PIN_HIGH); | |
1872 | if (ret) { | |
1873 | drm_err(>->i915->drm, "Failed to pin OA buffer %d\n", ret); | |
1874 | goto err_unref; | |
1875 | } | |
1876 | ||
a37f08a8 | 1877 | stream->oa_buffer.vma = vma; |
d7965152 | 1878 | |
a37f08a8 | 1879 | stream->oa_buffer.vaddr = |
ef4985ba | 1880 | i915_gem_object_pin_map_unlocked(bo, I915_MAP_WB); |
a37f08a8 UNR |
1881 | if (IS_ERR(stream->oa_buffer.vaddr)) { |
1882 | ret = PTR_ERR(stream->oa_buffer.vaddr); | |
d7965152 RB |
1883 | goto err_unpin; |
1884 | } | |
1885 | ||
2850748e | 1886 | return 0; |
d7965152 RB |
1887 | |
1888 | err_unpin: | |
1889 | __i915_vma_unpin(vma); | |
1890 | ||
1891 | err_unref: | |
1892 | i915_gem_object_put(bo); | |
1893 | ||
a37f08a8 UNR |
1894 | stream->oa_buffer.vaddr = NULL; |
1895 | stream->oa_buffer.vma = NULL; | |
d7965152 | 1896 | |
d7965152 RB |
1897 | return ret; |
1898 | } | |
1899 | ||
daed3e44 LL |
1900 | static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs, |
1901 | bool save, i915_reg_t reg, u32 offset, | |
1902 | u32 dword_count) | |
1903 | { | |
1904 | u32 cmd; | |
1905 | u32 d; | |
1906 | ||
1907 | cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM; | |
e43ff99c | 1908 | cmd |= MI_SRM_LRM_GLOBAL_GTT; |
651e7d48 | 1909 | if (GRAPHICS_VER(stream->perf->i915) >= 8) |
daed3e44 LL |
1910 | cmd++; |
1911 | ||
1912 | for (d = 0; d < dword_count; d++) { | |
1913 | *cs++ = cmd; | |
1914 | *cs++ = i915_mmio_reg_offset(reg) + 4 * d; | |
a4b6e74c | 1915 | *cs++ = i915_ggtt_offset(stream->noa_wait) + offset + 4 * d; |
daed3e44 LL |
1916 | *cs++ = 0; |
1917 | } | |
1918 | ||
1919 | return cs; | |
1920 | } | |
1921 | ||
1922 | static int alloc_noa_wait(struct i915_perf_stream *stream) | |
1923 | { | |
1924 | struct drm_i915_private *i915 = stream->perf->i915; | |
cc85345d | 1925 | struct intel_gt *gt = stream->engine->gt; |
daed3e44 LL |
1926 | struct drm_i915_gem_object *bo; |
1927 | struct i915_vma *vma; | |
1928 | const u64 delay_ticks = 0xffffffffffffffff - | |
204129a2 MW |
1929 | intel_gt_ns_to_clock_interval(to_gt(stream->perf->i915), |
1930 | atomic64_read(&stream->perf->noa_programming_delay)); | |
daed3e44 LL |
1931 | const u32 base = stream->engine->mmio_base; |
1932 | #define CS_GPR(x) GEN8_RING_CS_GPR(base, x) | |
1933 | u32 *batch, *ts0, *cs, *jump; | |
ef4985ba | 1934 | struct i915_gem_ww_ctx ww; |
daed3e44 LL |
1935 | int ret, i; |
1936 | enum { | |
1937 | START_TS, | |
1938 | NOW_TS, | |
1939 | DELTA_TS, | |
1940 | JUMP_PREDICATE, | |
1941 | DELTA_TARGET, | |
1942 | N_CS_GPR | |
1943 | }; | |
2d9da585 UNR |
1944 | i915_reg_t mi_predicate_result = HAS_MI_SET_PREDICATE(i915) ? |
1945 | MI_PREDICATE_RESULT_2_ENGINE(base) : | |
1946 | MI_PREDICATE_RESULT_1(RENDER_RING_BASE); | |
daed3e44 | 1947 | |
a4b6e74c UNR |
1948 | /* |
1949 | * gt->scratch was being used to save/restore the GPR registers, but on | |
1950 | * MTL the scratch uses stolen lmem. An MI_SRM to this memory region | |
1951 | * causes an engine hang. Instead allocate an additional page here to | |
1952 | * save/restore GPR registers | |
1953 | */ | |
1954 | bo = i915_gem_object_create_internal(i915, 8192); | |
daed3e44 | 1955 | if (IS_ERR(bo)) { |
00376ccf WK |
1956 | drm_err(&i915->drm, |
1957 | "Failed to allocate NOA wait batchbuffer\n"); | |
daed3e44 LL |
1958 | return PTR_ERR(bo); |
1959 | } | |
1960 | ||
ef4985ba ML |
1961 | i915_gem_ww_ctx_init(&ww, true); |
1962 | retry: | |
1963 | ret = i915_gem_object_lock(bo, &ww); | |
1964 | if (ret) | |
1965 | goto out_ww; | |
1966 | ||
daed3e44 LL |
1967 | /* |
1968 | * We pin in GGTT because we jump into this buffer now because | |
1969 | * multiple OA config BOs will have a jump to this address and it | |
1970 | * needs to be fixed during the lifetime of the i915/perf stream. | |
1971 | */ | |
cc85345d | 1972 | vma = i915_vma_instance(bo, >->ggtt->vm, NULL); |
daed3e44 LL |
1973 | if (IS_ERR(vma)) { |
1974 | ret = PTR_ERR(vma); | |
ef4985ba | 1975 | goto out_ww; |
daed3e44 LL |
1976 | } |
1977 | ||
cc85345d UNR |
1978 | ret = i915_vma_pin_ww(vma, &ww, 0, 0, PIN_GLOBAL | PIN_HIGH); |
1979 | if (ret) | |
1980 | goto out_ww; | |
1981 | ||
daed3e44 LL |
1982 | batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB); |
1983 | if (IS_ERR(batch)) { | |
1984 | ret = PTR_ERR(batch); | |
1985 | goto err_unpin; | |
1986 | } | |
1987 | ||
a4b6e74c UNR |
1988 | stream->noa_wait = vma; |
1989 | ||
1990 | #define GPR_SAVE_OFFSET 4096 | |
1991 | #define PREDICATE_SAVE_OFFSET 4160 | |
1992 | ||
daed3e44 LL |
1993 | /* Save registers. */ |
1994 | for (i = 0; i < N_CS_GPR; i++) | |
1995 | cs = save_restore_register( | |
1996 | stream, cs, true /* save */, CS_GPR(i), | |
a4b6e74c | 1997 | GPR_SAVE_OFFSET + 8 * i, 2); |
daed3e44 | 1998 | cs = save_restore_register( |
2d9da585 | 1999 | stream, cs, true /* save */, mi_predicate_result, |
a4b6e74c | 2000 | PREDICATE_SAVE_OFFSET, 1); |
daed3e44 LL |
2001 | |
2002 | /* First timestamp snapshot location. */ | |
2003 | ts0 = cs; | |
2004 | ||
2005 | /* | |
2006 | * Initial snapshot of the timestamp register to implement the wait. | |
2007 | * We work with 32b values, so clear out the top 32b bits of the | |
2008 | * register because the ALU works 64bits. | |
2009 | */ | |
2010 | *cs++ = MI_LOAD_REGISTER_IMM(1); | |
2011 | *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS)) + 4; | |
2012 | *cs++ = 0; | |
2013 | *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); | |
2014 | *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base)); | |
2015 | *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS)); | |
2016 | ||
2017 | /* | |
2018 | * This is the location we're going to jump back into until the | |
2019 | * required amount of time has passed. | |
2020 | */ | |
2021 | jump = cs; | |
2022 | ||
2023 | /* | |
2024 | * Take another snapshot of the timestamp register. Take care to clear | |
2025 | * up the top 32bits of CS_GPR(1) as we're using it for other | |
2026 | * operations below. | |
2027 | */ | |
2028 | *cs++ = MI_LOAD_REGISTER_IMM(1); | |
2029 | *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS)) + 4; | |
2030 | *cs++ = 0; | |
2031 | *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); | |
2032 | *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base)); | |
2033 | *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS)); | |
2034 | ||
2035 | /* | |
2036 | * Do a diff between the 2 timestamps and store the result back into | |
2037 | * CS_GPR(1). | |
2038 | */ | |
2039 | *cs++ = MI_MATH(5); | |
2040 | *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS)); | |
2041 | *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS)); | |
2042 | *cs++ = MI_MATH_SUB; | |
2043 | *cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU); | |
2044 | *cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF); | |
2045 | ||
2046 | /* | |
2047 | * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the | |
2048 | * timestamp have rolled over the 32bits) into the predicate register | |
2049 | * to be used for the predicated jump. | |
2050 | */ | |
2051 | *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); | |
2052 | *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE)); | |
2d9da585 UNR |
2053 | *cs++ = i915_mmio_reg_offset(mi_predicate_result); |
2054 | ||
2055 | if (HAS_MI_SET_PREDICATE(i915)) | |
2056 | *cs++ = MI_SET_PREDICATE | 1; | |
daed3e44 LL |
2057 | |
2058 | /* Restart from the beginning if we had timestamps roll over. */ | |
651e7d48 | 2059 | *cs++ = (GRAPHICS_VER(i915) < 8 ? |
daed3e44 LL |
2060 | MI_BATCH_BUFFER_START : |
2061 | MI_BATCH_BUFFER_START_GEN8) | | |
2062 | MI_BATCH_PREDICATE; | |
2063 | *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; | |
2064 | *cs++ = 0; | |
2065 | ||
2d9da585 UNR |
2066 | if (HAS_MI_SET_PREDICATE(i915)) |
2067 | *cs++ = MI_SET_PREDICATE; | |
2068 | ||
daed3e44 LL |
2069 | /* |
2070 | * Now add the diff between to previous timestamps and add it to : | |
2071 | * (((1 * << 64) - 1) - delay_ns) | |
2072 | * | |
2073 | * When the Carry Flag contains 1 this means the elapsed time is | |
2074 | * longer than the expected delay, and we can exit the wait loop. | |
2075 | */ | |
2076 | *cs++ = MI_LOAD_REGISTER_IMM(2); | |
2077 | *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET)); | |
2078 | *cs++ = lower_32_bits(delay_ticks); | |
2079 | *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET)) + 4; | |
2080 | *cs++ = upper_32_bits(delay_ticks); | |
2081 | ||
2082 | *cs++ = MI_MATH(4); | |
2083 | *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS)); | |
2084 | *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET)); | |
2085 | *cs++ = MI_MATH_ADD; | |
2086 | *cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF); | |
2087 | ||
dd590f68 LL |
2088 | *cs++ = MI_ARB_CHECK; |
2089 | ||
daed3e44 LL |
2090 | /* |
2091 | * Transfer the result into the predicate register to be used for the | |
2092 | * predicated jump. | |
2093 | */ | |
2094 | *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); | |
2095 | *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE)); | |
2d9da585 UNR |
2096 | *cs++ = i915_mmio_reg_offset(mi_predicate_result); |
2097 | ||
2098 | if (HAS_MI_SET_PREDICATE(i915)) | |
2099 | *cs++ = MI_SET_PREDICATE | 1; | |
daed3e44 LL |
2100 | |
2101 | /* Predicate the jump. */ | |
651e7d48 | 2102 | *cs++ = (GRAPHICS_VER(i915) < 8 ? |
daed3e44 LL |
2103 | MI_BATCH_BUFFER_START : |
2104 | MI_BATCH_BUFFER_START_GEN8) | | |
2105 | MI_BATCH_PREDICATE; | |
2106 | *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; | |
2107 | *cs++ = 0; | |
2108 | ||
2d9da585 UNR |
2109 | if (HAS_MI_SET_PREDICATE(i915)) |
2110 | *cs++ = MI_SET_PREDICATE; | |
2111 | ||
daed3e44 LL |
2112 | /* Restore registers. */ |
2113 | for (i = 0; i < N_CS_GPR; i++) | |
2114 | cs = save_restore_register( | |
2115 | stream, cs, false /* restore */, CS_GPR(i), | |
a4b6e74c | 2116 | GPR_SAVE_OFFSET + 8 * i, 2); |
daed3e44 | 2117 | cs = save_restore_register( |
2d9da585 | 2118 | stream, cs, false /* restore */, mi_predicate_result, |
a4b6e74c | 2119 | PREDICATE_SAVE_OFFSET, 1); |
daed3e44 LL |
2120 | |
2121 | /* And return to the ring. */ | |
2122 | *cs++ = MI_BATCH_BUFFER_END; | |
2123 | ||
2124 | GEM_BUG_ON(cs - batch > PAGE_SIZE / sizeof(*batch)); | |
2125 | ||
2126 | i915_gem_object_flush_map(bo); | |
89d19b2b | 2127 | __i915_gem_object_release_map(bo); |
daed3e44 | 2128 | |
ef4985ba | 2129 | goto out_ww; |
daed3e44 LL |
2130 | |
2131 | err_unpin: | |
15d0ace1 | 2132 | i915_vma_unpin_and_release(&vma, 0); |
ef4985ba ML |
2133 | out_ww: |
2134 | if (ret == -EDEADLK) { | |
2135 | ret = i915_gem_ww_ctx_backoff(&ww); | |
2136 | if (!ret) | |
2137 | goto retry; | |
2138 | } | |
2139 | i915_gem_ww_ctx_fini(&ww); | |
2140 | if (ret) | |
2141 | i915_gem_object_put(bo); | |
daed3e44 LL |
2142 | return ret; |
2143 | } | |
2144 | ||
15d0ace1 LL |
2145 | static u32 *write_cs_mi_lri(u32 *cs, |
2146 | const struct i915_oa_reg *reg_data, | |
2147 | u32 n_regs) | |
d7965152 | 2148 | { |
701f8231 | 2149 | u32 i; |
d7965152 RB |
2150 | |
2151 | for (i = 0; i < n_regs; i++) { | |
15d0ace1 LL |
2152 | if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) { |
2153 | u32 n_lri = min_t(u32, | |
2154 | n_regs - i, | |
2155 | MI_LOAD_REGISTER_IMM_MAX_REGS); | |
d7965152 | 2156 | |
15d0ace1 LL |
2157 | *cs++ = MI_LOAD_REGISTER_IMM(n_lri); |
2158 | } | |
2159 | *cs++ = i915_mmio_reg_offset(reg_data[i].addr); | |
2160 | *cs++ = reg_data[i].value; | |
d7965152 | 2161 | } |
15d0ace1 LL |
2162 | |
2163 | return cs; | |
d7965152 RB |
2164 | } |
2165 | ||
15d0ace1 | 2166 | static int num_lri_dwords(int num_regs) |
d7965152 | 2167 | { |
15d0ace1 LL |
2168 | int count = 0; |
2169 | ||
2170 | if (num_regs > 0) { | |
2171 | count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS); | |
2172 | count += num_regs * 2; | |
2173 | } | |
2174 | ||
2175 | return count; | |
2176 | } | |
2177 | ||
2178 | static struct i915_oa_config_bo * | |
2179 | alloc_oa_config_buffer(struct i915_perf_stream *stream, | |
2180 | struct i915_oa_config *oa_config) | |
2181 | { | |
2182 | struct drm_i915_gem_object *obj; | |
2183 | struct i915_oa_config_bo *oa_bo; | |
ef4985ba | 2184 | struct i915_gem_ww_ctx ww; |
15d0ace1 LL |
2185 | size_t config_length = 0; |
2186 | u32 *cs; | |
2187 | int err; | |
2188 | ||
2189 | oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL); | |
2190 | if (!oa_bo) | |
2191 | return ERR_PTR(-ENOMEM); | |
2192 | ||
2193 | config_length += num_lri_dwords(oa_config->mux_regs_len); | |
2194 | config_length += num_lri_dwords(oa_config->b_counter_regs_len); | |
2195 | config_length += num_lri_dwords(oa_config->flex_regs_len); | |
93937659 | 2196 | config_length += 3; /* MI_BATCH_BUFFER_START */ |
15d0ace1 LL |
2197 | config_length = ALIGN(sizeof(u32) * config_length, I915_GTT_PAGE_SIZE); |
2198 | ||
2199 | obj = i915_gem_object_create_shmem(stream->perf->i915, config_length); | |
2200 | if (IS_ERR(obj)) { | |
2201 | err = PTR_ERR(obj); | |
2202 | goto err_free; | |
2203 | } | |
2204 | ||
ef4985ba ML |
2205 | i915_gem_ww_ctx_init(&ww, true); |
2206 | retry: | |
2207 | err = i915_gem_object_lock(obj, &ww); | |
2208 | if (err) | |
2209 | goto out_ww; | |
2210 | ||
15d0ace1 LL |
2211 | cs = i915_gem_object_pin_map(obj, I915_MAP_WB); |
2212 | if (IS_ERR(cs)) { | |
2213 | err = PTR_ERR(cs); | |
ef4985ba | 2214 | goto out_ww; |
15d0ace1 LL |
2215 | } |
2216 | ||
2217 | cs = write_cs_mi_lri(cs, | |
2218 | oa_config->mux_regs, | |
2219 | oa_config->mux_regs_len); | |
2220 | cs = write_cs_mi_lri(cs, | |
2221 | oa_config->b_counter_regs, | |
2222 | oa_config->b_counter_regs_len); | |
2223 | cs = write_cs_mi_lri(cs, | |
2224 | oa_config->flex_regs, | |
2225 | oa_config->flex_regs_len); | |
2226 | ||
93937659 | 2227 | /* Jump into the active wait. */ |
651e7d48 | 2228 | *cs++ = (GRAPHICS_VER(stream->perf->i915) < 8 ? |
93937659 LL |
2229 | MI_BATCH_BUFFER_START : |
2230 | MI_BATCH_BUFFER_START_GEN8); | |
2231 | *cs++ = i915_ggtt_offset(stream->noa_wait); | |
2232 | *cs++ = 0; | |
15d0ace1 LL |
2233 | |
2234 | i915_gem_object_flush_map(obj); | |
89d19b2b | 2235 | __i915_gem_object_release_map(obj); |
15d0ace1 LL |
2236 | |
2237 | oa_bo->vma = i915_vma_instance(obj, | |
2238 | &stream->engine->gt->ggtt->vm, | |
2239 | NULL); | |
2240 | if (IS_ERR(oa_bo->vma)) { | |
2241 | err = PTR_ERR(oa_bo->vma); | |
ef4985ba | 2242 | goto out_ww; |
15d0ace1 LL |
2243 | } |
2244 | ||
2245 | oa_bo->oa_config = i915_oa_config_get(oa_config); | |
2246 | llist_add(&oa_bo->node, &stream->oa_config_bos); | |
2247 | ||
ef4985ba ML |
2248 | out_ww: |
2249 | if (err == -EDEADLK) { | |
2250 | err = i915_gem_ww_ctx_backoff(&ww); | |
2251 | if (!err) | |
2252 | goto retry; | |
2253 | } | |
2254 | i915_gem_ww_ctx_fini(&ww); | |
15d0ace1 | 2255 | |
ef4985ba ML |
2256 | if (err) |
2257 | i915_gem_object_put(obj); | |
15d0ace1 | 2258 | err_free: |
ef4985ba ML |
2259 | if (err) { |
2260 | kfree(oa_bo); | |
2261 | return ERR_PTR(err); | |
2262 | } | |
2263 | return oa_bo; | |
15d0ace1 LL |
2264 | } |
2265 | ||
2266 | static struct i915_vma * | |
2267 | get_oa_vma(struct i915_perf_stream *stream, struct i915_oa_config *oa_config) | |
2268 | { | |
2269 | struct i915_oa_config_bo *oa_bo; | |
2270 | ||
14bfcd3e | 2271 | /* |
15d0ace1 LL |
2272 | * Look for the buffer in the already allocated BOs attached |
2273 | * to the stream. | |
d7965152 | 2274 | */ |
15d0ace1 LL |
2275 | llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) { |
2276 | if (oa_bo->oa_config == oa_config && | |
2277 | memcmp(oa_bo->oa_config->uuid, | |
2278 | oa_config->uuid, | |
2279 | sizeof(oa_config->uuid)) == 0) | |
2280 | goto out; | |
2281 | } | |
2282 | ||
2283 | oa_bo = alloc_oa_config_buffer(stream, oa_config); | |
2284 | if (IS_ERR(oa_bo)) | |
2285 | return ERR_CAST(oa_bo); | |
2286 | ||
2287 | out: | |
2288 | return i915_vma_get(oa_bo->vma); | |
2289 | } | |
2290 | ||
d7d50f80 | 2291 | static int |
4b4e973d CW |
2292 | emit_oa_config(struct i915_perf_stream *stream, |
2293 | struct i915_oa_config *oa_config, | |
d7d50f80 CW |
2294 | struct intel_context *ce, |
2295 | struct i915_active *active) | |
15d0ace1 LL |
2296 | { |
2297 | struct i915_request *rq; | |
2298 | struct i915_vma *vma; | |
f00ecc2e | 2299 | struct i915_gem_ww_ctx ww; |
15d0ace1 LL |
2300 | int err; |
2301 | ||
8814c6d0 | 2302 | vma = get_oa_vma(stream, oa_config); |
15d0ace1 | 2303 | if (IS_ERR(vma)) |
d7d50f80 | 2304 | return PTR_ERR(vma); |
15d0ace1 | 2305 | |
f00ecc2e ML |
2306 | i915_gem_ww_ctx_init(&ww, true); |
2307 | retry: | |
2308 | err = i915_gem_object_lock(vma->obj, &ww); | |
15d0ace1 | 2309 | if (err) |
f00ecc2e ML |
2310 | goto err; |
2311 | ||
2312 | err = i915_vma_pin_ww(vma, &ww, 0, 0, PIN_GLOBAL | PIN_HIGH); | |
2313 | if (err) | |
2314 | goto err; | |
15d0ace1 | 2315 | |
de5825be | 2316 | intel_engine_pm_get(ce->engine); |
15d0ace1 | 2317 | rq = i915_request_create(ce); |
de5825be | 2318 | intel_engine_pm_put(ce->engine); |
15d0ace1 LL |
2319 | if (IS_ERR(rq)) { |
2320 | err = PTR_ERR(rq); | |
2321 | goto err_vma_unpin; | |
2322 | } | |
2323 | ||
d7d50f80 CW |
2324 | if (!IS_ERR_OR_NULL(active)) { |
2325 | /* After all individual context modifications */ | |
2326 | err = i915_request_await_active(rq, active, | |
442dbc5c | 2327 | I915_ACTIVE_AWAIT_ACTIVE); |
d7d50f80 CW |
2328 | if (err) |
2329 | goto err_add_request; | |
2330 | ||
2331 | err = i915_active_add_request(active, rq); | |
2332 | if (err) | |
2333 | goto err_add_request; | |
2334 | } | |
2335 | ||
2a76fc89 | 2336 | err = i915_vma_move_to_active(vma, rq, 0); |
15d0ace1 LL |
2337 | if (err) |
2338 | goto err_add_request; | |
2339 | ||
2340 | err = rq->engine->emit_bb_start(rq, | |
8e4ee5e8 | 2341 | i915_vma_offset(vma), 0, |
15d0ace1 | 2342 | I915_DISPATCH_SECURE); |
4b4e973d CW |
2343 | if (err) |
2344 | goto err_add_request; | |
2345 | ||
15d0ace1 LL |
2346 | err_add_request: |
2347 | i915_request_add(rq); | |
2348 | err_vma_unpin: | |
2349 | i915_vma_unpin(vma); | |
f00ecc2e ML |
2350 | err: |
2351 | if (err == -EDEADLK) { | |
2352 | err = i915_gem_ww_ctx_backoff(&ww); | |
2353 | if (!err) | |
2354 | goto retry; | |
2355 | } | |
2356 | ||
2357 | i915_gem_ww_ctx_fini(&ww); | |
15d0ace1 | 2358 | i915_vma_put(vma); |
d7d50f80 | 2359 | return err; |
14bfcd3e LL |
2360 | } |
2361 | ||
5f5c382e CW |
2362 | static struct intel_context *oa_context(struct i915_perf_stream *stream) |
2363 | { | |
2364 | return stream->pinned_ctx ?: stream->engine->kernel_context; | |
2365 | } | |
2366 | ||
d7d50f80 CW |
2367 | static int |
2368 | hsw_enable_metric_set(struct i915_perf_stream *stream, | |
2369 | struct i915_active *active) | |
14bfcd3e | 2370 | { |
52111c46 | 2371 | struct intel_uncore *uncore = stream->uncore; |
14bfcd3e LL |
2372 | |
2373 | /* | |
2374 | * PRM: | |
2375 | * | |
2376 | * OA unit is using “crclk” for its functionality. When trunk | |
2377 | * level clock gating takes place, OA clock would be gated, | |
2378 | * unable to count the events from non-render clock domain. | |
2379 | * Render clock gating must be disabled when OA is enabled to | |
2380 | * count the events from non-render domain. Unit level clock | |
2381 | * gating for RCS should also be disabled. | |
2382 | */ | |
8f8b1171 CW |
2383 | intel_uncore_rmw(uncore, GEN7_MISCCPCTL, |
2384 | GEN7_DOP_CLOCK_GATE_ENABLE, 0); | |
2385 | intel_uncore_rmw(uncore, GEN6_UCGCTL1, | |
2386 | 0, GEN6_CSUNIT_CLOCK_GATE_DISABLE); | |
14bfcd3e | 2387 | |
d7d50f80 CW |
2388 | return emit_oa_config(stream, |
2389 | stream->oa_config, oa_context(stream), | |
2390 | active); | |
d7965152 RB |
2391 | } |
2392 | ||
a37f08a8 | 2393 | static void hsw_disable_metric_set(struct i915_perf_stream *stream) |
d7965152 | 2394 | { |
52111c46 | 2395 | struct intel_uncore *uncore = stream->uncore; |
a37f08a8 | 2396 | |
8f8b1171 CW |
2397 | intel_uncore_rmw(uncore, GEN6_UCGCTL1, |
2398 | GEN6_CSUNIT_CLOCK_GATE_DISABLE, 0); | |
2399 | intel_uncore_rmw(uncore, GEN7_MISCCPCTL, | |
2400 | 0, GEN7_DOP_CLOCK_GATE_ENABLE); | |
d7965152 | 2401 | |
8f8b1171 | 2402 | intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, 0); |
d7965152 RB |
2403 | } |
2404 | ||
a9877da2 CW |
2405 | static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config, |
2406 | i915_reg_t reg) | |
2407 | { | |
2408 | u32 mmio = i915_mmio_reg_offset(reg); | |
2409 | int i; | |
2410 | ||
2411 | /* | |
2412 | * This arbitrary default will select the 'EU FPU0 Pipeline | |
2413 | * Active' event. In the future it's anticipated that there | |
2414 | * will be an explicit 'No Event' we can select, but not yet... | |
2415 | */ | |
2416 | if (!oa_config) | |
2417 | return 0; | |
2418 | ||
2419 | for (i = 0; i < oa_config->flex_regs_len; i++) { | |
2420 | if (i915_mmio_reg_offset(oa_config->flex_regs[i].addr) == mmio) | |
2421 | return oa_config->flex_regs[i].value; | |
2422 | } | |
2423 | ||
2424 | return 0; | |
2425 | } | |
19f81df2 RB |
2426 | /* |
2427 | * NB: It must always remain pointer safe to run this even if the OA unit | |
2428 | * has been disabled. | |
2429 | * | |
2430 | * It's fine to put out-of-date values into these per-context registers | |
2431 | * in the case that the OA unit has been disabled. | |
2432 | */ | |
b146e5ef | 2433 | static void |
7dc56af5 CW |
2434 | gen8_update_reg_state_unlocked(const struct intel_context *ce, |
2435 | const struct i915_perf_stream *stream) | |
19f81df2 | 2436 | { |
8f8b1171 CW |
2437 | u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset; |
2438 | u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset; | |
19f81df2 | 2439 | /* The MMIO offsets for Flex EU registers aren't contiguous */ |
3a5d604f | 2440 | static const i915_reg_t flex_regs[] = { |
35ab4fd2 LL |
2441 | EU_PERF_CNTL0, |
2442 | EU_PERF_CNTL1, | |
2443 | EU_PERF_CNTL2, | |
2444 | EU_PERF_CNTL3, | |
2445 | EU_PERF_CNTL4, | |
2446 | EU_PERF_CNTL5, | |
2447 | EU_PERF_CNTL6, | |
19f81df2 | 2448 | }; |
7dc56af5 | 2449 | u32 *reg_state = ce->lrc_reg_state; |
19f81df2 RB |
2450 | int i; |
2451 | ||
ccdeed49 UNR |
2452 | reg_state[ctx_oactxctrl + 1] = |
2453 | (stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) | | |
2454 | (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) | | |
2455 | GEN8_OA_COUNTER_RESUME; | |
19f81df2 | 2456 | |
ccdeed49 | 2457 | for (i = 0; i < ARRAY_SIZE(flex_regs); i++) |
7dc56af5 CW |
2458 | reg_state[ctx_flexeu0 + i * 2 + 1] = |
2459 | oa_config_flex_reg(stream->oa_config, flex_regs[i]); | |
19f81df2 RB |
2460 | } |
2461 | ||
a9877da2 CW |
2462 | struct flex { |
2463 | i915_reg_t reg; | |
2464 | u32 offset; | |
2465 | u32 value; | |
2466 | }; | |
2467 | ||
2468 | static int | |
2469 | gen8_store_flex(struct i915_request *rq, | |
2470 | struct intel_context *ce, | |
2471 | const struct flex *flex, unsigned int count) | |
2472 | { | |
2473 | u32 offset; | |
2474 | u32 *cs; | |
2475 | ||
2476 | cs = intel_ring_begin(rq, 4 * count); | |
2477 | if (IS_ERR(cs)) | |
2478 | return PTR_ERR(cs); | |
2479 | ||
b4892e44 | 2480 | offset = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET; |
a9877da2 CW |
2481 | do { |
2482 | *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; | |
7dc56af5 | 2483 | *cs++ = offset + flex->offset * sizeof(u32); |
a9877da2 CW |
2484 | *cs++ = 0; |
2485 | *cs++ = flex->value; | |
2486 | } while (flex++, --count); | |
2487 | ||
2488 | intel_ring_advance(rq, cs); | |
2489 | ||
2490 | return 0; | |
2491 | } | |
2492 | ||
2493 | static int | |
2494 | gen8_load_flex(struct i915_request *rq, | |
2495 | struct intel_context *ce, | |
2496 | const struct flex *flex, unsigned int count) | |
2497 | { | |
2498 | u32 *cs; | |
2499 | ||
2500 | GEM_BUG_ON(!count || count > 63); | |
2501 | ||
2502 | cs = intel_ring_begin(rq, 2 * count + 2); | |
2503 | if (IS_ERR(cs)) | |
2504 | return PTR_ERR(cs); | |
2505 | ||
2506 | *cs++ = MI_LOAD_REGISTER_IMM(count); | |
2507 | do { | |
2508 | *cs++ = i915_mmio_reg_offset(flex->reg); | |
2509 | *cs++ = flex->value; | |
2510 | } while (flex++, --count); | |
2511 | *cs++ = MI_NOOP; | |
2512 | ||
2513 | intel_ring_advance(rq, cs); | |
2514 | ||
2515 | return 0; | |
2516 | } | |
2517 | ||
2518 | static int gen8_modify_context(struct intel_context *ce, | |
2519 | const struct flex *flex, unsigned int count) | |
2520 | { | |
2521 | struct i915_request *rq; | |
2522 | int err; | |
2523 | ||
de5825be | 2524 | rq = intel_engine_create_kernel_request(ce->engine); |
a9877da2 CW |
2525 | if (IS_ERR(rq)) |
2526 | return PTR_ERR(rq); | |
2527 | ||
2528 | /* Serialise with the remote context */ | |
2529 | err = intel_context_prepare_remote_request(ce, rq); | |
2530 | if (err == 0) | |
2531 | err = gen8_store_flex(rq, ce, flex, count); | |
2532 | ||
2533 | i915_request_add(rq); | |
2534 | return err; | |
2535 | } | |
2536 | ||
d7d50f80 CW |
2537 | static int |
2538 | gen8_modify_self(struct intel_context *ce, | |
2539 | const struct flex *flex, unsigned int count, | |
2540 | struct i915_active *active) | |
a9877da2 CW |
2541 | { |
2542 | struct i915_request *rq; | |
2543 | int err; | |
2544 | ||
d236e2ac | 2545 | intel_engine_pm_get(ce->engine); |
a9877da2 | 2546 | rq = i915_request_create(ce); |
d236e2ac | 2547 | intel_engine_pm_put(ce->engine); |
a9877da2 CW |
2548 | if (IS_ERR(rq)) |
2549 | return PTR_ERR(rq); | |
2550 | ||
d7d50f80 CW |
2551 | if (!IS_ERR_OR_NULL(active)) { |
2552 | err = i915_active_add_request(active, rq); | |
2553 | if (err) | |
2554 | goto err_add_request; | |
2555 | } | |
2556 | ||
a9877da2 | 2557 | err = gen8_load_flex(rq, ce, flex, count); |
d7d50f80 CW |
2558 | if (err) |
2559 | goto err_add_request; | |
a9877da2 | 2560 | |
d7d50f80 | 2561 | err_add_request: |
a9877da2 CW |
2562 | i915_request_add(rq); |
2563 | return err; | |
2564 | } | |
2565 | ||
5cca5038 CW |
2566 | static int gen8_configure_context(struct i915_gem_context *ctx, |
2567 | struct flex *flex, unsigned int count) | |
2568 | { | |
2569 | struct i915_gem_engines_iter it; | |
2570 | struct intel_context *ce; | |
2571 | int err = 0; | |
2572 | ||
2573 | for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) { | |
2574 | GEM_BUG_ON(ce == ce->engine->kernel_context); | |
2575 | ||
2576 | if (ce->engine->class != RENDER_CLASS) | |
2577 | continue; | |
2578 | ||
feed5c7b CW |
2579 | /* Otherwise OA settings will be set upon first use */ |
2580 | if (!intel_context_pin_if_active(ce)) | |
2581 | continue; | |
5cca5038 | 2582 | |
0b6613c6 | 2583 | flex->value = intel_sseu_make_rpcs(ce->engine->gt, &ce->sseu); |
feed5c7b | 2584 | err = gen8_modify_context(ce, flex, count); |
5cca5038 | 2585 | |
feed5c7b | 2586 | intel_context_unpin(ce); |
5cca5038 CW |
2587 | if (err) |
2588 | break; | |
2589 | } | |
2590 | i915_gem_context_unlock_engines(ctx); | |
2591 | ||
2592 | return err; | |
2593 | } | |
2594 | ||
d7d50f80 CW |
2595 | static int gen12_configure_oar_context(struct i915_perf_stream *stream, |
2596 | struct i915_active *active) | |
00a7f0d7 | 2597 | { |
ccdeed49 UNR |
2598 | int err; |
2599 | struct intel_context *ce = stream->pinned_ctx; | |
90981da6 | 2600 | u32 format = stream->oa_buffer.format->format; |
a5c3a3cb | 2601 | u32 offset = stream->perf->ctx_oactxctrl_offset; |
ccdeed49 UNR |
2602 | struct flex regs_context[] = { |
2603 | { | |
2604 | GEN8_OACTXCONTROL, | |
a5c3a3cb | 2605 | offset + 1, |
d7d50f80 | 2606 | active ? GEN8_OA_COUNTER_RESUME : 0, |
ccdeed49 UNR |
2607 | }, |
2608 | }; | |
2609 | /* Offsets in regs_lri are not used since this configuration is only | |
2610 | * applied using LRI. Initialize the correct offsets for posterity. | |
2611 | */ | |
2612 | #define GEN12_OAR_OACONTROL_OFFSET 0x5B0 | |
2613 | struct flex regs_lri[] = { | |
2614 | { | |
2615 | GEN12_OAR_OACONTROL, | |
2616 | GEN12_OAR_OACONTROL_OFFSET + 1, | |
2617 | (format << GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT) | | |
d7d50f80 | 2618 | (active ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0) |
ccdeed49 UNR |
2619 | }, |
2620 | { | |
2621 | RING_CONTEXT_CONTROL(ce->engine->mmio_base), | |
2622 | CTX_CONTEXT_CONTROL, | |
2623 | _MASKED_FIELD(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE, | |
d7d50f80 | 2624 | active ? |
ccdeed49 UNR |
2625 | GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE : |
2626 | 0) | |
2627 | }, | |
2628 | }; | |
00a7f0d7 | 2629 | |
a5c3a3cb | 2630 | /* Modify the context image of pinned context with regs_context */ |
ccdeed49 UNR |
2631 | err = intel_context_lock_pinned(ce); |
2632 | if (err) | |
2633 | return err; | |
00a7f0d7 | 2634 | |
a5c3a3cb UNR |
2635 | err = gen8_modify_context(ce, regs_context, |
2636 | ARRAY_SIZE(regs_context)); | |
ccdeed49 UNR |
2637 | intel_context_unlock_pinned(ce); |
2638 | if (err) | |
2639 | return err; | |
00a7f0d7 | 2640 | |
ccdeed49 | 2641 | /* Apply regs_lri using LRI with pinned context */ |
d7d50f80 | 2642 | return gen8_modify_self(ce, regs_lri, ARRAY_SIZE(regs_lri), active); |
00a7f0d7 LL |
2643 | } |
2644 | ||
19f81df2 RB |
2645 | /* |
2646 | * Manages updating the per-context aspects of the OA stream | |
2647 | * configuration across all contexts. | |
2648 | * | |
2649 | * The awkward consideration here is that OACTXCONTROL controls the | |
2650 | * exponent for periodic sampling which is primarily used for system | |
2651 | * wide profiling where we'd like a consistent sampling period even in | |
2652 | * the face of context switches. | |
2653 | * | |
2654 | * Our approach of updating the register state context (as opposed to | |
2655 | * say using a workaround batch buffer) ensures that the hardware | |
2656 | * won't automatically reload an out-of-date timer exponent even | |
2657 | * transiently before a WA BB could be parsed. | |
2658 | * | |
2659 | * This function needs to: | |
2660 | * - Ensure the currently running context's per-context OA state is | |
2661 | * updated | |
2662 | * - Ensure that all existing contexts will have the correct per-context | |
2663 | * OA state if they are scheduled for use. | |
2664 | * - Ensure any new contexts will be initialized with the correct | |
2665 | * per-context OA state. | |
2666 | * | |
2667 | * Note: it's only the RCS/Render context that has any OA state. | |
ccdeed49 | 2668 | * Note: the first flex register passed must always be R_PWR_CLK_STATE |
19f81df2 | 2669 | */ |
d7d50f80 CW |
2670 | static int |
2671 | oa_configure_all_contexts(struct i915_perf_stream *stream, | |
2672 | struct flex *regs, | |
2673 | size_t num_regs, | |
2674 | struct i915_active *active) | |
19f81df2 | 2675 | { |
8f8b1171 | 2676 | struct drm_i915_private *i915 = stream->perf->i915; |
a9877da2 | 2677 | struct intel_engine_cs *engine; |
9677a9f3 | 2678 | struct intel_gt *gt = stream->engine->gt; |
a4e7ccda | 2679 | struct i915_gem_context *ctx, *cn; |
ccdeed49 | 2680 | int err; |
a9877da2 | 2681 | |
9677a9f3 | 2682 | lockdep_assert_held(>->perf.lock); |
19f81df2 | 2683 | |
19f81df2 RB |
2684 | /* |
2685 | * The OA register config is setup through the context image. This image | |
2686 | * might be written to by the GPU on context switch (in particular on | |
2687 | * lite-restore). This means we can't safely update a context's image, | |
2688 | * if this context is scheduled/submitted to run on the GPU. | |
2689 | * | |
2690 | * We could emit the OA register config through the batch buffer but | |
2691 | * this might leave small interval of time where the OA unit is | |
2692 | * configured at an invalid sampling period. | |
2693 | * | |
a9877da2 CW |
2694 | * Note that since we emit all requests from a single ring, there |
2695 | * is still an implicit global barrier here that may cause a high | |
2696 | * priority context to wait for an otherwise independent low priority | |
2697 | * context. Contexts idle at the time of reconfiguration are not | |
2698 | * trapped behind the barrier. | |
19f81df2 | 2699 | */ |
a4e7ccda CW |
2700 | spin_lock(&i915->gem.contexts.lock); |
2701 | list_for_each_entry_safe(ctx, cn, &i915->gem.contexts.list, link) { | |
a4e7ccda CW |
2702 | if (!kref_get_unless_zero(&ctx->ref)) |
2703 | continue; | |
2704 | ||
2705 | spin_unlock(&i915->gem.contexts.lock); | |
2706 | ||
ccdeed49 | 2707 | err = gen8_configure_context(ctx, regs, num_regs); |
a4e7ccda CW |
2708 | if (err) { |
2709 | i915_gem_context_put(ctx); | |
a9877da2 | 2710 | return err; |
a4e7ccda CW |
2711 | } |
2712 | ||
2713 | spin_lock(&i915->gem.contexts.lock); | |
2714 | list_safe_reset_next(ctx, cn, link); | |
2715 | i915_gem_context_put(ctx); | |
19f81df2 | 2716 | } |
a4e7ccda | 2717 | spin_unlock(&i915->gem.contexts.lock); |
19f81df2 | 2718 | |
722f3de3 | 2719 | /* |
a9877da2 CW |
2720 | * After updating all other contexts, we need to modify ourselves. |
2721 | * If we don't modify the kernel_context, we do not get events while | |
2722 | * idle. | |
722f3de3 | 2723 | */ |
750e76b4 | 2724 | for_each_uabi_engine(engine, i915) { |
a9877da2 | 2725 | struct intel_context *ce = engine->kernel_context; |
722f3de3 | 2726 | |
a9877da2 CW |
2727 | if (engine->class != RENDER_CLASS) |
2728 | continue; | |
2729 | ||
0b6613c6 | 2730 | regs[0].value = intel_sseu_make_rpcs(engine->gt, &ce->sseu); |
a9877da2 | 2731 | |
d7d50f80 | 2732 | err = gen8_modify_self(ce, regs, num_regs, active); |
a9877da2 CW |
2733 | if (err) |
2734 | return err; | |
2735 | } | |
722f3de3 TU |
2736 | |
2737 | return 0; | |
19f81df2 RB |
2738 | } |
2739 | ||
d7d50f80 CW |
2740 | static int |
2741 | gen12_configure_all_contexts(struct i915_perf_stream *stream, | |
2742 | const struct i915_oa_config *oa_config, | |
2743 | struct i915_active *active) | |
ccdeed49 UNR |
2744 | { |
2745 | struct flex regs[] = { | |
2746 | { | |
7d296f36 | 2747 | GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE), |
ccdeed49 UNR |
2748 | CTX_R_PWR_CLK_STATE, |
2749 | }, | |
2750 | }; | |
2751 | ||
d7d50f80 CW |
2752 | return oa_configure_all_contexts(stream, |
2753 | regs, ARRAY_SIZE(regs), | |
2754 | active); | |
ccdeed49 UNR |
2755 | } |
2756 | ||
d7d50f80 CW |
2757 | static int |
2758 | lrc_configure_all_contexts(struct i915_perf_stream *stream, | |
2759 | const struct i915_oa_config *oa_config, | |
2760 | struct i915_active *active) | |
ccdeed49 | 2761 | { |
a5c3a3cb | 2762 | u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset; |
ccdeed49 UNR |
2763 | /* The MMIO offsets for Flex EU registers aren't contiguous */ |
2764 | const u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset; | |
2765 | #define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N) + 1) | |
2766 | struct flex regs[] = { | |
2767 | { | |
7d296f36 | 2768 | GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE), |
ccdeed49 UNR |
2769 | CTX_R_PWR_CLK_STATE, |
2770 | }, | |
2771 | { | |
2772 | GEN8_OACTXCONTROL, | |
a5c3a3cb | 2773 | ctx_oactxctrl + 1, |
ccdeed49 UNR |
2774 | }, |
2775 | { EU_PERF_CNTL0, ctx_flexeuN(0) }, | |
2776 | { EU_PERF_CNTL1, ctx_flexeuN(1) }, | |
2777 | { EU_PERF_CNTL2, ctx_flexeuN(2) }, | |
2778 | { EU_PERF_CNTL3, ctx_flexeuN(3) }, | |
2779 | { EU_PERF_CNTL4, ctx_flexeuN(4) }, | |
2780 | { EU_PERF_CNTL5, ctx_flexeuN(5) }, | |
2781 | { EU_PERF_CNTL6, ctx_flexeuN(6) }, | |
2782 | }; | |
2783 | #undef ctx_flexeuN | |
2784 | int i; | |
2785 | ||
2786 | regs[1].value = | |
2787 | (stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) | | |
2788 | (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) | | |
2789 | GEN8_OA_COUNTER_RESUME; | |
2790 | ||
2791 | for (i = 2; i < ARRAY_SIZE(regs); i++) | |
2792 | regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg); | |
2793 | ||
d7d50f80 CW |
2794 | return oa_configure_all_contexts(stream, |
2795 | regs, ARRAY_SIZE(regs), | |
2796 | active); | |
ccdeed49 UNR |
2797 | } |
2798 | ||
d7d50f80 CW |
2799 | static int |
2800 | gen8_enable_metric_set(struct i915_perf_stream *stream, | |
2801 | struct i915_active *active) | |
19f81df2 | 2802 | { |
52111c46 | 2803 | struct intel_uncore *uncore = stream->uncore; |
8814c6d0 | 2804 | struct i915_oa_config *oa_config = stream->oa_config; |
701f8231 | 2805 | int ret; |
19f81df2 RB |
2806 | |
2807 | /* | |
2808 | * We disable slice/unslice clock ratio change reports on SKL since | |
2809 | * they are too noisy. The HW generates a lot of redundant reports | |
2810 | * where the ratio hasn't really changed causing a lot of redundant | |
2811 | * work to processes and increasing the chances we'll hit buffer | |
2812 | * overruns. | |
2813 | * | |
2814 | * Although we don't currently use the 'disable overrun' OABUFFER | |
2815 | * feature it's worth noting that clock ratio reports have to be | |
2816 | * disabled before considering to use that feature since the HW doesn't | |
2817 | * correctly block these reports. | |
2818 | * | |
2819 | * Currently none of the high-level metrics we have depend on knowing | |
2820 | * this ratio to normalize. | |
2821 | * | |
2822 | * Note: This register is not power context saved and restored, but | |
2823 | * that's OK considering that we disable RC6 while the OA unit is | |
2824 | * enabled. | |
2825 | * | |
2826 | * The _INCLUDE_CLK_RATIO bit allows the slice/unslice frequency to | |
2827 | * be read back from automatically triggered reports, as part of the | |
2828 | * RPT_ID field. | |
2829 | */ | |
651e7d48 | 2830 | if (IS_GRAPHICS_VER(stream->perf->i915, 9, 11)) { |
8f8b1171 CW |
2831 | intel_uncore_write(uncore, GEN8_OA_DEBUG, |
2832 | _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS | | |
2833 | GEN9_OA_DEBUG_INCLUDE_CLK_RATIO)); | |
19f81df2 RB |
2834 | } |
2835 | ||
2836 | /* | |
2837 | * Update all contexts prior writing the mux configurations as we need | |
2838 | * to make sure all slices/subslices are ON before writing to NOA | |
2839 | * registers. | |
2840 | */ | |
d7d50f80 | 2841 | ret = lrc_configure_all_contexts(stream, oa_config, active); |
00a7f0d7 | 2842 | if (ret) |
d7d50f80 | 2843 | return ret; |
00a7f0d7 | 2844 | |
d7d50f80 CW |
2845 | return emit_oa_config(stream, |
2846 | stream->oa_config, oa_context(stream), | |
2847 | active); | |
00a7f0d7 LL |
2848 | } |
2849 | ||
9278bbb6 CW |
2850 | static u32 oag_report_ctx_switches(const struct i915_perf_stream *stream) |
2851 | { | |
2852 | return _MASKED_FIELD(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS, | |
2853 | (stream->sample_flags & SAMPLE_OA_REPORT) ? | |
2854 | 0 : GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS); | |
2855 | } | |
2856 | ||
d7d50f80 CW |
2857 | static int |
2858 | gen12_enable_metric_set(struct i915_perf_stream *stream, | |
2859 | struct i915_active *active) | |
00a7f0d7 | 2860 | { |
cceb0849 | 2861 | struct drm_i915_private *i915 = stream->perf->i915; |
00a7f0d7 LL |
2862 | struct intel_uncore *uncore = stream->uncore; |
2863 | struct i915_oa_config *oa_config = stream->oa_config; | |
2864 | bool periodic = stream->periodic; | |
2865 | u32 period_exponent = stream->period_exponent; | |
cceb0849 | 2866 | u32 sqcnt1; |
00a7f0d7 LL |
2867 | int ret; |
2868 | ||
ed6b25aa UNR |
2869 | /* |
2870 | * Wa_1508761755:xehpsdv, dg2 | |
2871 | * EU NOA signals behave incorrectly if EU clock gating is enabled. | |
2872 | * Disable thread stall DOP gating and EU DOP gating. | |
2873 | */ | |
2874 | if (IS_XEHPSDV(i915) || IS_DG2(i915)) { | |
2875 | intel_gt_mcr_multicast_write(uncore->gt, GEN8_ROW_CHICKEN, | |
2876 | _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); | |
2877 | intel_uncore_write(uncore, GEN7_ROW_CHICKEN2, | |
2878 | _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING)); | |
2879 | } | |
2880 | ||
00a7f0d7 LL |
2881 | intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG, |
2882 | /* Disable clk ratio reports, like previous Gens. */ | |
2883 | _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS | | |
2884 | GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) | | |
2885 | /* | |
9278bbb6 CW |
2886 | * If the user didn't require OA reports, instruct |
2887 | * the hardware not to emit ctx switch reports. | |
00a7f0d7 | 2888 | */ |
9278bbb6 | 2889 | oag_report_ctx_switches(stream)); |
00a7f0d7 LL |
2890 | |
2891 | intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ? | |
2892 | (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME | | |
2893 | GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE | | |
2894 | (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT)) | |
2895 | : 0); | |
2896 | ||
cceb0849 UNR |
2897 | /* |
2898 | * Initialize Super Queue Internal Cnt Register | |
2899 | * Set PMON Enable in order to collect valid metrics. | |
2900 | * Enable byets per clock reporting in OA for XEHPSDV onward. | |
2901 | */ | |
2902 | sqcnt1 = GEN12_SQCNT1_PMON_ENABLE | | |
2903 | (HAS_OA_BPC_REPORTING(i915) ? GEN12_SQCNT1_OABPC : 0); | |
2904 | ||
2905 | intel_uncore_rmw(uncore, GEN12_SQCNT1, 0, sqcnt1); | |
2906 | ||
00a7f0d7 LL |
2907 | /* |
2908 | * Update all contexts prior writing the mux configurations as we need | |
2909 | * to make sure all slices/subslices are ON before writing to NOA | |
2910 | * registers. | |
2911 | */ | |
d7d50f80 | 2912 | ret = gen12_configure_all_contexts(stream, oa_config, active); |
19f81df2 | 2913 | if (ret) |
d7d50f80 | 2914 | return ret; |
19f81df2 | 2915 | |
00a7f0d7 LL |
2916 | /* |
2917 | * For Gen12, performance counters are context | |
2918 | * saved/restored. Only enable it for the context that | |
2919 | * requested this. | |
2920 | */ | |
2921 | if (stream->ctx) { | |
d7d50f80 | 2922 | ret = gen12_configure_oar_context(stream, active); |
00a7f0d7 | 2923 | if (ret) |
d7d50f80 | 2924 | return ret; |
00a7f0d7 LL |
2925 | } |
2926 | ||
d7d50f80 CW |
2927 | return emit_oa_config(stream, |
2928 | stream->oa_config, oa_context(stream), | |
2929 | active); | |
19f81df2 RB |
2930 | } |
2931 | ||
a37f08a8 | 2932 | static void gen8_disable_metric_set(struct i915_perf_stream *stream) |
19f81df2 | 2933 | { |
52111c46 | 2934 | struct intel_uncore *uncore = stream->uncore; |
a37f08a8 | 2935 | |
19f81df2 | 2936 | /* Reset all contexts' slices/subslices configurations. */ |
d7d50f80 | 2937 | lrc_configure_all_contexts(stream, NULL, NULL); |
28964cf2 | 2938 | |
8f8b1171 | 2939 | intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, 0); |
19f81df2 RB |
2940 | } |
2941 | ||
5dae69a9 | 2942 | static void gen11_disable_metric_set(struct i915_perf_stream *stream) |
95690a02 | 2943 | { |
52111c46 | 2944 | struct intel_uncore *uncore = stream->uncore; |
a37f08a8 | 2945 | |
95690a02 | 2946 | /* Reset all contexts' slices/subslices configurations. */ |
d7d50f80 | 2947 | lrc_configure_all_contexts(stream, NULL, NULL); |
00a7f0d7 LL |
2948 | |
2949 | /* Make sure we disable noa to save power. */ | |
2950 | intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0); | |
2951 | } | |
2952 | ||
2953 | static void gen12_disable_metric_set(struct i915_perf_stream *stream) | |
2954 | { | |
2955 | struct intel_uncore *uncore = stream->uncore; | |
cceb0849 UNR |
2956 | struct drm_i915_private *i915 = stream->perf->i915; |
2957 | u32 sqcnt1; | |
00a7f0d7 | 2958 | |
ed6b25aa UNR |
2959 | /* |
2960 | * Wa_1508761755:xehpsdv, dg2 | |
2961 | * Enable thread stall DOP gating and EU DOP gating. | |
2962 | */ | |
2963 | if (IS_XEHPSDV(i915) || IS_DG2(i915)) { | |
2964 | intel_gt_mcr_multicast_write(uncore->gt, GEN8_ROW_CHICKEN, | |
2965 | _MASKED_BIT_DISABLE(STALL_DOP_GATING_DISABLE)); | |
2966 | intel_uncore_write(uncore, GEN7_ROW_CHICKEN2, | |
2967 | _MASKED_BIT_DISABLE(GEN12_DISABLE_DOP_GATING)); | |
2968 | } | |
2969 | ||
00a7f0d7 | 2970 | /* Reset all contexts' slices/subslices configurations. */ |
d7d50f80 | 2971 | gen12_configure_all_contexts(stream, NULL, NULL); |
00a7f0d7 LL |
2972 | |
2973 | /* disable the context save/restore or OAR counters */ | |
2974 | if (stream->ctx) | |
d7d50f80 | 2975 | gen12_configure_oar_context(stream, NULL); |
95690a02 LL |
2976 | |
2977 | /* Make sure we disable noa to save power. */ | |
8f8b1171 | 2978 | intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0); |
cceb0849 UNR |
2979 | |
2980 | sqcnt1 = GEN12_SQCNT1_PMON_ENABLE | | |
2981 | (HAS_OA_BPC_REPORTING(i915) ? GEN12_SQCNT1_OABPC : 0); | |
2982 | ||
2983 | /* Reset PMON Enable to save power. */ | |
2984 | intel_uncore_rmw(uncore, GEN12_SQCNT1, sqcnt1, 0); | |
95690a02 LL |
2985 | } |
2986 | ||
5728de2f | 2987 | static void gen7_oa_enable(struct i915_perf_stream *stream) |
d7965152 | 2988 | { |
52111c46 | 2989 | struct intel_uncore *uncore = stream->uncore; |
5728de2f | 2990 | struct i915_gem_context *ctx = stream->ctx; |
a37f08a8 UNR |
2991 | u32 ctx_id = stream->specific_ctx_id; |
2992 | bool periodic = stream->periodic; | |
2993 | u32 period_exponent = stream->period_exponent; | |
90981da6 | 2994 | u32 report_format = stream->oa_buffer.format->format; |
11051303 | 2995 | |
1bef3409 RB |
2996 | /* |
2997 | * Reset buf pointers so we don't forward reports from before now. | |
2998 | * | |
2999 | * Think carefully if considering trying to avoid this, since it | |
3000 | * also ensures status flags and the buffer itself are cleared | |
3001 | * in error paths, and we have checks for invalid reports based | |
3002 | * on the assumption that certain fields are written to zeroed | |
3003 | * memory which this helps maintains. | |
3004 | */ | |
a37f08a8 | 3005 | gen7_init_oa_buffer(stream); |
d7965152 | 3006 | |
8f8b1171 CW |
3007 | intel_uncore_write(uncore, GEN7_OACONTROL, |
3008 | (ctx_id & GEN7_OACONTROL_CTX_MASK) | | |
3009 | (period_exponent << | |
3010 | GEN7_OACONTROL_TIMER_PERIOD_SHIFT) | | |
3011 | (periodic ? GEN7_OACONTROL_TIMER_ENABLE : 0) | | |
3012 | (report_format << GEN7_OACONTROL_FORMAT_SHIFT) | | |
3013 | (ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : 0) | | |
3014 | GEN7_OACONTROL_ENABLE); | |
d7965152 RB |
3015 | } |
3016 | ||
5728de2f | 3017 | static void gen8_oa_enable(struct i915_perf_stream *stream) |
19f81df2 | 3018 | { |
52111c46 | 3019 | struct intel_uncore *uncore = stream->uncore; |
90981da6 | 3020 | u32 report_format = stream->oa_buffer.format->format; |
19f81df2 RB |
3021 | |
3022 | /* | |
3023 | * Reset buf pointers so we don't forward reports from before now. | |
3024 | * | |
3025 | * Think carefully if considering trying to avoid this, since it | |
3026 | * also ensures status flags and the buffer itself are cleared | |
3027 | * in error paths, and we have checks for invalid reports based | |
3028 | * on the assumption that certain fields are written to zeroed | |
3029 | * memory which this helps maintains. | |
3030 | */ | |
a37f08a8 | 3031 | gen8_init_oa_buffer(stream); |
19f81df2 RB |
3032 | |
3033 | /* | |
3034 | * Note: we don't rely on the hardware to perform single context | |
3035 | * filtering and instead filter on the cpu based on the context-id | |
3036 | * field of reports | |
3037 | */ | |
8f8b1171 CW |
3038 | intel_uncore_write(uncore, GEN8_OACONTROL, |
3039 | (report_format << GEN8_OA_REPORT_FORMAT_SHIFT) | | |
3040 | GEN8_OA_COUNTER_ENABLE); | |
19f81df2 RB |
3041 | } |
3042 | ||
00a7f0d7 LL |
3043 | static void gen12_oa_enable(struct i915_perf_stream *stream) |
3044 | { | |
3045 | struct intel_uncore *uncore = stream->uncore; | |
90981da6 | 3046 | u32 report_format = stream->oa_buffer.format->format; |
00a7f0d7 LL |
3047 | |
3048 | /* | |
3049 | * If we don't want OA reports from the OA buffer, then we don't even | |
3050 | * need to program the OAG unit. | |
3051 | */ | |
3052 | if (!(stream->sample_flags & SAMPLE_OA_REPORT)) | |
3053 | return; | |
3054 | ||
3055 | gen12_init_oa_buffer(stream); | |
3056 | ||
3057 | intel_uncore_write(uncore, GEN12_OAG_OACONTROL, | |
3058 | (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) | | |
3059 | GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE); | |
3060 | } | |
3061 | ||
16d98b31 RB |
3062 | /** |
3063 | * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream | |
3064 | * @stream: An i915 perf stream opened for OA metrics | |
3065 | * | |
3066 | * [Re]enables hardware periodic sampling according to the period configured | |
3067 | * when opening the stream. This also starts a hrtimer that will periodically | |
3068 | * check for data in the circular OA buffer for notifying userspace (e.g. | |
3069 | * during a read() or poll()). | |
3070 | */ | |
d7965152 RB |
3071 | static void i915_oa_stream_enable(struct i915_perf_stream *stream) |
3072 | { | |
c51dbc6e LL |
3073 | stream->pollin = false; |
3074 | ||
8f8b1171 | 3075 | stream->perf->ops.oa_enable(stream); |
d7965152 | 3076 | |
be0bdd67 | 3077 | if (stream->sample_flags & SAMPLE_OA_REPORT) |
a37f08a8 | 3078 | hrtimer_start(&stream->poll_check_timer, |
4ef10fe0 | 3079 | ns_to_ktime(stream->poll_oa_period), |
d7965152 RB |
3080 | HRTIMER_MODE_REL_PINNED); |
3081 | } | |
3082 | ||
5728de2f | 3083 | static void gen7_oa_disable(struct i915_perf_stream *stream) |
d7965152 | 3084 | { |
52111c46 | 3085 | struct intel_uncore *uncore = stream->uncore; |
5728de2f | 3086 | |
97a04e0d DCS |
3087 | intel_uncore_write(uncore, GEN7_OACONTROL, 0); |
3088 | if (intel_wait_for_register(uncore, | |
e896d29a CW |
3089 | GEN7_OACONTROL, GEN7_OACONTROL_ENABLE, 0, |
3090 | 50)) | |
0bf85735 WK |
3091 | drm_err(&stream->perf->i915->drm, |
3092 | "wait for OA to be disabled timed out\n"); | |
d7965152 RB |
3093 | } |
3094 | ||
5728de2f | 3095 | static void gen8_oa_disable(struct i915_perf_stream *stream) |
19f81df2 | 3096 | { |
52111c46 | 3097 | struct intel_uncore *uncore = stream->uncore; |
5728de2f | 3098 | |
97a04e0d DCS |
3099 | intel_uncore_write(uncore, GEN8_OACONTROL, 0); |
3100 | if (intel_wait_for_register(uncore, | |
e896d29a CW |
3101 | GEN8_OACONTROL, GEN8_OA_COUNTER_ENABLE, 0, |
3102 | 50)) | |
0bf85735 WK |
3103 | drm_err(&stream->perf->i915->drm, |
3104 | "wait for OA to be disabled timed out\n"); | |
19f81df2 RB |
3105 | } |
3106 | ||
00a7f0d7 LL |
3107 | static void gen12_oa_disable(struct i915_perf_stream *stream) |
3108 | { | |
3109 | struct intel_uncore *uncore = stream->uncore; | |
3110 | ||
3111 | intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0); | |
3112 | if (intel_wait_for_register(uncore, | |
3113 | GEN12_OAG_OACONTROL, | |
3114 | GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0, | |
3115 | 50)) | |
0bf85735 WK |
3116 | drm_err(&stream->perf->i915->drm, |
3117 | "wait for OA to be disabled timed out\n"); | |
c06aa1b4 UNR |
3118 | |
3119 | intel_uncore_write(uncore, GEN12_OA_TLB_INV_CR, 1); | |
3120 | if (intel_wait_for_register(uncore, | |
3121 | GEN12_OA_TLB_INV_CR, | |
3122 | 1, 0, | |
3123 | 50)) | |
3124 | drm_err(&stream->perf->i915->drm, | |
3125 | "wait for OA tlb invalidate timed out\n"); | |
00a7f0d7 LL |
3126 | } |
3127 | ||
16d98b31 RB |
3128 | /** |
3129 | * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream | |
3130 | * @stream: An i915 perf stream opened for OA metrics | |
3131 | * | |
3132 | * Stops the OA unit from periodically writing counter reports into the | |
3133 | * circular OA buffer. This also stops the hrtimer that periodically checks for | |
3134 | * data in the circular OA buffer, for notifying userspace. | |
3135 | */ | |
d7965152 RB |
3136 | static void i915_oa_stream_disable(struct i915_perf_stream *stream) |
3137 | { | |
8f8b1171 | 3138 | stream->perf->ops.oa_disable(stream); |
d7965152 | 3139 | |
be0bdd67 | 3140 | if (stream->sample_flags & SAMPLE_OA_REPORT) |
a37f08a8 | 3141 | hrtimer_cancel(&stream->poll_check_timer); |
d7965152 RB |
3142 | } |
3143 | ||
d7965152 RB |
3144 | static const struct i915_perf_stream_ops i915_oa_stream_ops = { |
3145 | .destroy = i915_oa_stream_destroy, | |
3146 | .enable = i915_oa_stream_enable, | |
3147 | .disable = i915_oa_stream_disable, | |
3148 | .wait_unlocked = i915_oa_wait_unlocked, | |
3149 | .poll_wait = i915_oa_poll_wait, | |
3150 | .read = i915_oa_read, | |
eec688e1 RB |
3151 | }; |
3152 | ||
4b4e973d CW |
3153 | static int i915_perf_stream_enable_sync(struct i915_perf_stream *stream) |
3154 | { | |
d7d50f80 CW |
3155 | struct i915_active *active; |
3156 | int err; | |
4b4e973d | 3157 | |
d7d50f80 CW |
3158 | active = i915_active_create(); |
3159 | if (!active) | |
3160 | return -ENOMEM; | |
4b4e973d | 3161 | |
d7d50f80 CW |
3162 | err = stream->perf->ops.enable_metric_set(stream, active); |
3163 | if (err == 0) | |
3164 | __i915_active_wait(active, TASK_UNINTERRUPTIBLE); | |
4b4e973d | 3165 | |
d7d50f80 CW |
3166 | i915_active_put(active); |
3167 | return err; | |
4b4e973d CW |
3168 | } |
3169 | ||
11ecbddd LL |
3170 | static void |
3171 | get_default_sseu_config(struct intel_sseu *out_sseu, | |
3172 | struct intel_engine_cs *engine) | |
3173 | { | |
0b6613c6 | 3174 | const struct sseu_dev_info *devinfo_sseu = &engine->gt->info.sseu; |
11ecbddd LL |
3175 | |
3176 | *out_sseu = intel_sseu_from_device_info(devinfo_sseu); | |
3177 | ||
651e7d48 | 3178 | if (GRAPHICS_VER(engine->i915) == 11) { |
11ecbddd LL |
3179 | /* |
3180 | * We only need subslice count so it doesn't matter which ones | |
3181 | * we select - just turn off low bits in the amount of half of | |
3182 | * all available subslices per slice. | |
3183 | */ | |
3184 | out_sseu->subslice_mask = | |
3185 | ~(~0 << (hweight8(out_sseu->subslice_mask) / 2)); | |
3186 | out_sseu->slice_mask = 0x1; | |
3187 | } | |
3188 | } | |
3189 | ||
3190 | static int | |
3191 | get_sseu_config(struct intel_sseu *out_sseu, | |
3192 | struct intel_engine_cs *engine, | |
3193 | const struct drm_i915_gem_context_param_sseu *drm_sseu) | |
3194 | { | |
3195 | if (drm_sseu->engine.engine_class != engine->uabi_class || | |
3196 | drm_sseu->engine.engine_instance != engine->uabi_instance) | |
3197 | return -EINVAL; | |
3198 | ||
0b6613c6 | 3199 | return i915_gem_user_to_context_sseu(engine->gt, drm_sseu, out_sseu); |
4b4e973d CW |
3200 | } |
3201 | ||
bc7ed4d3 UNR |
3202 | /* |
3203 | * OA timestamp frequency = CS timestamp frequency in most platforms. On some | |
3204 | * platforms OA unit ignores the CTC_SHIFT and the 2 timestamps differ. In such | |
3205 | * cases, return the adjusted CS timestamp frequency to the user. | |
3206 | */ | |
3207 | u32 i915_perf_oa_timestamp_frequency(struct drm_i915_private *i915) | |
3208 | { | |
a6b44302 UNR |
3209 | /* |
3210 | * Wa_18013179988:dg2 | |
3211 | * Wa_14015846243:mtl | |
3212 | */ | |
3213 | if (IS_DG2(i915) || IS_METEORLAKE(i915)) { | |
bc7ed4d3 UNR |
3214 | intel_wakeref_t wakeref; |
3215 | u32 reg, shift; | |
3216 | ||
3217 | with_intel_runtime_pm(to_gt(i915)->uncore->rpm, wakeref) | |
3218 | reg = intel_uncore_read(to_gt(i915)->uncore, RPM_CONFIG0); | |
3219 | ||
3220 | shift = REG_FIELD_GET(GEN10_RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, | |
3221 | reg); | |
3222 | ||
3223 | return to_gt(i915)->clock_frequency << (3 - shift); | |
3224 | } | |
3225 | ||
3226 | return to_gt(i915)->clock_frequency; | |
3227 | } | |
3228 | ||
16d98b31 RB |
3229 | /** |
3230 | * i915_oa_stream_init - validate combined props for OA stream and init | |
3231 | * @stream: An i915 perf stream | |
3232 | * @param: The open parameters passed to `DRM_I915_PERF_OPEN` | |
3233 | * @props: The property state that configures stream (individually validated) | |
3234 | * | |
3235 | * While read_properties_unlocked() validates properties in isolation it | |
3236 | * doesn't ensure that the combination necessarily makes sense. | |
3237 | * | |
3238 | * At this point it has been determined that userspace wants a stream of | |
3239 | * OA metrics, but still we need to further validate the combined | |
3240 | * properties are OK. | |
3241 | * | |
3242 | * If the configuration makes sense then we can allocate memory for | |
3243 | * a circular OA buffer and apply the requested metric set configuration. | |
3244 | * | |
3245 | * Returns: zero on success or a negative error code. | |
3246 | */ | |
d7965152 RB |
3247 | static int i915_oa_stream_init(struct i915_perf_stream *stream, |
3248 | struct drm_i915_perf_open_param *param, | |
3249 | struct perf_open_properties *props) | |
3250 | { | |
a9f236d1 | 3251 | struct drm_i915_private *i915 = stream->perf->i915; |
8f8b1171 | 3252 | struct i915_perf *perf = stream->perf; |
5f284e9c | 3253 | struct i915_perf_group *g; |
9677a9f3 | 3254 | struct intel_gt *gt; |
d7965152 RB |
3255 | int ret; |
3256 | ||
9a61363a | 3257 | if (!props->engine) { |
2fec5391 UNR |
3258 | drm_dbg(&stream->perf->i915->drm, |
3259 | "OA engine not specified\n"); | |
9a61363a LL |
3260 | return -EINVAL; |
3261 | } | |
9677a9f3 | 3262 | gt = props->engine->gt; |
5f284e9c | 3263 | g = props->engine->oa_group; |
9a61363a LL |
3264 | |
3265 | /* | |
3266 | * If the sysfs metrics/ directory wasn't registered for some | |
442b8c06 RB |
3267 | * reason then don't let userspace try their luck with config |
3268 | * IDs | |
3269 | */ | |
8f8b1171 | 3270 | if (!perf->metrics_kobj) { |
2fec5391 UNR |
3271 | drm_dbg(&stream->perf->i915->drm, |
3272 | "OA metrics weren't advertised via sysfs\n"); | |
442b8c06 RB |
3273 | return -EINVAL; |
3274 | } | |
3275 | ||
322d56aa | 3276 | if (!(props->sample_flags & SAMPLE_OA_REPORT) && |
651e7d48 | 3277 | (GRAPHICS_VER(perf->i915) < 12 || !stream->ctx)) { |
2fec5391 UNR |
3278 | drm_dbg(&stream->perf->i915->drm, |
3279 | "Only OA report sampling supported\n"); | |
d7965152 RB |
3280 | return -EINVAL; |
3281 | } | |
3282 | ||
8f8b1171 | 3283 | if (!perf->ops.enable_metric_set) { |
2fec5391 UNR |
3284 | drm_dbg(&stream->perf->i915->drm, |
3285 | "OA unit not supported\n"); | |
d7965152 RB |
3286 | return -ENODEV; |
3287 | } | |
3288 | ||
9a61363a LL |
3289 | /* |
3290 | * To avoid the complexity of having to accurately filter | |
d7965152 RB |
3291 | * counter reports and marshal to the appropriate client |
3292 | * we currently only allow exclusive access | |
3293 | */ | |
5f284e9c | 3294 | if (g->exclusive_stream) { |
2fec5391 UNR |
3295 | drm_dbg(&stream->perf->i915->drm, |
3296 | "OA unit already in use\n"); | |
d7965152 RB |
3297 | return -EBUSY; |
3298 | } | |
3299 | ||
d7965152 | 3300 | if (!props->oa_format) { |
2fec5391 UNR |
3301 | drm_dbg(&stream->perf->i915->drm, |
3302 | "OA report format not specified\n"); | |
d7965152 RB |
3303 | return -EINVAL; |
3304 | } | |
3305 | ||
9a61363a | 3306 | stream->engine = props->engine; |
52111c46 | 3307 | stream->uncore = stream->engine->gt->uncore; |
9a61363a | 3308 | |
d7965152 RB |
3309 | stream->sample_size = sizeof(struct drm_i915_perf_record_header); |
3310 | ||
90981da6 UNR |
3311 | stream->oa_buffer.format = &perf->oa_formats[props->oa_format]; |
3312 | if (drm_WARN_ON(&i915->drm, stream->oa_buffer.format->size == 0)) | |
3313 | return -EINVAL; | |
d7965152 | 3314 | |
322d56aa | 3315 | stream->sample_flags = props->sample_flags; |
90981da6 | 3316 | stream->sample_size += stream->oa_buffer.format->size; |
d7965152 | 3317 | |
9cd20ef7 LL |
3318 | stream->hold_preemption = props->hold_preemption; |
3319 | ||
a37f08a8 UNR |
3320 | stream->periodic = props->oa_periodic; |
3321 | if (stream->periodic) | |
3322 | stream->period_exponent = props->oa_period_exponent; | |
d7965152 | 3323 | |
d7965152 RB |
3324 | if (stream->ctx) { |
3325 | ret = oa_get_render_ctx_id(stream); | |
9bd9be66 | 3326 | if (ret) { |
2fec5391 UNR |
3327 | drm_dbg(&stream->perf->i915->drm, |
3328 | "Invalid context id to filter with\n"); | |
d7965152 | 3329 | return ret; |
9bd9be66 | 3330 | } |
d7965152 RB |
3331 | } |
3332 | ||
daed3e44 LL |
3333 | ret = alloc_noa_wait(stream); |
3334 | if (ret) { | |
2fec5391 UNR |
3335 | drm_dbg(&stream->perf->i915->drm, |
3336 | "Unable to allocate NOA wait batch buffer\n"); | |
daed3e44 LL |
3337 | goto err_noa_wait_alloc; |
3338 | } | |
3339 | ||
6a45008a LL |
3340 | stream->oa_config = i915_perf_get_oa_config(perf, props->metrics_set); |
3341 | if (!stream->oa_config) { | |
2fec5391 UNR |
3342 | drm_dbg(&stream->perf->i915->drm, |
3343 | "Invalid OA config id=%i\n", props->metrics_set); | |
6a45008a | 3344 | ret = -EINVAL; |
f89823c2 | 3345 | goto err_config; |
9bd9be66 | 3346 | } |
701f8231 | 3347 | |
d7965152 RB |
3348 | /* PRM - observability performance counters: |
3349 | * | |
3350 | * OACONTROL, performance counter enable, note: | |
3351 | * | |
3352 | * "When this bit is set, in order to have coherent counts, | |
3353 | * RC6 power state and trunk clock gating must be disabled. | |
3354 | * This can be achieved by programming MMIO registers as | |
3355 | * 0xA094=0 and 0xA090[31]=1" | |
3356 | * | |
3357 | * In our case we are expecting that taking pm + FORCEWAKE | |
3358 | * references will effectively disable RC6. | |
3359 | */ | |
a5efcde6 | 3360 | intel_engine_pm_get(stream->engine); |
52111c46 | 3361 | intel_uncore_forcewake_get(stream->uncore, FORCEWAKE_ALL); |
d7965152 | 3362 | |
01e74274 VB |
3363 | /* |
3364 | * Wa_16011777198:dg2: GuC resets render as part of the Wa. This causes | |
3365 | * OA to lose the configuration state. Prevent this by overriding GUCRC | |
3366 | * mode. | |
3367 | */ | |
3368 | if (intel_uc_uses_guc_rc(>->uc) && | |
3369 | (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_C0) || | |
3370 | IS_DG2_GRAPHICS_STEP(gt->i915, G11, STEP_A0, STEP_B0))) { | |
3371 | ret = intel_guc_slpc_override_gucrc_mode(>->uc.guc.slpc, | |
3372 | SLPC_GUCRC_MODE_GUCRC_NO_RC6); | |
3373 | if (ret) { | |
3374 | drm_dbg(&stream->perf->i915->drm, | |
3375 | "Unable to override gucrc mode\n"); | |
2810ac6c | 3376 | goto err_gucrc; |
01e74274 | 3377 | } |
2810ac6c CW |
3378 | |
3379 | stream->override_gucrc = true; | |
01e74274 VB |
3380 | } |
3381 | ||
a37f08a8 | 3382 | ret = alloc_oa_buffer(stream); |
987f8c44 | 3383 | if (ret) |
3384 | goto err_oa_buf_alloc; | |
3385 | ||
ec431eae | 3386 | stream->ops = &i915_oa_stream_ops; |
11ecbddd | 3387 | |
9677a9f3 | 3388 | stream->engine->gt->perf.sseu = props->sseu; |
5f284e9c | 3389 | WRITE_ONCE(g->exclusive_stream, stream); |
ec431eae | 3390 | |
4b4e973d | 3391 | ret = i915_perf_stream_enable_sync(stream); |
9bd9be66 | 3392 | if (ret) { |
2fec5391 UNR |
3393 | drm_dbg(&stream->perf->i915->drm, |
3394 | "Unable to enable metric set\n"); | |
d7965152 | 3395 | goto err_enable; |
9bd9be66 | 3396 | } |
d7965152 | 3397 | |
2fec5391 UNR |
3398 | drm_dbg(&stream->perf->i915->drm, |
3399 | "opening stream oa config uuid=%s\n", | |
6a45008a LL |
3400 | stream->oa_config->uuid); |
3401 | ||
a37f08a8 UNR |
3402 | hrtimer_init(&stream->poll_check_timer, |
3403 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
3404 | stream->poll_check_timer.function = oa_poll_check_timer_cb; | |
3405 | init_waitqueue_head(&stream->poll_wq); | |
3406 | spin_lock_init(&stream->oa_buffer.ptr_lock); | |
2db609c0 | 3407 | mutex_init(&stream->lock); |
a37f08a8 | 3408 | |
d7965152 RB |
3409 | return 0; |
3410 | ||
41d3fdcd | 3411 | err_enable: |
5f284e9c | 3412 | WRITE_ONCE(g->exclusive_stream, NULL); |
8f8b1171 | 3413 | perf->ops.disable_metric_set(stream); |
701f8231 | 3414 | |
a37f08a8 | 3415 | free_oa_buffer(stream); |
d7965152 RB |
3416 | |
3417 | err_oa_buf_alloc: | |
2810ac6c CW |
3418 | if (stream->override_gucrc) |
3419 | intel_guc_slpc_unset_gucrc_mode(>->uc.guc.slpc); | |
f89823c2 | 3420 | |
2810ac6c | 3421 | err_gucrc: |
52111c46 | 3422 | intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL); |
a5efcde6 | 3423 | intel_engine_pm_put(stream->engine); |
f89823c2 | 3424 | |
2810ac6c CW |
3425 | free_oa_configs(stream); |
3426 | ||
f89823c2 | 3427 | err_config: |
daed3e44 LL |
3428 | free_noa_wait(stream); |
3429 | ||
3430 | err_noa_wait_alloc: | |
d7965152 RB |
3431 | if (stream->ctx) |
3432 | oa_put_render_ctx_id(stream); | |
3433 | ||
3434 | return ret; | |
3435 | } | |
3436 | ||
7dc56af5 CW |
3437 | void i915_oa_init_reg_state(const struct intel_context *ce, |
3438 | const struct intel_engine_cs *engine) | |
19f81df2 | 3439 | { |
28b6cb08 | 3440 | struct i915_perf_stream *stream; |
19f81df2 | 3441 | |
8a68d464 | 3442 | if (engine->class != RENDER_CLASS) |
19f81df2 RB |
3443 | return; |
3444 | ||
a5af081d | 3445 | /* perf.exclusive_stream serialised by lrc_configure_all_contexts() */ |
5f284e9c | 3446 | stream = READ_ONCE(engine->oa_group->exclusive_stream); |
651e7d48 | 3447 | if (stream && GRAPHICS_VER(stream->perf->i915) < 12) |
7dc56af5 | 3448 | gen8_update_reg_state_unlocked(ce, stream); |
19f81df2 RB |
3449 | } |
3450 | ||
16d98b31 RB |
3451 | /** |
3452 | * i915_perf_read - handles read() FOP for i915 perf stream FDs | |
3453 | * @file: An i915 perf stream file | |
3454 | * @buf: destination buffer given by userspace | |
3455 | * @count: the number of bytes userspace wants to read | |
3456 | * @ppos: (inout) file seek position (unused) | |
3457 | * | |
3458 | * The entry point for handling a read() on a stream file descriptor from | |
3459 | * userspace. Most of the work is left to the i915_perf_read_locked() and | |
3460 | * &i915_perf_stream_ops->read but to save having stream implementations (of | |
3461 | * which we might have multiple later) we handle blocking read here. | |
3462 | * | |
3463 | * We can also consistently treat trying to read from a disabled stream | |
3464 | * as an IO error so implementations can assume the stream is enabled | |
3465 | * while reading. | |
3466 | * | |
3467 | * Returns: The number of bytes copied or a negative error code on failure. | |
3468 | */ | |
eec688e1 RB |
3469 | static ssize_t i915_perf_read(struct file *file, |
3470 | char __user *buf, | |
3471 | size_t count, | |
3472 | loff_t *ppos) | |
3473 | { | |
3474 | struct i915_perf_stream *stream = file->private_data; | |
bcad588d AD |
3475 | size_t offset = 0; |
3476 | int ret; | |
eec688e1 | 3477 | |
d7965152 RB |
3478 | /* To ensure it's handled consistently we simply treat all reads of a |
3479 | * disabled stream as an error. In particular it might otherwise lead | |
3480 | * to a deadlock for blocking file descriptors... | |
3481 | */ | |
be0bdd67 | 3482 | if (!stream->enabled || !(stream->sample_flags & SAMPLE_OA_REPORT)) |
d7965152 RB |
3483 | return -EIO; |
3484 | ||
eec688e1 | 3485 | if (!(file->f_flags & O_NONBLOCK)) { |
d7965152 RB |
3486 | /* There's the small chance of false positives from |
3487 | * stream->ops->wait_unlocked. | |
3488 | * | |
3489 | * E.g. with single context filtering since we only wait until | |
3490 | * oabuffer has >= 1 report we don't immediately know whether | |
3491 | * any reports really belong to the current context | |
eec688e1 RB |
3492 | */ |
3493 | do { | |
3494 | ret = stream->ops->wait_unlocked(stream); | |
3495 | if (ret) | |
3496 | return ret; | |
3497 | ||
2db609c0 | 3498 | mutex_lock(&stream->lock); |
bcad588d | 3499 | ret = stream->ops->read(stream, buf, count, &offset); |
2db609c0 | 3500 | mutex_unlock(&stream->lock); |
bcad588d | 3501 | } while (!offset && !ret); |
eec688e1 | 3502 | } else { |
2db609c0 | 3503 | mutex_lock(&stream->lock); |
bcad588d | 3504 | ret = stream->ops->read(stream, buf, count, &offset); |
2db609c0 | 3505 | mutex_unlock(&stream->lock); |
eec688e1 RB |
3506 | } |
3507 | ||
a9a08845 | 3508 | /* We allow the poll checking to sometimes report false positive EPOLLIN |
26ebd9c7 RB |
3509 | * events where we might actually report EAGAIN on read() if there's |
3510 | * not really any data available. In this situation though we don't | |
a9a08845 | 3511 | * want to enter a busy loop between poll() reporting a EPOLLIN event |
26ebd9c7 RB |
3512 | * and read() returning -EAGAIN. Clearing the oa.pollin state here |
3513 | * effectively ensures we back off until the next hrtimer callback | |
a9a08845 | 3514 | * before reporting another EPOLLIN event. |
bcad588d AD |
3515 | * The exception to this is if ops->read() returned -ENOSPC which means |
3516 | * that more OA data is available than could fit in the user provided | |
3517 | * buffer. In this case we want the next poll() call to not block. | |
26ebd9c7 | 3518 | */ |
bcad588d | 3519 | if (ret != -ENOSPC) |
a37f08a8 | 3520 | stream->pollin = false; |
d7965152 | 3521 | |
bcad588d AD |
3522 | /* Possible values for ret are 0, -EFAULT, -ENOSPC, -EIO, ... */ |
3523 | return offset ?: (ret ?: -EAGAIN); | |
eec688e1 RB |
3524 | } |
3525 | ||
d7965152 RB |
3526 | static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer) |
3527 | { | |
a37f08a8 UNR |
3528 | struct i915_perf_stream *stream = |
3529 | container_of(hrtimer, typeof(*stream), poll_check_timer); | |
d7965152 | 3530 | |
a37f08a8 UNR |
3531 | if (oa_buffer_check_unlocked(stream)) { |
3532 | stream->pollin = true; | |
3533 | wake_up(&stream->poll_wq); | |
d7965152 RB |
3534 | } |
3535 | ||
4ef10fe0 LL |
3536 | hrtimer_forward_now(hrtimer, |
3537 | ns_to_ktime(stream->poll_oa_period)); | |
d7965152 RB |
3538 | |
3539 | return HRTIMER_RESTART; | |
3540 | } | |
3541 | ||
16d98b31 RB |
3542 | /** |
3543 | * i915_perf_poll_locked - poll_wait() with a suitable wait queue for stream | |
16d98b31 RB |
3544 | * @stream: An i915 perf stream |
3545 | * @file: An i915 perf stream file | |
3546 | * @wait: poll() state table | |
3547 | * | |
3548 | * For handling userspace polling on an i915 perf stream, this calls through to | |
3549 | * &i915_perf_stream_ops->poll_wait to call poll_wait() with a wait queue that | |
3550 | * will be woken for new stream data. | |
3551 | * | |
16d98b31 RB |
3552 | * Returns: any poll events that are ready without sleeping |
3553 | */ | |
8f8b1171 CW |
3554 | static __poll_t i915_perf_poll_locked(struct i915_perf_stream *stream, |
3555 | struct file *file, | |
3556 | poll_table *wait) | |
eec688e1 | 3557 | { |
afc9a42b | 3558 | __poll_t events = 0; |
eec688e1 RB |
3559 | |
3560 | stream->ops->poll_wait(stream, file, wait); | |
3561 | ||
d7965152 RB |
3562 | /* Note: we don't explicitly check whether there's something to read |
3563 | * here since this path may be very hot depending on what else | |
3564 | * userspace is polling, or on the timeout in use. We rely solely on | |
3565 | * the hrtimer/oa_poll_check_timer_cb to notify us when there are | |
3566 | * samples to read. | |
3567 | */ | |
a37f08a8 | 3568 | if (stream->pollin) |
a9a08845 | 3569 | events |= EPOLLIN; |
eec688e1 | 3570 | |
d7965152 | 3571 | return events; |
eec688e1 RB |
3572 | } |
3573 | ||
16d98b31 RB |
3574 | /** |
3575 | * i915_perf_poll - call poll_wait() with a suitable wait queue for stream | |
3576 | * @file: An i915 perf stream file | |
3577 | * @wait: poll() state table | |
3578 | * | |
3579 | * For handling userspace polling on an i915 perf stream, this ensures | |
3580 | * poll_wait() gets called with a wait queue that will be woken for new stream | |
3581 | * data. | |
3582 | * | |
3583 | * Note: Implementation deferred to i915_perf_poll_locked() | |
3584 | * | |
3585 | * Returns: any poll events that are ready without sleeping | |
3586 | */ | |
afc9a42b | 3587 | static __poll_t i915_perf_poll(struct file *file, poll_table *wait) |
eec688e1 RB |
3588 | { |
3589 | struct i915_perf_stream *stream = file->private_data; | |
afc9a42b | 3590 | __poll_t ret; |
eec688e1 | 3591 | |
2db609c0 | 3592 | mutex_lock(&stream->lock); |
8f8b1171 | 3593 | ret = i915_perf_poll_locked(stream, file, wait); |
2db609c0 | 3594 | mutex_unlock(&stream->lock); |
eec688e1 RB |
3595 | |
3596 | return ret; | |
3597 | } | |
3598 | ||
16d98b31 RB |
3599 | /** |
3600 | * i915_perf_enable_locked - handle `I915_PERF_IOCTL_ENABLE` ioctl | |
3601 | * @stream: A disabled i915 perf stream | |
3602 | * | |
3603 | * [Re]enables the associated capture of data for this stream. | |
3604 | * | |
3605 | * If a stream was previously enabled then there's currently no intention | |
3606 | * to provide userspace any guarantee about the preservation of previously | |
3607 | * buffered data. | |
3608 | */ | |
eec688e1 RB |
3609 | static void i915_perf_enable_locked(struct i915_perf_stream *stream) |
3610 | { | |
3611 | if (stream->enabled) | |
3612 | return; | |
3613 | ||
3614 | /* Allow stream->ops->enable() to refer to this */ | |
3615 | stream->enabled = true; | |
3616 | ||
3617 | if (stream->ops->enable) | |
3618 | stream->ops->enable(stream); | |
9cd20ef7 LL |
3619 | |
3620 | if (stream->hold_preemption) | |
9f3ccd40 | 3621 | intel_context_set_nopreempt(stream->pinned_ctx); |
eec688e1 RB |
3622 | } |
3623 | ||
16d98b31 RB |
3624 | /** |
3625 | * i915_perf_disable_locked - handle `I915_PERF_IOCTL_DISABLE` ioctl | |
3626 | * @stream: An enabled i915 perf stream | |
3627 | * | |
3628 | * Disables the associated capture of data for this stream. | |
3629 | * | |
3630 | * The intention is that disabling an re-enabling a stream will ideally be | |
3631 | * cheaper than destroying and re-opening a stream with the same configuration, | |
3632 | * though there are no formal guarantees about what state or buffered data | |
3633 | * must be retained between disabling and re-enabling a stream. | |
3634 | * | |
3635 | * Note: while a stream is disabled it's considered an error for userspace | |
3636 | * to attempt to read from the stream (-EIO). | |
3637 | */ | |
eec688e1 RB |
3638 | static void i915_perf_disable_locked(struct i915_perf_stream *stream) |
3639 | { | |
3640 | if (!stream->enabled) | |
3641 | return; | |
3642 | ||
3643 | /* Allow stream->ops->disable() to refer to this */ | |
3644 | stream->enabled = false; | |
3645 | ||
9cd20ef7 | 3646 | if (stream->hold_preemption) |
9f3ccd40 | 3647 | intel_context_clear_nopreempt(stream->pinned_ctx); |
9cd20ef7 | 3648 | |
eec688e1 RB |
3649 | if (stream->ops->disable) |
3650 | stream->ops->disable(stream); | |
3651 | } | |
3652 | ||
7831e9a9 CW |
3653 | static long i915_perf_config_locked(struct i915_perf_stream *stream, |
3654 | unsigned long metrics_set) | |
3655 | { | |
3656 | struct i915_oa_config *config; | |
3657 | long ret = stream->oa_config->id; | |
3658 | ||
3659 | config = i915_perf_get_oa_config(stream->perf, metrics_set); | |
3660 | if (!config) | |
3661 | return -EINVAL; | |
3662 | ||
3663 | if (config != stream->oa_config) { | |
d7d50f80 | 3664 | int err; |
7831e9a9 CW |
3665 | |
3666 | /* | |
3667 | * If OA is bound to a specific context, emit the | |
3668 | * reconfiguration inline from that context. The update | |
3669 | * will then be ordered with respect to submission on that | |
3670 | * context. | |
3671 | * | |
3672 | * When set globally, we use a low priority kernel context, | |
3673 | * so it will effectively take effect when idle. | |
3674 | */ | |
d7d50f80 CW |
3675 | err = emit_oa_config(stream, config, oa_context(stream), NULL); |
3676 | if (!err) | |
7831e9a9 | 3677 | config = xchg(&stream->oa_config, config); |
d7d50f80 CW |
3678 | else |
3679 | ret = err; | |
7831e9a9 CW |
3680 | } |
3681 | ||
3682 | i915_oa_config_put(config); | |
3683 | ||
3684 | return ret; | |
3685 | } | |
3686 | ||
16d98b31 | 3687 | /** |
e9d2871f | 3688 | * i915_perf_ioctl_locked - support ioctl() usage with i915 perf stream FDs |
16d98b31 RB |
3689 | * @stream: An i915 perf stream |
3690 | * @cmd: the ioctl request | |
3691 | * @arg: the ioctl data | |
3692 | * | |
16d98b31 RB |
3693 | * Returns: zero on success or a negative error code. Returns -EINVAL for |
3694 | * an unknown ioctl request. | |
3695 | */ | |
eec688e1 RB |
3696 | static long i915_perf_ioctl_locked(struct i915_perf_stream *stream, |
3697 | unsigned int cmd, | |
3698 | unsigned long arg) | |
3699 | { | |
3700 | switch (cmd) { | |
3701 | case I915_PERF_IOCTL_ENABLE: | |
3702 | i915_perf_enable_locked(stream); | |
3703 | return 0; | |
3704 | case I915_PERF_IOCTL_DISABLE: | |
3705 | i915_perf_disable_locked(stream); | |
3706 | return 0; | |
7831e9a9 CW |
3707 | case I915_PERF_IOCTL_CONFIG: |
3708 | return i915_perf_config_locked(stream, arg); | |
eec688e1 RB |
3709 | } |
3710 | ||
3711 | return -EINVAL; | |
3712 | } | |
3713 | ||
16d98b31 RB |
3714 | /** |
3715 | * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs | |
3716 | * @file: An i915 perf stream file | |
3717 | * @cmd: the ioctl request | |
3718 | * @arg: the ioctl data | |
3719 | * | |
3720 | * Implementation deferred to i915_perf_ioctl_locked(). | |
3721 | * | |
3722 | * Returns: zero on success or a negative error code. Returns -EINVAL for | |
3723 | * an unknown ioctl request. | |
3724 | */ | |
eec688e1 RB |
3725 | static long i915_perf_ioctl(struct file *file, |
3726 | unsigned int cmd, | |
3727 | unsigned long arg) | |
3728 | { | |
3729 | struct i915_perf_stream *stream = file->private_data; | |
eec688e1 RB |
3730 | long ret; |
3731 | ||
2db609c0 | 3732 | mutex_lock(&stream->lock); |
eec688e1 | 3733 | ret = i915_perf_ioctl_locked(stream, cmd, arg); |
2db609c0 | 3734 | mutex_unlock(&stream->lock); |
eec688e1 RB |
3735 | |
3736 | return ret; | |
3737 | } | |
3738 | ||
16d98b31 RB |
3739 | /** |
3740 | * i915_perf_destroy_locked - destroy an i915 perf stream | |
3741 | * @stream: An i915 perf stream | |
3742 | * | |
3743 | * Frees all resources associated with the given i915 perf @stream, disabling | |
3744 | * any associated data capture in the process. | |
3745 | * | |
9677a9f3 | 3746 | * Note: The >->perf.lock mutex has been taken to serialize |
16d98b31 RB |
3747 | * with any non-file-operation driver hooks. |
3748 | */ | |
eec688e1 RB |
3749 | static void i915_perf_destroy_locked(struct i915_perf_stream *stream) |
3750 | { | |
eec688e1 RB |
3751 | if (stream->enabled) |
3752 | i915_perf_disable_locked(stream); | |
3753 | ||
3754 | if (stream->ops->destroy) | |
3755 | stream->ops->destroy(stream); | |
3756 | ||
69df05e1 | 3757 | if (stream->ctx) |
5f09a9c8 | 3758 | i915_gem_context_put(stream->ctx); |
eec688e1 RB |
3759 | |
3760 | kfree(stream); | |
3761 | } | |
3762 | ||
16d98b31 RB |
3763 | /** |
3764 | * i915_perf_release - handles userspace close() of a stream file | |
3765 | * @inode: anonymous inode associated with file | |
3766 | * @file: An i915 perf stream file | |
3767 | * | |
3768 | * Cleans up any resources associated with an open i915 perf stream file. | |
3769 | * | |
3770 | * NB: close() can't really fail from the userspace point of view. | |
3771 | * | |
3772 | * Returns: zero on success or a negative error code. | |
3773 | */ | |
eec688e1 RB |
3774 | static int i915_perf_release(struct inode *inode, struct file *file) |
3775 | { | |
3776 | struct i915_perf_stream *stream = file->private_data; | |
8f8b1171 | 3777 | struct i915_perf *perf = stream->perf; |
9677a9f3 | 3778 | struct intel_gt *gt = stream->engine->gt; |
eec688e1 | 3779 | |
2db609c0 UNR |
3780 | /* |
3781 | * Within this call, we know that the fd is being closed and we have no | |
3782 | * other user of stream->lock. Use the perf lock to destroy the stream | |
3783 | * here. | |
3784 | */ | |
9677a9f3 | 3785 | mutex_lock(>->perf.lock); |
eec688e1 | 3786 | i915_perf_destroy_locked(stream); |
9677a9f3 | 3787 | mutex_unlock(>->perf.lock); |
eec688e1 | 3788 | |
a5af1df7 | 3789 | /* Release the reference the perf stream kept on the driver. */ |
8f8b1171 | 3790 | drm_dev_put(&perf->i915->drm); |
a5af1df7 | 3791 | |
eec688e1 RB |
3792 | return 0; |
3793 | } | |
3794 | ||
3795 | ||
3796 | static const struct file_operations fops = { | |
3797 | .owner = THIS_MODULE, | |
3798 | .llseek = no_llseek, | |
3799 | .release = i915_perf_release, | |
3800 | .poll = i915_perf_poll, | |
3801 | .read = i915_perf_read, | |
3802 | .unlocked_ioctl = i915_perf_ioctl, | |
191f8960 LL |
3803 | /* Our ioctl have no arguments, so it's safe to use the same function |
3804 | * to handle 32bits compatibility. | |
3805 | */ | |
3806 | .compat_ioctl = i915_perf_ioctl, | |
eec688e1 RB |
3807 | }; |
3808 | ||
3809 | ||
16d98b31 RB |
3810 | /** |
3811 | * i915_perf_open_ioctl_locked - DRM ioctl() for userspace to open a stream FD | |
8f8b1171 | 3812 | * @perf: i915 perf instance |
16d98b31 RB |
3813 | * @param: The open parameters passed to 'DRM_I915_PERF_OPEN` |
3814 | * @props: individually validated u64 property value pairs | |
3815 | * @file: drm file | |
3816 | * | |
3817 | * See i915_perf_ioctl_open() for interface details. | |
3818 | * | |
3819 | * Implements further stream config validation and stream initialization on | |
9677a9f3 | 3820 | * behalf of i915_perf_open_ioctl() with the >->perf.lock mutex |
16d98b31 RB |
3821 | * taken to serialize with any non-file-operation driver hooks. |
3822 | * | |
3823 | * Note: at this point the @props have only been validated in isolation and | |
3824 | * it's still necessary to validate that the combination of properties makes | |
3825 | * sense. | |
3826 | * | |
3827 | * In the case where userspace is interested in OA unit metrics then further | |
3828 | * config validation and stream initialization details will be handled by | |
3829 | * i915_oa_stream_init(). The code here should only validate config state that | |
3830 | * will be relevant to all stream types / backends. | |
3831 | * | |
3832 | * Returns: zero on success or a negative error code. | |
3833 | */ | |
eec688e1 | 3834 | static int |
8f8b1171 | 3835 | i915_perf_open_ioctl_locked(struct i915_perf *perf, |
eec688e1 RB |
3836 | struct drm_i915_perf_open_param *param, |
3837 | struct perf_open_properties *props, | |
3838 | struct drm_file *file) | |
3839 | { | |
3840 | struct i915_gem_context *specific_ctx = NULL; | |
3841 | struct i915_perf_stream *stream = NULL; | |
3842 | unsigned long f_flags = 0; | |
19f81df2 | 3843 | bool privileged_op = true; |
eec688e1 RB |
3844 | int stream_fd; |
3845 | int ret; | |
3846 | ||
3847 | if (props->single_context) { | |
3848 | u32 ctx_handle = props->ctx_handle; | |
3849 | struct drm_i915_file_private *file_priv = file->driver_priv; | |
3850 | ||
635f56c3 | 3851 | specific_ctx = i915_gem_context_lookup(file_priv, ctx_handle); |
046d1660 | 3852 | if (IS_ERR(specific_ctx)) { |
2fec5391 UNR |
3853 | drm_dbg(&perf->i915->drm, |
3854 | "Failed to look up context with ID %u for opening perf stream\n", | |
635f56c3 | 3855 | ctx_handle); |
046d1660 | 3856 | ret = PTR_ERR(specific_ctx); |
eec688e1 RB |
3857 | goto err; |
3858 | } | |
3859 | } | |
3860 | ||
19f81df2 RB |
3861 | /* |
3862 | * On Haswell the OA unit supports clock gating off for a specific | |
3863 | * context and in this mode there's no visibility of metrics for the | |
3864 | * rest of the system, which we consider acceptable for a | |
3865 | * non-privileged client. | |
3866 | * | |
00a7f0d7 | 3867 | * For Gen8->11 the OA unit no longer supports clock gating off for a |
19f81df2 RB |
3868 | * specific context and the kernel can't securely stop the counters |
3869 | * from updating as system-wide / global values. Even though we can | |
3870 | * filter reports based on the included context ID we can't block | |
3871 | * clients from seeing the raw / global counter values via | |
3872 | * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to | |
3873 | * enable the OA unit by default. | |
00a7f0d7 LL |
3874 | * |
3875 | * For Gen12+ we gain a new OAR unit that only monitors the RCS on a | |
3876 | * per context basis. So we can relax requirements there if the user | |
3877 | * doesn't request global stream access (i.e. query based sampling | |
3878 | * using MI_RECORD_PERF_COUNT. | |
19f81df2 | 3879 | */ |
0b0120d4 | 3880 | if (IS_HASWELL(perf->i915) && specific_ctx) |
19f81df2 | 3881 | privileged_op = false; |
651e7d48 | 3882 | else if (GRAPHICS_VER(perf->i915) == 12 && specific_ctx && |
00a7f0d7 LL |
3883 | (props->sample_flags & SAMPLE_OA_REPORT) == 0) |
3884 | privileged_op = false; | |
19f81df2 | 3885 | |
0b0120d4 LL |
3886 | if (props->hold_preemption) { |
3887 | if (!props->single_context) { | |
2fec5391 UNR |
3888 | drm_dbg(&perf->i915->drm, |
3889 | "preemption disable with no context\n"); | |
0b0120d4 LL |
3890 | ret = -EINVAL; |
3891 | goto err; | |
3892 | } | |
3893 | privileged_op = true; | |
3894 | } | |
3895 | ||
11ecbddd LL |
3896 | /* |
3897 | * Asking for SSEU configuration is a priviliged operation. | |
3898 | */ | |
3899 | if (props->has_sseu) | |
3900 | privileged_op = true; | |
3901 | else | |
3902 | get_default_sseu_config(&props->sseu, props->engine); | |
3903 | ||
ccdf6341 RB |
3904 | /* Similar to perf's kernel.perf_paranoid_cpu sysctl option |
3905 | * we check a dev.i915.perf_stream_paranoid sysctl option | |
3906 | * to determine if it's ok to access system wide OA counters | |
4e3d3456 | 3907 | * without CAP_PERFMON or CAP_SYS_ADMIN privileges. |
ccdf6341 | 3908 | */ |
19f81df2 | 3909 | if (privileged_op && |
4e3d3456 | 3910 | i915_perf_stream_paranoid && !perfmon_capable()) { |
2fec5391 UNR |
3911 | drm_dbg(&perf->i915->drm, |
3912 | "Insufficient privileges to open i915 perf stream\n"); | |
eec688e1 RB |
3913 | ret = -EACCES; |
3914 | goto err_ctx; | |
3915 | } | |
3916 | ||
3917 | stream = kzalloc(sizeof(*stream), GFP_KERNEL); | |
3918 | if (!stream) { | |
3919 | ret = -ENOMEM; | |
3920 | goto err_ctx; | |
3921 | } | |
3922 | ||
8f8b1171 | 3923 | stream->perf = perf; |
eec688e1 | 3924 | stream->ctx = specific_ctx; |
4ef10fe0 | 3925 | stream->poll_oa_period = props->poll_oa_period; |
eec688e1 | 3926 | |
d7965152 RB |
3927 | ret = i915_oa_stream_init(stream, param, props); |
3928 | if (ret) | |
3929 | goto err_alloc; | |
3930 | ||
3931 | /* we avoid simply assigning stream->sample_flags = props->sample_flags | |
3932 | * to have _stream_init check the combination of sample flags more | |
3933 | * thoroughly, but still this is the expected result at this point. | |
eec688e1 | 3934 | */ |
d7965152 RB |
3935 | if (WARN_ON(stream->sample_flags != props->sample_flags)) { |
3936 | ret = -ENODEV; | |
22f880ca | 3937 | goto err_flags; |
d7965152 | 3938 | } |
eec688e1 | 3939 | |
eec688e1 RB |
3940 | if (param->flags & I915_PERF_FLAG_FD_CLOEXEC) |
3941 | f_flags |= O_CLOEXEC; | |
3942 | if (param->flags & I915_PERF_FLAG_FD_NONBLOCK) | |
3943 | f_flags |= O_NONBLOCK; | |
3944 | ||
3945 | stream_fd = anon_inode_getfd("[i915_perf]", &fops, stream, f_flags); | |
3946 | if (stream_fd < 0) { | |
3947 | ret = stream_fd; | |
23b9e41a | 3948 | goto err_flags; |
eec688e1 RB |
3949 | } |
3950 | ||
3951 | if (!(param->flags & I915_PERF_FLAG_DISABLED)) | |
3952 | i915_perf_enable_locked(stream); | |
3953 | ||
a5af1df7 LL |
3954 | /* Take a reference on the driver that will be kept with stream_fd |
3955 | * until its release. | |
3956 | */ | |
8f8b1171 | 3957 | drm_dev_get(&perf->i915->drm); |
a5af1df7 | 3958 | |
eec688e1 RB |
3959 | return stream_fd; |
3960 | ||
22f880ca | 3961 | err_flags: |
eec688e1 RB |
3962 | if (stream->ops->destroy) |
3963 | stream->ops->destroy(stream); | |
3964 | err_alloc: | |
3965 | kfree(stream); | |
3966 | err_ctx: | |
69df05e1 | 3967 | if (specific_ctx) |
5f09a9c8 | 3968 | i915_gem_context_put(specific_ctx); |
eec688e1 RB |
3969 | err: |
3970 | return ret; | |
3971 | } | |
3972 | ||
8f8b1171 | 3973 | static u64 oa_exponent_to_ns(struct i915_perf *perf, int exponent) |
155e941f | 3974 | { |
bc7ed4d3 UNR |
3975 | u64 nom = (2ULL << exponent) * NSEC_PER_SEC; |
3976 | u32 den = i915_perf_oa_timestamp_frequency(perf->i915); | |
3977 | ||
3978 | return div_u64(nom + den - 1, den); | |
155e941f RB |
3979 | } |
3980 | ||
77892f4f UNR |
3981 | static __always_inline bool |
3982 | oa_format_valid(struct i915_perf *perf, enum drm_i915_oa_format format) | |
3983 | { | |
3984 | return test_bit(format, perf->format_mask); | |
3985 | } | |
3986 | ||
3987 | static __always_inline void | |
3988 | oa_format_add(struct i915_perf *perf, enum drm_i915_oa_format format) | |
3989 | { | |
3990 | __set_bit(format, perf->format_mask); | |
3991 | } | |
3992 | ||
16d98b31 RB |
3993 | /** |
3994 | * read_properties_unlocked - validate + copy userspace stream open properties | |
8f8b1171 | 3995 | * @perf: i915 perf instance |
16d98b31 RB |
3996 | * @uprops: The array of u64 key value pairs given by userspace |
3997 | * @n_props: The number of key value pairs expected in @uprops | |
3998 | * @props: The stream configuration built up while validating properties | |
eec688e1 RB |
3999 | * |
4000 | * Note this function only validates properties in isolation it doesn't | |
4001 | * validate that the combination of properties makes sense or that all | |
4002 | * properties necessary for a particular kind of stream have been set. | |
16d98b31 RB |
4003 | * |
4004 | * Note that there currently aren't any ordering requirements for properties so | |
4005 | * we shouldn't validate or assume anything about ordering here. This doesn't | |
4006 | * rule out defining new properties with ordering requirements in the future. | |
eec688e1 | 4007 | */ |
8f8b1171 | 4008 | static int read_properties_unlocked(struct i915_perf *perf, |
eec688e1 RB |
4009 | u64 __user *uprops, |
4010 | u32 n_props, | |
4011 | struct perf_open_properties *props) | |
4012 | { | |
9919d119 | 4013 | struct drm_i915_gem_context_param_sseu user_sseu; |
eec688e1 | 4014 | u64 __user *uprop = uprops; |
c61d04c9 UNR |
4015 | bool config_instance = false; |
4016 | bool config_class = false; | |
9919d119 | 4017 | bool config_sseu = false; |
c61d04c9 | 4018 | u8 class, instance; |
701f8231 | 4019 | u32 i; |
11ecbddd | 4020 | int ret; |
eec688e1 RB |
4021 | |
4022 | memset(props, 0, sizeof(struct perf_open_properties)); | |
4ef10fe0 | 4023 | props->poll_oa_period = DEFAULT_POLL_PERIOD_NS; |
eec688e1 | 4024 | |
eec688e1 RB |
4025 | /* Considering that ID = 0 is reserved and assuming that we don't |
4026 | * (currently) expect any configurations to ever specify duplicate | |
4027 | * values for a particular property ID then the last _PROP_MAX value is | |
4028 | * one greater than the maximum number of properties we expect to get | |
4029 | * from userspace. | |
4030 | */ | |
c61d04c9 | 4031 | if (!n_props || n_props >= DRM_I915_PERF_PROP_MAX) { |
2fec5391 | 4032 | drm_dbg(&perf->i915->drm, |
c61d04c9 | 4033 | "Invalid number of i915 perf properties given\n"); |
eec688e1 RB |
4034 | return -EINVAL; |
4035 | } | |
4036 | ||
c61d04c9 UNR |
4037 | /* Defaults when class:instance is not passed */ |
4038 | class = I915_ENGINE_CLASS_RENDER; | |
4039 | instance = 0; | |
4040 | ||
eec688e1 | 4041 | for (i = 0; i < n_props; i++) { |
00319ba0 | 4042 | u64 oa_period, oa_freq_hz; |
eec688e1 | 4043 | u64 id, value; |
eec688e1 RB |
4044 | |
4045 | ret = get_user(id, uprop); | |
4046 | if (ret) | |
4047 | return ret; | |
4048 | ||
4049 | ret = get_user(value, uprop + 1); | |
4050 | if (ret) | |
4051 | return ret; | |
4052 | ||
0a309f9e | 4053 | if (id == 0 || id >= DRM_I915_PERF_PROP_MAX) { |
2fec5391 UNR |
4054 | drm_dbg(&perf->i915->drm, |
4055 | "Unknown i915 perf property ID\n"); | |
0a309f9e MA |
4056 | return -EINVAL; |
4057 | } | |
4058 | ||
eec688e1 RB |
4059 | switch ((enum drm_i915_perf_property_id)id) { |
4060 | case DRM_I915_PERF_PROP_CTX_HANDLE: | |
4061 | props->single_context = 1; | |
4062 | props->ctx_handle = value; | |
4063 | break; | |
d7965152 | 4064 | case DRM_I915_PERF_PROP_SAMPLE_OA: |
b6dd47b9 LL |
4065 | if (value) |
4066 | props->sample_flags |= SAMPLE_OA_REPORT; | |
d7965152 RB |
4067 | break; |
4068 | case DRM_I915_PERF_PROP_OA_METRICS_SET: | |
701f8231 | 4069 | if (value == 0) { |
2fec5391 UNR |
4070 | drm_dbg(&perf->i915->drm, |
4071 | "Unknown OA metric set ID\n"); | |
d7965152 RB |
4072 | return -EINVAL; |
4073 | } | |
4074 | props->metrics_set = value; | |
4075 | break; | |
4076 | case DRM_I915_PERF_PROP_OA_FORMAT: | |
4077 | if (value == 0 || value >= I915_OA_FORMAT_MAX) { | |
2fec5391 UNR |
4078 | drm_dbg(&perf->i915->drm, |
4079 | "Out-of-range OA report format %llu\n", | |
52c57c26 | 4080 | value); |
d7965152 RB |
4081 | return -EINVAL; |
4082 | } | |
77892f4f | 4083 | if (!oa_format_valid(perf, value)) { |
2fec5391 UNR |
4084 | drm_dbg(&perf->i915->drm, |
4085 | "Unsupported OA report format %llu\n", | |
52c57c26 | 4086 | value); |
d7965152 RB |
4087 | return -EINVAL; |
4088 | } | |
4089 | props->oa_format = value; | |
4090 | break; | |
4091 | case DRM_I915_PERF_PROP_OA_EXPONENT: | |
4092 | if (value > OA_EXPONENT_MAX) { | |
2fec5391 UNR |
4093 | drm_dbg(&perf->i915->drm, |
4094 | "OA timer exponent too high (> %u)\n", | |
7708550c | 4095 | OA_EXPONENT_MAX); |
d7965152 RB |
4096 | return -EINVAL; |
4097 | } | |
4098 | ||
00319ba0 | 4099 | /* Theoretically we can program the OA unit to sample |
155e941f RB |
4100 | * e.g. every 160ns for HSW, 167ns for BDW/SKL or 104ns |
4101 | * for BXT. We don't allow such high sampling | |
4102 | * frequencies by default unless root. | |
00319ba0 | 4103 | */ |
155e941f | 4104 | |
00319ba0 | 4105 | BUILD_BUG_ON(sizeof(oa_period) != 8); |
8f8b1171 | 4106 | oa_period = oa_exponent_to_ns(perf, value); |
00319ba0 RB |
4107 | |
4108 | /* This check is primarily to ensure that oa_period <= | |
4109 | * UINT32_MAX (before passing to do_div which only | |
4110 | * accepts a u32 denominator), but we can also skip | |
4111 | * checking anything < 1Hz which implicitly can't be | |
4112 | * limited via an integer oa_max_sample_rate. | |
d7965152 | 4113 | */ |
00319ba0 RB |
4114 | if (oa_period <= NSEC_PER_SEC) { |
4115 | u64 tmp = NSEC_PER_SEC; | |
4116 | do_div(tmp, oa_period); | |
4117 | oa_freq_hz = tmp; | |
4118 | } else | |
4119 | oa_freq_hz = 0; | |
4120 | ||
4e3d3456 | 4121 | if (oa_freq_hz > i915_oa_max_sample_rate && !perfmon_capable()) { |
2fec5391 UNR |
4122 | drm_dbg(&perf->i915->drm, |
4123 | "OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without CAP_PERFMON or CAP_SYS_ADMIN privileges\n", | |
00319ba0 | 4124 | i915_oa_max_sample_rate); |
d7965152 RB |
4125 | return -EACCES; |
4126 | } | |
4127 | ||
4128 | props->oa_periodic = true; | |
4129 | props->oa_period_exponent = value; | |
4130 | break; | |
9cd20ef7 LL |
4131 | case DRM_I915_PERF_PROP_HOLD_PREEMPTION: |
4132 | props->hold_preemption = !!value; | |
4133 | break; | |
11ecbddd | 4134 | case DRM_I915_PERF_PROP_GLOBAL_SSEU: { |
ca437b45 UNR |
4135 | if (GRAPHICS_VER_FULL(perf->i915) >= IP_VER(12, 50)) { |
4136 | drm_dbg(&perf->i915->drm, | |
4137 | "SSEU config not supported on gfx %x\n", | |
4138 | GRAPHICS_VER_FULL(perf->i915)); | |
4139 | return -ENODEV; | |
4140 | } | |
4141 | ||
11ecbddd LL |
4142 | if (copy_from_user(&user_sseu, |
4143 | u64_to_user_ptr(value), | |
4144 | sizeof(user_sseu))) { | |
2fec5391 UNR |
4145 | drm_dbg(&perf->i915->drm, |
4146 | "Unable to copy global sseu parameter\n"); | |
11ecbddd LL |
4147 | return -EFAULT; |
4148 | } | |
9919d119 | 4149 | config_sseu = true; |
11ecbddd LL |
4150 | break; |
4151 | } | |
4ef10fe0 LL |
4152 | case DRM_I915_PERF_PROP_POLL_OA_PERIOD: |
4153 | if (value < 100000 /* 100us */) { | |
2fec5391 UNR |
4154 | drm_dbg(&perf->i915->drm, |
4155 | "OA availability timer too small (%lluns < 100us)\n", | |
4ef10fe0 LL |
4156 | value); |
4157 | return -EINVAL; | |
4158 | } | |
4159 | props->poll_oa_period = value; | |
4160 | break; | |
c61d04c9 UNR |
4161 | case DRM_I915_PERF_PROP_OA_ENGINE_CLASS: |
4162 | class = (u8)value; | |
4163 | config_class = true; | |
4164 | break; | |
4165 | case DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE: | |
4166 | instance = (u8)value; | |
4167 | config_instance = true; | |
4168 | break; | |
4169 | default: | |
eec688e1 | 4170 | MISSING_CASE(id); |
eec688e1 RB |
4171 | return -EINVAL; |
4172 | } | |
4173 | ||
4174 | uprop += 2; | |
4175 | } | |
4176 | ||
c61d04c9 UNR |
4177 | if ((config_class && !config_instance) || |
4178 | (config_instance && !config_class)) { | |
4179 | drm_dbg(&perf->i915->drm, | |
4180 | "OA engine-class and engine-instance parameters must be passed together\n"); | |
4181 | return -EINVAL; | |
4182 | } | |
4183 | ||
4184 | props->engine = intel_engine_lookup_user(perf->i915, class, instance); | |
4185 | if (!props->engine) { | |
4186 | drm_dbg(&perf->i915->drm, | |
4187 | "OA engine class and instance invalid %d:%d\n", | |
4188 | class, instance); | |
4189 | return -EINVAL; | |
4190 | } | |
4191 | ||
4192 | if (!engine_supports_oa(props->engine)) { | |
4193 | drm_dbg(&perf->i915->drm, | |
4194 | "Engine not supported by OA %d:%d\n", | |
4195 | class, instance); | |
4196 | return -EINVAL; | |
4197 | } | |
4198 | ||
9919d119 UNR |
4199 | if (config_sseu) { |
4200 | ret = get_sseu_config(&props->sseu, props->engine, &user_sseu); | |
4201 | if (ret) { | |
4202 | drm_dbg(&perf->i915->drm, | |
4203 | "Invalid SSEU configuration\n"); | |
4204 | return ret; | |
4205 | } | |
4206 | props->has_sseu = true; | |
4207 | } | |
4208 | ||
eec688e1 RB |
4209 | return 0; |
4210 | } | |
4211 | ||
16d98b31 RB |
4212 | /** |
4213 | * i915_perf_open_ioctl - DRM ioctl() for userspace to open a stream FD | |
4214 | * @dev: drm device | |
4215 | * @data: ioctl data copied from userspace (unvalidated) | |
4216 | * @file: drm file | |
4217 | * | |
4218 | * Validates the stream open parameters given by userspace including flags | |
4219 | * and an array of u64 key, value pair properties. | |
4220 | * | |
4221 | * Very little is assumed up front about the nature of the stream being | |
4222 | * opened (for instance we don't assume it's for periodic OA unit metrics). An | |
4223 | * i915-perf stream is expected to be a suitable interface for other forms of | |
4224 | * buffered data written by the GPU besides periodic OA metrics. | |
4225 | * | |
4226 | * Note we copy the properties from userspace outside of the i915 perf | |
c1e8d7c6 | 4227 | * mutex to avoid an awkward lockdep with mmap_lock. |
16d98b31 RB |
4228 | * |
4229 | * Most of the implementation details are handled by | |
9677a9f3 | 4230 | * i915_perf_open_ioctl_locked() after taking the >->perf.lock |
16d98b31 RB |
4231 | * mutex for serializing with any non-file-operation driver hooks. |
4232 | * | |
4233 | * Return: A newly opened i915 Perf stream file descriptor or negative | |
4234 | * error code on failure. | |
4235 | */ | |
eec688e1 RB |
4236 | int i915_perf_open_ioctl(struct drm_device *dev, void *data, |
4237 | struct drm_file *file) | |
4238 | { | |
8f8b1171 | 4239 | struct i915_perf *perf = &to_i915(dev)->perf; |
eec688e1 | 4240 | struct drm_i915_perf_open_param *param = data; |
9677a9f3 | 4241 | struct intel_gt *gt; |
eec688e1 RB |
4242 | struct perf_open_properties props; |
4243 | u32 known_open_flags; | |
4244 | int ret; | |
4245 | ||
8f8b1171 | 4246 | if (!perf->i915) { |
2fec5391 UNR |
4247 | drm_dbg(&perf->i915->drm, |
4248 | "i915 perf interface not available for this system\n"); | |
eec688e1 RB |
4249 | return -ENOTSUPP; |
4250 | } | |
4251 | ||
4252 | known_open_flags = I915_PERF_FLAG_FD_CLOEXEC | | |
4253 | I915_PERF_FLAG_FD_NONBLOCK | | |
4254 | I915_PERF_FLAG_DISABLED; | |
4255 | if (param->flags & ~known_open_flags) { | |
2fec5391 UNR |
4256 | drm_dbg(&perf->i915->drm, |
4257 | "Unknown drm_i915_perf_open_param flag\n"); | |
eec688e1 RB |
4258 | return -EINVAL; |
4259 | } | |
4260 | ||
8f8b1171 | 4261 | ret = read_properties_unlocked(perf, |
eec688e1 RB |
4262 | u64_to_user_ptr(param->properties_ptr), |
4263 | param->num_properties, | |
4264 | &props); | |
4265 | if (ret) | |
4266 | return ret; | |
4267 | ||
9677a9f3 UNR |
4268 | gt = props.engine->gt; |
4269 | ||
4270 | mutex_lock(>->perf.lock); | |
8f8b1171 | 4271 | ret = i915_perf_open_ioctl_locked(perf, param, &props, file); |
9677a9f3 | 4272 | mutex_unlock(>->perf.lock); |
eec688e1 RB |
4273 | |
4274 | return ret; | |
4275 | } | |
4276 | ||
16d98b31 RB |
4277 | /** |
4278 | * i915_perf_register - exposes i915-perf to userspace | |
8f8b1171 | 4279 | * @i915: i915 device instance |
16d98b31 RB |
4280 | * |
4281 | * In particular OA metric sets are advertised under a sysfs metrics/ | |
4282 | * directory allowing userspace to enumerate valid IDs that can be | |
4283 | * used to open an i915-perf stream. | |
4284 | */ | |
8f8b1171 | 4285 | void i915_perf_register(struct drm_i915_private *i915) |
442b8c06 | 4286 | { |
8f8b1171 | 4287 | struct i915_perf *perf = &i915->perf; |
9677a9f3 | 4288 | struct intel_gt *gt = to_gt(i915); |
701f8231 | 4289 | |
8f8b1171 | 4290 | if (!perf->i915) |
442b8c06 RB |
4291 | return; |
4292 | ||
4293 | /* To be sure we're synchronized with an attempted | |
4294 | * i915_perf_open_ioctl(); considering that we register after | |
4295 | * being exposed to userspace. | |
4296 | */ | |
9677a9f3 | 4297 | mutex_lock(>->perf.lock); |
442b8c06 | 4298 | |
8f8b1171 | 4299 | perf->metrics_kobj = |
442b8c06 | 4300 | kobject_create_and_add("metrics", |
8f8b1171 | 4301 | &i915->drm.primary->kdev->kobj); |
19f81df2 | 4302 | |
9677a9f3 | 4303 | mutex_unlock(>->perf.lock); |
442b8c06 RB |
4304 | } |
4305 | ||
16d98b31 RB |
4306 | /** |
4307 | * i915_perf_unregister - hide i915-perf from userspace | |
8f8b1171 | 4308 | * @i915: i915 device instance |
16d98b31 RB |
4309 | * |
4310 | * i915-perf state cleanup is split up into an 'unregister' and | |
4311 | * 'deinit' phase where the interface is first hidden from | |
4312 | * userspace by i915_perf_unregister() before cleaning up | |
4313 | * remaining state in i915_perf_fini(). | |
4314 | */ | |
8f8b1171 | 4315 | void i915_perf_unregister(struct drm_i915_private *i915) |
442b8c06 | 4316 | { |
8f8b1171 CW |
4317 | struct i915_perf *perf = &i915->perf; |
4318 | ||
4319 | if (!perf->metrics_kobj) | |
442b8c06 RB |
4320 | return; |
4321 | ||
8f8b1171 CW |
4322 | kobject_put(perf->metrics_kobj); |
4323 | perf->metrics_kobj = NULL; | |
442b8c06 RB |
4324 | } |
4325 | ||
8f8b1171 | 4326 | static bool gen8_is_valid_flex_addr(struct i915_perf *perf, u32 addr) |
f89823c2 LL |
4327 | { |
4328 | static const i915_reg_t flex_eu_regs[] = { | |
4329 | EU_PERF_CNTL0, | |
4330 | EU_PERF_CNTL1, | |
4331 | EU_PERF_CNTL2, | |
4332 | EU_PERF_CNTL3, | |
4333 | EU_PERF_CNTL4, | |
4334 | EU_PERF_CNTL5, | |
4335 | EU_PERF_CNTL6, | |
4336 | }; | |
4337 | int i; | |
4338 | ||
4339 | for (i = 0; i < ARRAY_SIZE(flex_eu_regs); i++) { | |
7c52a221 | 4340 | if (i915_mmio_reg_offset(flex_eu_regs[i]) == addr) |
f89823c2 LL |
4341 | return true; |
4342 | } | |
4343 | return false; | |
4344 | } | |
4345 | ||
66a19a3a MR |
4346 | static bool reg_in_range_table(u32 addr, const struct i915_range *table) |
4347 | { | |
4348 | while (table->start || table->end) { | |
4349 | if (addr >= table->start && addr <= table->end) | |
4350 | return true; | |
fc215230 | 4351 | |
66a19a3a MR |
4352 | table++; |
4353 | } | |
4354 | ||
4355 | return false; | |
4356 | } | |
fc215230 UNR |
4357 | |
4358 | #define REG_EQUAL(addr, mmio) \ | |
4359 | ((addr) == i915_mmio_reg_offset(mmio)) | |
4360 | ||
66a19a3a MR |
4361 | static const struct i915_range gen7_oa_b_counters[] = { |
4362 | { .start = 0x2710, .end = 0x272c }, /* OASTARTTRIG[1-8] */ | |
4363 | { .start = 0x2740, .end = 0x275c }, /* OAREPORTTRIG[1-8] */ | |
4364 | { .start = 0x2770, .end = 0x27ac }, /* OACEC[0-7][0-1] */ | |
4365 | {} | |
4366 | }; | |
f89823c2 | 4367 | |
66a19a3a MR |
4368 | static const struct i915_range gen12_oa_b_counters[] = { |
4369 | { .start = 0x2b2c, .end = 0x2b2c }, /* GEN12_OAG_OA_PESS */ | |
4370 | { .start = 0xd900, .end = 0xd91c }, /* GEN12_OAG_OASTARTTRIG[1-8] */ | |
4371 | { .start = 0xd920, .end = 0xd93c }, /* GEN12_OAG_OAREPORTTRIG1[1-8] */ | |
4372 | { .start = 0xd940, .end = 0xd97c }, /* GEN12_OAG_CEC[0-7][0-1] */ | |
4373 | { .start = 0xdc00, .end = 0xdc3c }, /* GEN12_OAG_SCEC[0-7][0-1] */ | |
4374 | { .start = 0xdc40, .end = 0xdc40 }, /* GEN12_OAG_SPCTR_CNF */ | |
4375 | { .start = 0xdc44, .end = 0xdc44 }, /* GEN12_OAA_DBG_REG */ | |
4376 | {} | |
4377 | }; | |
4378 | ||
0fa9349d LL |
4379 | static const struct i915_range xehp_oa_b_counters[] = { |
4380 | { .start = 0xdc48, .end = 0xdc48 }, /* OAA_ENABLE_REG */ | |
4381 | { .start = 0xdd00, .end = 0xdd48 }, /* OAG_LCE0_0 - OAA_LENABLE_REG */ | |
4382 | }; | |
4383 | ||
66a19a3a MR |
4384 | static const struct i915_range gen7_oa_mux_regs[] = { |
4385 | { .start = 0x91b8, .end = 0x91cc }, /* OA_PERFCNT[1-2], OA_PERFMATRIX */ | |
4386 | { .start = 0x9800, .end = 0x9888 }, /* MICRO_BP0_0 - NOA_WRITE */ | |
4387 | { .start = 0xe180, .end = 0xe180 }, /* HALF_SLICE_CHICKEN2 */ | |
4388 | {} | |
4389 | }; | |
4390 | ||
4391 | static const struct i915_range hsw_oa_mux_regs[] = { | |
4392 | { .start = 0x09e80, .end = 0x09ea4 }, /* HSW_MBVID2_NOA[0-9] */ | |
4393 | { .start = 0x09ec0, .end = 0x09ec0 }, /* HSW_MBVID2_MISR0 */ | |
4394 | { .start = 0x25100, .end = 0x2ff90 }, | |
4395 | {} | |
4396 | }; | |
4397 | ||
4398 | static const struct i915_range chv_oa_mux_regs[] = { | |
4399 | { .start = 0x182300, .end = 0x1823a4 }, | |
4400 | {} | |
4401 | }; | |
4402 | ||
4403 | static const struct i915_range gen8_oa_mux_regs[] = { | |
4404 | { .start = 0x0d00, .end = 0x0d2c }, /* RPM_CONFIG[0-1], NOA_CONFIG[0-8] */ | |
4405 | { .start = 0x20cc, .end = 0x20cc }, /* WAIT_FOR_RC6_EXIT */ | |
4406 | {} | |
4407 | }; | |
4408 | ||
4409 | static const struct i915_range gen11_oa_mux_regs[] = { | |
4410 | { .start = 0x91c8, .end = 0x91dc }, /* OA_PERFCNT[3-4] */ | |
4411 | {} | |
4412 | }; | |
4413 | ||
4414 | static const struct i915_range gen12_oa_mux_regs[] = { | |
4415 | { .start = 0x0d00, .end = 0x0d04 }, /* RPM_CONFIG[0-1] */ | |
4416 | { .start = 0x0d0c, .end = 0x0d2c }, /* NOA_CONFIG[0-8] */ | |
4417 | { .start = 0x9840, .end = 0x9840 }, /* GDT_CHICKEN_BITS */ | |
4418 | { .start = 0x9884, .end = 0x9888 }, /* NOA_WRITE */ | |
4419 | { .start = 0x20cc, .end = 0x20cc }, /* WAIT_FOR_RC6_EXIT */ | |
4420 | {} | |
4421 | }; | |
4422 | ||
d654ae8b UNR |
4423 | /* |
4424 | * Ref: 14010536224: | |
4425 | * 0x20cc is repurposed on MTL, so use a separate array for MTL. | |
4426 | */ | |
4427 | static const struct i915_range mtl_oa_mux_regs[] = { | |
4428 | { .start = 0x0d00, .end = 0x0d04 }, /* RPM_CONFIG[0-1] */ | |
4429 | { .start = 0x0d0c, .end = 0x0d2c }, /* NOA_CONFIG[0-8] */ | |
4430 | { .start = 0x9840, .end = 0x9840 }, /* GDT_CHICKEN_BITS */ | |
4431 | { .start = 0x9884, .end = 0x9888 }, /* NOA_WRITE */ | |
4432 | }; | |
4433 | ||
66a19a3a | 4434 | static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr) |
f89823c2 | 4435 | { |
66a19a3a | 4436 | return reg_in_range_table(addr, gen7_oa_b_counters); |
f89823c2 LL |
4437 | } |
4438 | ||
8f8b1171 | 4439 | static bool gen8_is_valid_mux_addr(struct i915_perf *perf, u32 addr) |
f89823c2 | 4440 | { |
66a19a3a MR |
4441 | return reg_in_range_table(addr, gen7_oa_mux_regs) || |
4442 | reg_in_range_table(addr, gen8_oa_mux_regs); | |
f89823c2 LL |
4443 | } |
4444 | ||
5dae69a9 | 4445 | static bool gen11_is_valid_mux_addr(struct i915_perf *perf, u32 addr) |
95690a02 | 4446 | { |
66a19a3a MR |
4447 | return reg_in_range_table(addr, gen7_oa_mux_regs) || |
4448 | reg_in_range_table(addr, gen8_oa_mux_regs) || | |
4449 | reg_in_range_table(addr, gen11_oa_mux_regs); | |
95690a02 LL |
4450 | } |
4451 | ||
8f8b1171 | 4452 | static bool hsw_is_valid_mux_addr(struct i915_perf *perf, u32 addr) |
f89823c2 | 4453 | { |
66a19a3a MR |
4454 | return reg_in_range_table(addr, gen7_oa_mux_regs) || |
4455 | reg_in_range_table(addr, hsw_oa_mux_regs); | |
f89823c2 LL |
4456 | } |
4457 | ||
8f8b1171 | 4458 | static bool chv_is_valid_mux_addr(struct i915_perf *perf, u32 addr) |
f89823c2 | 4459 | { |
66a19a3a MR |
4460 | return reg_in_range_table(addr, gen7_oa_mux_regs) || |
4461 | reg_in_range_table(addr, chv_oa_mux_regs); | |
f89823c2 LL |
4462 | } |
4463 | ||
00a7f0d7 LL |
4464 | static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr) |
4465 | { | |
66a19a3a | 4466 | return reg_in_range_table(addr, gen12_oa_b_counters); |
00a7f0d7 LL |
4467 | } |
4468 | ||
0fa9349d LL |
4469 | static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr) |
4470 | { | |
4471 | return reg_in_range_table(addr, xehp_oa_b_counters) || | |
4472 | reg_in_range_table(addr, gen12_oa_b_counters); | |
4473 | } | |
4474 | ||
00a7f0d7 LL |
4475 | static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr) |
4476 | { | |
d654ae8b UNR |
4477 | if (IS_METEORLAKE(perf->i915)) |
4478 | return reg_in_range_table(addr, mtl_oa_mux_regs); | |
4479 | else | |
4480 | return reg_in_range_table(addr, gen12_oa_mux_regs); | |
00a7f0d7 LL |
4481 | } |
4482 | ||
739f3abd | 4483 | static u32 mask_reg_value(u32 reg, u32 val) |
f89823c2 LL |
4484 | { |
4485 | /* HALF_SLICE_CHICKEN2 is programmed with a the | |
4486 | * WaDisableSTUnitPowerOptimization workaround. Make sure the value | |
4487 | * programmed by userspace doesn't change this. | |
4488 | */ | |
fc215230 | 4489 | if (REG_EQUAL(reg, HALF_SLICE_CHICKEN2)) |
f89823c2 LL |
4490 | val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE); |
4491 | ||
4492 | /* WAIT_FOR_RC6_EXIT has only one bit fullfilling the function | |
4493 | * indicated by its name and a bunch of selection fields used by OA | |
4494 | * configs. | |
4495 | */ | |
fc215230 | 4496 | if (REG_EQUAL(reg, WAIT_FOR_RC6_EXIT)) |
f89823c2 LL |
4497 | val = val & ~_MASKED_BIT_ENABLE(HSW_WAIT_FOR_RC6_EXIT_ENABLE); |
4498 | ||
4499 | return val; | |
4500 | } | |
4501 | ||
8f8b1171 CW |
4502 | static struct i915_oa_reg *alloc_oa_regs(struct i915_perf *perf, |
4503 | bool (*is_valid)(struct i915_perf *perf, u32 addr), | |
f89823c2 LL |
4504 | u32 __user *regs, |
4505 | u32 n_regs) | |
4506 | { | |
4507 | struct i915_oa_reg *oa_regs; | |
4508 | int err; | |
4509 | u32 i; | |
4510 | ||
4511 | if (!n_regs) | |
4512 | return NULL; | |
4513 | ||
f89823c2 LL |
4514 | /* No is_valid function means we're not allowing any register to be programmed. */ |
4515 | GEM_BUG_ON(!is_valid); | |
4516 | if (!is_valid) | |
4517 | return ERR_PTR(-EINVAL); | |
4518 | ||
4519 | oa_regs = kmalloc_array(n_regs, sizeof(*oa_regs), GFP_KERNEL); | |
4520 | if (!oa_regs) | |
4521 | return ERR_PTR(-ENOMEM); | |
4522 | ||
4523 | for (i = 0; i < n_regs; i++) { | |
4524 | u32 addr, value; | |
4525 | ||
4526 | err = get_user(addr, regs); | |
4527 | if (err) | |
4528 | goto addr_err; | |
4529 | ||
8f8b1171 | 4530 | if (!is_valid(perf, addr)) { |
2fec5391 UNR |
4531 | drm_dbg(&perf->i915->drm, |
4532 | "Invalid oa_reg address: %X\n", addr); | |
f89823c2 LL |
4533 | err = -EINVAL; |
4534 | goto addr_err; | |
4535 | } | |
4536 | ||
4537 | err = get_user(value, regs + 1); | |
4538 | if (err) | |
4539 | goto addr_err; | |
4540 | ||
4541 | oa_regs[i].addr = _MMIO(addr); | |
4542 | oa_regs[i].value = mask_reg_value(addr, value); | |
4543 | ||
4544 | regs += 2; | |
4545 | } | |
4546 | ||
4547 | return oa_regs; | |
4548 | ||
4549 | addr_err: | |
4550 | kfree(oa_regs); | |
4551 | return ERR_PTR(err); | |
4552 | } | |
4553 | ||
18fb42db NC |
4554 | static ssize_t show_dynamic_id(struct kobject *kobj, |
4555 | struct kobj_attribute *attr, | |
f89823c2 LL |
4556 | char *buf) |
4557 | { | |
4558 | struct i915_oa_config *oa_config = | |
4559 | container_of(attr, typeof(*oa_config), sysfs_metric_id); | |
4560 | ||
4561 | return sprintf(buf, "%d\n", oa_config->id); | |
4562 | } | |
4563 | ||
8f8b1171 | 4564 | static int create_dynamic_oa_sysfs_entry(struct i915_perf *perf, |
f89823c2 LL |
4565 | struct i915_oa_config *oa_config) |
4566 | { | |
28152a23 | 4567 | sysfs_attr_init(&oa_config->sysfs_metric_id.attr); |
f89823c2 LL |
4568 | oa_config->sysfs_metric_id.attr.name = "id"; |
4569 | oa_config->sysfs_metric_id.attr.mode = S_IRUGO; | |
4570 | oa_config->sysfs_metric_id.show = show_dynamic_id; | |
4571 | oa_config->sysfs_metric_id.store = NULL; | |
4572 | ||
4573 | oa_config->attrs[0] = &oa_config->sysfs_metric_id.attr; | |
4574 | oa_config->attrs[1] = NULL; | |
4575 | ||
4576 | oa_config->sysfs_metric.name = oa_config->uuid; | |
4577 | oa_config->sysfs_metric.attrs = oa_config->attrs; | |
4578 | ||
8f8b1171 | 4579 | return sysfs_create_group(perf->metrics_kobj, |
f89823c2 LL |
4580 | &oa_config->sysfs_metric); |
4581 | } | |
4582 | ||
4583 | /** | |
4584 | * i915_perf_add_config_ioctl - DRM ioctl() for userspace to add a new OA config | |
4585 | * @dev: drm device | |
4586 | * @data: ioctl data (pointer to struct drm_i915_perf_oa_config) copied from | |
4587 | * userspace (unvalidated) | |
4588 | * @file: drm file | |
4589 | * | |
4590 | * Validates the submitted OA register to be saved into a new OA config that | |
4591 | * can then be used for programming the OA unit and its NOA network. | |
4592 | * | |
4593 | * Returns: A new allocated config number to be used with the perf open ioctl | |
4594 | * or a negative error code on failure. | |
4595 | */ | |
4596 | int i915_perf_add_config_ioctl(struct drm_device *dev, void *data, | |
4597 | struct drm_file *file) | |
4598 | { | |
8f8b1171 | 4599 | struct i915_perf *perf = &to_i915(dev)->perf; |
f89823c2 LL |
4600 | struct drm_i915_perf_oa_config *args = data; |
4601 | struct i915_oa_config *oa_config, *tmp; | |
c415ef2a | 4602 | struct i915_oa_reg *regs; |
f89823c2 LL |
4603 | int err, id; |
4604 | ||
8f8b1171 | 4605 | if (!perf->i915) { |
2fec5391 UNR |
4606 | drm_dbg(&perf->i915->drm, |
4607 | "i915 perf interface not available for this system\n"); | |
f89823c2 LL |
4608 | return -ENOTSUPP; |
4609 | } | |
4610 | ||
8f8b1171 | 4611 | if (!perf->metrics_kobj) { |
2fec5391 UNR |
4612 | drm_dbg(&perf->i915->drm, |
4613 | "OA metrics weren't advertised via sysfs\n"); | |
f89823c2 LL |
4614 | return -EINVAL; |
4615 | } | |
4616 | ||
4e3d3456 | 4617 | if (i915_perf_stream_paranoid && !perfmon_capable()) { |
2fec5391 UNR |
4618 | drm_dbg(&perf->i915->drm, |
4619 | "Insufficient privileges to add i915 OA config\n"); | |
f89823c2 LL |
4620 | return -EACCES; |
4621 | } | |
4622 | ||
4623 | if ((!args->mux_regs_ptr || !args->n_mux_regs) && | |
4624 | (!args->boolean_regs_ptr || !args->n_boolean_regs) && | |
4625 | (!args->flex_regs_ptr || !args->n_flex_regs)) { | |
2fec5391 UNR |
4626 | drm_dbg(&perf->i915->drm, |
4627 | "No OA registers given\n"); | |
f89823c2 LL |
4628 | return -EINVAL; |
4629 | } | |
4630 | ||
4631 | oa_config = kzalloc(sizeof(*oa_config), GFP_KERNEL); | |
4632 | if (!oa_config) { | |
2fec5391 UNR |
4633 | drm_dbg(&perf->i915->drm, |
4634 | "Failed to allocate memory for the OA config\n"); | |
f89823c2 LL |
4635 | return -ENOMEM; |
4636 | } | |
4637 | ||
6a45008a LL |
4638 | oa_config->perf = perf; |
4639 | kref_init(&oa_config->ref); | |
f89823c2 LL |
4640 | |
4641 | if (!uuid_is_valid(args->uuid)) { | |
2fec5391 UNR |
4642 | drm_dbg(&perf->i915->drm, |
4643 | "Invalid uuid format for OA config\n"); | |
f89823c2 LL |
4644 | err = -EINVAL; |
4645 | goto reg_err; | |
4646 | } | |
4647 | ||
4648 | /* Last character in oa_config->uuid will be 0 because oa_config is | |
4649 | * kzalloc. | |
4650 | */ | |
4651 | memcpy(oa_config->uuid, args->uuid, sizeof(args->uuid)); | |
4652 | ||
4653 | oa_config->mux_regs_len = args->n_mux_regs; | |
c2fba936 CW |
4654 | regs = alloc_oa_regs(perf, |
4655 | perf->ops.is_valid_mux_reg, | |
4656 | u64_to_user_ptr(args->mux_regs_ptr), | |
4657 | args->n_mux_regs); | |
f89823c2 | 4658 | |
c2fba936 | 4659 | if (IS_ERR(regs)) { |
2fec5391 UNR |
4660 | drm_dbg(&perf->i915->drm, |
4661 | "Failed to create OA config for mux_regs\n"); | |
c2fba936 | 4662 | err = PTR_ERR(regs); |
f89823c2 LL |
4663 | goto reg_err; |
4664 | } | |
c2fba936 | 4665 | oa_config->mux_regs = regs; |
f89823c2 LL |
4666 | |
4667 | oa_config->b_counter_regs_len = args->n_boolean_regs; | |
c2fba936 CW |
4668 | regs = alloc_oa_regs(perf, |
4669 | perf->ops.is_valid_b_counter_reg, | |
4670 | u64_to_user_ptr(args->boolean_regs_ptr), | |
4671 | args->n_boolean_regs); | |
f89823c2 | 4672 | |
c2fba936 | 4673 | if (IS_ERR(regs)) { |
2fec5391 UNR |
4674 | drm_dbg(&perf->i915->drm, |
4675 | "Failed to create OA config for b_counter_regs\n"); | |
c2fba936 | 4676 | err = PTR_ERR(regs); |
f89823c2 LL |
4677 | goto reg_err; |
4678 | } | |
c2fba936 | 4679 | oa_config->b_counter_regs = regs; |
f89823c2 | 4680 | |
651e7d48 | 4681 | if (GRAPHICS_VER(perf->i915) < 8) { |
f89823c2 LL |
4682 | if (args->n_flex_regs != 0) { |
4683 | err = -EINVAL; | |
4684 | goto reg_err; | |
4685 | } | |
4686 | } else { | |
4687 | oa_config->flex_regs_len = args->n_flex_regs; | |
c2fba936 CW |
4688 | regs = alloc_oa_regs(perf, |
4689 | perf->ops.is_valid_flex_reg, | |
4690 | u64_to_user_ptr(args->flex_regs_ptr), | |
4691 | args->n_flex_regs); | |
f89823c2 | 4692 | |
c2fba936 | 4693 | if (IS_ERR(regs)) { |
2fec5391 UNR |
4694 | drm_dbg(&perf->i915->drm, |
4695 | "Failed to create OA config for flex_regs\n"); | |
c2fba936 | 4696 | err = PTR_ERR(regs); |
f89823c2 LL |
4697 | goto reg_err; |
4698 | } | |
c2fba936 | 4699 | oa_config->flex_regs = regs; |
f89823c2 LL |
4700 | } |
4701 | ||
8f8b1171 | 4702 | err = mutex_lock_interruptible(&perf->metrics_lock); |
f89823c2 LL |
4703 | if (err) |
4704 | goto reg_err; | |
4705 | ||
4706 | /* We shouldn't have too many configs, so this iteration shouldn't be | |
4707 | * too costly. | |
4708 | */ | |
8f8b1171 | 4709 | idr_for_each_entry(&perf->metrics_idr, tmp, id) { |
f89823c2 | 4710 | if (!strcmp(tmp->uuid, oa_config->uuid)) { |
2fec5391 UNR |
4711 | drm_dbg(&perf->i915->drm, |
4712 | "OA config already exists with this uuid\n"); | |
f89823c2 LL |
4713 | err = -EADDRINUSE; |
4714 | goto sysfs_err; | |
4715 | } | |
4716 | } | |
4717 | ||
8f8b1171 | 4718 | err = create_dynamic_oa_sysfs_entry(perf, oa_config); |
f89823c2 | 4719 | if (err) { |
2fec5391 UNR |
4720 | drm_dbg(&perf->i915->drm, |
4721 | "Failed to create sysfs entry for OA config\n"); | |
f89823c2 LL |
4722 | goto sysfs_err; |
4723 | } | |
4724 | ||
4725 | /* Config id 0 is invalid, id 1 for kernel stored test config. */ | |
8f8b1171 | 4726 | oa_config->id = idr_alloc(&perf->metrics_idr, |
f89823c2 LL |
4727 | oa_config, 2, |
4728 | 0, GFP_KERNEL); | |
4729 | if (oa_config->id < 0) { | |
2fec5391 UNR |
4730 | drm_dbg(&perf->i915->drm, |
4731 | "Failed to create sysfs entry for OA config\n"); | |
f89823c2 LL |
4732 | err = oa_config->id; |
4733 | goto sysfs_err; | |
4734 | } | |
4735 | ||
8f8b1171 | 4736 | mutex_unlock(&perf->metrics_lock); |
f89823c2 | 4737 | |
2fec5391 UNR |
4738 | drm_dbg(&perf->i915->drm, |
4739 | "Added config %s id=%i\n", oa_config->uuid, oa_config->id); | |
9bd9be66 | 4740 | |
f89823c2 LL |
4741 | return oa_config->id; |
4742 | ||
4743 | sysfs_err: | |
8f8b1171 | 4744 | mutex_unlock(&perf->metrics_lock); |
f89823c2 | 4745 | reg_err: |
6a45008a | 4746 | i915_oa_config_put(oa_config); |
2fec5391 UNR |
4747 | drm_dbg(&perf->i915->drm, |
4748 | "Failed to add new OA config\n"); | |
f89823c2 LL |
4749 | return err; |
4750 | } | |
4751 | ||
4752 | /** | |
4753 | * i915_perf_remove_config_ioctl - DRM ioctl() for userspace to remove an OA config | |
4754 | * @dev: drm device | |
4755 | * @data: ioctl data (pointer to u64 integer) copied from userspace | |
4756 | * @file: drm file | |
4757 | * | |
4758 | * Configs can be removed while being used, the will stop appearing in sysfs | |
4759 | * and their content will be freed when the stream using the config is closed. | |
4760 | * | |
4761 | * Returns: 0 on success or a negative error code on failure. | |
4762 | */ | |
4763 | int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data, | |
4764 | struct drm_file *file) | |
4765 | { | |
8f8b1171 | 4766 | struct i915_perf *perf = &to_i915(dev)->perf; |
f89823c2 LL |
4767 | u64 *arg = data; |
4768 | struct i915_oa_config *oa_config; | |
4769 | int ret; | |
4770 | ||
8f8b1171 | 4771 | if (!perf->i915) { |
2fec5391 UNR |
4772 | drm_dbg(&perf->i915->drm, |
4773 | "i915 perf interface not available for this system\n"); | |
f89823c2 LL |
4774 | return -ENOTSUPP; |
4775 | } | |
4776 | ||
4e3d3456 | 4777 | if (i915_perf_stream_paranoid && !perfmon_capable()) { |
2fec5391 UNR |
4778 | drm_dbg(&perf->i915->drm, |
4779 | "Insufficient privileges to remove i915 OA config\n"); | |
f89823c2 LL |
4780 | return -EACCES; |
4781 | } | |
4782 | ||
8f8b1171 | 4783 | ret = mutex_lock_interruptible(&perf->metrics_lock); |
f89823c2 | 4784 | if (ret) |
6a45008a | 4785 | return ret; |
f89823c2 | 4786 | |
8f8b1171 | 4787 | oa_config = idr_find(&perf->metrics_idr, *arg); |
f89823c2 | 4788 | if (!oa_config) { |
2fec5391 UNR |
4789 | drm_dbg(&perf->i915->drm, |
4790 | "Failed to remove unknown OA config\n"); | |
f89823c2 | 4791 | ret = -ENOENT; |
6a45008a | 4792 | goto err_unlock; |
f89823c2 LL |
4793 | } |
4794 | ||
4795 | GEM_BUG_ON(*arg != oa_config->id); | |
4796 | ||
4f6ccc74 | 4797 | sysfs_remove_group(perf->metrics_kobj, &oa_config->sysfs_metric); |
f89823c2 | 4798 | |
8f8b1171 | 4799 | idr_remove(&perf->metrics_idr, *arg); |
9bd9be66 | 4800 | |
6a45008a LL |
4801 | mutex_unlock(&perf->metrics_lock); |
4802 | ||
2fec5391 UNR |
4803 | drm_dbg(&perf->i915->drm, |
4804 | "Removed config %s id=%i\n", oa_config->uuid, oa_config->id); | |
9bd9be66 | 4805 | |
6a45008a LL |
4806 | i915_oa_config_put(oa_config); |
4807 | ||
4808 | return 0; | |
f89823c2 | 4809 | |
6a45008a | 4810 | err_unlock: |
8f8b1171 | 4811 | mutex_unlock(&perf->metrics_lock); |
f89823c2 LL |
4812 | return ret; |
4813 | } | |
4814 | ||
ccdf6341 RB |
4815 | static struct ctl_table oa_table[] = { |
4816 | { | |
4817 | .procname = "perf_stream_paranoid", | |
4818 | .data = &i915_perf_stream_paranoid, | |
4819 | .maxlen = sizeof(i915_perf_stream_paranoid), | |
4820 | .mode = 0644, | |
4821 | .proc_handler = proc_dointvec_minmax, | |
eec4844f MC |
4822 | .extra1 = SYSCTL_ZERO, |
4823 | .extra2 = SYSCTL_ONE, | |
ccdf6341 | 4824 | }, |
00319ba0 RB |
4825 | { |
4826 | .procname = "oa_max_sample_rate", | |
4827 | .data = &i915_oa_max_sample_rate, | |
4828 | .maxlen = sizeof(i915_oa_max_sample_rate), | |
4829 | .mode = 0644, | |
4830 | .proc_handler = proc_dointvec_minmax, | |
eec4844f | 4831 | .extra1 = SYSCTL_ZERO, |
00319ba0 RB |
4832 | .extra2 = &oa_sample_rate_hard_limit, |
4833 | }, | |
ccdf6341 RB |
4834 | {} |
4835 | }; | |
4836 | ||
5f284e9c UNR |
4837 | static u32 num_perf_groups_per_gt(struct intel_gt *gt) |
4838 | { | |
4839 | return 1; | |
4840 | } | |
4841 | ||
4842 | static u32 __oa_engine_group(struct intel_engine_cs *engine) | |
4843 | { | |
4844 | if (engine->class == RENDER_CLASS) | |
4845 | return PERF_GROUP_OAG; | |
4846 | else | |
4847 | return PERF_GROUP_INVALID; | |
4848 | } | |
4849 | ||
4850 | static int oa_init_gt(struct intel_gt *gt) | |
4851 | { | |
4852 | u32 num_groups = num_perf_groups_per_gt(gt); | |
4853 | struct intel_engine_cs *engine; | |
4854 | struct i915_perf_group *g; | |
4855 | intel_engine_mask_t tmp; | |
4856 | ||
4857 | g = kcalloc(num_groups, sizeof(*g), GFP_KERNEL); | |
4858 | if (!g) | |
4859 | return -ENOMEM; | |
4860 | ||
4861 | for_each_engine_masked(engine, gt, ALL_ENGINES, tmp) { | |
4862 | u32 index = __oa_engine_group(engine); | |
4863 | ||
4864 | engine->oa_group = NULL; | |
4865 | if (index < num_groups) { | |
4866 | g[index].num_engines++; | |
4867 | engine->oa_group = &g[index]; | |
4868 | } | |
4869 | } | |
4870 | ||
4871 | gt->perf.num_perf_groups = num_groups; | |
4872 | gt->perf.group = g; | |
4873 | ||
4874 | return 0; | |
4875 | } | |
4876 | ||
4877 | static int oa_init_engine_groups(struct i915_perf *perf) | |
4878 | { | |
4879 | struct intel_gt *gt; | |
4880 | int i, ret; | |
4881 | ||
4882 | for_each_gt(gt, perf->i915, i) { | |
4883 | ret = oa_init_gt(gt); | |
4884 | if (ret) | |
4885 | return ret; | |
4886 | } | |
4887 | ||
4888 | return 0; | |
4889 | } | |
4890 | ||
77892f4f UNR |
4891 | static void oa_init_supported_formats(struct i915_perf *perf) |
4892 | { | |
4893 | struct drm_i915_private *i915 = perf->i915; | |
4894 | enum intel_platform platform = INTEL_INFO(i915)->platform; | |
4895 | ||
4896 | switch (platform) { | |
4897 | case INTEL_HASWELL: | |
4898 | oa_format_add(perf, I915_OA_FORMAT_A13); | |
4899 | oa_format_add(perf, I915_OA_FORMAT_A13); | |
4900 | oa_format_add(perf, I915_OA_FORMAT_A29); | |
4901 | oa_format_add(perf, I915_OA_FORMAT_A13_B8_C8); | |
4902 | oa_format_add(perf, I915_OA_FORMAT_B4_C8); | |
4903 | oa_format_add(perf, I915_OA_FORMAT_A45_B8_C8); | |
4904 | oa_format_add(perf, I915_OA_FORMAT_B4_C8_A16); | |
4905 | oa_format_add(perf, I915_OA_FORMAT_C4_B8); | |
4906 | break; | |
4907 | ||
4908 | case INTEL_BROADWELL: | |
4909 | case INTEL_CHERRYVIEW: | |
4910 | case INTEL_SKYLAKE: | |
4911 | case INTEL_BROXTON: | |
4912 | case INTEL_KABYLAKE: | |
4913 | case INTEL_GEMINILAKE: | |
4914 | case INTEL_COFFEELAKE: | |
4915 | case INTEL_COMETLAKE: | |
77892f4f UNR |
4916 | case INTEL_ICELAKE: |
4917 | case INTEL_ELKHARTLAKE: | |
4918 | case INTEL_JASPERLAKE: | |
77892f4f UNR |
4919 | case INTEL_TIGERLAKE: |
4920 | case INTEL_ROCKETLAKE: | |
4921 | case INTEL_DG1: | |
4922 | case INTEL_ALDERLAKE_S: | |
73c1bf0f | 4923 | case INTEL_ALDERLAKE_P: |
5e4b7385 UNR |
4924 | oa_format_add(perf, I915_OA_FORMAT_A12); |
4925 | oa_format_add(perf, I915_OA_FORMAT_A12_B8_C8); | |
77892f4f | 4926 | oa_format_add(perf, I915_OA_FORMAT_A32u40_A4u32_B8_C8); |
5e4b7385 | 4927 | oa_format_add(perf, I915_OA_FORMAT_C4_B8); |
77892f4f UNR |
4928 | break; |
4929 | ||
81d5f7d9 | 4930 | case INTEL_DG2: |
d0fa30be | 4931 | case INTEL_METEORLAKE: |
81d5f7d9 UNR |
4932 | oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8); |
4933 | oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8); | |
4934 | break; | |
4935 | ||
77892f4f UNR |
4936 | default: |
4937 | MISSING_CASE(platform); | |
4938 | } | |
4939 | } | |
4940 | ||
a5c3a3cb UNR |
4941 | static void i915_perf_init_info(struct drm_i915_private *i915) |
4942 | { | |
4943 | struct i915_perf *perf = &i915->perf; | |
4944 | ||
4945 | switch (GRAPHICS_VER(i915)) { | |
4946 | case 8: | |
4947 | perf->ctx_oactxctrl_offset = 0x120; | |
4948 | perf->ctx_flexeu0_offset = 0x2ce; | |
4949 | perf->gen8_valid_ctx_bit = BIT(25); | |
4950 | break; | |
4951 | case 9: | |
4952 | perf->ctx_oactxctrl_offset = 0x128; | |
4953 | perf->ctx_flexeu0_offset = 0x3de; | |
4954 | perf->gen8_valid_ctx_bit = BIT(16); | |
4955 | break; | |
4956 | case 11: | |
4957 | perf->ctx_oactxctrl_offset = 0x124; | |
4958 | perf->ctx_flexeu0_offset = 0x78e; | |
4959 | perf->gen8_valid_ctx_bit = BIT(16); | |
4960 | break; | |
4961 | case 12: | |
4962 | /* | |
4963 | * Calculate offset at runtime in oa_pin_context for gen12 and | |
4964 | * cache the value in perf->ctx_oactxctrl_offset. | |
4965 | */ | |
4966 | break; | |
4967 | default: | |
4968 | MISSING_CASE(GRAPHICS_VER(i915)); | |
4969 | } | |
4970 | } | |
4971 | ||
16d98b31 | 4972 | /** |
3dc716fd | 4973 | * i915_perf_init - initialize i915-perf state on module bind |
8f8b1171 | 4974 | * @i915: i915 device instance |
16d98b31 RB |
4975 | * |
4976 | * Initializes i915-perf state without exposing anything to userspace. | |
4977 | * | |
4978 | * Note: i915-perf initialization is split into an 'init' and 'register' | |
4979 | * phase with the i915_perf_register() exposing state to userspace. | |
4980 | */ | |
772a5803 | 4981 | int i915_perf_init(struct drm_i915_private *i915) |
8f8b1171 CW |
4982 | { |
4983 | struct i915_perf *perf = &i915->perf; | |
4984 | ||
0f15c5b0 | 4985 | perf->oa_formats = oa_formats; |
8f8b1171 CW |
4986 | if (IS_HASWELL(i915)) { |
4987 | perf->ops.is_valid_b_counter_reg = gen7_is_valid_b_counter_addr; | |
4988 | perf->ops.is_valid_mux_reg = hsw_is_valid_mux_addr; | |
4989 | perf->ops.is_valid_flex_reg = NULL; | |
4990 | perf->ops.enable_metric_set = hsw_enable_metric_set; | |
4991 | perf->ops.disable_metric_set = hsw_disable_metric_set; | |
4992 | perf->ops.oa_enable = gen7_oa_enable; | |
4993 | perf->ops.oa_disable = gen7_oa_disable; | |
4994 | perf->ops.read = gen7_oa_read; | |
4995 | perf->ops.oa_hw_tail_read = gen7_oa_hw_tail_read; | |
8f8b1171 | 4996 | } else if (HAS_LOGICAL_RING_CONTEXTS(i915)) { |
19f81df2 RB |
4997 | /* Note: that although we could theoretically also support the |
4998 | * legacy ringbuffer mode on BDW (and earlier iterations of | |
4999 | * this driver, before upstreaming did this) it didn't seem | |
5000 | * worth the complexity to maintain now that BDW+ enable | |
5001 | * execlist mode by default. | |
5002 | */ | |
8f8b1171 | 5003 | perf->ops.read = gen8_oa_read; |
a5c3a3cb | 5004 | i915_perf_init_info(i915); |
701f8231 | 5005 | |
651e7d48 | 5006 | if (IS_GRAPHICS_VER(i915, 8, 9)) { |
8f8b1171 | 5007 | perf->ops.is_valid_b_counter_reg = |
ba6b7c1a | 5008 | gen7_is_valid_b_counter_addr; |
8f8b1171 | 5009 | perf->ops.is_valid_mux_reg = |
ba6b7c1a | 5010 | gen8_is_valid_mux_addr; |
8f8b1171 | 5011 | perf->ops.is_valid_flex_reg = |
ba6b7c1a | 5012 | gen8_is_valid_flex_addr; |
155e941f | 5013 | |
8f8b1171 CW |
5014 | if (IS_CHERRYVIEW(i915)) { |
5015 | perf->ops.is_valid_mux_reg = | |
f89823c2 LL |
5016 | chv_is_valid_mux_addr; |
5017 | } | |
155e941f | 5018 | |
00a7f0d7 LL |
5019 | perf->ops.oa_enable = gen8_oa_enable; |
5020 | perf->ops.oa_disable = gen8_oa_disable; | |
8f8b1171 CW |
5021 | perf->ops.enable_metric_set = gen8_enable_metric_set; |
5022 | perf->ops.disable_metric_set = gen8_disable_metric_set; | |
00a7f0d7 | 5023 | perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read; |
5dae69a9 | 5024 | } else if (GRAPHICS_VER(i915) == 11) { |
8f8b1171 | 5025 | perf->ops.is_valid_b_counter_reg = |
95690a02 | 5026 | gen7_is_valid_b_counter_addr; |
8f8b1171 | 5027 | perf->ops.is_valid_mux_reg = |
5dae69a9 | 5028 | gen11_is_valid_mux_addr; |
8f8b1171 | 5029 | perf->ops.is_valid_flex_reg = |
95690a02 LL |
5030 | gen8_is_valid_flex_addr; |
5031 | ||
00a7f0d7 LL |
5032 | perf->ops.oa_enable = gen8_oa_enable; |
5033 | perf->ops.oa_disable = gen8_oa_disable; | |
8f8b1171 | 5034 | perf->ops.enable_metric_set = gen8_enable_metric_set; |
5dae69a9 | 5035 | perf->ops.disable_metric_set = gen11_disable_metric_set; |
00a7f0d7 | 5036 | perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read; |
651e7d48 | 5037 | } else if (GRAPHICS_VER(i915) == 12) { |
00a7f0d7 | 5038 | perf->ops.is_valid_b_counter_reg = |
0fa9349d LL |
5039 | HAS_OA_SLICE_CONTRIB_LIMITS(i915) ? |
5040 | xehp_is_valid_b_counter_addr : | |
00a7f0d7 LL |
5041 | gen12_is_valid_b_counter_addr; |
5042 | perf->ops.is_valid_mux_reg = | |
5043 | gen12_is_valid_mux_addr; | |
5044 | perf->ops.is_valid_flex_reg = | |
5045 | gen8_is_valid_flex_addr; | |
5046 | ||
5047 | perf->ops.oa_enable = gen12_oa_enable; | |
5048 | perf->ops.oa_disable = gen12_oa_disable; | |
5049 | perf->ops.enable_metric_set = gen12_enable_metric_set; | |
5050 | perf->ops.disable_metric_set = gen12_disable_metric_set; | |
5051 | perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read; | |
19f81df2 | 5052 | } |
19f81df2 | 5053 | } |
d7965152 | 5054 | |
8f8b1171 | 5055 | if (perf->ops.enable_metric_set) { |
9677a9f3 | 5056 | struct intel_gt *gt; |
5f284e9c | 5057 | int i, ret; |
9677a9f3 UNR |
5058 | |
5059 | for_each_gt(gt, i915, i) | |
5060 | mutex_init(>->perf.lock); | |
eec688e1 | 5061 | |
f170523a | 5062 | /* Choose a representative limit */ |
2cbc876d | 5063 | oa_sample_rate_hard_limit = to_gt(i915)->clock_frequency / 2; |
ccdf6341 | 5064 | |
8f8b1171 | 5065 | mutex_init(&perf->metrics_lock); |
8d989f44 | 5066 | idr_init_base(&perf->metrics_idr, 1); |
f89823c2 | 5067 | |
a37f08a8 UNR |
5068 | /* We set up some ratelimit state to potentially throttle any |
5069 | * _NOTES about spurious, invalid OA reports which we don't | |
5070 | * forward to userspace. | |
5071 | * | |
5072 | * We print a _NOTE about any throttling when closing the | |
5073 | * stream instead of waiting until driver _fini which no one | |
5074 | * would ever see. | |
5075 | * | |
5076 | * Using the same limiting factors as printk_ratelimit() | |
5077 | */ | |
8f8b1171 | 5078 | ratelimit_state_init(&perf->spurious_report_rs, 5 * HZ, 10); |
a37f08a8 UNR |
5079 | /* Since we use a DRM_NOTE for spurious reports it would be |
5080 | * inconsistent to let __ratelimit() automatically print a | |
5081 | * warning for throttling. | |
5082 | */ | |
8f8b1171 | 5083 | ratelimit_set_flags(&perf->spurious_report_rs, |
a37f08a8 UNR |
5084 | RATELIMIT_MSG_ON_RELEASE); |
5085 | ||
d1df41eb LL |
5086 | ratelimit_state_init(&perf->tail_pointer_race, |
5087 | 5 * HZ, 10); | |
5088 | ratelimit_set_flags(&perf->tail_pointer_race, | |
5089 | RATELIMIT_MSG_ON_RELEASE); | |
5090 | ||
daed3e44 LL |
5091 | atomic64_set(&perf->noa_programming_delay, |
5092 | 500 * 1000 /* 500us */); | |
5093 | ||
8f8b1171 | 5094 | perf->i915 = i915; |
77892f4f | 5095 | |
5f284e9c | 5096 | ret = oa_init_engine_groups(perf); |
772a5803 | 5097 | if (ret) { |
5f284e9c UNR |
5098 | drm_err(&i915->drm, |
5099 | "OA initialization failed %d\n", ret); | |
772a5803 UNR |
5100 | return ret; |
5101 | } | |
5f284e9c | 5102 | |
77892f4f | 5103 | oa_init_supported_formats(perf); |
19f81df2 | 5104 | } |
772a5803 UNR |
5105 | |
5106 | return 0; | |
eec688e1 RB |
5107 | } |
5108 | ||
f89823c2 LL |
5109 | static int destroy_config(int id, void *p, void *data) |
5110 | { | |
6a45008a | 5111 | i915_oa_config_put(p); |
f89823c2 LL |
5112 | return 0; |
5113 | } | |
5114 | ||
a04ea6ae | 5115 | int i915_perf_sysctl_register(void) |
3dc716fd | 5116 | { |
e5a1fd99 | 5117 | sysctl_header = register_sysctl("dev/i915", oa_table); |
a04ea6ae | 5118 | return 0; |
3dc716fd VSD |
5119 | } |
5120 | ||
5121 | void i915_perf_sysctl_unregister(void) | |
5122 | { | |
5123 | unregister_sysctl_table(sysctl_header); | |
5124 | } | |
5125 | ||
16d98b31 RB |
5126 | /** |
5127 | * i915_perf_fini - Counter part to i915_perf_init() | |
8f8b1171 | 5128 | * @i915: i915 device instance |
16d98b31 | 5129 | */ |
8f8b1171 | 5130 | void i915_perf_fini(struct drm_i915_private *i915) |
eec688e1 | 5131 | { |
8f8b1171 | 5132 | struct i915_perf *perf = &i915->perf; |
5f284e9c UNR |
5133 | struct intel_gt *gt; |
5134 | int i; | |
eec688e1 | 5135 | |
8f8b1171 CW |
5136 | if (!perf->i915) |
5137 | return; | |
f89823c2 | 5138 | |
5f284e9c UNR |
5139 | for_each_gt(gt, perf->i915, i) |
5140 | kfree(gt->perf.group); | |
5141 | ||
8f8b1171 CW |
5142 | idr_for_each(&perf->metrics_idr, destroy_config, perf); |
5143 | idr_destroy(&perf->metrics_idr); | |
ccdf6341 | 5144 | |
8f8b1171 CW |
5145 | memset(&perf->ops, 0, sizeof(perf->ops)); |
5146 | perf->i915 = NULL; | |
eec688e1 | 5147 | } |
daed3e44 | 5148 | |
b8d49f28 LL |
5149 | /** |
5150 | * i915_perf_ioctl_version - Version of the i915-perf subsystem | |
5151 | * | |
5152 | * This version number is used by userspace to detect available features. | |
5153 | */ | |
5154 | int i915_perf_ioctl_version(void) | |
5155 | { | |
7831e9a9 CW |
5156 | /* |
5157 | * 1: Initial version | |
5158 | * I915_PERF_IOCTL_ENABLE | |
5159 | * I915_PERF_IOCTL_DISABLE | |
5160 | * | |
5161 | * 2: Added runtime modification of OA config. | |
5162 | * I915_PERF_IOCTL_CONFIG | |
9cd20ef7 LL |
5163 | * |
5164 | * 3: Add DRM_I915_PERF_PROP_HOLD_PREEMPTION parameter to hold | |
5165 | * preemption on a particular context so that performance data is | |
5166 | * accessible from a delta of MI_RPC reports without looking at the | |
5167 | * OA buffer. | |
11ecbddd LL |
5168 | * |
5169 | * 4: Add DRM_I915_PERF_PROP_ALLOWED_SSEU to limit what contexts can | |
5170 | * be run for the duration of the performance recording based on | |
5171 | * their SSEU configuration. | |
4ef10fe0 LL |
5172 | * |
5173 | * 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the | |
5174 | * interval for the hrtimer used to check for OA data. | |
c61d04c9 UNR |
5175 | * |
5176 | * 6: Add DRM_I915_PERF_PROP_OA_ENGINE_CLASS and | |
5177 | * DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE | |
7831e9a9 | 5178 | */ |
c61d04c9 | 5179 | return 6; |
b8d49f28 LL |
5180 | } |
5181 | ||
daed3e44 LL |
5182 | #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) |
5183 | #include "selftests/i915_perf.c" | |
5184 | #endif |