Commit | Line | Data |
---|---|---|
6a445edc VS |
1 | /* |
2 | * Resource Director Technology(RDT) | |
3 | * - Monitoring code | |
4 | * | |
5 | * Copyright (C) 2017 Intel Corporation | |
6 | * | |
7 | * Author: | |
8 | * Vikas Shivappa <vikas.shivappa@intel.com> | |
9 | * | |
10 | * This replaces the cqm.c based on perf but we reuse a lot of | |
11 | * code and datastructures originally from Peter Zijlstra and Matt Fleming. | |
12 | * | |
13 | * This program is free software; you can redistribute it and/or modify it | |
14 | * under the terms and conditions of the GNU General Public License, | |
15 | * version 2, as published by the Free Software Foundation. | |
16 | * | |
17 | * This program is distributed in the hope it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
20 | * more details. | |
21 | * | |
22 | * More information about RDT be found in the Intel (R) x86 Architecture | |
23 | * Software Developer Manual June 2016, volume 3, section 17.17. | |
24 | */ | |
25 | ||
26 | #include <linux/module.h> | |
27 | #include <linux/slab.h> | |
28 | #include <asm/cpu_device_id.h> | |
29 | #include "intel_rdt.h" | |
30 | ||
edf6fa1c VS |
31 | #define MSR_IA32_QM_CTR 0x0c8e |
32 | #define MSR_IA32_QM_EVTSEL 0x0c8d | |
33 | ||
6a445edc VS |
34 | struct rmid_entry { |
35 | u32 rmid; | |
24247aee | 36 | int busy; |
6a445edc VS |
37 | struct list_head list; |
38 | }; | |
39 | ||
40 | /** | |
41 | * @rmid_free_lru A least recently used list of free RMIDs | |
42 | * These RMIDs are guaranteed to have an occupancy less than the | |
43 | * threshold occupancy | |
44 | */ | |
45 | static LIST_HEAD(rmid_free_lru); | |
46 | ||
47 | /** | |
24247aee | 48 | * @rmid_limbo_count count of currently unused but (potentially) |
6a445edc | 49 | * dirty RMIDs. |
24247aee | 50 | * This counts RMIDs that no one is currently using but that |
6a445edc VS |
51 | * may have a occupancy value > intel_cqm_threshold. User can change |
52 | * the threshold occupancy value. | |
53 | */ | |
5fd88b60 | 54 | static unsigned int rmid_limbo_count; |
6a445edc VS |
55 | |
56 | /** | |
57 | * @rmid_entry - The entry in the limbo and free lists. | |
58 | */ | |
59 | static struct rmid_entry *rmid_ptrs; | |
60 | ||
61 | /* | |
62 | * Global boolean for rdt_monitor which is true if any | |
63 | * resource monitoring is enabled. | |
64 | */ | |
65 | bool rdt_mon_capable; | |
66 | ||
67 | /* | |
68 | * Global to indicate which monitoring events are enabled. | |
69 | */ | |
70 | unsigned int rdt_mon_features; | |
71 | ||
72 | /* | |
73 | * This is the threshold cache occupancy at which we will consider an | |
74 | * RMID available for re-allocation. | |
75 | */ | |
76 | unsigned int intel_cqm_threshold; | |
77 | ||
78 | static inline struct rmid_entry *__rmid_entry(u32 rmid) | |
79 | { | |
80 | struct rmid_entry *entry; | |
81 | ||
82 | entry = &rmid_ptrs[rmid]; | |
83 | WARN_ON(entry->rmid != rmid); | |
84 | ||
85 | return entry; | |
86 | } | |
87 | ||
edf6fa1c VS |
88 | static u64 __rmid_read(u32 rmid, u32 eventid) |
89 | { | |
90 | u64 val; | |
91 | ||
92 | /* | |
93 | * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured | |
94 | * with a valid event code for supported resource type and the bits | |
95 | * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, | |
96 | * IA32_QM_CTR.data (bits 61:0) reports the monitored data. | |
97 | * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) | |
98 | * are error bits. | |
99 | */ | |
100 | wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); | |
101 | rdmsrl(MSR_IA32_QM_CTR, val); | |
102 | ||
103 | return val; | |
104 | } | |
105 | ||
24247aee | 106 | static bool rmid_dirty(struct rmid_entry *entry) |
edf6fa1c | 107 | { |
24247aee | 108 | u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); |
edf6fa1c | 109 | |
24247aee | 110 | return val >= intel_cqm_threshold; |
edf6fa1c VS |
111 | } |
112 | ||
113 | /* | |
24247aee VS |
114 | * Check the RMIDs that are marked as busy for this domain. If the |
115 | * reported LLC occupancy is below the threshold clear the busy bit and | |
116 | * decrement the count. If the busy count gets to zero on an RMID, we | |
117 | * free the RMID | |
edf6fa1c | 118 | */ |
24247aee | 119 | void __check_limbo(struct rdt_domain *d, bool force_free) |
edf6fa1c | 120 | { |
24247aee | 121 | struct rmid_entry *entry; |
edf6fa1c | 122 | struct rdt_resource *r; |
24247aee | 123 | u32 crmid = 1, nrmid; |
edf6fa1c VS |
124 | |
125 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | |
126 | ||
edf6fa1c | 127 | /* |
24247aee VS |
128 | * Skip RMID 0 and start from RMID 1 and check all the RMIDs that |
129 | * are marked as busy for occupancy < threshold. If the occupancy | |
130 | * is less than the threshold decrement the busy counter of the | |
131 | * RMID and move it to the free list when the counter reaches 0. | |
edf6fa1c | 132 | */ |
24247aee VS |
133 | for (;;) { |
134 | nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); | |
135 | if (nrmid >= r->num_rmid) | |
136 | break; | |
137 | ||
138 | entry = __rmid_entry(nrmid); | |
139 | if (force_free || !rmid_dirty(entry)) { | |
140 | clear_bit(entry->rmid, d->rmid_busy_llc); | |
141 | if (!--entry->busy) { | |
142 | rmid_limbo_count--; | |
edf6fa1c | 143 | list_add_tail(&entry->list, &rmid_free_lru); |
edf6fa1c VS |
144 | } |
145 | } | |
24247aee | 146 | crmid = nrmid + 1; |
edf6fa1c | 147 | } |
24247aee | 148 | } |
edf6fa1c | 149 | |
24247aee VS |
150 | bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) |
151 | { | |
152 | return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; | |
edf6fa1c VS |
153 | } |
154 | ||
155 | /* | |
156 | * As of now the RMIDs allocation is global. | |
157 | * However we keep track of which packages the RMIDs | |
158 | * are used to optimize the limbo list management. | |
159 | */ | |
160 | int alloc_rmid(void) | |
161 | { | |
162 | struct rmid_entry *entry; | |
edf6fa1c VS |
163 | |
164 | lockdep_assert_held(&rdtgroup_mutex); | |
165 | ||
24247aee VS |
166 | if (list_empty(&rmid_free_lru)) |
167 | return rmid_limbo_count ? -EBUSY : -ENOSPC; | |
edf6fa1c VS |
168 | |
169 | entry = list_first_entry(&rmid_free_lru, | |
170 | struct rmid_entry, list); | |
171 | list_del(&entry->list); | |
172 | ||
173 | return entry->rmid; | |
174 | } | |
175 | ||
176 | static void add_rmid_to_limbo(struct rmid_entry *entry) | |
177 | { | |
178 | struct rdt_resource *r; | |
179 | struct rdt_domain *d; | |
24247aee | 180 | int cpu; |
edf6fa1c VS |
181 | u64 val; |
182 | ||
183 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | |
184 | ||
24247aee | 185 | entry->busy = 0; |
edf6fa1c VS |
186 | cpu = get_cpu(); |
187 | list_for_each_entry(d, &r->domains, list) { | |
188 | if (cpumask_test_cpu(cpu, &d->cpu_mask)) { | |
189 | val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); | |
190 | if (val <= intel_cqm_threshold) | |
191 | continue; | |
192 | } | |
24247aee VS |
193 | |
194 | /* | |
195 | * For the first limbo RMID in the domain, | |
196 | * setup up the limbo worker. | |
197 | */ | |
198 | if (!has_busy_rmid(r, d)) | |
199 | cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); | |
edf6fa1c | 200 | set_bit(entry->rmid, d->rmid_busy_llc); |
24247aee | 201 | entry->busy++; |
edf6fa1c VS |
202 | } |
203 | put_cpu(); | |
204 | ||
24247aee VS |
205 | if (entry->busy) |
206 | rmid_limbo_count++; | |
207 | else | |
edf6fa1c | 208 | list_add_tail(&entry->list, &rmid_free_lru); |
edf6fa1c VS |
209 | } |
210 | ||
211 | void free_rmid(u32 rmid) | |
212 | { | |
213 | struct rmid_entry *entry; | |
214 | ||
215 | if (!rmid) | |
216 | return; | |
217 | ||
218 | lockdep_assert_held(&rdtgroup_mutex); | |
219 | ||
220 | entry = __rmid_entry(rmid); | |
221 | ||
222 | if (is_llc_occupancy_enabled()) | |
223 | add_rmid_to_limbo(entry); | |
224 | else | |
225 | list_add_tail(&entry->list, &rmid_free_lru); | |
226 | } | |
227 | ||
ba0f26d8 VS |
228 | static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) |
229 | { | |
230 | u64 shift = 64 - MBM_CNTR_WIDTH, chunks; | |
231 | ||
232 | chunks = (cur_msr << shift) - (prev_msr << shift); | |
233 | return chunks >>= shift; | |
234 | } | |
235 | ||
d89b7379 VS |
236 | static int __mon_event_count(u32 rmid, struct rmid_read *rr) |
237 | { | |
9f52425b | 238 | struct mbm_state *m; |
ba0f26d8 | 239 | u64 chunks, tval; |
d89b7379 VS |
240 | |
241 | tval = __rmid_read(rmid, rr->evtid); | |
242 | if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { | |
243 | rr->val = tval; | |
244 | return -EINVAL; | |
245 | } | |
246 | switch (rr->evtid) { | |
247 | case QOS_L3_OCCUP_EVENT_ID: | |
248 | rr->val += tval; | |
249 | return 0; | |
9f52425b TL |
250 | case QOS_L3_MBM_TOTAL_EVENT_ID: |
251 | m = &rr->d->mbm_total[rmid]; | |
252 | break; | |
253 | case QOS_L3_MBM_LOCAL_EVENT_ID: | |
254 | m = &rr->d->mbm_local[rmid]; | |
255 | break; | |
d89b7379 VS |
256 | default: |
257 | /* | |
258 | * Code would never reach here because | |
259 | * an invalid event id would fail the __rmid_read. | |
260 | */ | |
261 | return -EINVAL; | |
262 | } | |
a4de1dfd VS |
263 | |
264 | if (rr->first) { | |
ba0f26d8 VS |
265 | memset(m, 0, sizeof(struct mbm_state)); |
266 | m->prev_bw_msr = m->prev_msr = tval; | |
a4de1dfd VS |
267 | return 0; |
268 | } | |
269 | ||
ba0f26d8 | 270 | chunks = mbm_overflow_count(m->prev_msr, tval); |
9f52425b TL |
271 | m->chunks += chunks; |
272 | m->prev_msr = tval; | |
273 | ||
274 | rr->val += m->chunks; | |
275 | return 0; | |
d89b7379 VS |
276 | } |
277 | ||
ba0f26d8 VS |
278 | /* |
279 | * Supporting function to calculate the memory bandwidth | |
280 | * and delta bandwidth in MBps. | |
281 | */ | |
282 | static void mbm_bw_count(u32 rmid, struct rmid_read *rr) | |
283 | { | |
284 | struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; | |
285 | struct mbm_state *m = &rr->d->mbm_local[rmid]; | |
286 | u64 tval, cur_bw, chunks; | |
287 | ||
288 | tval = __rmid_read(rmid, rr->evtid); | |
289 | if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | |
290 | return; | |
291 | ||
292 | chunks = mbm_overflow_count(m->prev_bw_msr, tval); | |
293 | m->chunks_bw += chunks; | |
294 | m->chunks = m->chunks_bw; | |
295 | cur_bw = (chunks * r->mon_scale) >> 20; | |
296 | ||
297 | if (m->delta_comp) | |
298 | m->delta_bw = abs(cur_bw - m->prev_bw); | |
299 | m->delta_comp = false; | |
300 | m->prev_bw = cur_bw; | |
301 | m->prev_bw_msr = tval; | |
302 | } | |
303 | ||
d89b7379 VS |
304 | /* |
305 | * This is called via IPI to read the CQM/MBM counters | |
306 | * on a domain. | |
307 | */ | |
308 | void mon_event_count(void *info) | |
309 | { | |
310 | struct rdtgroup *rdtgrp, *entry; | |
311 | struct rmid_read *rr = info; | |
312 | struct list_head *head; | |
313 | ||
314 | rdtgrp = rr->rgrp; | |
315 | ||
316 | if (__mon_event_count(rdtgrp->mon.rmid, rr)) | |
317 | return; | |
318 | ||
319 | /* | |
320 | * For Ctrl groups read data from child monitor groups. | |
321 | */ | |
322 | head = &rdtgrp->mon.crdtgrp_list; | |
323 | ||
324 | if (rdtgrp->type == RDTCTRL_GROUP) { | |
325 | list_for_each_entry(entry, head, mon.crdtgrp_list) { | |
326 | if (__mon_event_count(entry->mon.rmid, rr)) | |
327 | return; | |
328 | } | |
329 | } | |
330 | } | |
4af4a88e | 331 | |
e3302683 VS |
332 | static void mbm_update(struct rdt_domain *d, int rmid) |
333 | { | |
334 | struct rmid_read rr; | |
335 | ||
336 | rr.first = false; | |
337 | rr.d = d; | |
338 | ||
339 | /* | |
340 | * This is protected from concurrent reads from user | |
341 | * as both the user and we hold the global mutex. | |
342 | */ | |
343 | if (is_mbm_total_enabled()) { | |
344 | rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; | |
345 | __mon_event_count(rmid, &rr); | |
346 | } | |
347 | if (is_mbm_local_enabled()) { | |
348 | rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; | |
349 | __mon_event_count(rmid, &rr); | |
350 | } | |
351 | } | |
352 | ||
24247aee VS |
353 | /* |
354 | * Handler to scan the limbo list and move the RMIDs | |
355 | * to free list whose occupancy < threshold_occupancy. | |
356 | */ | |
357 | void cqm_handle_limbo(struct work_struct *work) | |
358 | { | |
359 | unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); | |
360 | int cpu = smp_processor_id(); | |
361 | struct rdt_resource *r; | |
362 | struct rdt_domain *d; | |
363 | ||
364 | mutex_lock(&rdtgroup_mutex); | |
365 | ||
366 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | |
367 | d = get_domain_from_cpu(cpu, r); | |
368 | ||
369 | if (!d) { | |
370 | pr_warn_once("Failure to get domain for limbo worker\n"); | |
371 | goto out_unlock; | |
372 | } | |
373 | ||
374 | __check_limbo(d, false); | |
375 | ||
376 | if (has_busy_rmid(r, d)) | |
377 | schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); | |
378 | ||
379 | out_unlock: | |
380 | mutex_unlock(&rdtgroup_mutex); | |
381 | } | |
382 | ||
383 | void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) | |
384 | { | |
385 | unsigned long delay = msecs_to_jiffies(delay_ms); | |
386 | struct rdt_resource *r; | |
387 | int cpu; | |
388 | ||
389 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | |
390 | ||
391 | cpu = cpumask_any(&dom->cpu_mask); | |
392 | dom->cqm_work_cpu = cpu; | |
393 | ||
394 | schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); | |
395 | } | |
396 | ||
e3302683 VS |
397 | void mbm_handle_overflow(struct work_struct *work) |
398 | { | |
399 | unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); | |
400 | struct rdtgroup *prgrp, *crgrp; | |
401 | int cpu = smp_processor_id(); | |
402 | struct list_head *head; | |
403 | struct rdt_domain *d; | |
404 | ||
405 | mutex_lock(&rdtgroup_mutex); | |
406 | ||
407 | if (!static_branch_likely(&rdt_enable_key)) | |
408 | goto out_unlock; | |
409 | ||
410 | d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); | |
411 | if (!d) | |
412 | goto out_unlock; | |
413 | ||
414 | list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { | |
415 | mbm_update(d, prgrp->mon.rmid); | |
416 | ||
417 | head = &prgrp->mon.crdtgrp_list; | |
418 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) | |
419 | mbm_update(d, crgrp->mon.rmid); | |
420 | } | |
421 | ||
422 | schedule_delayed_work_on(cpu, &d->mbm_over, delay); | |
24247aee | 423 | |
e3302683 VS |
424 | out_unlock: |
425 | mutex_unlock(&rdtgroup_mutex); | |
426 | } | |
427 | ||
bbc4615e | 428 | void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) |
e3302683 | 429 | { |
bbc4615e | 430 | unsigned long delay = msecs_to_jiffies(delay_ms); |
e3302683 VS |
431 | int cpu; |
432 | ||
433 | if (!static_branch_likely(&rdt_enable_key)) | |
434 | return; | |
435 | cpu = cpumask_any(&dom->cpu_mask); | |
436 | dom->mbm_work_cpu = cpu; | |
437 | schedule_delayed_work_on(cpu, &dom->mbm_over, delay); | |
438 | } | |
439 | ||
6a445edc VS |
440 | static int dom_data_init(struct rdt_resource *r) |
441 | { | |
442 | struct rmid_entry *entry = NULL; | |
443 | int i, nr_rmids; | |
444 | ||
445 | nr_rmids = r->num_rmid; | |
446 | rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); | |
447 | if (!rmid_ptrs) | |
448 | return -ENOMEM; | |
449 | ||
450 | for (i = 0; i < nr_rmids; i++) { | |
451 | entry = &rmid_ptrs[i]; | |
452 | INIT_LIST_HEAD(&entry->list); | |
453 | ||
454 | entry->rmid = i; | |
455 | list_add_tail(&entry->list, &rmid_free_lru); | |
456 | } | |
457 | ||
458 | /* | |
459 | * RMID 0 is special and is always allocated. It's used for all | |
460 | * tasks that are not monitored. | |
461 | */ | |
462 | entry = __rmid_entry(0); | |
463 | list_del(&entry->list); | |
464 | ||
465 | return 0; | |
466 | } | |
467 | ||
468 | static struct mon_evt llc_occupancy_event = { | |
469 | .name = "llc_occupancy", | |
470 | .evtid = QOS_L3_OCCUP_EVENT_ID, | |
471 | }; | |
472 | ||
9f52425b TL |
473 | static struct mon_evt mbm_total_event = { |
474 | .name = "mbm_total_bytes", | |
475 | .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, | |
476 | }; | |
477 | ||
478 | static struct mon_evt mbm_local_event = { | |
479 | .name = "mbm_local_bytes", | |
480 | .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, | |
481 | }; | |
482 | ||
6a445edc VS |
483 | /* |
484 | * Initialize the event list for the resource. | |
485 | * | |
486 | * Note that MBM events are also part of RDT_RESOURCE_L3 resource | |
487 | * because as per the SDM the total and local memory bandwidth | |
488 | * are enumerated as part of L3 monitoring. | |
489 | */ | |
490 | static void l3_mon_evt_init(struct rdt_resource *r) | |
491 | { | |
492 | INIT_LIST_HEAD(&r->evt_list); | |
493 | ||
494 | if (is_llc_occupancy_enabled()) | |
495 | list_add_tail(&llc_occupancy_event.list, &r->evt_list); | |
9f52425b TL |
496 | if (is_mbm_total_enabled()) |
497 | list_add_tail(&mbm_total_event.list, &r->evt_list); | |
498 | if (is_mbm_local_enabled()) | |
499 | list_add_tail(&mbm_local_event.list, &r->evt_list); | |
6a445edc VS |
500 | } |
501 | ||
502 | int rdt_get_mon_l3_config(struct rdt_resource *r) | |
503 | { | |
504 | int ret; | |
505 | ||
506 | r->mon_scale = boot_cpu_data.x86_cache_occ_scale; | |
507 | r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; | |
508 | ||
509 | /* | |
510 | * A reasonable upper limit on the max threshold is the number | |
511 | * of lines tagged per RMID if all RMIDs have the same number of | |
512 | * lines tagged in the LLC. | |
513 | * | |
514 | * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. | |
515 | */ | |
516 | intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; | |
517 | ||
518 | /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ | |
519 | intel_cqm_threshold /= r->mon_scale; | |
520 | ||
521 | ret = dom_data_init(r); | |
522 | if (ret) | |
523 | return ret; | |
524 | ||
525 | l3_mon_evt_init(r); | |
526 | ||
527 | r->mon_capable = true; | |
528 | r->mon_enabled = true; | |
529 | ||
530 | return 0; | |
531 | } |