Merge tag 'edac_updates_for_v6.9' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-block.git] / drivers / ras / amd / fmpm.c
CommitLineData
6f15e617
YG
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * FRU (Field-Replaceable Unit) Memory Poison Manager
4 *
5 * Copyright (c) 2024, Advanced Micro Devices, Inc.
6 * All Rights Reserved.
7 *
8 * Authors:
9 * Naveen Krishna Chatradhi <naveenkrishna.chatradhi@amd.com>
10 * Muralidhara M K <muralidhara.mk@amd.com>
11 * Yazen Ghannam <Yazen.Ghannam@amd.com>
12 *
13 * Implementation notes, assumptions, and limitations:
14 *
15 * - FRU memory poison section and memory poison descriptor definitions are not yet
16 * included in the UEFI specification. So they are defined here. Afterwards, they
17 * may be moved to linux/cper.h, if appropriate.
18 *
19 * - Platforms based on AMD MI300 systems will be the first to use these structures.
20 * There are a number of assumptions made here that will need to be generalized
21 * to support other platforms.
22 *
23 * AMD MI300-based platform(s) assumptions:
24 * - Memory errors are reported through x86 MCA.
25 * - The entire DRAM row containing a memory error should be retired.
26 * - There will be (1) FRU memory poison section per CPER.
27 * - The FRU will be the CPU package (processor socket).
28 * - The default number of memory poison descriptor entries should be (8).
29 * - The platform will use ACPI ERST for persistent storage.
30 * - All FRU records should be saved to persistent storage. Module init will
31 * fail if any FRU record is not successfully written.
32 *
33 * - Boot time memory retirement may occur later than ideal due to dependencies
34 * on other libraries and drivers. This leaves a gap where bad memory may be
35 * accessed during early boot stages.
36 *
37 * - Enough memory should be pre-allocated for each FRU record to be able to hold
38 * the expected number of descriptor entries. This, mostly empty, record is
39 * written to storage during init time. Subsequent writes to the same record
40 * should allow the Platform to update the stored record in-place. Otherwise,
41 * if the record is extended, then the Platform may need to perform costly memory
42 * management operations on the storage. For example, the Platform may spend time
43 * in Firmware copying and invalidating memory on a relatively slow SPI ROM.
44 */
45
46#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
47
48#include <linux/cper.h>
49#include <linux/ras.h>
50#include <linux/cpu.h>
51
52#include <acpi/apei.h>
53
54#include <asm/cpu_device_id.h>
55#include <asm/mce.h>
56
7d19eea5
YG
57#include "../debugfs.h"
58
6f15e617
YG
59#define INVALID_CPU UINT_MAX
60
61/* Validation Bits */
62#define FMP_VALID_ARCH_TYPE BIT_ULL(0)
63#define FMP_VALID_ARCH BIT_ULL(1)
64#define FMP_VALID_ID_TYPE BIT_ULL(2)
65#define FMP_VALID_ID BIT_ULL(3)
66#define FMP_VALID_LIST_ENTRIES BIT_ULL(4)
67#define FMP_VALID_LIST BIT_ULL(5)
68
69/* FRU Architecture Types */
70#define FMP_ARCH_TYPE_X86_CPUID_1_EAX 0
71
72/* FRU ID Types */
73#define FMP_ID_TYPE_X86_PPIN 0
74
75/* FRU Memory Poison Section */
76struct cper_sec_fru_mem_poison {
77 u32 checksum;
78 u64 validation_bits;
79 u32 fru_arch_type;
80 u64 fru_arch;
81 u32 fru_id_type;
82 u64 fru_id;
83 u32 nr_entries;
84} __packed;
85
86/* FRU Descriptor ID Types */
87#define FPD_HW_ID_TYPE_MCA_IPID 0
88
89/* FRU Descriptor Address Types */
90#define FPD_ADDR_TYPE_MCA_ADDR 0
91
92/* Memory Poison Descriptor */
93struct cper_fru_poison_desc {
94 u64 timestamp;
95 u32 hw_id_type;
96 u64 hw_id;
97 u32 addr_type;
98 u64 addr;
99} __packed;
100
101/* Collection of headers and sections for easy pointer use. */
102struct fru_rec {
103 struct cper_record_header hdr;
104 struct cper_section_descriptor sec_desc;
105 struct cper_sec_fru_mem_poison fmp;
106 struct cper_fru_poison_desc entries[];
107} __packed;
108
109/*
110 * Pointers to the complete CPER record of each FRU.
111 *
112 * Memory allocation will include padded space for descriptor entries.
113 */
114static struct fru_rec **fru_records;
115
838850c5
YG
116/* system physical addresses array */
117static u64 *spa_entries;
118
119#define INVALID_SPA ~0ULL
120
7d19eea5
YG
121static struct dentry *fmpm_dfs_dir;
122static struct dentry *fmpm_dfs_entries;
123
6f15e617
YG
124#define CPER_CREATOR_FMP \
125 GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \
126 0xa0, 0x33, 0x08, 0x75)
127
128#define CPER_SECTION_TYPE_FMP \
129 GUID_INIT(0x5e4706c1, 0x5356, 0x48c6, 0x93, 0x0b, 0x52, 0xf2, \
130 0x12, 0x0a, 0x44, 0x58)
131
132/**
838850c5 133 * DOC: max_nr_entries (byte)
6f15e617
YG
134 * Maximum number of descriptor entries possible for each FRU.
135 *
136 * Values between '1' and '255' are valid.
137 * No input or '0' will default to FMPM_DEFAULT_MAX_NR_ENTRIES.
138 */
139static u8 max_nr_entries;
140module_param(max_nr_entries, byte, 0644);
141MODULE_PARM_DESC(max_nr_entries,
142 "Maximum number of memory poison descriptor entries per FRU");
143
144#define FMPM_DEFAULT_MAX_NR_ENTRIES 8
145
146/* Maximum number of FRUs in the system. */
147#define FMPM_MAX_NR_FRU 256
148static unsigned int max_nr_fru;
149
150/* Total length of record including headers and list of descriptor entries. */
151static size_t max_rec_len;
152
838850c5
YG
153/* Total number of SPA entries across all FRUs. */
154static unsigned int spa_nr_entries;
155
6f15e617
YG
156/*
157 * Protect the local records cache in fru_records and prevent concurrent
158 * writes to storage. This is only needed after init once notifier block
159 * registration is done.
7d19eea5
YG
160 *
161 * The majority of a record is fixed at module init and will not change
162 * during run time. The entries within a record will be updated as new
163 * errors are reported. The mutex should be held whenever the entries are
164 * accessed during run time.
6f15e617
YG
165 */
166static DEFINE_MUTEX(fmpm_update_mutex);
167
168#define for_each_fru(i, rec) \
169 for (i = 0; rec = fru_records[i], i < max_nr_fru; i++)
170
171static inline u32 get_fmp_len(struct fru_rec *rec)
172{
173 return rec->sec_desc.section_length - sizeof(struct cper_section_descriptor);
174}
175
176static struct fru_rec *get_fru_record(u64 fru_id)
177{
178 struct fru_rec *rec;
179 unsigned int i;
180
181 for_each_fru(i, rec) {
182 if (rec->fmp.fru_id == fru_id)
183 return rec;
184 }
185
186 pr_debug("Record not found for FRU 0x%016llx\n", fru_id);
187
188 return NULL;
189}
190
191/*
192 * Sum up all bytes within the FRU Memory Poison Section including the Memory
193 * Poison Descriptor entries.
194 *
195 * Don't include the old checksum here. It's a u32 value, so summing each of its
196 * bytes will give the wrong total.
197 */
198static u32 do_fmp_checksum(struct cper_sec_fru_mem_poison *fmp, u32 len)
199{
200 u32 checksum = 0;
201 u8 *buf, *end;
202
203 /* Skip old checksum. */
204 buf = (u8 *)fmp + sizeof(u32);
205 end = buf + len;
206
207 while (buf < end)
208 checksum += (u8)(*(buf++));
209
210 return checksum;
211}
212
213static int update_record_on_storage(struct fru_rec *rec)
214{
215 u32 len, checksum;
216 int ret;
217
218 /* Calculate a new checksum. */
219 len = get_fmp_len(rec);
220
221 /* Get the current total. */
222 checksum = do_fmp_checksum(&rec->fmp, len);
223
224 /* Use the complement value. */
225 rec->fmp.checksum = -checksum;
226
227 pr_debug("Writing to storage\n");
228
229 ret = erst_write(&rec->hdr);
230 if (ret) {
231 pr_warn("Storage update failed for FRU 0x%016llx\n", rec->fmp.fru_id);
232
233 if (ret == -ENOSPC)
234 pr_warn("Not enough space on storage\n");
235 }
236
237 return ret;
238}
239
240static bool rec_has_valid_entries(struct fru_rec *rec)
241{
242 if (!(rec->fmp.validation_bits & FMP_VALID_LIST_ENTRIES))
243 return false;
244
245 if (!(rec->fmp.validation_bits & FMP_VALID_LIST))
246 return false;
247
248 return true;
249}
250
251static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new)
252{
253 /*
254 * Ignore timestamp field.
255 * The same physical error may be reported multiple times due to stuck bits, etc.
256 *
257 * Also, order the checks from most->least likely to fail to shortcut the code.
258 */
259 if (old->addr != new->addr)
260 return false;
261
262 if (old->hw_id != new->hw_id)
263 return false;
264
265 if (old->addr_type != new->addr_type)
266 return false;
267
268 if (old->hw_id_type != new->hw_id_type)
269 return false;
270
271 return true;
272}
273
274static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd)
275{
276 unsigned int i;
277
278 for (i = 0; i < rec->fmp.nr_entries; i++) {
279 struct cper_fru_poison_desc *fpd_i = &rec->entries[i];
280
281 if (fpds_equal(fpd_i, fpd)) {
282 pr_debug("Found duplicate record\n");
283 return true;
284 }
285 }
286
287 return false;
288}
289
838850c5
YG
290static void save_spa(struct fru_rec *rec, unsigned int entry,
291 u64 addr, u64 id, unsigned int cpu)
292{
293 unsigned int i, fru_idx, spa_entry;
294 struct atl_err a_err;
295 unsigned long spa;
296
297 if (entry >= max_nr_entries) {
298 pr_warn_once("FRU descriptor entry %d out-of-bounds (max: %d)\n",
299 entry, max_nr_entries);
300 return;
301 }
302
303 /* spa_nr_entries is always multiple of max_nr_entries */
304 for (i = 0; i < spa_nr_entries; i += max_nr_entries) {
305 fru_idx = i / max_nr_entries;
306 if (fru_records[fru_idx] == rec)
307 break;
308 }
309
310 if (i >= spa_nr_entries) {
311 pr_warn_once("FRU record %d not found\n", i);
312 return;
313 }
314
315 spa_entry = i + entry;
316 if (spa_entry >= spa_nr_entries) {
317 pr_warn_once("spa_entries[] index out-of-bounds\n");
318 return;
319 }
320
321 memset(&a_err, 0, sizeof(struct atl_err));
322
323 a_err.addr = addr;
324 a_err.ipid = id;
325 a_err.cpu = cpu;
326
327 spa = amd_convert_umc_mca_addr_to_sys_addr(&a_err);
328 if (IS_ERR_VALUE(spa)) {
329 pr_debug("Failed to get system address\n");
330 return;
331 }
332
333 spa_entries[spa_entry] = spa;
334 pr_debug("fru_idx: %u, entry: %u, spa_entry: %u, spa: 0x%016llx\n",
335 fru_idx, entry, spa_entry, spa_entries[spa_entry]);
336}
337
6f15e617
YG
338static void update_fru_record(struct fru_rec *rec, struct mce *m)
339{
340 struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
341 struct cper_fru_poison_desc fpd, *fpd_dest;
342 u32 entry = 0;
343
344 mutex_lock(&fmpm_update_mutex);
345
346 memset(&fpd, 0, sizeof(struct cper_fru_poison_desc));
347
348 fpd.timestamp = m->time;
349 fpd.hw_id_type = FPD_HW_ID_TYPE_MCA_IPID;
350 fpd.hw_id = m->ipid;
351 fpd.addr_type = FPD_ADDR_TYPE_MCA_ADDR;
352 fpd.addr = m->addr;
353
354 /* This is the first entry, so just save it. */
355 if (!rec_has_valid_entries(rec))
356 goto save_fpd;
357
358 /* Ignore already recorded errors. */
359 if (rec_has_fpd(rec, &fpd))
360 goto out_unlock;
361
362 if (rec->fmp.nr_entries >= max_nr_entries) {
363 pr_warn("Exceeded number of entries for FRU 0x%016llx\n", rec->fmp.fru_id);
364 goto out_unlock;
365 }
366
367 entry = fmp->nr_entries;
368
369save_fpd:
838850c5 370 save_spa(rec, entry, m->addr, m->ipid, m->extcpu);
6f15e617
YG
371 fpd_dest = &rec->entries[entry];
372 memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc));
373
374 fmp->nr_entries = entry + 1;
375 fmp->validation_bits |= FMP_VALID_LIST_ENTRIES;
376 fmp->validation_bits |= FMP_VALID_LIST;
377
378 pr_debug("Updated FRU 0x%016llx entry #%u\n", fmp->fru_id, entry);
379
380 update_record_on_storage(rec);
381
382out_unlock:
383 mutex_unlock(&fmpm_update_mutex);
384}
385
386static void retire_dram_row(u64 addr, u64 id, u32 cpu)
387{
388 struct atl_err a_err;
389
390 memset(&a_err, 0, sizeof(struct atl_err));
391
392 a_err.addr = addr;
393 a_err.ipid = id;
394 a_err.cpu = cpu;
395
396 amd_retire_dram_row(&a_err);
397}
398
399static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long val, void *data)
400{
401 struct mce *m = (struct mce *)data;
402 struct fru_rec *rec;
403
404 if (!mce_is_memory_error(m))
405 return NOTIFY_DONE;
406
407 retire_dram_row(m->addr, m->ipid, m->extcpu);
408
409 /*
410 * An invalid FRU ID should not happen on real errors. But it
411 * could happen from software error injection, etc.
412 */
413 rec = get_fru_record(m->ppin);
414 if (!rec)
415 return NOTIFY_DONE;
416
417 update_fru_record(rec, m);
418
419 return NOTIFY_OK;
420}
421
422static struct notifier_block fru_mem_poison_nb = {
423 .notifier_call = fru_handle_mem_poison,
424 .priority = MCE_PRIO_LOWEST,
425};
426
427static void retire_mem_fmp(struct fru_rec *rec)
428{
429 struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
430 unsigned int i, cpu;
431
432 for (i = 0; i < fmp->nr_entries; i++) {
433 struct cper_fru_poison_desc *fpd = &rec->entries[i];
434 unsigned int err_cpu = INVALID_CPU;
435
436 if (fpd->hw_id_type != FPD_HW_ID_TYPE_MCA_IPID)
437 continue;
438
439 if (fpd->addr_type != FPD_ADDR_TYPE_MCA_ADDR)
440 continue;
441
442 cpus_read_lock();
443 for_each_online_cpu(cpu) {
444 if (topology_ppin(cpu) == fmp->fru_id) {
445 err_cpu = cpu;
446 break;
447 }
448 }
449 cpus_read_unlock();
450
451 if (err_cpu == INVALID_CPU)
452 continue;
453
454 retire_dram_row(fpd->addr, fpd->hw_id, err_cpu);
838850c5 455 save_spa(rec, i, fpd->addr, fpd->hw_id, err_cpu);
6f15e617
YG
456 }
457}
458
459static void retire_mem_records(void)
460{
461 struct fru_rec *rec;
462 unsigned int i;
463
464 for_each_fru(i, rec) {
465 if (!rec_has_valid_entries(rec))
466 continue;
467
468 retire_mem_fmp(rec);
469 }
470}
471
472/* Set the CPER Record Header and CPER Section Descriptor fields. */
473static void set_rec_fields(struct fru_rec *rec)
474{
475 struct cper_section_descriptor *sec_desc = &rec->sec_desc;
476 struct cper_record_header *hdr = &rec->hdr;
477
478 memcpy(hdr->signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
479 hdr->revision = CPER_RECORD_REV;
480 hdr->signature_end = CPER_SIG_END;
481
482 /*
483 * Currently, it is assumed that there is one FRU Memory Poison
484 * section per CPER. But this may change for other implementations.
485 */
486 hdr->section_count = 1;
487
488 /* The logged errors are recoverable. Otherwise, they'd never make it here. */
489 hdr->error_severity = CPER_SEV_RECOVERABLE;
490
491 hdr->validation_bits = 0;
492 hdr->record_length = max_rec_len;
493 hdr->creator_id = CPER_CREATOR_FMP;
494 hdr->notification_type = CPER_NOTIFY_MCE;
495 hdr->record_id = cper_next_record_id();
496 hdr->flags = CPER_HW_ERROR_FLAGS_PREVERR;
497
498 sec_desc->section_offset = sizeof(struct cper_record_header);
499 sec_desc->section_length = max_rec_len - sizeof(struct cper_record_header);
500 sec_desc->revision = CPER_SEC_REV;
501 sec_desc->validation_bits = 0;
502 sec_desc->flags = CPER_SEC_PRIMARY;
503 sec_desc->section_type = CPER_SECTION_TYPE_FMP;
504 sec_desc->section_severity = CPER_SEV_RECOVERABLE;
505}
506
507static int save_new_records(void)
508{
509 DECLARE_BITMAP(new_records, FMPM_MAX_NR_FRU);
510 struct fru_rec *rec;
511 unsigned int i;
512 int ret = 0;
513
514 for_each_fru(i, rec) {
515 if (rec->hdr.record_length)
516 continue;
517
518 set_rec_fields(rec);
519
520 ret = update_record_on_storage(rec);
521 if (ret)
522 goto out_clear;
523
524 set_bit(i, new_records);
525 }
526
527 return ret;
528
529out_clear:
530 for_each_fru(i, rec) {
531 if (!test_bit(i, new_records))
532 continue;
533
534 erst_clear(rec->hdr.record_id);
535 }
536
537 return ret;
538}
539
540/* Check that the record matches expected types for the current system.*/
541static bool fmp_is_usable(struct fru_rec *rec)
542{
543 struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
544 u64 cpuid;
545
546 pr_debug("Validation bits: 0x%016llx\n", fmp->validation_bits);
547
548 if (!(fmp->validation_bits & FMP_VALID_ARCH_TYPE)) {
549 pr_debug("Arch type unknown\n");
550 return false;
551 }
552
553 if (fmp->fru_arch_type != FMP_ARCH_TYPE_X86_CPUID_1_EAX) {
554 pr_debug("Arch type not 'x86 Family/Model/Stepping'\n");
555 return false;
556 }
557
558 if (!(fmp->validation_bits & FMP_VALID_ARCH)) {
559 pr_debug("Arch value unknown\n");
560 return false;
561 }
562
563 cpuid = cpuid_eax(1);
564 if (fmp->fru_arch != cpuid) {
565 pr_debug("Arch value mismatch: record = 0x%016llx, system = 0x%016llx\n",
566 fmp->fru_arch, cpuid);
567 return false;
568 }
569
570 if (!(fmp->validation_bits & FMP_VALID_ID_TYPE)) {
571 pr_debug("FRU ID type unknown\n");
572 return false;
573 }
574
575 if (fmp->fru_id_type != FMP_ID_TYPE_X86_PPIN) {
576 pr_debug("FRU ID type is not 'x86 PPIN'\n");
577 return false;
578 }
579
580 if (!(fmp->validation_bits & FMP_VALID_ID)) {
581 pr_debug("FRU ID value unknown\n");
582 return false;
583 }
584
585 return true;
586}
587
588static bool fmp_is_valid(struct fru_rec *rec)
589{
590 struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
591 u32 checksum, len;
592
593 len = get_fmp_len(rec);
594 if (len < sizeof(struct cper_sec_fru_mem_poison)) {
595 pr_debug("fmp length is too small\n");
596 return false;
597 }
598
599 /* Checksum must sum to zero for the entire section. */
600 checksum = do_fmp_checksum(fmp, len) + fmp->checksum;
601 if (checksum) {
602 pr_debug("fmp checksum failed: sum = 0x%x\n", checksum);
603 print_hex_dump_debug("fmp record: ", DUMP_PREFIX_NONE, 16, 1, fmp, len, false);
604 return false;
605 }
606
607 if (!fmp_is_usable(rec))
608 return false;
609
610 return true;
611}
612
613static struct fru_rec *get_valid_record(struct fru_rec *old)
614{
615 struct fru_rec *new;
616
617 if (!fmp_is_valid(old)) {
618 pr_debug("Ignoring invalid record\n");
619 return NULL;
620 }
621
622 new = get_fru_record(old->fmp.fru_id);
623 if (!new)
624 pr_debug("Ignoring record for absent FRU\n");
625
626 return new;
627}
628
629/*
630 * Fetch saved records from persistent storage.
631 *
632 * For each found record:
633 * - If it was not created by this module, then ignore it.
634 * - If it is valid, then copy its data to the local cache.
635 * - If it is not valid, then erase it.
636 */
637static int get_saved_records(void)
638{
639 struct fru_rec *old, *new;
640 u64 record_id;
641 int ret, pos;
642 ssize_t len;
643
644 /*
645 * Assume saved records match current max size.
646 *
647 * However, this may not be true depending on module parameters.
648 */
649 old = kmalloc(max_rec_len, GFP_KERNEL);
650 if (!old) {
651 ret = -ENOMEM;
652 goto out;
653 }
654
655 ret = erst_get_record_id_begin(&pos);
656 if (ret < 0)
657 goto out_end;
658
659 while (!erst_get_record_id_next(&pos, &record_id)) {
660 if (record_id == APEI_ERST_INVALID_RECORD_ID)
661 goto out_end;
662 /*
663 * Make sure to clear temporary buffer between reads to avoid
664 * leftover data from records of various sizes.
665 */
666 memset(old, 0, max_rec_len);
667
668 len = erst_read_record(record_id, &old->hdr, max_rec_len,
669 sizeof(struct fru_rec), &CPER_CREATOR_FMP);
670 if (len < 0)
671 continue;
672
673 if (len > max_rec_len) {
674 pr_debug("Found record larger than max_rec_len\n");
675 continue;
676 }
677
678 new = get_valid_record(old);
679 if (!new)
680 erst_clear(record_id);
681
682 /* Restore the record */
683 memcpy(new, old, len);
684 }
685
686out_end:
687 erst_get_record_id_end();
688 kfree(old);
689out:
690 return ret;
691}
692
693static void set_fmp_fields(struct fru_rec *rec, unsigned int cpu)
694{
695 struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
696
697 fmp->fru_arch_type = FMP_ARCH_TYPE_X86_CPUID_1_EAX;
698 fmp->validation_bits |= FMP_VALID_ARCH_TYPE;
699
700 /* Assume all CPUs in the system have the same value for now. */
701 fmp->fru_arch = cpuid_eax(1);
702 fmp->validation_bits |= FMP_VALID_ARCH;
703
704 fmp->fru_id_type = FMP_ID_TYPE_X86_PPIN;
705 fmp->validation_bits |= FMP_VALID_ID_TYPE;
706
707 fmp->fru_id = topology_ppin(cpu);
708 fmp->validation_bits |= FMP_VALID_ID;
709}
710
711static int init_fmps(void)
712{
713 struct fru_rec *rec;
714 unsigned int i, cpu;
715 int ret = 0;
716
717 for_each_fru(i, rec) {
718 unsigned int fru_cpu = INVALID_CPU;
719
720 cpus_read_lock();
721 for_each_online_cpu(cpu) {
722 if (topology_physical_package_id(cpu) == i) {
723 fru_cpu = cpu;
724 break;
725 }
726 }
727 cpus_read_unlock();
728
729 if (fru_cpu == INVALID_CPU) {
730 pr_debug("Failed to find matching CPU for FRU #%u\n", i);
731 ret = -ENODEV;
732 break;
733 }
734
735 set_fmp_fields(rec, fru_cpu);
736 }
737
738 return ret;
739}
740
741static int get_system_info(void)
742{
743 /* Only load on MI300A systems for now. */
744 if (!(boot_cpu_data.x86_model >= 0x90 &&
745 boot_cpu_data.x86_model <= 0x9f))
746 return -ENODEV;
747
748 if (!cpu_feature_enabled(X86_FEATURE_AMD_PPIN)) {
749 pr_debug("PPIN feature not available\n");
750 return -ENODEV;
751 }
752
753 /* Use CPU socket as FRU for MI300 systems. */
754 max_nr_fru = topology_max_packages();
755 if (!max_nr_fru)
756 return -ENODEV;
757
758 if (max_nr_fru > FMPM_MAX_NR_FRU) {
759 pr_warn("Too many FRUs to manage: found: %u, max: %u\n",
760 max_nr_fru, FMPM_MAX_NR_FRU);
761 return -ENODEV;
762 }
763
764 if (!max_nr_entries)
765 max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES;
766
838850c5
YG
767 spa_nr_entries = max_nr_fru * max_nr_entries;
768
6f15e617
YG
769 max_rec_len = sizeof(struct fru_rec);
770 max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries;
771
772 pr_info("max FRUs: %u, max entries: %u, max record length: %lu\n",
773 max_nr_fru, max_nr_entries, max_rec_len);
774
775 return 0;
776}
777
778static void free_records(void)
779{
780 struct fru_rec *rec;
781 int i;
782
783 for_each_fru(i, rec)
784 kfree(rec);
785
786 kfree(fru_records);
838850c5 787 kfree(spa_entries);
6f15e617
YG
788}
789
790static int allocate_records(void)
791{
792 int i, ret = 0;
793
794 fru_records = kcalloc(max_nr_fru, sizeof(struct fru_rec *), GFP_KERNEL);
795 if (!fru_records) {
796 ret = -ENOMEM;
797 goto out;
798 }
799
800 for (i = 0; i < max_nr_fru; i++) {
801 fru_records[i] = kzalloc(max_rec_len, GFP_KERNEL);
802 if (!fru_records[i]) {
803 ret = -ENOMEM;
804 goto out_free;
805 }
806 }
807
838850c5
YG
808 spa_entries = kcalloc(spa_nr_entries, sizeof(u64), GFP_KERNEL);
809 if (!spa_entries) {
810 ret = -ENOMEM;
811 goto out_free;
812 }
813
814 for (i = 0; i < spa_nr_entries; i++)
815 spa_entries[i] = INVALID_SPA;
816
6f15e617
YG
817 return ret;
818
819out_free:
bd17b7c3 820 while (--i >= 0)
6f15e617
YG
821 kfree(fru_records[i]);
822
823 kfree(fru_records);
824out:
825 return ret;
826}
827
7d19eea5
YG
828static void *fmpm_start(struct seq_file *f, loff_t *pos)
829{
830 if (*pos >= (spa_nr_entries + 1))
831 return NULL;
832 return pos;
833}
834
835static void *fmpm_next(struct seq_file *f, void *data, loff_t *pos)
836{
837 if (++(*pos) >= (spa_nr_entries + 1))
838 return NULL;
839 return pos;
840}
841
842static void fmpm_stop(struct seq_file *f, void *data)
843{
844}
845
846#define SHORT_WIDTH 8
847#define U64_WIDTH 18
848#define TIMESTAMP_WIDTH 19
849#define LONG_WIDTH 24
850#define U64_PAD (LONG_WIDTH - U64_WIDTH)
851#define TS_PAD (LONG_WIDTH - TIMESTAMP_WIDTH)
852static int fmpm_show(struct seq_file *f, void *data)
853{
854 unsigned int fru_idx, entry, spa_entry, line;
855 struct cper_fru_poison_desc *fpd;
856 struct fru_rec *rec;
857
858 line = *(loff_t *)data;
859 if (line == 0) {
860 seq_printf(f, "%-*s", SHORT_WIDTH, "fru_idx");
861 seq_printf(f, "%-*s", LONG_WIDTH, "fru_id");
862 seq_printf(f, "%-*s", SHORT_WIDTH, "entry");
863 seq_printf(f, "%-*s", LONG_WIDTH, "timestamp");
864 seq_printf(f, "%-*s", LONG_WIDTH, "hw_id");
865 seq_printf(f, "%-*s", LONG_WIDTH, "addr");
866 seq_printf(f, "%-*s", LONG_WIDTH, "spa");
867 goto out_newline;
868 }
869
870 spa_entry = line - 1;
871 fru_idx = spa_entry / max_nr_entries;
872 entry = spa_entry % max_nr_entries;
873
874 rec = fru_records[fru_idx];
875 if (!rec)
876 goto out;
877
878 seq_printf(f, "%-*u", SHORT_WIDTH, fru_idx);
879 seq_printf(f, "0x%016llx%-*s", rec->fmp.fru_id, U64_PAD, "");
880 seq_printf(f, "%-*u", SHORT_WIDTH, entry);
881
882 mutex_lock(&fmpm_update_mutex);
883
884 if (entry >= rec->fmp.nr_entries) {
885 seq_printf(f, "%-*s", LONG_WIDTH, "*");
886 seq_printf(f, "%-*s", LONG_WIDTH, "*");
887 seq_printf(f, "%-*s", LONG_WIDTH, "*");
888 seq_printf(f, "%-*s", LONG_WIDTH, "*");
889 goto out_unlock;
890 }
891
892 fpd = &rec->entries[entry];
893
894 seq_printf(f, "%ptT%-*s", &fpd->timestamp, TS_PAD, "");
895 seq_printf(f, "0x%016llx%-*s", fpd->hw_id, U64_PAD, "");
896 seq_printf(f, "0x%016llx%-*s", fpd->addr, U64_PAD, "");
897
898 if (spa_entries[spa_entry] == INVALID_SPA)
899 seq_printf(f, "%-*s", LONG_WIDTH, "*");
900 else
901 seq_printf(f, "0x%016llx%-*s", spa_entries[spa_entry], U64_PAD, "");
902
903out_unlock:
904 mutex_unlock(&fmpm_update_mutex);
905out_newline:
906 seq_putc(f, '\n');
907out:
908 return 0;
909}
910
911static const struct seq_operations fmpm_seq_ops = {
912 .start = fmpm_start,
913 .next = fmpm_next,
914 .stop = fmpm_stop,
915 .show = fmpm_show,
916};
917
918static int fmpm_open(struct inode *inode, struct file *file)
919{
920 return seq_open(file, &fmpm_seq_ops);
921}
922
923static const struct file_operations fmpm_fops = {
924 .open = fmpm_open,
925 .release = seq_release,
926 .read = seq_read,
927 .llseek = seq_lseek,
928};
929
930static void setup_debugfs(void)
931{
932 struct dentry *dfs = ras_get_debugfs_root();
933
934 if (!dfs)
935 return;
936
937 fmpm_dfs_dir = debugfs_create_dir("fmpm", dfs);
938 if (!fmpm_dfs_dir)
939 return;
940
941 fmpm_dfs_entries = debugfs_create_file("entries", 0400, fmpm_dfs_dir, NULL, &fmpm_fops);
942 if (!fmpm_dfs_entries)
943 debugfs_remove(fmpm_dfs_dir);
944}
945
6f15e617
YG
946static const struct x86_cpu_id fmpm_cpuids[] = {
947 X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL),
948 { }
949};
950MODULE_DEVICE_TABLE(x86cpu, fmpm_cpuids);
951
952static int __init fru_mem_poison_init(void)
953{
954 int ret;
955
956 if (!x86_match_cpu(fmpm_cpuids)) {
957 ret = -ENODEV;
958 goto out;
959 }
960
961 if (erst_disable) {
962 pr_debug("ERST not available\n");
963 ret = -ENODEV;
964 goto out;
965 }
966
967 ret = get_system_info();
968 if (ret)
969 goto out;
970
971 ret = allocate_records();
972 if (ret)
973 goto out;
974
975 ret = init_fmps();
976 if (ret)
977 goto out_free;
978
979 ret = get_saved_records();
980 if (ret)
981 goto out_free;
982
983 ret = save_new_records();
984 if (ret)
985 goto out_free;
986
7d19eea5
YG
987 setup_debugfs();
988
6f15e617
YG
989 retire_mem_records();
990
991 mce_register_decode_chain(&fru_mem_poison_nb);
992
993 pr_info("FRU Memory Poison Manager initialized\n");
994 return 0;
995
996out_free:
997 free_records();
998out:
999 return ret;
1000}
1001
1002static void __exit fru_mem_poison_exit(void)
1003{
1004 mce_unregister_decode_chain(&fru_mem_poison_nb);
7d19eea5 1005 debugfs_remove(fmpm_dfs_dir);
6f15e617
YG
1006 free_records();
1007}
1008
1009module_init(fru_mem_poison_init);
1010module_exit(fru_mem_poison_exit);
1011
1012MODULE_LICENSE("GPL");
1013MODULE_DESCRIPTION("FRU Memory Poison Manager");