Commit | Line | Data |
---|---|---|
3f317499 YG |
1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* | |
3 | * AMD Address Translation Library | |
4 | * | |
5 | * umc.c : Unified Memory Controller (UMC) topology helpers | |
6 | * | |
7 | * Copyright (c) 2023, Advanced Micro Devices, Inc. | |
8 | * All Rights Reserved. | |
9 | * | |
10 | * Author: Yazen Ghannam <Yazen.Ghannam@amd.com> | |
11 | */ | |
12 | ||
13 | #include "internal.h" | |
14 | ||
453f0ae7 M |
15 | /* |
16 | * MI300 has a fixed, model-specific mapping between a UMC instance and | |
17 | * its related Data Fabric Coherent Station instance. | |
18 | * | |
19 | * The MCA_IPID_UMC[InstanceId] field holds a unique identifier for the | |
20 | * UMC instance within a Node. Use this to find the appropriate Coherent | |
21 | * Station ID. | |
22 | * | |
23 | * Redundant bits were removed from the map below. | |
24 | */ | |
25 | static const u16 umc_coh_st_map[32] = { | |
26 | 0x393, 0x293, 0x193, 0x093, | |
27 | 0x392, 0x292, 0x192, 0x092, | |
28 | 0x391, 0x291, 0x191, 0x091, | |
29 | 0x390, 0x290, 0x190, 0x090, | |
30 | 0x793, 0x693, 0x593, 0x493, | |
31 | 0x792, 0x692, 0x592, 0x492, | |
32 | 0x791, 0x691, 0x591, 0x491, | |
33 | 0x790, 0x690, 0x590, 0x490, | |
34 | }; | |
35 | ||
36 | #define UMC_ID_MI300 GENMASK(23, 12) | |
37 | static u8 get_coh_st_inst_id_mi300(struct atl_err *err) | |
38 | { | |
39 | u16 umc_id = FIELD_GET(UMC_ID_MI300, err->ipid); | |
40 | u8 i; | |
41 | ||
42 | for (i = 0; i < ARRAY_SIZE(umc_coh_st_map); i++) { | |
43 | if (umc_id == umc_coh_st_map[i]) | |
44 | break; | |
45 | } | |
46 | ||
47 | WARN_ON_ONCE(i >= ARRAY_SIZE(umc_coh_st_map)); | |
48 | ||
49 | return i; | |
50 | } | |
51 | ||
87a61237 YG |
52 | /* XOR the bits in @val. */ |
53 | static u16 bitwise_xor_bits(u16 val) | |
54 | { | |
55 | u16 tmp = 0; | |
56 | u8 i; | |
57 | ||
58 | for (i = 0; i < 16; i++) | |
59 | tmp ^= (val >> i) & 0x1; | |
60 | ||
61 | return tmp; | |
62 | } | |
63 | ||
64 | struct xor_bits { | |
65 | bool xor_enable; | |
66 | u16 col_xor; | |
67 | u32 row_xor; | |
68 | }; | |
69 | ||
70 | #define NUM_BANK_BITS 4 | |
71 | ||
72 | static struct { | |
73 | /* UMC::CH::AddrHashBank */ | |
74 | struct xor_bits bank[NUM_BANK_BITS]; | |
75 | ||
76 | /* UMC::CH::AddrHashPC */ | |
77 | struct xor_bits pc; | |
78 | ||
79 | /* UMC::CH::AddrHashPC2 */ | |
80 | u8 bank_xor; | |
81 | } addr_hash; | |
82 | ||
83 | #define MI300_UMC_CH_BASE 0x90000 | |
84 | #define MI300_ADDR_HASH_BANK0 (MI300_UMC_CH_BASE + 0xC8) | |
85 | #define MI300_ADDR_HASH_PC (MI300_UMC_CH_BASE + 0xE0) | |
86 | #define MI300_ADDR_HASH_PC2 (MI300_UMC_CH_BASE + 0xE4) | |
87 | ||
88 | #define ADDR_HASH_XOR_EN BIT(0) | |
89 | #define ADDR_HASH_COL_XOR GENMASK(13, 1) | |
90 | #define ADDR_HASH_ROW_XOR GENMASK(31, 14) | |
91 | #define ADDR_HASH_BANK_XOR GENMASK(5, 0) | |
92 | ||
93 | /* | |
94 | * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used | |
95 | * for hashing. Do this during module init, since the values will not | |
96 | * change during run time. | |
97 | * | |
98 | * These registers are instantiated for each UMC across each AMD Node. | |
99 | * However, they should be identically programmed due to the fixed hardware | |
100 | * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a | |
101 | * single global structure for simplicity. | |
102 | */ | |
103 | int get_addr_hash_mi300(void) | |
104 | { | |
105 | u32 temp; | |
106 | int ret; | |
107 | u8 i; | |
108 | ||
109 | for (i = 0; i < NUM_BANK_BITS; i++) { | |
110 | ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp); | |
111 | if (ret) | |
112 | return ret; | |
113 | ||
114 | addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp); | |
115 | addr_hash.bank[i].col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp); | |
116 | addr_hash.bank[i].row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp); | |
117 | } | |
118 | ||
119 | ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp); | |
120 | if (ret) | |
121 | return ret; | |
122 | ||
123 | addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp); | |
124 | addr_hash.pc.col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp); | |
125 | addr_hash.pc.row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp); | |
126 | ||
127 | ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp); | |
128 | if (ret) | |
129 | return ret; | |
130 | ||
131 | addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp); | |
132 | ||
133 | return 0; | |
134 | } | |
135 | ||
136 | /* | |
137 | * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must | |
138 | * be converted to the intermediate normalized address (NA) before translating to a | |
139 | * system physical address. | |
140 | * | |
141 | * The DRAM address includes bank, row, and column. Also included are bits for | |
142 | * pseudochannel (PC) and stack ID (SID). | |
143 | * | |
144 | * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero | |
145 | * | |
146 | * The MCA address format is as follows: | |
147 | * MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]} | |
148 | * | |
149 | * The normalized address format is fixed in hardware and is as follows: | |
150 | * NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]} | |
151 | * | |
152 | * Additionally, the PC and Bank bits may be hashed. This must be accounted for before | |
153 | * reconstructing the normalized address. | |
154 | */ | |
155 | #define MI300_UMC_MCA_COL GENMASK(5, 1) | |
156 | #define MI300_UMC_MCA_BANK GENMASK(9, 6) | |
157 | #define MI300_UMC_MCA_ROW GENMASK(24, 10) | |
158 | #define MI300_UMC_MCA_PC BIT(25) | |
159 | #define MI300_UMC_MCA_SID GENMASK(27, 26) | |
160 | ||
161 | #define MI300_NA_COL_1_0 GENMASK(6, 5) | |
162 | #define MI300_NA_PC BIT(7) | |
163 | #define MI300_NA_COL_3_2 GENMASK(9, 8) | |
164 | #define MI300_NA_BANK_3_2 GENMASK(11, 10) | |
165 | #define MI300_NA_BANK_1_0 GENMASK(13, 12) | |
166 | #define MI300_NA_COL_4 BIT(14) | |
167 | #define MI300_NA_ROW GENMASK(28, 15) | |
168 | #define MI300_NA_SID GENMASK(30, 29) | |
169 | ||
170 | static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) | |
171 | { | |
172 | u16 i, col, row, bank, pc, sid, temp; | |
173 | ||
174 | col = FIELD_GET(MI300_UMC_MCA_COL, addr); | |
175 | bank = FIELD_GET(MI300_UMC_MCA_BANK, addr); | |
176 | row = FIELD_GET(MI300_UMC_MCA_ROW, addr); | |
177 | pc = FIELD_GET(MI300_UMC_MCA_PC, addr); | |
178 | sid = FIELD_GET(MI300_UMC_MCA_SID, addr); | |
179 | ||
180 | /* Calculate hash for each Bank bit. */ | |
181 | for (i = 0; i < NUM_BANK_BITS; i++) { | |
182 | if (!addr_hash.bank[i].xor_enable) | |
183 | continue; | |
184 | ||
185 | temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor); | |
186 | temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor); | |
187 | bank ^= temp << i; | |
188 | } | |
189 | ||
190 | /* Calculate hash for PC bit. */ | |
191 | if (addr_hash.pc.xor_enable) { | |
192 | /* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */ | |
193 | bank |= sid << 5; | |
194 | ||
195 | temp = bitwise_xor_bits(col & addr_hash.pc.col_xor); | |
196 | temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor); | |
197 | temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor); | |
198 | pc ^= temp; | |
199 | ||
200 | /* Drop SID bits for the sake of debug printing later. */ | |
201 | bank &= 0x1F; | |
202 | } | |
203 | ||
204 | /* Reconstruct the normalized address starting with NA[4:0] = 0 */ | |
205 | addr = 0; | |
206 | ||
207 | /* NA[6:5] = Column[1:0] */ | |
208 | temp = col & 0x3; | |
209 | addr |= FIELD_PREP(MI300_NA_COL_1_0, temp); | |
210 | ||
211 | /* NA[7] = PC */ | |
212 | addr |= FIELD_PREP(MI300_NA_PC, pc); | |
213 | ||
214 | /* NA[9:8] = Column[3:2] */ | |
215 | temp = (col >> 2) & 0x3; | |
216 | addr |= FIELD_PREP(MI300_NA_COL_3_2, temp); | |
217 | ||
218 | /* NA[11:10] = Bank[3:2] */ | |
219 | temp = (bank >> 2) & 0x3; | |
220 | addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp); | |
221 | ||
222 | /* NA[13:12] = Bank[1:0] */ | |
223 | temp = bank & 0x3; | |
224 | addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp); | |
225 | ||
226 | /* NA[14] = Column[4] */ | |
227 | temp = (col >> 4) & 0x1; | |
228 | addr |= FIELD_PREP(MI300_NA_COL_4, temp); | |
229 | ||
230 | /* NA[28:15] = Row[13:0] */ | |
231 | addr |= FIELD_PREP(MI300_NA_ROW, row); | |
232 | ||
233 | /* NA[30:29] = SID[1:0] */ | |
234 | addr |= FIELD_PREP(MI300_NA_SID, sid); | |
235 | ||
236 | pr_debug("Addr=0x%016lx", addr); | |
237 | pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid); | |
238 | ||
239 | return addr; | |
240 | } | |
241 | ||
3b566b30 YG |
242 | /* |
243 | * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire | |
244 | * all memory within that DRAM row. This applies to the memory with a DRAM | |
245 | * bank. | |
246 | * | |
247 | * To find the memory addresses, loop through permutations of the DRAM column | |
248 | * bits and find the System Physical address of each. The column bits are used | |
249 | * to calculate the intermediate Normalized address, so all permutations should | |
250 | * be checked. | |
251 | * | |
252 | * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats. | |
253 | */ | |
254 | #define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL)) | |
255 | static void retire_row_mi300(struct atl_err *a_err) | |
256 | { | |
257 | unsigned long addr; | |
258 | struct page *p; | |
259 | u8 col; | |
260 | ||
261 | for (col = 0; col < MI300_NUM_COL; col++) { | |
262 | a_err->addr &= ~MI300_UMC_MCA_COL; | |
263 | a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col); | |
264 | ||
265 | addr = amd_convert_umc_mca_addr_to_sys_addr(a_err); | |
266 | if (IS_ERR_VALUE(addr)) | |
267 | continue; | |
268 | ||
269 | addr = PHYS_PFN(addr); | |
270 | ||
271 | /* | |
272 | * Skip invalid or already poisoned pages to avoid unnecessary | |
273 | * error messages from memory_failure(). | |
274 | */ | |
275 | p = pfn_to_online_page(addr); | |
276 | if (!p) | |
277 | continue; | |
278 | ||
279 | if (PageHWPoison(p)) | |
280 | continue; | |
281 | ||
282 | memory_failure(addr, 0); | |
283 | } | |
284 | } | |
285 | ||
286 | void amd_retire_dram_row(struct atl_err *a_err) | |
287 | { | |
288 | if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) | |
289 | return retire_row_mi300(a_err); | |
290 | } | |
291 | EXPORT_SYMBOL_GPL(amd_retire_dram_row); | |
292 | ||
87a61237 YG |
293 | static unsigned long get_addr(unsigned long addr) |
294 | { | |
295 | if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) | |
296 | return convert_dram_to_norm_addr_mi300(addr); | |
297 | ||
298 | return addr; | |
299 | } | |
300 | ||
453f0ae7 | 301 | #define MCA_IPID_INST_ID_HI GENMASK_ULL(47, 44) |
3f317499 YG |
302 | static u8 get_die_id(struct atl_err *err) |
303 | { | |
453f0ae7 M |
304 | /* |
305 | * AMD Node ID is provided in MCA_IPID[InstanceIdHi], and this | |
306 | * needs to be divided by 4 to get the internal Die ID. | |
307 | */ | |
308 | if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) { | |
309 | u8 node_id = FIELD_GET(MCA_IPID_INST_ID_HI, err->ipid); | |
310 | ||
311 | return node_id >> 2; | |
312 | } | |
313 | ||
3f317499 YG |
314 | /* |
315 | * For CPUs, this is the AMD Node ID modulo the number | |
316 | * of AMD Nodes per socket. | |
317 | */ | |
318 | return topology_die_id(err->cpu) % amd_get_nodes_per_socket(); | |
319 | } | |
320 | ||
321 | #define UMC_CHANNEL_NUM GENMASK(31, 20) | |
322 | static u8 get_coh_st_inst_id(struct atl_err *err) | |
323 | { | |
453f0ae7 M |
324 | if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) |
325 | return get_coh_st_inst_id_mi300(err); | |
326 | ||
3f317499 YG |
327 | return FIELD_GET(UMC_CHANNEL_NUM, err->ipid); |
328 | } | |
329 | ||
330 | unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err) | |
331 | { | |
332 | u8 socket_id = topology_physical_package_id(err->cpu); | |
333 | u8 coh_st_inst_id = get_coh_st_inst_id(err); | |
87a61237 | 334 | unsigned long addr = get_addr(err->addr); |
3f317499 YG |
335 | u8 die_id = get_die_id(err); |
336 | ||
337 | pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx", | |
338 | socket_id, die_id, coh_st_inst_id, addr); | |
339 | ||
340 | return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr); | |
341 | } |