Commit | Line | Data |
---|---|---|
64f55e62 AG |
1 | /* |
2 | * Copyright 2019 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | */ | |
23 | ||
24 | #include "amdgpu_ras_eeprom.h" | |
25 | #include "amdgpu.h" | |
26 | #include "amdgpu_ras.h" | |
27 | #include <linux/bits.h> | |
ef1caf48 | 28 | #include "atom.h" |
64f55e62 | 29 | |
f3729f7b DV |
30 | #define EEPROM_I2C_TARGET_ADDR_VEGA20 0xA0 |
31 | #define EEPROM_I2C_TARGET_ADDR_ARCTURUS 0xA8 | |
32 | #define EEPROM_I2C_TARGET_ADDR_ARCTURUS_D342 0xA0 | |
3e7bc83e | 33 | #define EEPROM_I2C_TARGET_ADDR_SIENNA_CICHLID 0xA0 |
64f55e62 AG |
34 | |
35 | /* | |
36 | * The 2 macros bellow represent the actual size in bytes that | |
37 | * those entities occupy in the EEPROM memory. | |
38 | * EEPROM_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which | |
39 | * uses uint64 to store 6b fields such as retired_page. | |
40 | */ | |
41 | #define EEPROM_TABLE_HEADER_SIZE 20 | |
42 | #define EEPROM_TABLE_RECORD_SIZE 24 | |
43 | ||
44 | #define EEPROM_ADDRESS_SIZE 0x2 | |
45 | ||
46 | /* Table hdr is 'AMDR' */ | |
47 | #define EEPROM_TABLE_HDR_VAL 0x414d4452 | |
48 | #define EEPROM_TABLE_VER 0x00010000 | |
49 | ||
1d6a9d12 GC |
50 | /* Bad GPU tag ‘BADG’ */ |
51 | #define EEPROM_TABLE_HDR_BAD 0x42414447 | |
52 | ||
64f55e62 AG |
53 | /* Assume 2 Mbit size */ |
54 | #define EEPROM_SIZE_BYTES 256000 | |
55 | #define EEPROM_PAGE__SIZE_BYTES 256 | |
56 | #define EEPROM_HDR_START 0 | |
57 | #define EEPROM_RECORD_START (EEPROM_HDR_START + EEPROM_TABLE_HEADER_SIZE) | |
58 | #define EEPROM_MAX_RECORD_NUM ((EEPROM_SIZE_BYTES - EEPROM_TABLE_HEADER_SIZE) / EEPROM_TABLE_RECORD_SIZE) | |
59 | #define EEPROM_ADDR_MSB_MASK GENMASK(17, 8) | |
60 | ||
61 | #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev | |
62 | ||
4bfb7428 JC |
63 | static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) |
64 | { | |
65 | if ((adev->asic_type == CHIP_VEGA20) || | |
3e7bc83e JC |
66 | (adev->asic_type == CHIP_ARCTURUS) || |
67 | (adev->asic_type == CHIP_SIENNA_CICHLID)) | |
4bfb7428 JC |
68 | return true; |
69 | ||
70 | return false; | |
71 | } | |
72 | ||
ef1caf48 JC |
73 | static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev, |
74 | uint16_t *i2c_addr) | |
75 | { | |
76 | struct atom_context *atom_ctx = adev->mode_info.atom_context; | |
77 | ||
78 | if (!i2c_addr || !atom_ctx) | |
79 | return false; | |
80 | ||
81 | if (strnstr(atom_ctx->vbios_version, | |
82 | "D342", | |
83 | sizeof(atom_ctx->vbios_version))) | |
84 | *i2c_addr = EEPROM_I2C_TARGET_ADDR_ARCTURUS_D342; | |
85 | else | |
86 | *i2c_addr = EEPROM_I2C_TARGET_ADDR_ARCTURUS; | |
87 | ||
88 | return true; | |
89 | } | |
90 | ||
91 | static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, | |
92 | uint16_t *i2c_addr) | |
93 | { | |
94 | if (!i2c_addr) | |
95 | return false; | |
96 | ||
97 | switch (adev->asic_type) { | |
98 | case CHIP_VEGA20: | |
99 | *i2c_addr = EEPROM_I2C_TARGET_ADDR_VEGA20; | |
100 | break; | |
101 | ||
102 | case CHIP_ARCTURUS: | |
103 | return __get_eeprom_i2c_addr_arct(adev, i2c_addr); | |
104 | ||
3e7bc83e JC |
105 | case CHIP_SIENNA_CICHLID: |
106 | *i2c_addr = EEPROM_I2C_TARGET_ADDR_SIENNA_CICHLID; | |
107 | break; | |
108 | ||
ef1caf48 JC |
109 | default: |
110 | return false; | |
111 | } | |
112 | ||
113 | return true; | |
114 | } | |
115 | ||
64f55e62 AG |
116 | static void __encode_table_header_to_buff(struct amdgpu_ras_eeprom_table_header *hdr, |
117 | unsigned char *buff) | |
118 | { | |
119 | uint32_t *pp = (uint32_t *) buff; | |
120 | ||
121 | pp[0] = cpu_to_le32(hdr->header); | |
122 | pp[1] = cpu_to_le32(hdr->version); | |
123 | pp[2] = cpu_to_le32(hdr->first_rec_offset); | |
124 | pp[3] = cpu_to_le32(hdr->tbl_size); | |
125 | pp[4] = cpu_to_le32(hdr->checksum); | |
126 | } | |
127 | ||
128 | static void __decode_table_header_from_buff(struct amdgpu_ras_eeprom_table_header *hdr, | |
129 | unsigned char *buff) | |
130 | { | |
131 | uint32_t *pp = (uint32_t *)buff; | |
132 | ||
f3729f7b DV |
133 | hdr->header = le32_to_cpu(pp[0]); |
134 | hdr->version = le32_to_cpu(pp[1]); | |
64f55e62 | 135 | hdr->first_rec_offset = le32_to_cpu(pp[2]); |
f3729f7b DV |
136 | hdr->tbl_size = le32_to_cpu(pp[3]); |
137 | hdr->checksum = le32_to_cpu(pp[4]); | |
64f55e62 AG |
138 | } |
139 | ||
140 | static int __update_table_header(struct amdgpu_ras_eeprom_control *control, | |
141 | unsigned char *buff) | |
142 | { | |
143 | int ret = 0; | |
9015d60c | 144 | struct amdgpu_device *adev = to_amdgpu_device(control); |
64f55e62 | 145 | struct i2c_msg msg = { |
5985ebbe | 146 | .addr = 0, |
64f55e62 AG |
147 | .flags = 0, |
148 | .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE, | |
149 | .buf = buff, | |
150 | }; | |
151 | ||
152 | ||
153 | *(uint16_t *)buff = EEPROM_HDR_START; | |
154 | __encode_table_header_to_buff(&control->tbl_hdr, buff + EEPROM_ADDRESS_SIZE); | |
155 | ||
5985ebbe JC |
156 | msg.addr = control->i2c_address; |
157 | ||
40e7ed97 DL |
158 | /* i2c may be unstable in gpu reset */ |
159 | down_read(&adev->reset_sem); | |
9015d60c | 160 | ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1); |
40e7ed97 DL |
161 | up_read(&adev->reset_sem); |
162 | ||
64f55e62 AG |
163 | if (ret < 1) |
164 | DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret); | |
165 | ||
166 | return ret; | |
167 | } | |
168 | ||
db338e16 AG |
169 | static uint32_t __calc_hdr_byte_sum(struct amdgpu_ras_eeprom_control *control) |
170 | { | |
171 | int i; | |
172 | uint32_t tbl_sum = 0; | |
173 | ||
174 | /* Header checksum, skip checksum field in the calculation */ | |
175 | for (i = 0; i < sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum); i++) | |
176 | tbl_sum += *(((unsigned char *)&control->tbl_hdr) + i); | |
177 | ||
178 | return tbl_sum; | |
179 | } | |
180 | ||
181 | static uint32_t __calc_recs_byte_sum(struct eeprom_table_record *records, | |
182 | int num) | |
183 | { | |
184 | int i, j; | |
185 | uint32_t tbl_sum = 0; | |
186 | ||
187 | /* Records checksum */ | |
188 | for (i = 0; i < num; i++) { | |
189 | struct eeprom_table_record *record = &records[i]; | |
190 | ||
191 | for (j = 0; j < sizeof(*record); j++) { | |
192 | tbl_sum += *(((unsigned char *)record) + j); | |
193 | } | |
194 | } | |
195 | ||
196 | return tbl_sum; | |
197 | } | |
198 | ||
199 | static inline uint32_t __calc_tbl_byte_sum(struct amdgpu_ras_eeprom_control *control, | |
200 | struct eeprom_table_record *records, int num) | |
201 | { | |
202 | return __calc_hdr_byte_sum(control) + __calc_recs_byte_sum(records, num); | |
203 | } | |
204 | ||
205 | /* Checksum = 256 -((sum of all table entries) mod 256) */ | |
206 | static void __update_tbl_checksum(struct amdgpu_ras_eeprom_control *control, | |
207 | struct eeprom_table_record *records, int num, | |
208 | uint32_t old_hdr_byte_sum) | |
209 | { | |
210 | /* | |
211 | * This will update the table sum with new records. | |
212 | * | |
213 | * TODO: What happens when the EEPROM table is to be wrapped around | |
214 | * and old records from start will get overridden. | |
215 | */ | |
216 | ||
217 | /* need to recalculate updated header byte sum */ | |
218 | control->tbl_byte_sum -= old_hdr_byte_sum; | |
219 | control->tbl_byte_sum += __calc_tbl_byte_sum(control, records, num); | |
220 | ||
221 | control->tbl_hdr.checksum = 256 - (control->tbl_byte_sum % 256); | |
222 | } | |
223 | ||
224 | /* table sum mod 256 + checksum must equals 256 */ | |
225 | static bool __validate_tbl_checksum(struct amdgpu_ras_eeprom_control *control, | |
226 | struct eeprom_table_record *records, int num) | |
227 | { | |
228 | control->tbl_byte_sum = __calc_tbl_byte_sum(control, records, num); | |
229 | ||
230 | if (control->tbl_hdr.checksum + (control->tbl_byte_sum % 256) != 256) { | |
231 | DRM_WARN("Checksum mismatch, checksum: %u ", control->tbl_hdr.checksum); | |
232 | return false; | |
233 | } | |
234 | ||
235 | return true; | |
236 | } | |
64f55e62 | 237 | |
9b856def GC |
238 | static int amdgpu_ras_eeprom_correct_header_tag( |
239 | struct amdgpu_ras_eeprom_control *control, | |
240 | uint32_t header) | |
241 | { | |
242 | unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE]; | |
243 | struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; | |
244 | int ret = 0; | |
245 | ||
246 | memset(buff, 0, EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE); | |
247 | ||
248 | mutex_lock(&control->tbl_mutex); | |
249 | hdr->header = header; | |
250 | ret = __update_table_header(control, buff); | |
251 | mutex_unlock(&control->tbl_mutex); | |
252 | ||
253 | return ret; | |
254 | } | |
255 | ||
d01b400b AG |
256 | int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) |
257 | { | |
258 | unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 }; | |
d01b400b | 259 | struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; |
db338e16 AG |
260 | int ret = 0; |
261 | ||
262 | mutex_lock(&control->tbl_mutex); | |
d01b400b AG |
263 | |
264 | hdr->header = EEPROM_TABLE_HDR_VAL; | |
265 | hdr->version = EEPROM_TABLE_VER; | |
266 | hdr->first_rec_offset = EEPROM_RECORD_START; | |
267 | hdr->tbl_size = EEPROM_TABLE_HEADER_SIZE; | |
268 | ||
db338e16 AG |
269 | control->tbl_byte_sum = 0; |
270 | __update_tbl_checksum(control, NULL, 0, 0); | |
271 | control->next_addr = EEPROM_RECORD_START; | |
272 | ||
273 | ret = __update_table_header(control, buff); | |
274 | ||
275 | mutex_unlock(&control->tbl_mutex); | |
276 | ||
277 | return ret; | |
278 | ||
d01b400b AG |
279 | } |
280 | ||
b82e65a9 GC |
281 | int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, |
282 | bool *exceed_err_limit) | |
64f55e62 AG |
283 | { |
284 | int ret = 0; | |
285 | struct amdgpu_device *adev = to_amdgpu_device(control); | |
286 | unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 }; | |
287 | struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; | |
9b856def | 288 | struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
64f55e62 | 289 | struct i2c_msg msg = { |
5985ebbe | 290 | .addr = 0, |
64f55e62 AG |
291 | .flags = I2C_M_RD, |
292 | .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE, | |
293 | .buf = buff, | |
294 | }; | |
295 | ||
b82e65a9 GC |
296 | *exceed_err_limit = false; |
297 | ||
4bfb7428 JC |
298 | if (!__is_ras_eeprom_supported(adev)) |
299 | return 0; | |
300 | ||
9015d60c AG |
301 | /* Verify i2c adapter is initialized */ |
302 | if (!adev->pm.smu_i2c.algo) | |
303 | return -ENOENT; | |
304 | ||
ef1caf48 JC |
305 | if (!__get_eeprom_i2c_addr(adev, &control->i2c_address)) |
306 | return -EINVAL; | |
307 | ||
64f55e62 AG |
308 | mutex_init(&control->tbl_mutex); |
309 | ||
5985ebbe | 310 | msg.addr = control->i2c_address; |
64f55e62 | 311 | /* Read/Create table header from EEPROM address 0 */ |
9015d60c | 312 | ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1); |
64f55e62 AG |
313 | if (ret < 1) { |
314 | DRM_ERROR("Failed to read EEPROM table header, ret:%d", ret); | |
315 | return ret; | |
316 | } | |
317 | ||
318 | __decode_table_header_from_buff(hdr, &buff[2]); | |
319 | ||
320 | if (hdr->header == EEPROM_TABLE_HDR_VAL) { | |
321 | control->num_recs = (hdr->tbl_size - EEPROM_TABLE_HEADER_SIZE) / | |
322 | EEPROM_TABLE_RECORD_SIZE; | |
db338e16 AG |
323 | control->tbl_byte_sum = __calc_hdr_byte_sum(control); |
324 | control->next_addr = EEPROM_RECORD_START; | |
325 | ||
64f55e62 AG |
326 | DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", |
327 | control->num_recs); | |
328 | ||
b82e65a9 GC |
329 | } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) && |
330 | (amdgpu_bad_page_threshold != 0)) { | |
9b856def GC |
331 | if (ras->bad_page_cnt_threshold > control->num_recs) { |
332 | dev_info(adev->dev, "Using one valid bigger bad page " | |
333 | "threshold and correcting eeprom header tag.\n"); | |
334 | ret = amdgpu_ras_eeprom_correct_header_tag(control, | |
335 | EEPROM_TABLE_HDR_VAL); | |
336 | } else { | |
337 | *exceed_err_limit = true; | |
338 | dev_err(adev->dev, "Exceeding the bad_page_threshold parameter, " | |
b82e65a9 | 339 | "disabling the GPU.\n"); |
9b856def | 340 | } |
64f55e62 AG |
341 | } else { |
342 | DRM_INFO("Creating new EEPROM table"); | |
343 | ||
d01b400b | 344 | ret = amdgpu_ras_eeprom_reset_table(control); |
64f55e62 AG |
345 | } |
346 | ||
64f55e62 AG |
347 | return ret == 1 ? 0 : -EIO; |
348 | } | |
349 | ||
64f55e62 AG |
350 | static void __encode_table_record_to_buff(struct amdgpu_ras_eeprom_control *control, |
351 | struct eeprom_table_record *record, | |
352 | unsigned char *buff) | |
353 | { | |
354 | __le64 tmp = 0; | |
355 | int i = 0; | |
356 | ||
357 | /* Next are all record fields according to EEPROM page spec in LE foramt */ | |
358 | buff[i++] = record->err_type; | |
359 | ||
360 | buff[i++] = record->bank; | |
361 | ||
362 | tmp = cpu_to_le64(record->ts); | |
363 | memcpy(buff + i, &tmp, 8); | |
364 | i += 8; | |
365 | ||
366 | tmp = cpu_to_le64((record->offset & 0xffffffffffff)); | |
367 | memcpy(buff + i, &tmp, 6); | |
368 | i += 6; | |
369 | ||
370 | buff[i++] = record->mem_channel; | |
371 | buff[i++] = record->mcumc_id; | |
372 | ||
373 | tmp = cpu_to_le64((record->retired_page & 0xffffffffffff)); | |
374 | memcpy(buff + i, &tmp, 6); | |
375 | } | |
376 | ||
377 | static void __decode_table_record_from_buff(struct amdgpu_ras_eeprom_control *control, | |
378 | struct eeprom_table_record *record, | |
379 | unsigned char *buff) | |
380 | { | |
381 | __le64 tmp = 0; | |
382 | int i = 0; | |
383 | ||
384 | /* Next are all record fields according to EEPROM page spec in LE foramt */ | |
385 | record->err_type = buff[i++]; | |
386 | ||
387 | record->bank = buff[i++]; | |
388 | ||
389 | memcpy(&tmp, buff + i, 8); | |
390 | record->ts = le64_to_cpu(tmp); | |
391 | i += 8; | |
392 | ||
393 | memcpy(&tmp, buff + i, 6); | |
394 | record->offset = (le64_to_cpu(tmp) & 0xffffffffffff); | |
395 | i += 6; | |
396 | ||
4bc22340 AG |
397 | record->mem_channel = buff[i++]; |
398 | record->mcumc_id = buff[i++]; | |
64f55e62 AG |
399 | |
400 | memcpy(&tmp, buff + i, 6); | |
401 | record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff); | |
402 | } | |
403 | ||
404 | /* | |
405 | * When reaching end of EEPROM memory jump back to 0 record address | |
406 | * When next record access will go beyond EEPROM page boundary modify bits A17/A8 | |
407 | * in I2C selector to go to next page | |
408 | */ | |
409 | static uint32_t __correct_eeprom_dest_address(uint32_t curr_address) | |
410 | { | |
411 | uint32_t next_address = curr_address + EEPROM_TABLE_RECORD_SIZE; | |
412 | ||
413 | /* When all EEPROM memory used jump back to 0 address */ | |
414 | if (next_address > EEPROM_SIZE_BYTES) { | |
415 | DRM_INFO("Reached end of EEPROM memory, jumping to 0 " | |
416 | "and overriding old record"); | |
417 | return EEPROM_RECORD_START; | |
418 | } | |
419 | ||
420 | /* | |
421 | * To check if we overflow page boundary compare next address with | |
422 | * current and see if bits 17/8 of the EEPROM address will change | |
423 | * If they do start from the next 256b page | |
424 | * | |
425 | * https://www.st.com/resource/en/datasheet/m24m02-dr.pdf sec. 5.1.2 | |
426 | */ | |
427 | if ((curr_address & EEPROM_ADDR_MSB_MASK) != (next_address & EEPROM_ADDR_MSB_MASK)) { | |
92ead9fa | 428 | DRM_DEBUG_DRIVER("Reached end of EEPROM memory page, jumping to next: %lx", |
64f55e62 AG |
429 | (next_address & EEPROM_ADDR_MSB_MASK)); |
430 | ||
431 | return (next_address & EEPROM_ADDR_MSB_MASK); | |
432 | } | |
433 | ||
434 | return curr_address; | |
435 | } | |
436 | ||
11003c68 | 437 | bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) |
e8fbaf03 | 438 | { |
11003c68 | 439 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
e8fbaf03 | 440 | |
4bfb7428 | 441 | if (!__is_ras_eeprom_supported(adev)) |
11003c68 | 442 | return false; |
e8fbaf03 | 443 | |
970fd197 SY |
444 | /* skip check eeprom table for VEGA20 Gaming */ |
445 | if (!con) | |
446 | return false; | |
447 | else | |
448 | if (!(con->features & BIT(AMDGPU_RAS_BLOCK__UMC))) | |
449 | return false; | |
450 | ||
451 | if (con->eeprom_control.tbl_hdr.header == EEPROM_TABLE_HDR_BAD) { | |
e8fbaf03 GC |
452 | dev_warn(adev->dev, "This GPU is in BAD status."); |
453 | dev_warn(adev->dev, "Please retire it or setting one bigger " | |
454 | "threshold value when reloading driver.\n"); | |
11003c68 | 455 | return true; |
e8fbaf03 GC |
456 | } |
457 | ||
11003c68 | 458 | return false; |
e8fbaf03 GC |
459 | } |
460 | ||
64f55e62 AG |
461 | int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, |
462 | struct eeprom_table_record *records, | |
463 | bool write, | |
464 | int num) | |
465 | { | |
466 | int i, ret = 0; | |
5222d261 GC |
467 | struct i2c_msg *msgs, *msg; |
468 | unsigned char *buffs, *buff; | |
469 | struct eeprom_table_record *record; | |
64f55e62 | 470 | struct amdgpu_device *adev = to_amdgpu_device(control); |
9c06f91f | 471 | struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
64f55e62 | 472 | |
4bfb7428 | 473 | if (!__is_ras_eeprom_supported(adev)) |
64f55e62 AG |
474 | return 0; |
475 | ||
476 | buffs = kcalloc(num, EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE, | |
477 | GFP_KERNEL); | |
478 | if (!buffs) | |
479 | return -ENOMEM; | |
480 | ||
481 | mutex_lock(&control->tbl_mutex); | |
482 | ||
483 | msgs = kcalloc(num, sizeof(*msgs), GFP_KERNEL); | |
484 | if (!msgs) { | |
485 | ret = -ENOMEM; | |
486 | goto free_buff; | |
487 | } | |
488 | ||
9c06f91f GC |
489 | /* |
490 | * If saved bad pages number exceeds the bad page threshold for | |
491 | * the whole VRAM, update table header to mark the BAD GPU tag | |
492 | * and schedule one ras recovery after eeprom write is done, | |
493 | * this can avoid the missing for latest records. | |
494 | * | |
495 | * This new header will be picked up and checked in the bootup | |
496 | * by ras recovery, which may break bootup process to notify | |
497 | * user this GPU is in bad state and to retire such GPU for | |
498 | * further check. | |
499 | */ | |
500 | if (write && (amdgpu_bad_page_threshold != 0) && | |
501 | ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) { | |
502 | dev_warn(adev->dev, | |
503 | "Saved bad pages(%d) reaches threshold value(%d).\n", | |
504 | control->num_recs + num, ras->bad_page_cnt_threshold); | |
505 | control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD; | |
9c06f91f GC |
506 | } |
507 | ||
64f55e62 AG |
508 | /* In case of overflow just start from beginning to not lose newest records */ |
509 | if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES)) | |
510 | control->next_addr = EEPROM_RECORD_START; | |
511 | ||
64f55e62 AG |
512 | /* |
513 | * TODO Currently makes EEPROM writes for each record, this creates | |
514 | * internal fragmentation. Optimized the code to do full page write of | |
515 | * 256b | |
516 | */ | |
517 | for (i = 0; i < num; i++) { | |
5222d261 GC |
518 | buff = &buffs[i * (EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE)]; |
519 | record = &records[i]; | |
520 | msg = &msgs[i]; | |
64f55e62 AG |
521 | |
522 | control->next_addr = __correct_eeprom_dest_address(control->next_addr); | |
523 | ||
524 | /* | |
525 | * Update bits 16,17 of EEPROM address in I2C address by setting them | |
526 | * to bits 1,2 of Device address byte | |
527 | */ | |
5985ebbe JC |
528 | msg->addr = control->i2c_address | |
529 | ((control->next_addr & EEPROM_ADDR_MSB_MASK) >> 15); | |
64f55e62 AG |
530 | msg->flags = write ? 0 : I2C_M_RD; |
531 | msg->len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE; | |
532 | msg->buf = buff; | |
533 | ||
534 | /* Insert the EEPROM dest addess, bits 0-15 */ | |
535 | buff[0] = ((control->next_addr >> 8) & 0xff); | |
536 | buff[1] = (control->next_addr & 0xff); | |
537 | ||
538 | /* EEPROM table content is stored in LE format */ | |
539 | if (write) | |
540 | __encode_table_record_to_buff(control, record, buff + EEPROM_ADDRESS_SIZE); | |
541 | ||
542 | /* | |
543 | * The destination EEPROM address might need to be corrected to account | |
544 | * for page or entire memory wrapping | |
545 | */ | |
546 | control->next_addr += EEPROM_TABLE_RECORD_SIZE; | |
547 | } | |
548 | ||
40e7ed97 DL |
549 | /* i2c may be unstable in gpu reset */ |
550 | down_read(&adev->reset_sem); | |
9015d60c | 551 | ret = i2c_transfer(&adev->pm.smu_i2c, msgs, num); |
40e7ed97 DL |
552 | up_read(&adev->reset_sem); |
553 | ||
64f55e62 AG |
554 | if (ret < 1) { |
555 | DRM_ERROR("Failed to process EEPROM table records, ret:%d", ret); | |
556 | ||
557 | /* TODO Restore prev next EEPROM address ? */ | |
558 | goto free_msgs; | |
559 | } | |
560 | ||
561 | ||
562 | if (!write) { | |
563 | for (i = 0; i < num; i++) { | |
5222d261 GC |
564 | buff = &buffs[i*(EEPROM_ADDRESS_SIZE + EEPROM_TABLE_RECORD_SIZE)]; |
565 | record = &records[i]; | |
64f55e62 AG |
566 | |
567 | __decode_table_record_from_buff(control, record, buff + EEPROM_ADDRESS_SIZE); | |
568 | } | |
569 | } | |
570 | ||
571 | if (write) { | |
572 | uint32_t old_hdr_byte_sum = __calc_hdr_byte_sum(control); | |
573 | ||
574 | /* | |
575 | * Update table header with size and CRC and account for table | |
576 | * wrap around where the assumption is that we treat it as empty | |
577 | * table | |
578 | * | |
579 | * TODO - Check the assumption is correct | |
580 | */ | |
581 | control->num_recs += num; | |
582 | control->num_recs %= EEPROM_MAX_RECORD_NUM; | |
583 | control->tbl_hdr.tbl_size += EEPROM_TABLE_RECORD_SIZE * num; | |
584 | if (control->tbl_hdr.tbl_size > EEPROM_SIZE_BYTES) | |
585 | control->tbl_hdr.tbl_size = EEPROM_TABLE_HEADER_SIZE + | |
586 | control->num_recs * EEPROM_TABLE_RECORD_SIZE; | |
587 | ||
588 | __update_tbl_checksum(control, records, num, old_hdr_byte_sum); | |
589 | ||
590 | __update_table_header(control, buffs); | |
591 | } else if (!__validate_tbl_checksum(control, records, num)) { | |
592 | DRM_WARN("EEPROM Table checksum mismatch!"); | |
593 | /* TODO Uncomment when EEPROM read/write is relliable */ | |
594 | /* ret = -EIO; */ | |
595 | } | |
596 | ||
597 | free_msgs: | |
598 | kfree(msgs); | |
599 | ||
600 | free_buff: | |
601 | kfree(buffs); | |
602 | ||
603 | mutex_unlock(&control->tbl_mutex); | |
604 | ||
605 | return ret == num ? 0 : -EIO; | |
606 | } | |
607 | ||
c84d4670 GC |
608 | inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) |
609 | { | |
610 | return EEPROM_MAX_RECORD_NUM; | |
611 | } | |
612 | ||
64f55e62 AG |
613 | /* Used for testing if bugs encountered */ |
614 | #if 0 | |
615 | void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control) | |
616 | { | |
617 | int i; | |
618 | struct eeprom_table_record *recs = kcalloc(1, sizeof(*recs), GFP_KERNEL); | |
619 | ||
620 | if (!recs) | |
621 | return; | |
622 | ||
623 | for (i = 0; i < 1 ; i++) { | |
624 | recs[i].address = 0xdeadbeef; | |
625 | recs[i].retired_page = i; | |
626 | } | |
627 | ||
628 | if (!amdgpu_ras_eeprom_process_recods(control, recs, true, 1)) { | |
629 | ||
630 | memset(recs, 0, sizeof(*recs) * 1); | |
631 | ||
632 | control->next_addr = EEPROM_RECORD_START; | |
633 | ||
634 | if (!amdgpu_ras_eeprom_process_recods(control, recs, false, 1)) { | |
635 | for (i = 0; i < 1; i++) | |
636 | DRM_INFO("rec.address :0x%llx, rec.retired_page :%llu", | |
637 | recs[i].address, recs[i].retired_page); | |
638 | } else | |
639 | DRM_ERROR("Failed in reading from table"); | |
640 | ||
641 | } else | |
642 | DRM_ERROR("Failed in writing to table"); | |
643 | } | |
644 | #endif |