Commit | Line | Data |
---|---|---|
c030f2e4 | 1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #ifndef _AMDGPU_RAS_H | |
25 | #define _AMDGPU_RAS_H | |
26 | ||
27 | #include <linux/debugfs.h> | |
28 | #include <linux/list.h> | |
98b5bc87 | 29 | #include <linux/kfifo.h> |
f493dd64 | 30 | #include <linux/radix-tree.h> |
c030f2e4 | 31 | #include "ta_ras_if.h" |
64f55e62 | 32 | #include "amdgpu_ras_eeprom.h" |
5b1270be | 33 | #include "amdgpu_smuio.h" |
f5e4cc84 | 34 | #include "amdgpu_aca.h" |
c030f2e4 | 35 | |
7cab2124 | 36 | struct amdgpu_iv_entry; |
37 | ||
cce4febb HZ |
38 | #define AMDGPU_RAS_GPU_ERR_MEM_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 0, 0) |
39 | #define AMDGPU_RAS_GPU_ERR_FW_LOAD(x) AMDGPU_GET_REG_FIELD(x, 1, 1) | |
40 | #define AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 2, 2) | |
41 | #define AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 3, 3) | |
42 | #define AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 4, 4) | |
43 | #define AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 5, 5) | |
44 | #define AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(x) AMDGPU_GET_REG_FIELD(x, 6, 6) | |
45 | #define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x) AMDGPU_GET_REG_FIELD(x, 7, 7) | |
46 | #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8) | |
47 | #define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11) | |
cf85764e | 48 | #define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13) |
dfe9d047 HZ |
49 | #define AMDGPU_RAS_GPU_ERR_DATA_ABORT(x) AMDGPU_GET_REG_FIELD(x, 29, 29) |
50 | #define AMDGPU_RAS_GPU_ERR_UNKNOWN(x) AMDGPU_GET_REG_FIELD(x, 30, 30) | |
cce4febb | 51 | |
a474161e | 52 | #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100 |
1731ba9b HZ |
53 | #define AMDGPU_RAS_BOOT_STEADY_STATUS 0xBA |
54 | #define AMDGPU_RAS_BOOT_STATUS_MASK 0xFF | |
1731ba9b | 55 | |
35cd2cda | 56 | #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) |
2c22ed0b TZ |
57 | /* position of instance value in sub_block_index of |
58 | * ta_ras_trigger_error_input, the sub block uses lower 12 bits | |
59 | */ | |
60 | #define AMDGPU_RAS_INST_MASK 0xfffff000 | |
61 | #define AMDGPU_RAS_INST_SHIFT 0xc | |
35cd2cda | 62 | |
ee9c3031 SY |
63 | #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29 |
64 | #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000 | |
65 | ||
473af28d HZ |
66 | /* Reserve 8 physical dram row for possible retirement. |
67 | * In worst cases, it will lose 8 * 2MB memory in vram domain */ | |
68 | #define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) | |
ee9c3031 SY |
69 | /* The high three bits indicates socketid */ |
70 | #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) | |
71 | ||
75ac6a25 YW |
72 | #define RAS_EVENT_INVALID_ID (BIT_ULL(63)) |
73 | #define RAS_EVENT_ID_IS_VALID(x) (!((x) & BIT_ULL(63))) | |
74 | ||
b712d7c2 | 75 | #define RAS_EVENT_LOG(adev, id, fmt, ...) \ |
332210c1 | 76 | amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__) |
9dc57c2a | 77 | |
75ac6a25 YW |
78 | #define amdgpu_ras_mark_ras_event(adev, type) \ |
79 | (amdgpu_ras_mark_ras_event_caller((adev), (type), __builtin_return_address(0))) | |
80 | ||
c030f2e4 | 81 | enum amdgpu_ras_block { |
82 | AMDGPU_RAS_BLOCK__UMC = 0, | |
83 | AMDGPU_RAS_BLOCK__SDMA, | |
84 | AMDGPU_RAS_BLOCK__GFX, | |
85 | AMDGPU_RAS_BLOCK__MMHUB, | |
86 | AMDGPU_RAS_BLOCK__ATHUB, | |
87 | AMDGPU_RAS_BLOCK__PCIE_BIF, | |
88 | AMDGPU_RAS_BLOCK__HDP, | |
89 | AMDGPU_RAS_BLOCK__XGMI_WAFL, | |
90 | AMDGPU_RAS_BLOCK__DF, | |
91 | AMDGPU_RAS_BLOCK__SMN, | |
92 | AMDGPU_RAS_BLOCK__SEM, | |
93 | AMDGPU_RAS_BLOCK__MP0, | |
94 | AMDGPU_RAS_BLOCK__MP1, | |
95 | AMDGPU_RAS_BLOCK__FUSE, | |
640ae42e | 96 | AMDGPU_RAS_BLOCK__MCA, |
a3d63c62 MZZ |
97 | AMDGPU_RAS_BLOCK__VCN, |
98 | AMDGPU_RAS_BLOCK__JPEG, | |
30df05fb HZ |
99 | AMDGPU_RAS_BLOCK__IH, |
100 | AMDGPU_RAS_BLOCK__MPIO, | |
c030f2e4 | 101 | |
102 | AMDGPU_RAS_BLOCK__LAST | |
103 | }; | |
104 | ||
640ae42e JC |
105 | enum amdgpu_ras_mca_block { |
106 | AMDGPU_RAS_MCA_BLOCK__MP0 = 0, | |
107 | AMDGPU_RAS_MCA_BLOCK__MP1, | |
108 | AMDGPU_RAS_MCA_BLOCK__MPIO, | |
109 | AMDGPU_RAS_MCA_BLOCK__IOHC, | |
110 | ||
111 | AMDGPU_RAS_MCA_BLOCK__LAST | |
112 | }; | |
893cf382 | 113 | |
c030f2e4 | 114 | #define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST |
640ae42e | 115 | #define AMDGPU_RAS_MCA_BLOCK_COUNT AMDGPU_RAS_MCA_BLOCK__LAST |
c030f2e4 | 116 | #define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) |
117 | ||
dc23a08f DL |
118 | enum amdgpu_ras_gfx_subblock { |
119 | /* CPC */ | |
120 | AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START = 0, | |
121 | AMDGPU_RAS_BLOCK__GFX_CPC_SCRATCH = | |
122 | AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START, | |
123 | AMDGPU_RAS_BLOCK__GFX_CPC_UCODE, | |
124 | AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME1, | |
125 | AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME1, | |
126 | AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME1, | |
127 | AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME2, | |
128 | AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME2, | |
129 | AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2, | |
130 | AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_END = | |
131 | AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2, | |
132 | /* CPF */ | |
133 | AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START, | |
134 | AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME2 = | |
135 | AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START, | |
136 | AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME1, | |
137 | AMDGPU_RAS_BLOCK__GFX_CPF_TAG, | |
138 | AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPF_TAG, | |
139 | /* CPG */ | |
140 | AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START, | |
141 | AMDGPU_RAS_BLOCK__GFX_CPG_DMA_ROQ = | |
142 | AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START, | |
143 | AMDGPU_RAS_BLOCK__GFX_CPG_DMA_TAG, | |
144 | AMDGPU_RAS_BLOCK__GFX_CPG_TAG, | |
145 | AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPG_TAG, | |
146 | /* GDS */ | |
147 | AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START, | |
148 | AMDGPU_RAS_BLOCK__GFX_GDS_MEM = AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START, | |
149 | AMDGPU_RAS_BLOCK__GFX_GDS_INPUT_QUEUE, | |
150 | AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_CMD_RAM_MEM, | |
151 | AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_DATA_RAM_MEM, | |
152 | AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM, | |
153 | AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_END = | |
154 | AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM, | |
155 | /* SPI */ | |
156 | AMDGPU_RAS_BLOCK__GFX_SPI_SR_MEM, | |
157 | /* SQ */ | |
158 | AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START, | |
159 | AMDGPU_RAS_BLOCK__GFX_SQ_SGPR = AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START, | |
160 | AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D, | |
161 | AMDGPU_RAS_BLOCK__GFX_SQ_LDS_I, | |
162 | AMDGPU_RAS_BLOCK__GFX_SQ_VGPR, | |
163 | AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_END = AMDGPU_RAS_BLOCK__GFX_SQ_VGPR, | |
164 | /* SQC (3 ranges) */ | |
165 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START, | |
166 | /* SQC range 0 */ | |
167 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START = | |
168 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START, | |
169 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_UTCL1_LFIFO = | |
170 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START, | |
171 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_WRITE_DATA_BUF, | |
172 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_UTCL1_LFIFO, | |
173 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_WRITE_DATA_BUF, | |
174 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO, | |
175 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_WRITE_DATA_BUF, | |
176 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO, | |
177 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_END = | |
178 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO, | |
179 | /* SQC range 1 */ | |
180 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START, | |
181 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM = | |
182 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START, | |
183 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO, | |
184 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_MISS_FIFO, | |
185 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_BANK_RAM, | |
186 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_TAG_RAM, | |
187 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_HIT_FIFO, | |
188 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_MISS_FIFO, | |
189 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM, | |
190 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM, | |
191 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_END = | |
192 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM, | |
193 | /* SQC range 2 */ | |
194 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START, | |
195 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM = | |
196 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START, | |
197 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO, | |
198 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_MISS_FIFO, | |
199 | AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_BANK_RAM, | |
200 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_TAG_RAM, | |
201 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_HIT_FIFO, | |
202 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_MISS_FIFO, | |
203 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM, | |
204 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM, | |
205 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END = | |
206 | AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM, | |
207 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_END = | |
208 | AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END, | |
209 | /* TA */ | |
210 | AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START, | |
211 | AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO = | |
212 | AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START, | |
213 | AMDGPU_RAS_BLOCK__GFX_TA_FS_AFIFO, | |
214 | AMDGPU_RAS_BLOCK__GFX_TA_FL_LFIFO, | |
215 | AMDGPU_RAS_BLOCK__GFX_TA_FX_LFIFO, | |
216 | AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO, | |
217 | AMDGPU_RAS_BLOCK__GFX_TA_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO, | |
218 | /* TCA */ | |
219 | AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START, | |
220 | AMDGPU_RAS_BLOCK__GFX_TCA_HOLE_FIFO = | |
221 | AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START, | |
222 | AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO, | |
223 | AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_END = | |
224 | AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO, | |
225 | /* TCC (5 sub-ranges) */ | |
226 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START, | |
227 | /* TCC range 0 */ | |
228 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START = | |
229 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START, | |
230 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA = | |
231 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START, | |
232 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1, | |
233 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0, | |
234 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1, | |
235 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_0, | |
236 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_1, | |
237 | AMDGPU_RAS_BLOCK__GFX_TCC_HIGH_RATE_TAG, | |
238 | AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG, | |
239 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_END = | |
240 | AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG, | |
241 | /* TCC range 1 */ | |
242 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START, | |
243 | AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_DEC = | |
244 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START, | |
245 | AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER, | |
246 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_END = | |
247 | AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER, | |
248 | /* TCC range 2 */ | |
249 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START, | |
250 | AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_DATA = | |
251 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START, | |
252 | AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_CONTROL, | |
253 | AMDGPU_RAS_BLOCK__GFX_TCC_UC_ATOMIC_FIFO, | |
254 | AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_RETURN, | |
255 | AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_CACHE_READ, | |
256 | AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO, | |
257 | AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO_NEXT_RAM, | |
258 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO, | |
259 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_END = | |
260 | AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO, | |
261 | /* TCC range 3 */ | |
262 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START, | |
263 | AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO = | |
264 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START, | |
265 | AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM, | |
266 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_END = | |
267 | AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM, | |
268 | /* TCC range 4 */ | |
269 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START, | |
270 | AMDGPU_RAS_BLOCK__GFX_TCC_WRRET_TAG_WRITE_RETURN = | |
271 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START, | |
272 | AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER, | |
273 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END = | |
274 | AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER, | |
275 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_END = | |
276 | AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END, | |
277 | /* TCI */ | |
278 | AMDGPU_RAS_BLOCK__GFX_TCI_WRITE_RAM, | |
279 | /* TCP */ | |
280 | AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START, | |
281 | AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM = | |
282 | AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START, | |
283 | AMDGPU_RAS_BLOCK__GFX_TCP_LFIFO_RAM, | |
284 | AMDGPU_RAS_BLOCK__GFX_TCP_CMD_FIFO, | |
285 | AMDGPU_RAS_BLOCK__GFX_TCP_VM_FIFO, | |
286 | AMDGPU_RAS_BLOCK__GFX_TCP_DB_RAM, | |
287 | AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO0, | |
288 | AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1, | |
289 | AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_END = | |
290 | AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1, | |
291 | /* TD */ | |
292 | AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START, | |
293 | AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO = | |
294 | AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START, | |
295 | AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_HI, | |
296 | AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO, | |
297 | AMDGPU_RAS_BLOCK__GFX_TD_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO, | |
298 | /* EA (3 sub-ranges) */ | |
299 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START, | |
300 | /* EA range 0 */ | |
301 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START = | |
302 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START, | |
303 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM = | |
304 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START, | |
305 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_CMDMEM, | |
306 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_DATAMEM, | |
307 | AMDGPU_RAS_BLOCK__GFX_EA_RRET_TAGMEM, | |
308 | AMDGPU_RAS_BLOCK__GFX_EA_WRET_TAGMEM, | |
309 | AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_CMDMEM, | |
310 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_CMDMEM, | |
311 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM, | |
312 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_END = | |
313 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM, | |
314 | /* EA range 1 */ | |
315 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START, | |
316 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_PAGEMEM = | |
317 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START, | |
318 | AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_PAGEMEM, | |
319 | AMDGPU_RAS_BLOCK__GFX_EA_IORD_CMDMEM, | |
320 | AMDGPU_RAS_BLOCK__GFX_EA_IOWR_CMDMEM, | |
321 | AMDGPU_RAS_BLOCK__GFX_EA_IOWR_DATAMEM, | |
322 | AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_PAGEMEM, | |
323 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM, | |
324 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_END = | |
325 | AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM, | |
326 | /* EA range 2 */ | |
327 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START, | |
328 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D0MEM = | |
329 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START, | |
330 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D1MEM, | |
331 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D2MEM, | |
332 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM, | |
333 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END = | |
334 | AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM, | |
335 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX_END = | |
336 | AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END, | |
337 | /* UTC VM L2 bank */ | |
338 | AMDGPU_RAS_BLOCK__UTC_VML2_BANK_CACHE, | |
339 | /* UTC VM walker */ | |
340 | AMDGPU_RAS_BLOCK__UTC_VML2_WALKER, | |
341 | /* UTC ATC L2 2MB cache */ | |
342 | AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_2M_BANK, | |
343 | /* UTC ATC L2 4KB cache */ | |
344 | AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_4K_BANK, | |
345 | AMDGPU_RAS_BLOCK__GFX_MAX | |
346 | }; | |
347 | ||
c030f2e4 | 348 | enum amdgpu_ras_error_type { |
349 | AMDGPU_RAS_ERROR__NONE = 0, | |
350 | AMDGPU_RAS_ERROR__PARITY = 1, | |
351 | AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE = 2, | |
352 | AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE = 4, | |
353 | AMDGPU_RAS_ERROR__POISON = 8, | |
354 | }; | |
355 | ||
356 | enum amdgpu_ras_ret { | |
357 | AMDGPU_RAS_SUCCESS = 0, | |
358 | AMDGPU_RAS_FAIL, | |
359 | AMDGPU_RAS_UE, | |
360 | AMDGPU_RAS_CE, | |
361 | AMDGPU_RAS_PT, | |
362 | }; | |
363 | ||
8cc0f566 HZ |
364 | enum amdgpu_ras_error_query_mode { |
365 | AMDGPU_RAS_INVALID_ERROR_QUERY = 0, | |
366 | AMDGPU_RAS_DIRECT_ERROR_QUERY = 1, | |
367 | AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2, | |
368 | }; | |
369 | ||
322a7e00 HZ |
370 | /* ras error status reisger fields */ |
371 | #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0 | |
372 | #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L | |
373 | #define ERR_STATUS_LO__MEMORY_ID__SHIFT 0x18 | |
374 | #define ERR_STATUS_LO__MEMORY_ID_MASK 0xFF000000L | |
375 | #define ERR_STATUS_HI__ERR_INFO_VALID_FLAG__SHIFT 0x2 | |
376 | #define ERR_STATUS_HI__ERR_INFO_VALID_FLAG_MASK 0x00000004L | |
377 | #define ERR_STATUS__ERR_CNT__SHIFT 0x17 | |
378 | #define ERR_STATUS__ERR_CNT_MASK 0x03800000L | |
379 | ||
380 | #define AMDGPU_RAS_REG_ENTRY(ip, inst, reg_lo, reg_hi) \ | |
381 | ip##_HWIP, inst, reg_lo##_BASE_IDX, reg_lo, reg_hi##_BASE_IDX, reg_hi | |
382 | ||
383 | #define AMDGPU_RAS_REG_ENTRY_OFFSET(hwip, ip_inst, segment, reg) \ | |
384 | (adev->reg_offset[hwip][ip_inst][segment] + (reg)) | |
385 | ||
386 | #define AMDGPU_RAS_ERR_INFO_VALID (1 << 0) | |
387 | #define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1) | |
388 | #define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2) | |
389 | ||
6c47a79b | 390 | #define AMDGPU_RAS_GPU_RESET_MODE2_RESET (0x1 << 0) |
2c7cd280 | 391 | #define AMDGPU_RAS_GPU_RESET_MODE1_RESET (0x1 << 1) |
6c47a79b | 392 | |
322a7e00 HZ |
393 | struct amdgpu_ras_err_status_reg_entry { |
394 | uint32_t hwip; | |
395 | uint32_t ip_inst; | |
396 | uint32_t seg_lo; | |
397 | uint32_t reg_lo; | |
398 | uint32_t seg_hi; | |
399 | uint32_t reg_hi; | |
400 | uint32_t reg_inst; | |
401 | uint32_t flags; | |
402 | const char *block_name; | |
403 | }; | |
404 | ||
405 | struct amdgpu_ras_memory_id_entry { | |
406 | uint32_t memory_id; | |
407 | const char *name; | |
408 | }; | |
409 | ||
c030f2e4 | 410 | struct ras_common_if { |
411 | enum amdgpu_ras_block block; | |
412 | enum amdgpu_ras_error_type type; | |
413 | uint32_t sub_block_index; | |
355e3e4c | 414 | char name[32]; |
c030f2e4 | 415 | }; |
416 | ||
8882f90a SY |
417 | #define MAX_UMC_CHANNEL_NUM 32 |
418 | ||
419 | struct ecc_info_per_ch { | |
420 | uint16_t ce_count_lo_chip; | |
421 | uint16_t ce_count_hi_chip; | |
422 | uint64_t mca_umc_status; | |
423 | uint64_t mca_umc_addr; | |
2f6247da | 424 | uint64_t mca_ceumc_addr; |
8882f90a SY |
425 | }; |
426 | ||
427 | struct umc_ecc_info { | |
428 | struct ecc_info_per_ch ecc[MAX_UMC_CHANNEL_NUM]; | |
cbd3e844 SY |
429 | |
430 | /* Determine smu ecctable whether support | |
431 | * record correctable error address | |
432 | */ | |
433 | int record_ce_addr_supported; | |
8882f90a SY |
434 | }; |
435 | ||
9dc57c2a | 436 | enum ras_event_type { |
75ac6a25 YW |
437 | RAS_EVENT_TYPE_INVALID = 0, |
438 | RAS_EVENT_TYPE_FATAL, | |
5b9de259 | 439 | RAS_EVENT_TYPE_POISON_CREATION, |
12b435a4 | 440 | RAS_EVENT_TYPE_POISON_CONSUMPTION, |
9dc57c2a YW |
441 | RAS_EVENT_TYPE_COUNT, |
442 | }; | |
443 | ||
59f488be YW |
444 | struct ras_event_state { |
445 | u64 last_seqno; | |
446 | atomic64_t count; | |
447 | }; | |
448 | ||
9dc57c2a | 449 | struct ras_event_manager { |
75ac6a25 | 450 | atomic64_t seqno; |
59f488be | 451 | struct ras_event_state event_state[RAS_EVENT_TYPE_COUNT]; |
9dc57c2a YW |
452 | }; |
453 | ||
75ac6a25 | 454 | struct ras_event_id { |
9dc57c2a YW |
455 | enum ras_event_type type; |
456 | u64 event_id; | |
457 | }; | |
458 | ||
75ac6a25 YW |
459 | struct ras_query_context { |
460 | struct ras_event_id evid; | |
461 | }; | |
462 | ||
98b5bc87 YC |
463 | typedef int (*pasid_notify)(struct amdgpu_device *adev, |
464 | uint16_t pasid, void *data); | |
465 | ||
466 | struct ras_poison_msg { | |
467 | enum amdgpu_ras_block block; | |
468 | uint16_t pasid; | |
469 | uint32_t reset; | |
470 | pasid_notify pasid_fn; | |
471 | void *data; | |
472 | }; | |
473 | ||
f493dd64 YC |
474 | struct ras_err_pages { |
475 | uint32_t count; | |
476 | uint64_t *pfn; | |
477 | }; | |
478 | ||
479 | struct ras_ecc_err { | |
f493dd64 YC |
480 | uint64_t status; |
481 | uint64_t ipid; | |
482 | uint64_t addr; | |
56631dee | 483 | uint64_t pa_pfn; |
f493dd64 YC |
484 | struct ras_err_pages err_pages; |
485 | }; | |
486 | ||
487 | struct ras_ecc_log_info { | |
488 | struct mutex lock; | |
f493dd64 | 489 | struct radix_tree_root de_page_tree; |
78146c1d YC |
490 | uint64_t de_queried_count; |
491 | uint64_t prev_de_queried_count; | |
f493dd64 YC |
492 | }; |
493 | ||
c030f2e4 | 494 | struct amdgpu_ras { |
495 | /* ras infrastructure */ | |
5caf466a | 496 | /* for ras itself. */ |
c030f2e4 | 497 | uint32_t features; |
625e5f38 | 498 | uint32_t schema; |
c030f2e4 | 499 | struct list_head head; |
c030f2e4 | 500 | /* sysfs */ |
501 | struct device_attribute features_attr; | |
625e5f38 AK |
502 | struct device_attribute version_attr; |
503 | struct device_attribute schema_attr; | |
59f488be | 504 | struct device_attribute event_state_attr; |
466b1793 | 505 | struct bin_attribute badpages_attr; |
c65b0805 | 506 | struct dentry *de_ras_eeprom_table; |
c030f2e4 | 507 | /* block array */ |
508 | struct ras_manager *objs; | |
509 | ||
510 | /* gpu recovery */ | |
511 | struct work_struct recovery_work; | |
512 | atomic_t in_recovery; | |
513 | struct amdgpu_device *adev; | |
514 | /* error handler data */ | |
515 | struct ras_err_handler_data *eh_data; | |
516 | struct mutex recovery_lock; | |
108c6a63 | 517 | |
518 | uint32_t flags; | |
d5ea093e | 519 | bool reboot; |
64f55e62 | 520 | struct amdgpu_ras_eeprom_control eeprom_control; |
61380faa JC |
521 | |
522 | bool error_query_ready; | |
c84d4670 GC |
523 | |
524 | /* bad page count threshold */ | |
525 | uint32_t bad_page_cnt_threshold; | |
f75e94d8 GC |
526 | |
527 | /* disable ras error count harvest in recovery */ | |
528 | bool disable_ras_err_cnt_harvest; | |
05adfd80 | 529 | |
e4348849 TZ |
530 | /* is poison mode supported */ |
531 | bool poison_supported; | |
532 | ||
05adfd80 LT |
533 | /* RAS count errors delayed work */ |
534 | struct delayed_work ras_counte_delay_work; | |
535 | atomic_t ras_ue_count; | |
536 | atomic_t ras_ce_count; | |
8882f90a SY |
537 | |
538 | /* record umc error info queried from smu */ | |
539 | struct umc_ecc_info umc_ecc; | |
69691c82 SY |
540 | |
541 | /* Indicates smu whether need update bad channel info */ | |
542 | bool update_channel_flag; | |
8096df76 | 543 | /* Record status of smu mca debug mode */ |
04c4fcd2 | 544 | bool is_aca_debug_mode; |
b95fa494 | 545 | bool is_rma; |
6c47a79b YC |
546 | |
547 | /* Record special requirements of gpu reset caller */ | |
548 | uint32_t gpu_reset_flags; | |
3fdcd0a3 YC |
549 | |
550 | struct task_struct *page_retirement_thread; | |
551 | wait_queue_head_t page_retirement_wq; | |
552 | struct mutex page_retirement_lock; | |
553 | atomic_t page_retirement_req_cnt; | |
5f08275c | 554 | atomic_t poison_creation_count; |
af730e08 | 555 | struct mutex page_rsv_lock; |
98b5bc87 | 556 | DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); |
f493dd64 | 557 | struct ras_ecc_log_info umc_ecc_log; |
2cf8e50e | 558 | struct delayed_work page_retirement_dwork; |
98b5bc87 | 559 | |
1b6ef74b LL |
560 | /* Fatal error detected flag */ |
561 | atomic_t fed; | |
9dc57c2a YW |
562 | |
563 | /* RAS event manager */ | |
564 | struct ras_event_manager __event_mgr; | |
565 | struct ras_event_manager *event_mgr; | |
566 | ||
473af28d | 567 | uint64_t reserved_pages_in_bytes; |
c030f2e4 | 568 | }; |
569 | ||
7af25d5b | 570 | struct ras_fs_data { |
3dd8a754 | 571 | char sysfs_name[48]; |
7af25d5b HZ |
572 | char debugfs_name[32]; |
573 | }; | |
574 | ||
5b1270be YW |
575 | struct ras_err_info { |
576 | struct amdgpu_smuio_mcm_config_info mcm_info; | |
577 | u64 ce_count; | |
578 | u64 ue_count; | |
46e2231c | 579 | u64 de_count; |
5b1270be YW |
580 | }; |
581 | ||
582 | struct ras_err_node { | |
583 | struct list_head node; | |
584 | struct ras_err_info err_info; | |
585 | }; | |
586 | ||
7af25d5b HZ |
587 | struct ras_err_data { |
588 | unsigned long ue_count; | |
589 | unsigned long ce_count; | |
46e2231c | 590 | unsigned long de_count; |
6f102dba | 591 | unsigned long err_addr_cnt; |
87d2b92f | 592 | struct eeprom_table_record *err_addr; |
e74313be | 593 | unsigned long err_addr_len; |
5b1270be YW |
594 | u32 err_list_count; |
595 | struct list_head err_node_list; | |
7af25d5b HZ |
596 | }; |
597 | ||
5b1270be YW |
598 | #define for_each_ras_error(err_node, err_data) \ |
599 | list_for_each_entry(err_node, &(err_data)->err_node_list, node) | |
600 | ||
7af25d5b | 601 | struct ras_err_handler_data { |
9dc23a63 TZ |
602 | /* point to bad page records array */ |
603 | struct eeprom_table_record *bps; | |
7af25d5b HZ |
604 | /* the count of entries */ |
605 | int count; | |
606 | /* the space can place new entries */ | |
607 | int space_left; | |
7af25d5b | 608 | }; |
c030f2e4 | 609 | |
cf04dfd0 | 610 | typedef int (*ras_ih_cb)(struct amdgpu_device *adev, |
f5f06e21 | 611 | void *err_data, |
cf04dfd0 TZ |
612 | struct amdgpu_iv_entry *entry); |
613 | ||
614 | struct ras_ih_data { | |
615 | /* interrupt bottom half */ | |
616 | struct work_struct ih_work; | |
617 | int inuse; | |
618 | /* IP callback */ | |
619 | ras_ih_cb cb; | |
620 | /* full of entries */ | |
621 | unsigned char *ring; | |
622 | unsigned int ring_size; | |
623 | unsigned int element_size; | |
624 | unsigned int aligned_element_size; | |
625 | unsigned int rptr; | |
626 | unsigned int wptr; | |
627 | }; | |
628 | ||
7af25d5b HZ |
629 | struct ras_manager { |
630 | struct ras_common_if head; | |
631 | /* reference count */ | |
632 | int use; | |
633 | /* ras block link */ | |
634 | struct list_head node; | |
635 | /* the device */ | |
636 | struct amdgpu_device *adev; | |
7af25d5b HZ |
637 | /* sysfs */ |
638 | struct device_attribute sysfs_attr; | |
639 | int attr_inuse; | |
640 | ||
641 | /* fs node name */ | |
642 | struct ras_fs_data fs_data; | |
643 | ||
644 | /* IH data */ | |
645 | struct ras_ih_data ih_data; | |
646 | ||
ec3e0a91 | 647 | struct ras_err_data err_data; |
04c4fcd2 YW |
648 | |
649 | struct aca_handle aca_handle; | |
7af25d5b HZ |
650 | }; |
651 | ||
652 | struct ras_badpage { | |
653 | unsigned int bp; | |
654 | unsigned int size; | |
655 | unsigned int flags; | |
656 | }; | |
657 | ||
658 | /* interfaces for IP */ | |
c030f2e4 | 659 | struct ras_fs_if { |
660 | struct ras_common_if head; | |
3907c492 | 661 | const char* sysfs_name; |
c030f2e4 | 662 | char debugfs_name[32]; |
663 | }; | |
664 | ||
665 | struct ras_query_if { | |
666 | struct ras_common_if head; | |
667 | unsigned long ue_count; | |
668 | unsigned long ce_count; | |
46e2231c | 669 | unsigned long de_count; |
c030f2e4 | 670 | }; |
671 | ||
672 | struct ras_inject_if { | |
673 | struct ras_common_if head; | |
674 | uint64_t address; | |
675 | uint64_t value; | |
2c22ed0b | 676 | uint32_t instance_mask; |
c030f2e4 | 677 | }; |
678 | ||
679 | struct ras_cure_if { | |
680 | struct ras_common_if head; | |
681 | uint64_t address; | |
682 | }; | |
683 | ||
684 | struct ras_ih_if { | |
685 | struct ras_common_if head; | |
686 | ras_ih_cb cb; | |
687 | }; | |
688 | ||
689 | struct ras_dispatch_if { | |
690 | struct ras_common_if head; | |
691 | struct amdgpu_iv_entry *entry; | |
692 | }; | |
693 | ||
36ea1bd2 | 694 | struct ras_debug_if { |
695 | union { | |
696 | struct ras_common_if head; | |
697 | struct ras_inject_if inject; | |
698 | }; | |
699 | int op; | |
700 | }; | |
6492e1b0 | 701 | |
702 | struct amdgpu_ras_block_object { | |
bdb3489c | 703 | struct ras_common_if ras_comm; |
6492e1b0 | 704 | |
b6efdb02 | 705 | int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, |
706 | enum amdgpu_ras_block block, uint32_t sub_block_index); | |
4e9b1fa5 | 707 | int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block); |
01d468d9 | 708 | void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block); |
bdb3489c | 709 | ras_ih_cb ras_cb; |
6492e1b0 | 710 | const struct amdgpu_ras_block_hw_ops *hw_ops; |
711 | }; | |
712 | ||
713 | struct amdgpu_ras_block_hw_ops { | |
2c22ed0b TZ |
714 | int (*ras_error_inject)(struct amdgpu_device *adev, |
715 | void *inject_if, uint32_t instance_mask); | |
b6efdb02 | 716 | void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status); |
6492e1b0 | 717 | void (*query_ras_error_status)(struct amdgpu_device *adev); |
718 | void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); | |
719 | void (*reset_ras_error_count)(struct amdgpu_device *adev); | |
720 | void (*reset_ras_error_status)(struct amdgpu_device *adev); | |
c543dcbe | 721 | bool (*query_poison_status)(struct amdgpu_device *adev); |
66f87949 | 722 | bool (*handle_poison_consumption)(struct amdgpu_device *adev); |
6492e1b0 | 723 | }; |
724 | ||
c030f2e4 | 725 | /* work flow |
726 | * vbios | |
727 | * 1: ras feature enable (enabled by default) | |
728 | * psp | |
729 | * 2: ras framework init (in ip_init) | |
730 | * IP | |
731 | * 3: IH add | |
732 | * 4: debugfs/sysfs create | |
733 | * 5: query/inject | |
734 | * 6: debugfs/sysfs remove | |
735 | * 7: IH remove | |
736 | * 8: feature disable | |
737 | */ | |
738 | ||
c030f2e4 | 739 | |
1a6fc071 | 740 | int amdgpu_ras_recovery_init(struct amdgpu_device *adev); |
a564808e | 741 | |
511fdbc3 | 742 | void amdgpu_ras_resume(struct amdgpu_device *adev); |
743 | void amdgpu_ras_suspend(struct amdgpu_device *adev); | |
744 | ||
4d9f771e LT |
745 | int amdgpu_ras_query_error_count(struct amdgpu_device *adev, |
746 | unsigned long *ce_count, | |
4a1c9a44 HZ |
747 | unsigned long *ue_count, |
748 | struct ras_query_if *query_info); | |
c030f2e4 | 749 | |
750 | /* error handling functions */ | |
751 | int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, | |
9dc23a63 | 752 | struct eeprom_table_record *bps, int pages); |
c030f2e4 | 753 | |
4d33e0f1 TZ |
754 | int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, |
755 | unsigned long *new_cnt); | |
c030f2e4 | 756 | |
828cfa29 | 757 | static inline enum ta_ras_block |
758 | amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) { | |
759 | switch (block) { | |
760 | case AMDGPU_RAS_BLOCK__UMC: | |
761 | return TA_RAS_BLOCK__UMC; | |
762 | case AMDGPU_RAS_BLOCK__SDMA: | |
763 | return TA_RAS_BLOCK__SDMA; | |
764 | case AMDGPU_RAS_BLOCK__GFX: | |
765 | return TA_RAS_BLOCK__GFX; | |
766 | case AMDGPU_RAS_BLOCK__MMHUB: | |
767 | return TA_RAS_BLOCK__MMHUB; | |
768 | case AMDGPU_RAS_BLOCK__ATHUB: | |
769 | return TA_RAS_BLOCK__ATHUB; | |
770 | case AMDGPU_RAS_BLOCK__PCIE_BIF: | |
771 | return TA_RAS_BLOCK__PCIE_BIF; | |
772 | case AMDGPU_RAS_BLOCK__HDP: | |
773 | return TA_RAS_BLOCK__HDP; | |
774 | case AMDGPU_RAS_BLOCK__XGMI_WAFL: | |
775 | return TA_RAS_BLOCK__XGMI_WAFL; | |
776 | case AMDGPU_RAS_BLOCK__DF: | |
777 | return TA_RAS_BLOCK__DF; | |
778 | case AMDGPU_RAS_BLOCK__SMN: | |
779 | return TA_RAS_BLOCK__SMN; | |
780 | case AMDGPU_RAS_BLOCK__SEM: | |
781 | return TA_RAS_BLOCK__SEM; | |
782 | case AMDGPU_RAS_BLOCK__MP0: | |
783 | return TA_RAS_BLOCK__MP0; | |
784 | case AMDGPU_RAS_BLOCK__MP1: | |
785 | return TA_RAS_BLOCK__MP1; | |
786 | case AMDGPU_RAS_BLOCK__FUSE: | |
787 | return TA_RAS_BLOCK__FUSE; | |
640ae42e JC |
788 | case AMDGPU_RAS_BLOCK__MCA: |
789 | return TA_RAS_BLOCK__MCA; | |
caa4dffa SY |
790 | case AMDGPU_RAS_BLOCK__VCN: |
791 | return TA_RAS_BLOCK__VCN; | |
792 | case AMDGPU_RAS_BLOCK__JPEG: | |
793 | return TA_RAS_BLOCK__JPEG; | |
828cfa29 | 794 | default: |
795 | WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block); | |
796 | return TA_RAS_BLOCK__UMC; | |
797 | } | |
798 | } | |
799 | ||
800 | static inline enum ta_ras_error_type | |
801 | amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) { | |
802 | switch (error) { | |
803 | case AMDGPU_RAS_ERROR__NONE: | |
804 | return TA_RAS_ERROR__NONE; | |
805 | case AMDGPU_RAS_ERROR__PARITY: | |
806 | return TA_RAS_ERROR__PARITY; | |
807 | case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: | |
808 | return TA_RAS_ERROR__SINGLE_CORRECTABLE; | |
809 | case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: | |
810 | return TA_RAS_ERROR__MULTI_UNCORRECTABLE; | |
811 | case AMDGPU_RAS_ERROR__POISON: | |
812 | return TA_RAS_ERROR__POISON; | |
813 | default: | |
814 | WARN_ONCE(1, "RAS ERROR: unexpected error type %d\n", error); | |
815 | return TA_RAS_ERROR__NONE; | |
816 | } | |
817 | } | |
818 | ||
c030f2e4 | 819 | /* called in ip_init and ip_fini */ |
820 | int amdgpu_ras_init(struct amdgpu_device *adev); | |
867e24ca | 821 | int amdgpu_ras_late_init(struct amdgpu_device *adev); |
c030f2e4 | 822 | int amdgpu_ras_fini(struct amdgpu_device *adev); |
823 | int amdgpu_ras_pre_fini(struct amdgpu_device *adev); | |
bdb3489c | 824 | |
825 | int amdgpu_ras_block_late_init(struct amdgpu_device *adev, | |
826 | struct ras_common_if *ras_block); | |
827 | ||
bdb3489c | 828 | void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, |
829 | struct ras_common_if *ras_block); | |
830 | ||
c030f2e4 | 831 | int amdgpu_ras_feature_enable(struct amdgpu_device *adev, |
832 | struct ras_common_if *head, bool enable); | |
833 | ||
77de502b | 834 | int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, |
835 | struct ras_common_if *head, bool enable); | |
836 | ||
c030f2e4 | 837 | int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, |
9252d33d | 838 | struct ras_common_if *head); |
c030f2e4 | 839 | |
840 | int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, | |
841 | struct ras_common_if *head); | |
842 | ||
f9317014 TZ |
843 | void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev); |
844 | ||
761d86d3 | 845 | int amdgpu_ras_query_error_status(struct amdgpu_device *adev, |
c030f2e4 | 846 | struct ras_query_if *info); |
847 | ||
472c5fb2 TZ |
848 | int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, |
849 | enum amdgpu_ras_block block); | |
761d86d3 DL |
850 | int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, |
851 | enum amdgpu_ras_block block); | |
852 | ||
c030f2e4 | 853 | int amdgpu_ras_error_inject(struct amdgpu_device *adev, |
854 | struct ras_inject_if *info); | |
855 | ||
856 | int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, | |
9252d33d | 857 | struct ras_common_if *head); |
c030f2e4 | 858 | |
859 | int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, | |
9252d33d | 860 | struct ras_common_if *head); |
c030f2e4 | 861 | |
862 | int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, | |
863 | struct ras_dispatch_if *info); | |
7c6e68c7 | 864 | |
f2a79be1 LM |
865 | struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, |
866 | struct ras_common_if *head); | |
867 | ||
7c6e68c7 AG |
868 | extern atomic_t amdgpu_ras_in_intr; |
869 | ||
870 | static inline bool amdgpu_ras_intr_triggered(void) | |
871 | { | |
872 | return !!atomic_read(&amdgpu_ras_in_intr); | |
873 | } | |
874 | ||
00eaa571 LM |
875 | static inline void amdgpu_ras_intr_cleared(void) |
876 | { | |
877 | atomic_set(&amdgpu_ras_in_intr, 0); | |
878 | } | |
879 | ||
7c6e68c7 AG |
880 | void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); |
881 | ||
61380faa JC |
882 | void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready); |
883 | ||
bb5c7235 | 884 | bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev); |
970fd197 SY |
885 | |
886 | void amdgpu_release_ras_context(struct amdgpu_device *adev); | |
8f6368a9 JC |
887 | |
888 | int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev); | |
889 | ||
640ae42e JC |
890 | const char *get_ras_block_str(struct ras_common_if *ras_block); |
891 | ||
e4348849 TZ |
892 | bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev); |
893 | ||
7cab2124 | 894 | int amdgpu_ras_is_supported(struct amdgpu_device *adev, unsigned int block); |
895 | ||
896 | int amdgpu_ras_reset_gpu(struct amdgpu_device *adev); | |
897 | ||
898 | struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); | |
899 | ||
b6efdb02 | 900 | int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); |
7cab2124 | 901 | |
201761b5 | 902 | int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); |
04c4fcd2 YW |
903 | int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable); |
904 | bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev); | |
8cc0f566 HZ |
905 | bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, |
906 | unsigned int *mode); | |
8096df76 | 907 | |
b6efdb02 | 908 | int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, |
909 | struct amdgpu_ras_block_object *ras_block_obj); | |
b3c76814 | 910 | void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); |
322a7e00 HZ |
911 | void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name); |
912 | bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, | |
913 | const struct amdgpu_ras_err_status_reg_entry *reg_entry, | |
914 | uint32_t instance, | |
915 | uint32_t *memory_id); | |
916 | bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, | |
917 | const struct amdgpu_ras_err_status_reg_entry *reg_entry, | |
918 | uint32_t instance, | |
919 | unsigned long *err_cnt); | |
920 | void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, | |
921 | const struct amdgpu_ras_err_status_reg_entry *reg_list, | |
922 | uint32_t reg_list_size, | |
923 | const struct amdgpu_ras_memory_id_entry *mem_list, | |
924 | uint32_t mem_list_size, | |
925 | uint32_t instance, | |
926 | uint32_t err_type, | |
927 | unsigned long *err_count); | |
e53a3250 HZ |
928 | void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, |
929 | const struct amdgpu_ras_err_status_reg_entry *reg_list, | |
930 | uint32_t reg_list_size, | |
931 | uint32_t instance); | |
5b1270be YW |
932 | |
933 | int amdgpu_ras_error_data_init(struct ras_err_data *err_data); | |
934 | void amdgpu_ras_error_data_fini(struct ras_err_data *err_data); | |
935 | int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, | |
671af066 YW |
936 | struct amdgpu_smuio_mcm_config_info *mcm_info, |
937 | u64 count); | |
5b1270be | 938 | int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, |
671af066 YW |
939 | struct amdgpu_smuio_mcm_config_info *mcm_info, |
940 | u64 count); | |
46e2231c | 941 | int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, |
671af066 YW |
942 | struct amdgpu_smuio_mcm_config_info *mcm_info, |
943 | u64 count); | |
cce4febb | 944 | void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances); |
04c4fcd2 YW |
945 | int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, |
946 | const struct aca_info *aca_info, void *data); | |
947 | int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk); | |
948 | ||
37973b69 YW |
949 | ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, |
950 | struct aca_handle *handle, char *buf, void *data); | |
951 | ||
1b6ef74b LL |
952 | void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); |
953 | bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); | |
954 | ||
9dc57c2a | 955 | u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); |
75ac6a25 YW |
956 | int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, |
957 | const void *caller); | |
af730e08 YC |
958 | |
959 | int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); | |
960 | ||
98b5bc87 YC |
961 | int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, |
962 | enum amdgpu_ras_block block, uint16_t pasid, | |
963 | pasid_notify pasid_fn, void *data, uint32_t reset); | |
964 | ||
7e437167 TZ |
965 | bool amdgpu_ras_in_recovery(struct amdgpu_device *adev); |
966 | ||
b712d7c2 YW |
967 | __printf(3, 4) |
968 | void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, | |
969 | const char *fmt, ...); | |
970 | ||
792be2e2 | 971 | bool amdgpu_ras_is_rma(struct amdgpu_device *adev); |
c030f2e4 | 972 | #endif |