Commit | Line | Data |
---|---|---|
e071dce3 LL |
1 | /* |
2 | * Copyright 2021 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | */ | |
23 | ||
24 | #include "amdgpu_reset.h" | |
25 | #include "aldebaran.h" | |
672c0218 | 26 | #include "sienna_cichlid.h" |
230dd6bb | 27 | #include "smu_v13_0_10.h" |
e071dce3 | 28 | |
1e4acf4d LL |
29 | static int amdgpu_reset_xgmi_reset_on_init_suspend(struct amdgpu_device *adev) |
30 | { | |
e095026f | 31 | int i; |
1e4acf4d LL |
32 | |
33 | for (i = adev->num_ip_blocks - 1; i >= 0; i--) { | |
34 | if (!adev->ip_blocks[i].status.valid) | |
35 | continue; | |
36 | if (!adev->ip_blocks[i].status.hw) | |
37 | continue; | |
38 | /* displays are handled in phase1 */ | |
39 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) | |
40 | continue; | |
41 | ||
42 | /* XXX handle errors */ | |
e095026f | 43 | amdgpu_ip_block_suspend(&adev->ip_blocks[i]); |
1e4acf4d LL |
44 | adev->ip_blocks[i].status.hw = false; |
45 | } | |
46 | ||
591aec15 LL |
47 | /* VCN FW shared region is in frambuffer, there are some flags |
48 | * initialized in that region during sw_init. Make sure the region is | |
49 | * backed up. | |
50 | */ | |
51 | amdgpu_vcn_save_vcpu_bo(adev); | |
52 | ||
1e4acf4d LL |
53 | return 0; |
54 | } | |
55 | ||
56 | static int amdgpu_reset_xgmi_reset_on_init_prep_hwctxt( | |
57 | struct amdgpu_reset_control *reset_ctl, | |
58 | struct amdgpu_reset_context *reset_context) | |
59 | { | |
60 | struct list_head *reset_device_list = reset_context->reset_device_list; | |
61 | struct amdgpu_device *tmp_adev; | |
62 | int r; | |
63 | ||
64 | list_for_each_entry(tmp_adev, reset_device_list, reset_list) { | |
65 | amdgpu_unregister_gpu_instance(tmp_adev); | |
66 | r = amdgpu_reset_xgmi_reset_on_init_suspend(tmp_adev); | |
67 | if (r) { | |
68 | dev_err(tmp_adev->dev, | |
69 | "xgmi reset on init: prepare for reset failed"); | |
70 | return r; | |
71 | } | |
72 | } | |
73 | ||
74 | return r; | |
75 | } | |
76 | ||
77 | static int amdgpu_reset_xgmi_reset_on_init_restore_hwctxt( | |
78 | struct amdgpu_reset_control *reset_ctl, | |
79 | struct amdgpu_reset_context *reset_context) | |
80 | { | |
81 | struct list_head *reset_device_list = reset_context->reset_device_list; | |
82 | struct amdgpu_device *tmp_adev = NULL; | |
83 | int r; | |
84 | ||
85 | r = amdgpu_device_reinit_after_reset(reset_context); | |
86 | if (r) | |
87 | return r; | |
88 | list_for_each_entry(tmp_adev, reset_device_list, reset_list) { | |
89 | if (!tmp_adev->kfd.init_complete) { | |
90 | kgd2kfd_init_zone_device(tmp_adev); | |
91 | amdgpu_amdkfd_device_init(tmp_adev); | |
92 | amdgpu_amdkfd_drm_client_create(tmp_adev); | |
93 | } | |
94 | } | |
95 | ||
96 | return r; | |
97 | } | |
98 | ||
99 | static int amdgpu_reset_xgmi_reset_on_init_perform_reset( | |
100 | struct amdgpu_reset_control *reset_ctl, | |
101 | struct amdgpu_reset_context *reset_context) | |
102 | { | |
103 | struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle; | |
104 | struct list_head *reset_device_list = reset_context->reset_device_list; | |
105 | struct amdgpu_device *tmp_adev = NULL; | |
106 | int r; | |
107 | ||
108 | dev_dbg(adev->dev, "xgmi roi - hw reset\n"); | |
109 | ||
110 | list_for_each_entry(tmp_adev, reset_device_list, reset_list) { | |
111 | mutex_lock(&tmp_adev->reset_cntl->reset_lock); | |
112 | tmp_adev->reset_cntl->active_reset = | |
113 | amdgpu_asic_reset_method(adev); | |
114 | } | |
115 | r = 0; | |
116 | /* Mode1 reset needs to be triggered on all devices together */ | |
117 | list_for_each_entry(tmp_adev, reset_device_list, reset_list) { | |
118 | /* For XGMI run all resets in parallel to speed up the process */ | |
119 | if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) | |
120 | r = -EALREADY; | |
121 | if (r) { | |
122 | dev_err(tmp_adev->dev, | |
123 | "xgmi reset on init: reset failed with error, %d", | |
124 | r); | |
125 | break; | |
126 | } | |
127 | } | |
128 | ||
129 | /* For XGMI wait for all resets to complete before proceed */ | |
130 | if (!r) { | |
131 | list_for_each_entry(tmp_adev, reset_device_list, reset_list) { | |
132 | flush_work(&tmp_adev->xgmi_reset_work); | |
133 | r = tmp_adev->asic_reset_res; | |
134 | if (r) | |
135 | break; | |
136 | } | |
137 | } | |
138 | ||
139 | list_for_each_entry(tmp_adev, reset_device_list, reset_list) { | |
140 | mutex_unlock(&tmp_adev->reset_cntl->reset_lock); | |
141 | tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE; | |
142 | } | |
143 | ||
144 | return r; | |
145 | } | |
146 | ||
147 | int amdgpu_reset_do_xgmi_reset_on_init( | |
148 | struct amdgpu_reset_context *reset_context) | |
149 | { | |
150 | struct list_head *reset_device_list = reset_context->reset_device_list; | |
151 | struct amdgpu_device *adev; | |
152 | int r; | |
153 | ||
154 | if (!reset_device_list || list_empty(reset_device_list) || | |
155 | list_is_singular(reset_device_list)) | |
156 | return -EINVAL; | |
157 | ||
158 | adev = list_first_entry(reset_device_list, struct amdgpu_device, | |
159 | reset_list); | |
160 | r = amdgpu_reset_prepare_hwcontext(adev, reset_context); | |
161 | if (r) | |
162 | return r; | |
163 | ||
164 | r = amdgpu_reset_perform_reset(adev, reset_context); | |
165 | ||
166 | return r; | |
167 | } | |
168 | ||
169 | struct amdgpu_reset_handler xgmi_reset_on_init_handler = { | |
170 | .reset_method = AMD_RESET_METHOD_ON_INIT, | |
171 | .prepare_env = NULL, | |
172 | .prepare_hwcontext = amdgpu_reset_xgmi_reset_on_init_prep_hwctxt, | |
173 | .perform_reset = amdgpu_reset_xgmi_reset_on_init_perform_reset, | |
174 | .restore_hwcontext = amdgpu_reset_xgmi_reset_on_init_restore_hwctxt, | |
175 | .restore_env = NULL, | |
176 | .do_reset = NULL, | |
177 | }; | |
178 | ||
e071dce3 LL |
179 | int amdgpu_reset_init(struct amdgpu_device *adev) |
180 | { | |
181 | int ret = 0; | |
182 | ||
4e8303cf | 183 | switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { |
9e085647 | 184 | case IP_VERSION(13, 0, 2): |
5cf16755 | 185 | case IP_VERSION(13, 0, 6): |
100350c3 | 186 | case IP_VERSION(13, 0, 12): |
a6bcffa5 | 187 | case IP_VERSION(13, 0, 14): |
142600e8 LL |
188 | ret = aldebaran_reset_init(adev); |
189 | break; | |
672c0218 VZ |
190 | case IP_VERSION(11, 0, 7): |
191 | ret = sienna_cichlid_reset_init(adev); | |
192 | break; | |
230dd6bb KF |
193 | case IP_VERSION(13, 0, 10): |
194 | ret = smu_v13_0_10_reset_init(adev); | |
195 | break; | |
142600e8 LL |
196 | default: |
197 | break; | |
198 | } | |
199 | ||
e071dce3 LL |
200 | return ret; |
201 | } | |
202 | ||
203 | int amdgpu_reset_fini(struct amdgpu_device *adev) | |
204 | { | |
205 | int ret = 0; | |
206 | ||
4e8303cf | 207 | switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { |
9e085647 | 208 | case IP_VERSION(13, 0, 2): |
5cf16755 | 209 | case IP_VERSION(13, 0, 6): |
100350c3 | 210 | case IP_VERSION(13, 0, 12): |
a6bcffa5 | 211 | case IP_VERSION(13, 0, 14): |
142600e8 LL |
212 | ret = aldebaran_reset_fini(adev); |
213 | break; | |
672c0218 VZ |
214 | case IP_VERSION(11, 0, 7): |
215 | ret = sienna_cichlid_reset_fini(adev); | |
216 | break; | |
230dd6bb KF |
217 | case IP_VERSION(13, 0, 10): |
218 | ret = smu_v13_0_10_reset_fini(adev); | |
219 | break; | |
142600e8 LL |
220 | default: |
221 | break; | |
222 | } | |
223 | ||
e071dce3 LL |
224 | return ret; |
225 | } | |
226 | ||
227 | int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev, | |
228 | struct amdgpu_reset_context *reset_context) | |
229 | { | |
230 | struct amdgpu_reset_handler *reset_handler = NULL; | |
231 | ||
232 | if (adev->reset_cntl && adev->reset_cntl->get_reset_handler) | |
233 | reset_handler = adev->reset_cntl->get_reset_handler( | |
234 | adev->reset_cntl, reset_context); | |
235 | if (!reset_handler) | |
b8920e1e | 236 | return -EOPNOTSUPP; |
e071dce3 LL |
237 | |
238 | return reset_handler->prepare_hwcontext(adev->reset_cntl, | |
239 | reset_context); | |
240 | } | |
241 | ||
242 | int amdgpu_reset_perform_reset(struct amdgpu_device *adev, | |
243 | struct amdgpu_reset_context *reset_context) | |
244 | { | |
245 | int ret; | |
246 | struct amdgpu_reset_handler *reset_handler = NULL; | |
247 | ||
248 | if (adev->reset_cntl) | |
249 | reset_handler = adev->reset_cntl->get_reset_handler( | |
250 | adev->reset_cntl, reset_context); | |
251 | if (!reset_handler) | |
b8920e1e | 252 | return -EOPNOTSUPP; |
e071dce3 LL |
253 | |
254 | ret = reset_handler->perform_reset(adev->reset_cntl, reset_context); | |
255 | if (ret) | |
256 | return ret; | |
257 | ||
258 | return reset_handler->restore_hwcontext(adev->reset_cntl, | |
259 | reset_context); | |
260 | } | |
cfbb6b00 AG |
261 | |
262 | ||
263 | void amdgpu_reset_destroy_reset_domain(struct kref *ref) | |
264 | { | |
265 | struct amdgpu_reset_domain *reset_domain = container_of(ref, | |
266 | struct amdgpu_reset_domain, | |
267 | refcount); | |
268 | if (reset_domain->wq) | |
269 | destroy_workqueue(reset_domain->wq); | |
270 | ||
271 | kvfree(reset_domain); | |
272 | } | |
273 | ||
274 | struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type, | |
275 | char *wq_name) | |
276 | { | |
277 | struct amdgpu_reset_domain *reset_domain; | |
278 | ||
279 | reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL); | |
280 | if (!reset_domain) { | |
281 | DRM_ERROR("Failed to allocate amdgpu_reset_domain!"); | |
282 | return NULL; | |
283 | } | |
284 | ||
285 | reset_domain->type = type; | |
286 | kref_init(&reset_domain->refcount); | |
287 | ||
288 | reset_domain->wq = create_singlethread_workqueue(wq_name); | |
289 | if (!reset_domain->wq) { | |
290 | DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!"); | |
291 | amdgpu_reset_put_reset_domain(reset_domain); | |
292 | return NULL; | |
293 | ||
294 | } | |
295 | ||
89a7a870 | 296 | atomic_set(&reset_domain->in_gpu_reset, 0); |
ab9a0b1f | 297 | atomic_set(&reset_domain->reset_res, 0); |
d0fb18b5 AG |
298 | init_rwsem(&reset_domain->sem); |
299 | ||
cfbb6b00 AG |
300 | return reset_domain; |
301 | } | |
302 | ||
3675c2f2 | 303 | void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain) |
e923be99 AG |
304 | { |
305 | atomic_set(&reset_domain->in_gpu_reset, 1); | |
3675c2f2 | 306 | down_write(&reset_domain->sem); |
e923be99 AG |
307 | } |
308 | ||
309 | ||
310 | void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain) | |
311 | { | |
312 | atomic_set(&reset_domain->in_gpu_reset, 0); | |
313 | up_write(&reset_domain->sem); | |
314 | } | |
2656e1ce EH |
315 | |
316 | void amdgpu_reset_get_desc(struct amdgpu_reset_context *rst_ctxt, char *buf, | |
317 | size_t len) | |
318 | { | |
2656e1ce EH |
319 | if (!buf || !len) |
320 | return; | |
321 | ||
322 | switch (rst_ctxt->src) { | |
323 | case AMDGPU_RESET_SRC_JOB: | |
324 | if (rst_ctxt->job) { | |
7bed1df8 EH |
325 | snprintf(buf, len, "job hang on ring:%s", |
326 | rst_ctxt->job->base.sched->name); | |
2656e1ce EH |
327 | } else { |
328 | strscpy(buf, "job hang", len); | |
329 | } | |
330 | break; | |
331 | case AMDGPU_RESET_SRC_RAS: | |
332 | strscpy(buf, "RAS error", len); | |
333 | break; | |
334 | case AMDGPU_RESET_SRC_MES: | |
335 | strscpy(buf, "MES hang", len); | |
336 | break; | |
337 | case AMDGPU_RESET_SRC_HWS: | |
338 | strscpy(buf, "HWS hang", len); | |
339 | break; | |
340 | case AMDGPU_RESET_SRC_USER: | |
341 | strscpy(buf, "user trigger", len); | |
342 | break; | |
343 | default: | |
344 | strscpy(buf, "unknown", len); | |
345 | } | |
346 | } | |
a86e0c0e LL |
347 | |
348 | bool amdgpu_reset_in_recovery(struct amdgpu_device *adev) | |
349 | { | |
350 | return (adev->init_lvl->level == AMDGPU_INIT_LEVEL_RESET_RECOVERY); | |
351 | } |