Commit | Line | Data |
---|---|---|
c1b69ed0 CZ |
1 | /* |
2 | * Copyright 2015 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #include <linux/kthread.h> | |
25 | #include <linux/wait.h> | |
26 | #include <linux/sched.h> | |
fdf2f6c5 | 27 | |
ca4e1724 AG |
28 | #include <drm/drm_drv.h> |
29 | ||
c1b69ed0 | 30 | #include "amdgpu.h" |
7034decf | 31 | #include "amdgpu_trace.h" |
f1549c09 | 32 | #include "amdgpu_reset.h" |
c1b69ed0 | 33 | |
a6a1f036 | 34 | static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) |
0de2479c | 35 | { |
3320b8d2 CK |
36 | struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); |
37 | struct amdgpu_job *job = to_amdgpu_job(s_job); | |
b8f67b9d | 38 | struct amdgpu_task_info *ti; |
95a2f917 | 39 | struct amdgpu_device *adev = ring->adev; |
ca4e1724 | 40 | int idx; |
7258fa31 | 41 | int r; |
ca4e1724 | 42 | |
c58a863b | 43 | if (!drm_dev_enter(adev_to_drm(adev), &idx)) { |
ca4e1724 AG |
44 | DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s", |
45 | __func__, s_job->sched->name); | |
46 | ||
47 | /* Effectively the job is aborted as the device is gone */ | |
48 | return DRM_GPU_SCHED_STAT_ENODEV; | |
49 | } | |
0346bfd9 | 50 | |
b8f67b9d | 51 | |
194eb174 | 52 | adev->job_hang = true; |
0e51a772 | 53 | |
cc063ea2 MO |
54 | if (amdgpu_gpu_recovery && |
55 | amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { | |
7876fa4f CK |
56 | DRM_ERROR("ring %s timeout, but soft recovered\n", |
57 | s_job->sched->name); | |
ca4e1724 | 58 | goto exit; |
7876fa4f CK |
59 | } |
60 | ||
f024e883 | 61 | DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", |
b8f67b9d SS |
62 | job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), |
63 | ring->fence_drv.sync_seq); | |
64 | ||
65 | ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid); | |
66 | if (ti) { | |
67 | DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n", | |
68 | ti->process_name, ti->tgid, ti->task_name, ti->pid); | |
69 | amdgpu_vm_put_task_info(ti); | |
70 | } | |
4fbf87e2 | 71 | |
7a66ad6c ZY |
72 | dma_fence_set_error(&s_job->s_fence->finished, -ETIME); |
73 | ||
95a2f917 | 74 | if (amdgpu_device_should_recover_gpu(ring->adev)) { |
f1549c09 LG |
75 | struct amdgpu_reset_context reset_context; |
76 | memset(&reset_context, 0, sizeof(reset_context)); | |
77 | ||
78 | reset_context.method = AMD_RESET_METHOD_NONE; | |
79 | reset_context.reset_req_dev = adev; | |
80 | clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); | |
81 | ||
82 | r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); | |
7258fa31 SK |
83 | if (r) |
84 | DRM_ERROR("GPU Recovery Failed: %d\n", r); | |
95a2f917 | 85 | } else { |
c3b6c607 | 86 | drm_sched_suspend_timeout(&ring->sched); |
95a2f917 YT |
87 | if (amdgpu_sriov_vf(adev)) |
88 | adev->virt.tdr_debug = true; | |
89 | } | |
ca4e1724 AG |
90 | |
91 | exit: | |
194eb174 | 92 | adev->job_hang = false; |
ca4e1724 AG |
93 | drm_dev_exit(idx); |
94 | return DRM_GPU_SCHED_STAT_NOMINAL; | |
0de2479c ML |
95 | } |
96 | ||
f7d66fb2 CK |
97 | int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, |
98 | struct drm_sched_entity *entity, void *owner, | |
99 | unsigned int num_ibs, struct amdgpu_job **job) | |
50838c8c | 100 | { |
50838c8c CK |
101 | if (num_ibs == 0) |
102 | return -EINVAL; | |
103 | ||
6103b2f2 | 104 | *job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL); |
50838c8c CK |
105 | if (!*job) |
106 | return -ENOMEM; | |
107 | ||
a1917b73 CK |
108 | /* |
109 | * Initialize the scheduler to at least some ring so that we always | |
110 | * have a pointer to adev. | |
111 | */ | |
112 | (*job)->base.sched = &adev->rings[0]->sched; | |
c5637837 | 113 | (*job)->vm = vm; |
50838c8c | 114 | |
1b2d5eda | 115 | amdgpu_sync_create(&(*job)->explicit_sync); |
f88e295e | 116 | (*job)->generation = amdgpu_vm_generation(adev, vm); |
d8de8260 | 117 | (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; |
e86f9cee | 118 | |
f7d66fb2 CK |
119 | if (!entity) |
120 | return 0; | |
121 | ||
a78422e9 | 122 | return drm_sched_job_init(&(*job)->base, entity, 1, owner); |
50838c8c CK |
123 | } |
124 | ||
f7d66fb2 CK |
125 | int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, |
126 | struct drm_sched_entity *entity, void *owner, | |
127 | size_t size, enum amdgpu_ib_pool_type pool_type, | |
128 | struct amdgpu_job **job) | |
d71518b5 CK |
129 | { |
130 | int r; | |
131 | ||
f7d66fb2 | 132 | r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job); |
d71518b5 CK |
133 | if (r) |
134 | return r; | |
135 | ||
4624459c | 136 | (*job)->num_ibs = 1; |
c8e42d57 | 137 | r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]); |
f7d66fb2 CK |
138 | if (r) { |
139 | if (entity) | |
140 | drm_sched_job_cleanup(&(*job)->base); | |
d71518b5 | 141 | kfree(*job); |
f7d66fb2 | 142 | } |
d71518b5 CK |
143 | |
144 | return r; | |
145 | } | |
146 | ||
736ec9fa CK |
147 | void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, |
148 | struct amdgpu_bo *gws, struct amdgpu_bo *oa) | |
149 | { | |
150 | if (gds) { | |
151 | job->gds_base = amdgpu_bo_gpu_offset(gds) >> PAGE_SHIFT; | |
152 | job->gds_size = amdgpu_bo_size(gds) >> PAGE_SHIFT; | |
153 | } | |
154 | if (gws) { | |
155 | job->gws_base = amdgpu_bo_gpu_offset(gws) >> PAGE_SHIFT; | |
156 | job->gws_size = amdgpu_bo_size(gws) >> PAGE_SHIFT; | |
157 | } | |
158 | if (oa) { | |
159 | job->oa_base = amdgpu_bo_gpu_offset(oa) >> PAGE_SHIFT; | |
160 | job->oa_size = amdgpu_bo_size(oa) >> PAGE_SHIFT; | |
161 | } | |
162 | } | |
163 | ||
a5fb4ec2 | 164 | void amdgpu_job_free_resources(struct amdgpu_job *job) |
50838c8c | 165 | { |
a1917b73 | 166 | struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); |
f54d1867 | 167 | struct dma_fence *f; |
1ab0d211 CK |
168 | unsigned i; |
169 | ||
74ea8e78 CK |
170 | /* Check if any fences where initialized */ |
171 | if (job->base.s_fence && job->base.s_fence->finished.ops) | |
172 | f = &job->base.s_fence->finished; | |
173 | else if (job->hw_fence.ops) | |
174 | f = &job->hw_fence; | |
175 | else | |
176 | f = NULL; | |
177 | ||
50838c8c | 178 | for (i = 0; i < job->num_ibs; ++i) |
a1917b73 | 179 | amdgpu_ib_free(ring->adev, &job->ibs[i], f); |
d71518b5 CK |
180 | } |
181 | ||
1b1f42d8 | 182 | static void amdgpu_job_free_cb(struct drm_sched_job *s_job) |
b6723c8d | 183 | { |
3320b8d2 | 184 | struct amdgpu_job *job = to_amdgpu_job(s_job); |
c5f74f78 | 185 | |
26efecf9 SM |
186 | drm_sched_job_cleanup(s_job); |
187 | ||
1b2d5eda | 188 | amdgpu_sync_free(&job->explicit_sync); |
c530b02f | 189 | |
3cb93f39 SY |
190 | /* only put the hw fence if has embedded fence */ |
191 | if (!job->hw_fence.ops) | |
192 | kfree(job); | |
193 | else | |
194 | dma_fence_put(&job->hw_fence); | |
b6723c8d ML |
195 | } |
196 | ||
68ce8b24 CK |
197 | void amdgpu_job_set_gang_leader(struct amdgpu_job *job, |
198 | struct amdgpu_job *leader) | |
199 | { | |
200 | struct dma_fence *fence = &leader->base.s_fence->scheduled; | |
201 | ||
202 | WARN_ON(job->gang_submit); | |
203 | ||
204 | /* | |
205 | * Don't add a reference when we are the gang leader to avoid circle | |
206 | * dependency. | |
207 | */ | |
208 | if (job != leader) | |
209 | dma_fence_get(fence); | |
210 | job->gang_submit = fence; | |
211 | } | |
212 | ||
1e24e31f CK |
213 | void amdgpu_job_free(struct amdgpu_job *job) |
214 | { | |
f7d66fb2 CK |
215 | if (job->base.entity) |
216 | drm_sched_job_cleanup(&job->base); | |
217 | ||
1e24e31f | 218 | amdgpu_job_free_resources(job); |
1b2d5eda | 219 | amdgpu_sync_free(&job->explicit_sync); |
68ce8b24 CK |
220 | if (job->gang_submit != &job->base.s_fence->scheduled) |
221 | dma_fence_put(job->gang_submit); | |
c530b02f | 222 | |
2581c5d8 YW |
223 | if (!job->hw_fence.ops) |
224 | kfree(job); | |
225 | else | |
226 | dma_fence_put(&job->hw_fence); | |
1e24e31f CK |
227 | } |
228 | ||
f7d66fb2 | 229 | struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job) |
d71518b5 | 230 | { |
f7d66fb2 | 231 | struct dma_fence *f; |
d71518b5 | 232 | |
dbe48d03 | 233 | drm_sched_job_arm(&job->base); |
f7d66fb2 | 234 | f = dma_fence_get(&job->base.s_fence->finished); |
a5fb4ec2 | 235 | amdgpu_job_free_resources(job); |
0e10e9a1 | 236 | drm_sched_entity_push_job(&job->base); |
d71518b5 | 237 | |
f7d66fb2 | 238 | return f; |
50838c8c CK |
239 | } |
240 | ||
ee913fd9 CK |
241 | int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, |
242 | struct dma_fence **fence) | |
243 | { | |
244 | int r; | |
245 | ||
246 | job->base.sched = &ring->sched; | |
f6a3f660 AG |
247 | r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, fence); |
248 | ||
ee913fd9 CK |
249 | if (r) |
250 | return r; | |
251 | ||
252 | amdgpu_job_free(job); | |
253 | return 0; | |
254 | } | |
255 | ||
940ca22b | 256 | static struct dma_fence * |
a82f30b0 | 257 | amdgpu_job_prepare_job(struct drm_sched_job *sched_job, |
940ca22b | 258 | struct drm_sched_entity *s_entity) |
e61235db | 259 | { |
068c3304 | 260 | struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); |
a6db8a33 | 261 | struct amdgpu_job *job = to_amdgpu_job(sched_job); |
1728baa7 | 262 | struct dma_fence *fence = NULL; |
df83d1eb | 263 | int r; |
cebb52b7 | 264 | |
e84e697d CK |
265 | /* Ignore soft recovered fences here */ |
266 | r = drm_sched_entity_error(s_entity); | |
267 | if (r && r != -ENODATA) | |
268 | goto error; | |
269 | ||
b09d6acb CK |
270 | if (!fence && job->gang_submit) |
271 | fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit); | |
272 | ||
940ca22b CK |
273 | while (!fence && job->vm && !job->vmid) { |
274 | r = amdgpu_vmid_grab(job->vm, ring, job, &fence); | |
e84e697d | 275 | if (r) { |
8d0a7cea | 276 | DRM_ERROR("Error getting VM ID (%d)\n", r); |
e84e697d CK |
277 | goto error; |
278 | } | |
8d0a7cea CK |
279 | } |
280 | ||
281 | return fence; | |
e84e697d CK |
282 | |
283 | error: | |
284 | dma_fence_set_error(&job->base.s_fence->finished, r); | |
285 | return NULL; | |
e61235db CK |
286 | } |
287 | ||
1b1f42d8 | 288 | static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) |
c1b69ed0 | 289 | { |
3320b8d2 | 290 | struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched); |
68ce8b24 | 291 | struct amdgpu_device *adev = ring->adev; |
48f05f29 | 292 | struct dma_fence *fence = NULL, *finished; |
4c7eb91c | 293 | struct amdgpu_job *job; |
db5e65fc | 294 | int r = 0; |
c1b69ed0 | 295 | |
a6db8a33 | 296 | job = to_amdgpu_job(sched_job); |
48f05f29 | 297 | finished = &job->base.s_fence->finished; |
e86f9cee | 298 | |
7034decf | 299 | trace_amdgpu_sched_run_job(job); |
48f05f29 | 300 | |
68ce8b24 | 301 | /* Skip job if VRAM is lost and never resubmit gangs */ |
f88e295e | 302 | if (job->generation != amdgpu_vm_generation(adev, job->vm) || |
68ce8b24 CK |
303 | (job->job_run_counter && job->gang_submit)) |
304 | dma_fence_set_error(finished, -ECANCELED); | |
f1403342 CK |
305 | |
306 | if (finished->error < 0) { | |
4b18a91f LL |
307 | dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)", |
308 | ring->name); | |
f1403342 | 309 | } else { |
3320b8d2 | 310 | r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, |
f1403342 | 311 | &fence); |
15d73ce6 | 312 | if (r) |
4b18a91f LL |
313 | dev_err(adev->dev, |
314 | "Error scheduling IBs (%d) in ring(%s)", r, | |
315 | ring->name); | |
15d73ce6 | 316 | } |
b2ff0e8a | 317 | |
c530b02f | 318 | job->job_run_counter++; |
22a77cf6 | 319 | amdgpu_job_free_resources(job); |
db5e65fc AG |
320 | |
321 | fence = r ? ERR_PTR(r) : fence; | |
ec72b800 | 322 | return fence; |
c1b69ed0 CZ |
323 | } |
324 | ||
7c6e68c7 AG |
325 | #define to_drm_sched_job(sched_job) \ |
326 | container_of((sched_job), struct drm_sched_job, queue_node) | |
327 | ||
328 | void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) | |
329 | { | |
330 | struct drm_sched_job *s_job; | |
331 | struct drm_sched_entity *s_entity = NULL; | |
332 | int i; | |
333 | ||
334 | /* Signal all jobs not yet scheduled */ | |
38f922a5 | 335 | for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) { |
56e44960 | 336 | struct drm_sched_rq *rq = sched->sched_rq[i]; |
7c6e68c7 AG |
337 | spin_lock(&rq->lock); |
338 | list_for_each_entry(s_entity, &rq->entities, list) { | |
339 | while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { | |
340 | struct drm_sched_fence *s_fence = s_job->s_fence; | |
341 | ||
342 | dma_fence_signal(&s_fence->scheduled); | |
343 | dma_fence_set_error(&s_fence->finished, -EHWPOISON); | |
344 | dma_fence_signal(&s_fence->finished); | |
345 | } | |
346 | } | |
347 | spin_unlock(&rq->lock); | |
348 | } | |
349 | ||
350 | /* Signal all jobs already scheduled to HW */ | |
6efa4b46 | 351 | list_for_each_entry(s_job, &sched->pending_list, list) { |
7c6e68c7 AG |
352 | struct drm_sched_fence *s_fence = s_job->s_fence; |
353 | ||
354 | dma_fence_set_error(&s_fence->finished, -EHWPOISON); | |
355 | dma_fence_signal(&s_fence->finished); | |
356 | } | |
357 | } | |
358 | ||
1b1f42d8 | 359 | const struct drm_sched_backend_ops amdgpu_sched_ops = { |
a82f30b0 | 360 | .prepare_job = amdgpu_job_prepare_job, |
0856cab1 | 361 | .run_job = amdgpu_job_run, |
0e51a772 | 362 | .timedout_job = amdgpu_job_timedout, |
c5f74f78 | 363 | .free_job = amdgpu_job_free_cb |
c1b69ed0 | 364 | }; |