Commit | Line | Data |
---|---|---|
c1b69ed0 CZ |
1 | /* |
2 | * Copyright 2015 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #include <linux/kthread.h> | |
25 | #include <linux/wait.h> | |
26 | #include <linux/sched.h> | |
fdf2f6c5 | 27 | |
ca4e1724 AG |
28 | #include <drm/drm_drv.h> |
29 | ||
c1b69ed0 | 30 | #include "amdgpu.h" |
7034decf | 31 | #include "amdgpu_trace.h" |
f1549c09 | 32 | #include "amdgpu_reset.h" |
c1b69ed0 | 33 | |
a6a1f036 | 34 | static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) |
0de2479c | 35 | { |
3320b8d2 CK |
36 | struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); |
37 | struct amdgpu_job *job = to_amdgpu_job(s_job); | |
0346bfd9 | 38 | struct amdgpu_task_info ti; |
95a2f917 | 39 | struct amdgpu_device *adev = ring->adev; |
ca4e1724 | 40 | int idx; |
7258fa31 | 41 | int r; |
ca4e1724 | 42 | |
c58a863b | 43 | if (!drm_dev_enter(adev_to_drm(adev), &idx)) { |
ca4e1724 AG |
44 | DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s", |
45 | __func__, s_job->sched->name); | |
46 | ||
47 | /* Effectively the job is aborted as the device is gone */ | |
48 | return DRM_GPU_SCHED_STAT_ENODEV; | |
49 | } | |
0346bfd9 TH |
50 | |
51 | memset(&ti, 0, sizeof(struct amdgpu_task_info)); | |
194eb174 | 52 | adev->job_hang = true; |
0e51a772 | 53 | |
cc063ea2 MO |
54 | if (amdgpu_gpu_recovery && |
55 | amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { | |
7876fa4f CK |
56 | DRM_ERROR("ring %s timeout, but soft recovered\n", |
57 | s_job->sched->name); | |
ca4e1724 | 58 | goto exit; |
7876fa4f CK |
59 | } |
60 | ||
0346bfd9 | 61 | amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti); |
f024e883 | 62 | DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", |
3320b8d2 CK |
63 | job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), |
64 | ring->fence_drv.sync_seq); | |
0346bfd9 TH |
65 | DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n", |
66 | ti.process_name, ti.tgid, ti.task_name, ti.pid); | |
4fbf87e2 | 67 | |
95a2f917 | 68 | if (amdgpu_device_should_recover_gpu(ring->adev)) { |
f1549c09 LG |
69 | struct amdgpu_reset_context reset_context; |
70 | memset(&reset_context, 0, sizeof(reset_context)); | |
71 | ||
72 | reset_context.method = AMD_RESET_METHOD_NONE; | |
73 | reset_context.reset_req_dev = adev; | |
74 | clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); | |
75 | ||
76 | r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); | |
7258fa31 SK |
77 | if (r) |
78 | DRM_ERROR("GPU Recovery Failed: %d\n", r); | |
95a2f917 | 79 | } else { |
c3b6c607 | 80 | drm_sched_suspend_timeout(&ring->sched); |
95a2f917 YT |
81 | if (amdgpu_sriov_vf(adev)) |
82 | adev->virt.tdr_debug = true; | |
83 | } | |
ca4e1724 AG |
84 | |
85 | exit: | |
194eb174 | 86 | adev->job_hang = false; |
ca4e1724 AG |
87 | drm_dev_exit(idx); |
88 | return DRM_GPU_SCHED_STAT_NOMINAL; | |
0de2479c ML |
89 | } |
90 | ||
50838c8c | 91 | int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, |
c5637837 | 92 | struct amdgpu_job **job, struct amdgpu_vm *vm) |
50838c8c | 93 | { |
50838c8c CK |
94 | if (num_ibs == 0) |
95 | return -EINVAL; | |
96 | ||
6103b2f2 | 97 | *job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL); |
50838c8c CK |
98 | if (!*job) |
99 | return -ENOMEM; | |
100 | ||
a1917b73 CK |
101 | /* |
102 | * Initialize the scheduler to at least some ring so that we always | |
103 | * have a pointer to adev. | |
104 | */ | |
105 | (*job)->base.sched = &adev->rings[0]->sched; | |
c5637837 | 106 | (*job)->vm = vm; |
50838c8c | 107 | |
e86f9cee | 108 | amdgpu_sync_create(&(*job)->sync); |
df83d1eb | 109 | amdgpu_sync_create(&(*job)->sched_sync); |
c70b78a7 | 110 | (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); |
d8de8260 | 111 | (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; |
e86f9cee | 112 | |
50838c8c CK |
113 | return 0; |
114 | } | |
115 | ||
d71518b5 | 116 | int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size, |
c8e42d57 | 117 | enum amdgpu_ib_pool_type pool_type, |
118 | struct amdgpu_job **job) | |
d71518b5 CK |
119 | { |
120 | int r; | |
121 | ||
c5637837 | 122 | r = amdgpu_job_alloc(adev, 1, job, NULL); |
d71518b5 CK |
123 | if (r) |
124 | return r; | |
125 | ||
4624459c | 126 | (*job)->num_ibs = 1; |
c8e42d57 | 127 | r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]); |
d71518b5 CK |
128 | if (r) |
129 | kfree(*job); | |
130 | ||
131 | return r; | |
132 | } | |
133 | ||
736ec9fa CK |
134 | void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, |
135 | struct amdgpu_bo *gws, struct amdgpu_bo *oa) | |
136 | { | |
137 | if (gds) { | |
138 | job->gds_base = amdgpu_bo_gpu_offset(gds) >> PAGE_SHIFT; | |
139 | job->gds_size = amdgpu_bo_size(gds) >> PAGE_SHIFT; | |
140 | } | |
141 | if (gws) { | |
142 | job->gws_base = amdgpu_bo_gpu_offset(gws) >> PAGE_SHIFT; | |
143 | job->gws_size = amdgpu_bo_size(gws) >> PAGE_SHIFT; | |
144 | } | |
145 | if (oa) { | |
146 | job->oa_base = amdgpu_bo_gpu_offset(oa) >> PAGE_SHIFT; | |
147 | job->oa_size = amdgpu_bo_size(oa) >> PAGE_SHIFT; | |
148 | } | |
149 | } | |
150 | ||
a5fb4ec2 | 151 | void amdgpu_job_free_resources(struct amdgpu_job *job) |
50838c8c | 152 | { |
a1917b73 | 153 | struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); |
f54d1867 | 154 | struct dma_fence *f; |
1ab0d211 CK |
155 | unsigned i; |
156 | ||
c530b02f | 157 | /* use sched fence if available */ |
f6a3f660 | 158 | f = job->base.s_fence ? &job->base.s_fence->finished : &job->hw_fence; |
50838c8c | 159 | for (i = 0; i < job->num_ibs; ++i) |
a1917b73 | 160 | amdgpu_ib_free(ring->adev, &job->ibs[i], f); |
d71518b5 CK |
161 | } |
162 | ||
1b1f42d8 | 163 | static void amdgpu_job_free_cb(struct drm_sched_job *s_job) |
b6723c8d | 164 | { |
3320b8d2 | 165 | struct amdgpu_job *job = to_amdgpu_job(s_job); |
c5f74f78 | 166 | |
26efecf9 SM |
167 | drm_sched_job_cleanup(s_job); |
168 | ||
a79a5bdc | 169 | amdgpu_sync_free(&job->sync); |
df83d1eb | 170 | amdgpu_sync_free(&job->sched_sync); |
c530b02f | 171 | |
f6a3f660 | 172 | dma_fence_put(&job->hw_fence); |
b6723c8d ML |
173 | } |
174 | ||
68ce8b24 CK |
175 | void amdgpu_job_set_gang_leader(struct amdgpu_job *job, |
176 | struct amdgpu_job *leader) | |
177 | { | |
178 | struct dma_fence *fence = &leader->base.s_fence->scheduled; | |
179 | ||
180 | WARN_ON(job->gang_submit); | |
181 | ||
182 | /* | |
183 | * Don't add a reference when we are the gang leader to avoid circle | |
184 | * dependency. | |
185 | */ | |
186 | if (job != leader) | |
187 | dma_fence_get(fence); | |
188 | job->gang_submit = fence; | |
189 | } | |
190 | ||
1e24e31f CK |
191 | void amdgpu_job_free(struct amdgpu_job *job) |
192 | { | |
193 | amdgpu_job_free_resources(job); | |
a79a5bdc | 194 | amdgpu_sync_free(&job->sync); |
df83d1eb | 195 | amdgpu_sync_free(&job->sched_sync); |
68ce8b24 CK |
196 | if (job->gang_submit != &job->base.s_fence->scheduled) |
197 | dma_fence_put(job->gang_submit); | |
c530b02f | 198 | |
2581c5d8 YW |
199 | if (!job->hw_fence.ops) |
200 | kfree(job); | |
201 | else | |
202 | dma_fence_put(&job->hw_fence); | |
1e24e31f CK |
203 | } |
204 | ||
0e28b10f CK |
205 | int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity, |
206 | void *owner, struct dma_fence **f) | |
d71518b5 | 207 | { |
e686941a | 208 | int r; |
d71518b5 | 209 | |
e686941a ML |
210 | if (!f) |
211 | return -EINVAL; | |
212 | ||
cdc50176 | 213 | r = drm_sched_job_init(&job->base, entity, owner); |
e686941a ML |
214 | if (r) |
215 | return r; | |
d71518b5 | 216 | |
dbe48d03 DV |
217 | drm_sched_job_arm(&job->base); |
218 | ||
f54d1867 | 219 | *f = dma_fence_get(&job->base.s_fence->finished); |
a5fb4ec2 | 220 | amdgpu_job_free_resources(job); |
0e10e9a1 | 221 | drm_sched_entity_push_job(&job->base); |
d71518b5 CK |
222 | |
223 | return 0; | |
50838c8c CK |
224 | } |
225 | ||
ee913fd9 CK |
226 | int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, |
227 | struct dma_fence **fence) | |
228 | { | |
229 | int r; | |
230 | ||
231 | job->base.sched = &ring->sched; | |
f6a3f660 AG |
232 | r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, fence); |
233 | ||
ee913fd9 CK |
234 | if (r) |
235 | return r; | |
236 | ||
237 | amdgpu_job_free(job); | |
238 | return 0; | |
239 | } | |
240 | ||
1b1f42d8 LS |
241 | static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job, |
242 | struct drm_sched_entity *s_entity) | |
e61235db | 243 | { |
068c3304 | 244 | struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); |
a6db8a33 | 245 | struct amdgpu_job *job = to_amdgpu_job(sched_job); |
c5637837 | 246 | struct amdgpu_vm *vm = job->vm; |
f024e883 | 247 | struct dma_fence *fence; |
df83d1eb | 248 | int r; |
cebb52b7 | 249 | |
174b328b CK |
250 | fence = amdgpu_sync_get_fence(&job->sync); |
251 | if (fence && drm_sched_dependency_optimized(fence, s_entity)) { | |
252 | r = amdgpu_sync_fence(&job->sched_sync, fence); | |
253 | if (r) | |
254 | DRM_ERROR("Error adding fence (%d)\n", r); | |
a340c7bc | 255 | } |
cebb52b7 | 256 | |
c4f46f22 | 257 | while (fence == NULL && vm && !job->vmid) { |
620f774f CK |
258 | r = amdgpu_vmid_grab(vm, ring, &job->sync, |
259 | &job->base.s_fence->finished, | |
260 | job); | |
94dd0a4a | 261 | if (r) |
8d0a7cea | 262 | DRM_ERROR("Error getting VM ID (%d)\n", r); |
8d0a7cea | 263 | |
174b328b | 264 | fence = amdgpu_sync_get_fence(&job->sync); |
8d0a7cea CK |
265 | } |
266 | ||
68ce8b24 CK |
267 | if (!fence && job->gang_submit) |
268 | fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit); | |
269 | ||
8d0a7cea | 270 | return fence; |
e61235db CK |
271 | } |
272 | ||
1b1f42d8 | 273 | static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) |
c1b69ed0 | 274 | { |
3320b8d2 | 275 | struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched); |
68ce8b24 | 276 | struct amdgpu_device *adev = ring->adev; |
48f05f29 | 277 | struct dma_fence *fence = NULL, *finished; |
4c7eb91c | 278 | struct amdgpu_job *job; |
db5e65fc | 279 | int r = 0; |
c1b69ed0 | 280 | |
a6db8a33 | 281 | job = to_amdgpu_job(sched_job); |
48f05f29 | 282 | finished = &job->base.s_fence->finished; |
e86f9cee | 283 | |
1fbb2e92 | 284 | BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); |
e86f9cee | 285 | |
7034decf | 286 | trace_amdgpu_sched_run_job(job); |
48f05f29 | 287 | |
68ce8b24 CK |
288 | /* Skip job if VRAM is lost and never resubmit gangs */ |
289 | if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter) || | |
290 | (job->job_run_counter && job->gang_submit)) | |
291 | dma_fence_set_error(finished, -ECANCELED); | |
f1403342 CK |
292 | |
293 | if (finished->error < 0) { | |
294 | DRM_INFO("Skip scheduling IBs!\n"); | |
295 | } else { | |
3320b8d2 | 296 | r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, |
f1403342 | 297 | &fence); |
15d73ce6 CZ |
298 | if (r) |
299 | DRM_ERROR("Error scheduling IBs (%d)\n", r); | |
300 | } | |
b2ff0e8a | 301 | |
c530b02f | 302 | job->job_run_counter++; |
22a77cf6 | 303 | amdgpu_job_free_resources(job); |
db5e65fc AG |
304 | |
305 | fence = r ? ERR_PTR(r) : fence; | |
ec72b800 | 306 | return fence; |
c1b69ed0 CZ |
307 | } |
308 | ||
7c6e68c7 AG |
309 | #define to_drm_sched_job(sched_job) \ |
310 | container_of((sched_job), struct drm_sched_job, queue_node) | |
311 | ||
312 | void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) | |
313 | { | |
314 | struct drm_sched_job *s_job; | |
315 | struct drm_sched_entity *s_entity = NULL; | |
316 | int i; | |
317 | ||
318 | /* Signal all jobs not yet scheduled */ | |
e2d732fd | 319 | for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { |
7c6e68c7 | 320 | struct drm_sched_rq *rq = &sched->sched_rq[i]; |
7c6e68c7 AG |
321 | spin_lock(&rq->lock); |
322 | list_for_each_entry(s_entity, &rq->entities, list) { | |
323 | while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { | |
324 | struct drm_sched_fence *s_fence = s_job->s_fence; | |
325 | ||
326 | dma_fence_signal(&s_fence->scheduled); | |
327 | dma_fence_set_error(&s_fence->finished, -EHWPOISON); | |
328 | dma_fence_signal(&s_fence->finished); | |
329 | } | |
330 | } | |
331 | spin_unlock(&rq->lock); | |
332 | } | |
333 | ||
334 | /* Signal all jobs already scheduled to HW */ | |
6efa4b46 | 335 | list_for_each_entry(s_job, &sched->pending_list, list) { |
7c6e68c7 AG |
336 | struct drm_sched_fence *s_fence = s_job->s_fence; |
337 | ||
338 | dma_fence_set_error(&s_fence->finished, -EHWPOISON); | |
339 | dma_fence_signal(&s_fence->finished); | |
340 | } | |
341 | } | |
342 | ||
1b1f42d8 | 343 | const struct drm_sched_backend_ops amdgpu_sched_ops = { |
0856cab1 CK |
344 | .dependency = amdgpu_job_dependency, |
345 | .run_job = amdgpu_job_run, | |
0e51a772 | 346 | .timedout_job = amdgpu_job_timedout, |
c5f74f78 | 347 | .free_job = amdgpu_job_free_cb |
c1b69ed0 | 348 | }; |