Commit | Line | Data |
---|---|---|
c1b69ed0 CZ |
1 | /* |
2 | * Copyright 2015 Advanced Micro Devices, Inc. | |
3 | * | |
4 | * Permission is hereby granted, free of charge, to any person obtaining a | |
5 | * copy of this software and associated documentation files (the "Software"), | |
6 | * to deal in the Software without restriction, including without limitation | |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
8 | * and/or sell copies of the Software, and to permit persons to whom the | |
9 | * Software is furnished to do so, subject to the following conditions: | |
10 | * | |
11 | * The above copyright notice and this permission notice shall be included in | |
12 | * all copies or substantial portions of the Software. | |
13 | * | |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
20 | * OTHER DEALINGS IN THE SOFTWARE. | |
21 | * | |
22 | * | |
23 | */ | |
24 | #include <linux/kthread.h> | |
25 | #include <linux/wait.h> | |
26 | #include <linux/sched.h> | |
fdf2f6c5 | 27 | |
ca4e1724 AG |
28 | #include <drm/drm_drv.h> |
29 | ||
c1b69ed0 | 30 | #include "amdgpu.h" |
7034decf | 31 | #include "amdgpu_trace.h" |
f1549c09 | 32 | #include "amdgpu_reset.h" |
c1b69ed0 | 33 | |
a6a1f036 | 34 | static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) |
0de2479c | 35 | { |
3320b8d2 CK |
36 | struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); |
37 | struct amdgpu_job *job = to_amdgpu_job(s_job); | |
0346bfd9 | 38 | struct amdgpu_task_info ti; |
95a2f917 | 39 | struct amdgpu_device *adev = ring->adev; |
ca4e1724 | 40 | int idx; |
7258fa31 | 41 | int r; |
ca4e1724 | 42 | |
c58a863b | 43 | if (!drm_dev_enter(adev_to_drm(adev), &idx)) { |
ca4e1724 AG |
44 | DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s", |
45 | __func__, s_job->sched->name); | |
46 | ||
47 | /* Effectively the job is aborted as the device is gone */ | |
48 | return DRM_GPU_SCHED_STAT_ENODEV; | |
49 | } | |
0346bfd9 TH |
50 | |
51 | memset(&ti, 0, sizeof(struct amdgpu_task_info)); | |
194eb174 | 52 | adev->job_hang = true; |
0e51a772 | 53 | |
cc063ea2 MO |
54 | if (amdgpu_gpu_recovery && |
55 | amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { | |
7876fa4f CK |
56 | DRM_ERROR("ring %s timeout, but soft recovered\n", |
57 | s_job->sched->name); | |
ca4e1724 | 58 | goto exit; |
7876fa4f CK |
59 | } |
60 | ||
0346bfd9 | 61 | amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti); |
f024e883 | 62 | DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", |
3320b8d2 CK |
63 | job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), |
64 | ring->fence_drv.sync_seq); | |
0346bfd9 TH |
65 | DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n", |
66 | ti.process_name, ti.tgid, ti.task_name, ti.pid); | |
4fbf87e2 | 67 | |
95a2f917 | 68 | if (amdgpu_device_should_recover_gpu(ring->adev)) { |
f1549c09 LG |
69 | struct amdgpu_reset_context reset_context; |
70 | memset(&reset_context, 0, sizeof(reset_context)); | |
71 | ||
72 | reset_context.method = AMD_RESET_METHOD_NONE; | |
73 | reset_context.reset_req_dev = adev; | |
74 | clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); | |
dac6b808 | 75 | clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); |
f1549c09 LG |
76 | |
77 | r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); | |
7258fa31 SK |
78 | if (r) |
79 | DRM_ERROR("GPU Recovery Failed: %d\n", r); | |
95a2f917 | 80 | } else { |
c3b6c607 | 81 | drm_sched_suspend_timeout(&ring->sched); |
95a2f917 YT |
82 | if (amdgpu_sriov_vf(adev)) |
83 | adev->virt.tdr_debug = true; | |
84 | } | |
ca4e1724 AG |
85 | |
86 | exit: | |
194eb174 | 87 | adev->job_hang = false; |
ca4e1724 AG |
88 | drm_dev_exit(idx); |
89 | return DRM_GPU_SCHED_STAT_NOMINAL; | |
0de2479c ML |
90 | } |
91 | ||
50838c8c | 92 | int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, |
c5637837 | 93 | struct amdgpu_job **job, struct amdgpu_vm *vm) |
50838c8c | 94 | { |
50838c8c CK |
95 | if (num_ibs == 0) |
96 | return -EINVAL; | |
97 | ||
6103b2f2 | 98 | *job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL); |
50838c8c CK |
99 | if (!*job) |
100 | return -ENOMEM; | |
101 | ||
a1917b73 CK |
102 | /* |
103 | * Initialize the scheduler to at least some ring so that we always | |
104 | * have a pointer to adev. | |
105 | */ | |
106 | (*job)->base.sched = &adev->rings[0]->sched; | |
c5637837 | 107 | (*job)->vm = vm; |
50838c8c | 108 | (*job)->num_ibs = num_ibs; |
50838c8c | 109 | |
e86f9cee | 110 | amdgpu_sync_create(&(*job)->sync); |
df83d1eb | 111 | amdgpu_sync_create(&(*job)->sched_sync); |
c70b78a7 | 112 | (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); |
d8de8260 | 113 | (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; |
e86f9cee | 114 | |
50838c8c CK |
115 | return 0; |
116 | } | |
117 | ||
d71518b5 | 118 | int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size, |
c8e42d57 | 119 | enum amdgpu_ib_pool_type pool_type, |
120 | struct amdgpu_job **job) | |
d71518b5 CK |
121 | { |
122 | int r; | |
123 | ||
c5637837 | 124 | r = amdgpu_job_alloc(adev, 1, job, NULL); |
d71518b5 CK |
125 | if (r) |
126 | return r; | |
127 | ||
c8e42d57 | 128 | r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]); |
d71518b5 CK |
129 | if (r) |
130 | kfree(*job); | |
131 | ||
132 | return r; | |
133 | } | |
134 | ||
a5fb4ec2 | 135 | void amdgpu_job_free_resources(struct amdgpu_job *job) |
50838c8c | 136 | { |
a1917b73 | 137 | struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); |
f54d1867 | 138 | struct dma_fence *f; |
1ab0d211 CK |
139 | unsigned i; |
140 | ||
c530b02f | 141 | /* use sched fence if available */ |
f6a3f660 | 142 | f = job->base.s_fence ? &job->base.s_fence->finished : &job->hw_fence; |
50838c8c | 143 | for (i = 0; i < job->num_ibs; ++i) |
a1917b73 | 144 | amdgpu_ib_free(ring->adev, &job->ibs[i], f); |
d71518b5 CK |
145 | } |
146 | ||
1b1f42d8 | 147 | static void amdgpu_job_free_cb(struct drm_sched_job *s_job) |
b6723c8d | 148 | { |
3320b8d2 | 149 | struct amdgpu_job *job = to_amdgpu_job(s_job); |
c5f74f78 | 150 | |
26efecf9 SM |
151 | drm_sched_job_cleanup(s_job); |
152 | ||
a79a5bdc | 153 | amdgpu_sync_free(&job->sync); |
df83d1eb | 154 | amdgpu_sync_free(&job->sched_sync); |
c530b02f | 155 | |
f6a3f660 | 156 | dma_fence_put(&job->hw_fence); |
b6723c8d ML |
157 | } |
158 | ||
1e24e31f CK |
159 | void amdgpu_job_free(struct amdgpu_job *job) |
160 | { | |
161 | amdgpu_job_free_resources(job); | |
a79a5bdc | 162 | amdgpu_sync_free(&job->sync); |
df83d1eb | 163 | amdgpu_sync_free(&job->sched_sync); |
c530b02f | 164 | |
2581c5d8 YW |
165 | if (!job->hw_fence.ops) |
166 | kfree(job); | |
167 | else | |
168 | dma_fence_put(&job->hw_fence); | |
1e24e31f CK |
169 | } |
170 | ||
0e28b10f CK |
171 | int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity, |
172 | void *owner, struct dma_fence **f) | |
d71518b5 | 173 | { |
e686941a | 174 | int r; |
d71518b5 | 175 | |
e686941a ML |
176 | if (!f) |
177 | return -EINVAL; | |
178 | ||
cdc50176 | 179 | r = drm_sched_job_init(&job->base, entity, owner); |
e686941a ML |
180 | if (r) |
181 | return r; | |
d71518b5 | 182 | |
dbe48d03 DV |
183 | drm_sched_job_arm(&job->base); |
184 | ||
f54d1867 | 185 | *f = dma_fence_get(&job->base.s_fence->finished); |
a5fb4ec2 | 186 | amdgpu_job_free_resources(job); |
0e10e9a1 | 187 | drm_sched_entity_push_job(&job->base); |
d71518b5 CK |
188 | |
189 | return 0; | |
50838c8c CK |
190 | } |
191 | ||
ee913fd9 CK |
192 | int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, |
193 | struct dma_fence **fence) | |
194 | { | |
195 | int r; | |
196 | ||
197 | job->base.sched = &ring->sched; | |
f6a3f660 AG |
198 | r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, fence); |
199 | ||
ee913fd9 CK |
200 | if (r) |
201 | return r; | |
202 | ||
203 | amdgpu_job_free(job); | |
204 | return 0; | |
205 | } | |
206 | ||
1b1f42d8 LS |
207 | static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job, |
208 | struct drm_sched_entity *s_entity) | |
e61235db | 209 | { |
068c3304 | 210 | struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); |
a6db8a33 | 211 | struct amdgpu_job *job = to_amdgpu_job(sched_job); |
c5637837 | 212 | struct amdgpu_vm *vm = job->vm; |
f024e883 | 213 | struct dma_fence *fence; |
df83d1eb | 214 | int r; |
cebb52b7 | 215 | |
174b328b CK |
216 | fence = amdgpu_sync_get_fence(&job->sync); |
217 | if (fence && drm_sched_dependency_optimized(fence, s_entity)) { | |
218 | r = amdgpu_sync_fence(&job->sched_sync, fence); | |
219 | if (r) | |
220 | DRM_ERROR("Error adding fence (%d)\n", r); | |
a340c7bc | 221 | } |
cebb52b7 | 222 | |
c4f46f22 | 223 | while (fence == NULL && vm && !job->vmid) { |
620f774f CK |
224 | r = amdgpu_vmid_grab(vm, ring, &job->sync, |
225 | &job->base.s_fence->finished, | |
226 | job); | |
94dd0a4a | 227 | if (r) |
8d0a7cea | 228 | DRM_ERROR("Error getting VM ID (%d)\n", r); |
8d0a7cea | 229 | |
174b328b | 230 | fence = amdgpu_sync_get_fence(&job->sync); |
8d0a7cea CK |
231 | } |
232 | ||
233 | return fence; | |
e61235db CK |
234 | } |
235 | ||
1b1f42d8 | 236 | static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) |
c1b69ed0 | 237 | { |
3320b8d2 | 238 | struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched); |
48f05f29 | 239 | struct dma_fence *fence = NULL, *finished; |
4c7eb91c | 240 | struct amdgpu_job *job; |
db5e65fc | 241 | int r = 0; |
c1b69ed0 | 242 | |
a6db8a33 | 243 | job = to_amdgpu_job(sched_job); |
48f05f29 | 244 | finished = &job->base.s_fence->finished; |
e86f9cee | 245 | |
1fbb2e92 | 246 | BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); |
e86f9cee | 247 | |
7034decf | 248 | trace_amdgpu_sched_run_job(job); |
48f05f29 | 249 | |
f1403342 CK |
250 | if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter)) |
251 | dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */ | |
252 | ||
253 | if (finished->error < 0) { | |
254 | DRM_INFO("Skip scheduling IBs!\n"); | |
255 | } else { | |
3320b8d2 | 256 | r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, |
f1403342 | 257 | &fence); |
15d73ce6 CZ |
258 | if (r) |
259 | DRM_ERROR("Error scheduling IBs (%d)\n", r); | |
260 | } | |
b2ff0e8a | 261 | |
c530b02f | 262 | job->job_run_counter++; |
22a77cf6 | 263 | amdgpu_job_free_resources(job); |
db5e65fc AG |
264 | |
265 | fence = r ? ERR_PTR(r) : fence; | |
ec72b800 | 266 | return fence; |
c1b69ed0 CZ |
267 | } |
268 | ||
7c6e68c7 AG |
269 | #define to_drm_sched_job(sched_job) \ |
270 | container_of((sched_job), struct drm_sched_job, queue_node) | |
271 | ||
272 | void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) | |
273 | { | |
274 | struct drm_sched_job *s_job; | |
275 | struct drm_sched_entity *s_entity = NULL; | |
276 | int i; | |
277 | ||
278 | /* Signal all jobs not yet scheduled */ | |
e2d732fd | 279 | for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { |
7c6e68c7 | 280 | struct drm_sched_rq *rq = &sched->sched_rq[i]; |
7c6e68c7 AG |
281 | spin_lock(&rq->lock); |
282 | list_for_each_entry(s_entity, &rq->entities, list) { | |
283 | while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { | |
284 | struct drm_sched_fence *s_fence = s_job->s_fence; | |
285 | ||
286 | dma_fence_signal(&s_fence->scheduled); | |
287 | dma_fence_set_error(&s_fence->finished, -EHWPOISON); | |
288 | dma_fence_signal(&s_fence->finished); | |
289 | } | |
290 | } | |
291 | spin_unlock(&rq->lock); | |
292 | } | |
293 | ||
294 | /* Signal all jobs already scheduled to HW */ | |
6efa4b46 | 295 | list_for_each_entry(s_job, &sched->pending_list, list) { |
7c6e68c7 AG |
296 | struct drm_sched_fence *s_fence = s_job->s_fence; |
297 | ||
298 | dma_fence_set_error(&s_fence->finished, -EHWPOISON); | |
299 | dma_fence_signal(&s_fence->finished); | |
300 | } | |
301 | } | |
302 | ||
1b1f42d8 | 303 | const struct drm_sched_backend_ops amdgpu_sched_ops = { |
0856cab1 CK |
304 | .dependency = amdgpu_job_dependency, |
305 | .run_job = amdgpu_job_run, | |
0e51a772 | 306 | .timedout_job = amdgpu_job_timedout, |
c5f74f78 | 307 | .free_job = amdgpu_job_free_cb |
c1b69ed0 | 308 | }; |