Commit | Line | Data |
---|---|---|
42de677f PY |
1 | // SPDX-License-Identifier: GPL-2.0 OR MIT |
2 | /* | |
3 | * Copyright 2020-2021 Advanced Micro Devices, Inc. | |
4 | * | |
5 | * Permission is hereby granted, free of charge, to any person obtaining a | |
6 | * copy of this software and associated documentation files (the "Software"), | |
7 | * to deal in the Software without restriction, including without limitation | |
8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
9 | * and/or sell copies of the Software, and to permit persons to whom the | |
10 | * Software is furnished to do so, subject to the following conditions: | |
11 | * | |
12 | * The above copyright notice and this permission notice shall be included in | |
13 | * all copies or substantial portions of the Software. | |
14 | * | |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
18 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
19 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
20 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
21 | * OTHER DEALINGS IN THE SOFTWARE. | |
22 | */ | |
23 | ||
24 | #include <linux/types.h> | |
8a7c184a | 25 | #include <linux/sched/task.h> |
42de677f PY |
26 | #include "amdgpu_sync.h" |
27 | #include "amdgpu_object.h" | |
28 | #include "amdgpu_vm.h" | |
d9483ecd | 29 | #include "amdgpu_hmm.h" |
b53fa124 PY |
30 | #include "amdgpu.h" |
31 | #include "amdgpu_xgmi.h" | |
42de677f PY |
32 | #include "kfd_priv.h" |
33 | #include "kfd_svm.h" | |
0b0e518d | 34 | #include "kfd_migrate.h" |
e0f1e65b | 35 | #include "kfd_smi_events.h" |
42de677f | 36 | |
a273bc99 PY |
37 | #ifdef dev_fmt |
38 | #undef dev_fmt | |
39 | #endif | |
40 | #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__ | |
41 | ||
8a7c184a FK |
42 | #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 |
43 | ||
564d2b92 FK |
44 | /* Long enough to ensure no retry fault comes after svm range is restored and |
45 | * page table is updated. | |
46 | */ | |
e0f1e65b | 47 | #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC) |
564d2b92 | 48 | |
4959e609 PY |
49 | /* Giant svm range split into smaller ranges based on this, it is decided using |
50 | * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to | |
51 | * power of 2MB. | |
52 | */ | |
53 | static uint64_t max_svm_range_pages; | |
54 | ||
c2db32ce RB |
55 | struct criu_svm_metadata { |
56 | struct list_head list; | |
57 | struct kfd_criu_svm_range_priv_data data; | |
58 | }; | |
59 | ||
b41896e3 | 60 | static void svm_range_evict_svm_bo_worker(struct work_struct *work); |
b1c46c7d PY |
61 | static bool |
62 | svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, | |
63 | const struct mmu_notifier_range *range, | |
64 | unsigned long cur_seq); | |
43fc10c1 PY |
65 | static int |
66 | svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, | |
67 | uint64_t *bo_s, uint64_t *bo_l); | |
b1c46c7d PY |
68 | static const struct mmu_interval_notifier_ops svm_range_mn_ops = { |
69 | .invalidate = svm_range_cpu_invalidate_pagetables, | |
70 | }; | |
71 | ||
42de677f PY |
72 | /** |
73 | * svm_range_unlink - unlink svm_range from lists and interval tree | |
74 | * @prange: svm range structure to be removed | |
75 | * | |
e49fe404 FK |
76 | * Remove the svm_range from the svms and svm_bo lists and the svms |
77 | * interval tree. | |
42de677f PY |
78 | * |
79 | * Context: The caller must hold svms->lock | |
80 | */ | |
81 | static void svm_range_unlink(struct svm_range *prange) | |
82 | { | |
83 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, | |
84 | prange, prange->start, prange->last); | |
85 | ||
e49fe404 FK |
86 | if (prange->svm_bo) { |
87 | spin_lock(&prange->svm_bo->list_lock); | |
88 | list_del(&prange->svm_bo_list); | |
89 | spin_unlock(&prange->svm_bo->list_lock); | |
90 | } | |
91 | ||
42de677f PY |
92 | list_del(&prange->list); |
93 | if (prange->it_node.start != 0 && prange->it_node.last != 0) | |
94 | interval_tree_remove(&prange->it_node, &prange->svms->objects); | |
95 | } | |
96 | ||
b1c46c7d PY |
97 | static void |
98 | svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) | |
99 | { | |
100 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, | |
101 | prange, prange->start, prange->last); | |
102 | ||
103 | mmu_interval_notifier_insert_locked(&prange->notifier, mm, | |
104 | prange->start << PAGE_SHIFT, | |
105 | prange->npages << PAGE_SHIFT, | |
106 | &svm_range_mn_ops); | |
107 | } | |
108 | ||
42de677f PY |
109 | /** |
110 | * svm_range_add_to_svms - add svm range to svms | |
111 | * @prange: svm range structure to be added | |
112 | * | |
113 | * Add the svm range to svms interval tree and link list | |
114 | * | |
115 | * Context: The caller must hold svms->lock | |
116 | */ | |
117 | static void svm_range_add_to_svms(struct svm_range *prange) | |
118 | { | |
119 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, | |
120 | prange, prange->start, prange->last); | |
121 | ||
ef3b4137 | 122 | list_move_tail(&prange->list, &prange->svms->list); |
42de677f PY |
123 | prange->it_node.start = prange->start; |
124 | prange->it_node.last = prange->last; | |
125 | interval_tree_insert(&prange->it_node, &prange->svms->objects); | |
126 | } | |
127 | ||
b1c46c7d PY |
128 | static void svm_range_remove_notifier(struct svm_range *prange) |
129 | { | |
130 | pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", | |
131 | prange->svms, prange, | |
132 | prange->notifier.interval_tree.start >> PAGE_SHIFT, | |
133 | prange->notifier.interval_tree.last >> PAGE_SHIFT); | |
134 | ||
135 | if (prange->notifier.interval_tree.start != 0 && | |
136 | prange->notifier.interval_tree.last != 0) | |
137 | mmu_interval_notifier_remove(&prange->notifier); | |
138 | } | |
139 | ||
e7eb2137 PY |
140 | static bool |
141 | svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) | |
142 | { | |
143 | return dma_addr && !dma_mapping_error(dev, dma_addr) && | |
144 | !(dma_addr & SVM_RANGE_VRAM_DOMAIN); | |
145 | } | |
146 | ||
f80fe9d3 | 147 | static int |
1d5dbfe6 | 148 | svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, |
2f617f4d | 149 | unsigned long offset, unsigned long npages, |
1d5dbfe6 | 150 | unsigned long *hmm_pfns, uint32_t gpuidx) |
f80fe9d3 FK |
151 | { |
152 | enum dma_data_direction dir = DMA_BIDIRECTIONAL; | |
1d5dbfe6 AS |
153 | dma_addr_t *addr = prange->dma_addr[gpuidx]; |
154 | struct device *dev = adev->dev; | |
f80fe9d3 FK |
155 | struct page *page; |
156 | int i, r; | |
157 | ||
158 | if (!addr) { | |
cc9d82fc | 159 | addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL); |
f80fe9d3 FK |
160 | if (!addr) |
161 | return -ENOMEM; | |
1d5dbfe6 | 162 | prange->dma_addr[gpuidx] = addr; |
f80fe9d3 FK |
163 | } |
164 | ||
2f617f4d PY |
165 | addr += offset; |
166 | for (i = 0; i < npages; i++) { | |
e7eb2137 | 167 | if (svm_is_valid_dma_mapping_addr(dev, addr[i])) |
f80fe9d3 FK |
168 | dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); |
169 | ||
170 | page = hmm_pfn_to_page(hmm_pfns[i]); | |
1d5dbfe6 AS |
171 | if (is_zone_device_page(page)) { |
172 | struct amdgpu_device *bo_adev = | |
173 | amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); | |
174 | ||
175 | addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + | |
176 | bo_adev->vm_manager.vram_base_offset - | |
177 | bo_adev->kfd.dev->pgmap.range.start; | |
178 | addr[i] |= SVM_RANGE_VRAM_DOMAIN; | |
a273bc99 | 179 | pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]); |
1d5dbfe6 AS |
180 | continue; |
181 | } | |
f80fe9d3 FK |
182 | addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); |
183 | r = dma_mapping_error(dev, addr[i]); | |
184 | if (r) { | |
a273bc99 | 185 | dev_err(dev, "failed %d dma_map_page\n", r); |
f80fe9d3 FK |
186 | return r; |
187 | } | |
a273bc99 PY |
188 | pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n", |
189 | addr[i] >> PAGE_SHIFT, page_to_pfn(page)); | |
f80fe9d3 FK |
190 | } |
191 | return 0; | |
192 | } | |
193 | ||
194 | static int | |
195 | svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, | |
2f617f4d | 196 | unsigned long offset, unsigned long npages, |
f80fe9d3 FK |
197 | unsigned long *hmm_pfns) |
198 | { | |
199 | struct kfd_process *p; | |
200 | uint32_t gpuidx; | |
201 | int r; | |
202 | ||
203 | p = container_of(prange->svms, struct kfd_process, svms); | |
204 | ||
205 | for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { | |
206 | struct kfd_process_device *pdd; | |
f80fe9d3 FK |
207 | |
208 | pr_debug("mapping to gpu idx 0x%x\n", gpuidx); | |
209 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); | |
210 | if (!pdd) { | |
211 | pr_debug("failed to find device idx %d\n", gpuidx); | |
212 | return -EINVAL; | |
213 | } | |
f80fe9d3 | 214 | |
56c5977e | 215 | r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages, |
2f617f4d | 216 | hmm_pfns, gpuidx); |
f80fe9d3 FK |
217 | if (r) |
218 | break; | |
219 | } | |
220 | ||
221 | return r; | |
222 | } | |
223 | ||
0b0e518d FK |
224 | void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, |
225 | unsigned long offset, unsigned long npages) | |
f80fe9d3 FK |
226 | { |
227 | enum dma_data_direction dir = DMA_BIDIRECTIONAL; | |
228 | int i; | |
229 | ||
230 | if (!dma_addr) | |
231 | return; | |
232 | ||
233 | for (i = offset; i < offset + npages; i++) { | |
e7eb2137 | 234 | if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) |
f80fe9d3 | 235 | continue; |
a273bc99 | 236 | pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); |
f80fe9d3 FK |
237 | dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); |
238 | dma_addr[i] = 0; | |
239 | } | |
240 | } | |
241 | ||
0b0e518d | 242 | void svm_range_free_dma_mappings(struct svm_range *prange) |
f80fe9d3 FK |
243 | { |
244 | struct kfd_process_device *pdd; | |
245 | dma_addr_t *dma_addr; | |
246 | struct device *dev; | |
247 | struct kfd_process *p; | |
248 | uint32_t gpuidx; | |
249 | ||
250 | p = container_of(prange->svms, struct kfd_process, svms); | |
251 | ||
252 | for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { | |
253 | dma_addr = prange->dma_addr[gpuidx]; | |
254 | if (!dma_addr) | |
255 | continue; | |
256 | ||
257 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); | |
258 | if (!pdd) { | |
259 | pr_debug("failed to find device idx %d\n", gpuidx); | |
260 | continue; | |
261 | } | |
d69a3b76 | 262 | dev = &pdd->dev->adev->pdev->dev; |
f80fe9d3 FK |
263 | svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); |
264 | kvfree(dma_addr); | |
265 | prange->dma_addr[gpuidx] = NULL; | |
266 | } | |
267 | } | |
268 | ||
f9af3c16 | 269 | static void svm_range_free(struct svm_range *prange, bool update_mem_usage) |
42de677f | 270 | { |
f9af3c16 AS |
271 | uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT; |
272 | struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); | |
273 | ||
42de677f PY |
274 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, |
275 | prange->start, prange->last); | |
276 | ||
e49fe404 | 277 | svm_range_vram_node_free(prange); |
f80fe9d3 | 278 | svm_range_free_dma_mappings(prange); |
f9af3c16 AS |
279 | |
280 | if (update_mem_usage && !p->xnack_enabled) { | |
8a7c3ce1 | 281 | pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size); |
f9af3c16 AS |
282 | amdgpu_amdkfd_unreserve_mem_limit(NULL, size, |
283 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); | |
284 | } | |
b1c46c7d | 285 | mutex_destroy(&prange->lock); |
0b0e518d | 286 | mutex_destroy(&prange->migrate_mutex); |
42de677f PY |
287 | kfree(prange); |
288 | } | |
289 | ||
290 | static void | |
291 | svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, | |
292 | uint8_t *granularity, uint32_t *flags) | |
293 | { | |
294 | *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; | |
295 | *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; | |
296 | *granularity = 9; | |
297 | *flags = | |
298 | KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; | |
299 | } | |
300 | ||
301 | static struct | |
302 | svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, | |
f9af3c16 | 303 | uint64_t last, bool update_mem_usage) |
42de677f PY |
304 | { |
305 | uint64_t size = last - start + 1; | |
306 | struct svm_range *prange; | |
a9a76bee | 307 | struct kfd_process *p; |
42de677f PY |
308 | |
309 | prange = kzalloc(sizeof(*prange), GFP_KERNEL); | |
310 | if (!prange) | |
311 | return NULL; | |
f9af3c16 AS |
312 | |
313 | p = container_of(svms, struct kfd_process, svms); | |
314 | if (!p->xnack_enabled && update_mem_usage && | |
315 | amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT, | |
316 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)) { | |
317 | pr_info("SVM mapping failed, exceeds resident system memory limit\n"); | |
318 | kfree(prange); | |
319 | return NULL; | |
320 | } | |
42de677f PY |
321 | prange->npages = size; |
322 | prange->svms = svms; | |
323 | prange->start = start; | |
324 | prange->last = last; | |
325 | INIT_LIST_HEAD(&prange->list); | |
326 | INIT_LIST_HEAD(&prange->update_list); | |
e49fe404 | 327 | INIT_LIST_HEAD(&prange->svm_bo_list); |
4683cfec PY |
328 | INIT_LIST_HEAD(&prange->deferred_list); |
329 | INIT_LIST_HEAD(&prange->child_list); | |
8a7c184a | 330 | atomic_set(&prange->invalid, 0); |
b19dbb7a | 331 | prange->validate_timestamp = 0; |
0b0e518d | 332 | mutex_init(&prange->migrate_mutex); |
b1c46c7d | 333 | mutex_init(&prange->lock); |
a9a76bee | 334 | |
a9a76bee | 335 | if (p->xnack_enabled) |
5a75ea56 FK |
336 | bitmap_copy(prange->bitmap_access, svms->bitmap_supported, |
337 | MAX_GPU_INSTANCE); | |
a9a76bee | 338 | |
42de677f PY |
339 | svm_range_set_default_attributes(&prange->preferred_loc, |
340 | &prange->prefetch_loc, | |
341 | &prange->granularity, &prange->flags); | |
342 | ||
343 | pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); | |
344 | ||
345 | return prange; | |
346 | } | |
347 | ||
e49fe404 FK |
348 | static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) |
349 | { | |
350 | if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) | |
351 | return false; | |
352 | ||
353 | return true; | |
354 | } | |
355 | ||
e49fe404 FK |
356 | static void svm_range_bo_release(struct kref *kref) |
357 | { | |
358 | struct svm_range_bo *svm_bo; | |
359 | ||
360 | svm_bo = container_of(kref, struct svm_range_bo, kref); | |
69879b30 PY |
361 | pr_debug("svm_bo 0x%p\n", svm_bo); |
362 | ||
e49fe404 FK |
363 | spin_lock(&svm_bo->list_lock); |
364 | while (!list_empty(&svm_bo->range_list)) { | |
365 | struct svm_range *prange = | |
366 | list_first_entry(&svm_bo->range_list, | |
367 | struct svm_range, svm_bo_list); | |
368 | /* list_del_init tells a concurrent svm_range_vram_node_new when | |
369 | * it's safe to reuse the svm_bo pointer and svm_bo_list head. | |
370 | */ | |
371 | list_del_init(&prange->svm_bo_list); | |
372 | spin_unlock(&svm_bo->list_lock); | |
373 | ||
374 | pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, | |
375 | prange->start, prange->last); | |
376 | mutex_lock(&prange->lock); | |
377 | prange->svm_bo = NULL; | |
378 | mutex_unlock(&prange->lock); | |
379 | ||
380 | spin_lock(&svm_bo->list_lock); | |
381 | } | |
382 | spin_unlock(&svm_bo->list_lock); | |
b41896e3 FK |
383 | if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { |
384 | /* We're not in the eviction worker. | |
385 | * Signal the fence and synchronize with any | |
386 | * pending eviction work. | |
387 | */ | |
388 | dma_fence_signal(&svm_bo->eviction_fence->base); | |
389 | cancel_work_sync(&svm_bo->eviction_work); | |
390 | } | |
391 | dma_fence_put(&svm_bo->eviction_fence->base); | |
e49fe404 FK |
392 | amdgpu_bo_unref(&svm_bo->bo); |
393 | kfree(svm_bo); | |
394 | } | |
395 | ||
69879b30 | 396 | static void svm_range_bo_wq_release(struct work_struct *work) |
e49fe404 | 397 | { |
69879b30 PY |
398 | struct svm_range_bo *svm_bo; |
399 | ||
400 | svm_bo = container_of(work, struct svm_range_bo, release_work); | |
401 | svm_range_bo_release(&svm_bo->kref); | |
402 | } | |
403 | ||
404 | static void svm_range_bo_release_async(struct kref *kref) | |
405 | { | |
406 | struct svm_range_bo *svm_bo; | |
e49fe404 | 407 | |
69879b30 PY |
408 | svm_bo = container_of(kref, struct svm_range_bo, kref); |
409 | pr_debug("svm_bo 0x%p\n", svm_bo); | |
410 | INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release); | |
411 | schedule_work(&svm_bo->release_work); | |
412 | } | |
413 | ||
414 | void svm_range_bo_unref_async(struct svm_range_bo *svm_bo) | |
415 | { | |
416 | kref_put(&svm_bo->kref, svm_range_bo_release_async); | |
417 | } | |
418 | ||
419 | static void svm_range_bo_unref(struct svm_range_bo *svm_bo) | |
420 | { | |
421 | if (svm_bo) | |
422 | kref_put(&svm_bo->kref, svm_range_bo_release); | |
e49fe404 FK |
423 | } |
424 | ||
1a3b2b5d FK |
425 | static bool |
426 | svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) | |
b41896e3 | 427 | { |
1a3b2b5d FK |
428 | struct amdgpu_device *bo_adev; |
429 | ||
b41896e3 FK |
430 | mutex_lock(&prange->lock); |
431 | if (!prange->svm_bo) { | |
432 | mutex_unlock(&prange->lock); | |
433 | return false; | |
434 | } | |
435 | if (prange->ttm_res) { | |
436 | /* We still have a reference, all is well */ | |
437 | mutex_unlock(&prange->lock); | |
438 | return true; | |
439 | } | |
440 | if (svm_bo_ref_unless_zero(prange->svm_bo)) { | |
1a3b2b5d FK |
441 | /* |
442 | * Migrate from GPU to GPU, remove range from source bo_adev | |
443 | * svm_bo range list, and return false to allocate svm_bo from | |
444 | * destination adev. | |
445 | */ | |
446 | bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); | |
447 | if (bo_adev != adev) { | |
448 | mutex_unlock(&prange->lock); | |
449 | ||
450 | spin_lock(&prange->svm_bo->list_lock); | |
451 | list_del_init(&prange->svm_bo_list); | |
452 | spin_unlock(&prange->svm_bo->list_lock); | |
453 | ||
454 | svm_range_bo_unref(prange->svm_bo); | |
455 | return false; | |
456 | } | |
b41896e3 FK |
457 | if (READ_ONCE(prange->svm_bo->evicting)) { |
458 | struct dma_fence *f; | |
459 | struct svm_range_bo *svm_bo; | |
460 | /* The BO is getting evicted, | |
461 | * we need to get a new one | |
462 | */ | |
463 | mutex_unlock(&prange->lock); | |
464 | svm_bo = prange->svm_bo; | |
465 | f = dma_fence_get(&svm_bo->eviction_fence->base); | |
466 | svm_range_bo_unref(prange->svm_bo); | |
467 | /* wait for the fence to avoid long spin-loop | |
468 | * at list_empty_careful | |
469 | */ | |
470 | dma_fence_wait(f, false); | |
471 | dma_fence_put(f); | |
472 | } else { | |
473 | /* The BO was still around and we got | |
474 | * a new reference to it | |
475 | */ | |
476 | mutex_unlock(&prange->lock); | |
477 | pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n", | |
478 | prange->svms, prange->start, prange->last); | |
479 | ||
d3116756 | 480 | prange->ttm_res = prange->svm_bo->bo->tbo.resource; |
b41896e3 FK |
481 | return true; |
482 | } | |
483 | ||
484 | } else { | |
485 | mutex_unlock(&prange->lock); | |
486 | } | |
487 | ||
488 | /* We need a new svm_bo. Spin-loop to wait for concurrent | |
489 | * svm_range_bo_release to finish removing this range from | |
490 | * its range list. After this, it is safe to reuse the | |
491 | * svm_bo pointer and svm_bo_list head. | |
492 | */ | |
493 | while (!list_empty_careful(&prange->svm_bo_list)) | |
494 | ; | |
495 | ||
496 | return false; | |
497 | } | |
498 | ||
e49fe404 FK |
499 | static struct svm_range_bo *svm_range_bo_new(void) |
500 | { | |
501 | struct svm_range_bo *svm_bo; | |
502 | ||
503 | svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); | |
504 | if (!svm_bo) | |
505 | return NULL; | |
506 | ||
507 | kref_init(&svm_bo->kref); | |
508 | INIT_LIST_HEAD(&svm_bo->range_list); | |
509 | spin_lock_init(&svm_bo->list_lock); | |
510 | ||
511 | return svm_bo; | |
512 | } | |
513 | ||
514 | int | |
515 | svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, | |
516 | bool clear) | |
517 | { | |
e49fe404 FK |
518 | struct amdgpu_bo_param bp; |
519 | struct svm_range_bo *svm_bo; | |
520 | struct amdgpu_bo_user *ubo; | |
521 | struct amdgpu_bo *bo; | |
522 | struct kfd_process *p; | |
b41896e3 | 523 | struct mm_struct *mm; |
e49fe404 FK |
524 | int r; |
525 | ||
b41896e3 FK |
526 | p = container_of(prange->svms, struct kfd_process, svms); |
527 | pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, | |
528 | prange->start, prange->last); | |
e49fe404 | 529 | |
1a3b2b5d | 530 | if (svm_range_validate_svm_bo(adev, prange)) |
b41896e3 | 531 | return 0; |
e49fe404 FK |
532 | |
533 | svm_bo = svm_range_bo_new(); | |
534 | if (!svm_bo) { | |
535 | pr_debug("failed to alloc svm bo\n"); | |
536 | return -ENOMEM; | |
537 | } | |
b41896e3 FK |
538 | mm = get_task_mm(p->lead_thread); |
539 | if (!mm) { | |
540 | pr_debug("failed to get mm\n"); | |
541 | kfree(svm_bo); | |
542 | return -ESRCH; | |
543 | } | |
b41896e3 FK |
544 | svm_bo->eviction_fence = |
545 | amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), | |
546 | mm, | |
547 | svm_bo); | |
548 | mmput(mm); | |
549 | INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); | |
550 | svm_bo->evicting = 0; | |
e49fe404 FK |
551 | memset(&bp, 0, sizeof(bp)); |
552 | bp.size = prange->npages * PAGE_SIZE; | |
553 | bp.byte_align = PAGE_SIZE; | |
554 | bp.domain = AMDGPU_GEM_DOMAIN_VRAM; | |
555 | bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; | |
556 | bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; | |
fab2cc83 | 557 | bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE; |
e49fe404 FK |
558 | bp.type = ttm_bo_type_device; |
559 | bp.resv = NULL; | |
560 | ||
561 | r = amdgpu_bo_create_user(adev, &bp, &ubo); | |
562 | if (r) { | |
563 | pr_debug("failed %d to create bo\n", r); | |
b41896e3 | 564 | goto create_bo_failed; |
e49fe404 FK |
565 | } |
566 | bo = &ubo->bo; | |
e49fe404 FK |
567 | r = amdgpu_bo_reserve(bo, true); |
568 | if (r) { | |
569 | pr_debug("failed %d to reserve bo\n", r); | |
570 | goto reserve_bo_failed; | |
571 | } | |
572 | ||
c8d4c18b | 573 | r = dma_resv_reserve_fences(bo->tbo.base.resv, 1); |
e49fe404 FK |
574 | if (r) { |
575 | pr_debug("failed %d to reserve bo\n", r); | |
576 | amdgpu_bo_unreserve(bo); | |
577 | goto reserve_bo_failed; | |
578 | } | |
b41896e3 | 579 | amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true); |
e49fe404 FK |
580 | |
581 | amdgpu_bo_unreserve(bo); | |
582 | ||
583 | svm_bo->bo = bo; | |
584 | prange->svm_bo = svm_bo; | |
d3116756 | 585 | prange->ttm_res = bo->tbo.resource; |
e49fe404 FK |
586 | prange->offset = 0; |
587 | ||
588 | spin_lock(&svm_bo->list_lock); | |
589 | list_add(&prange->svm_bo_list, &svm_bo->range_list); | |
590 | spin_unlock(&svm_bo->list_lock); | |
591 | ||
592 | return 0; | |
593 | ||
594 | reserve_bo_failed: | |
e49fe404 | 595 | amdgpu_bo_unref(&bo); |
b41896e3 FK |
596 | create_bo_failed: |
597 | dma_fence_put(&svm_bo->eviction_fence->base); | |
598 | kfree(svm_bo); | |
e49fe404 FK |
599 | prange->ttm_res = NULL; |
600 | ||
601 | return r; | |
602 | } | |
603 | ||
604 | void svm_range_vram_node_free(struct svm_range *prange) | |
605 | { | |
606 | svm_range_bo_unref(prange->svm_bo); | |
607 | prange->ttm_res = NULL; | |
608 | } | |
609 | ||
610 | struct amdgpu_device * | |
611 | svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) | |
612 | { | |
613 | struct kfd_process_device *pdd; | |
614 | struct kfd_process *p; | |
615 | int32_t gpu_idx; | |
616 | ||
617 | p = container_of(prange->svms, struct kfd_process, svms); | |
618 | ||
619 | gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); | |
620 | if (gpu_idx < 0) { | |
621 | pr_debug("failed to get device by id 0x%x\n", gpu_id); | |
622 | return NULL; | |
623 | } | |
624 | pdd = kfd_process_device_from_gpuidx(p, gpu_idx); | |
625 | if (!pdd) { | |
626 | pr_debug("failed to get device by idx 0x%x\n", gpu_idx); | |
627 | return NULL; | |
628 | } | |
629 | ||
56c5977e | 630 | return pdd->dev->adev; |
e49fe404 FK |
631 | } |
632 | ||
d4ebc200 PY |
633 | struct kfd_process_device * |
634 | svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev) | |
635 | { | |
636 | struct kfd_process *p; | |
637 | int32_t gpu_idx, gpuid; | |
638 | int r; | |
639 | ||
640 | p = container_of(prange->svms, struct kfd_process, svms); | |
641 | ||
56c5977e | 642 | r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpu_idx); |
d4ebc200 PY |
643 | if (r) { |
644 | pr_debug("failed to get device id by adev %p\n", adev); | |
645 | return NULL; | |
646 | } | |
647 | ||
648 | return kfd_process_device_from_gpuidx(p, gpu_idx); | |
649 | } | |
650 | ||
f80fe9d3 FK |
651 | static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) |
652 | { | |
653 | struct ttm_operation_ctx ctx = { false, false }; | |
654 | ||
655 | amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM); | |
656 | ||
657 | return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); | |
658 | } | |
659 | ||
42de677f PY |
660 | static int |
661 | svm_range_check_attr(struct kfd_process *p, | |
662 | uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) | |
663 | { | |
664 | uint32_t i; | |
42de677f PY |
665 | |
666 | for (i = 0; i < nattr; i++) { | |
5a75ea56 FK |
667 | uint32_t val = attrs[i].value; |
668 | int gpuidx = MAX_GPU_INSTANCE; | |
669 | ||
42de677f PY |
670 | switch (attrs[i].type) { |
671 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: | |
5a75ea56 FK |
672 | if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM && |
673 | val != KFD_IOCTL_SVM_LOCATION_UNDEFINED) | |
674 | gpuidx = kfd_process_gpuidx_from_gpuid(p, val); | |
42de677f PY |
675 | break; |
676 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: | |
5a75ea56 FK |
677 | if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM) |
678 | gpuidx = kfd_process_gpuidx_from_gpuid(p, val); | |
42de677f PY |
679 | break; |
680 | case KFD_IOCTL_SVM_ATTR_ACCESS: | |
681 | case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: | |
682 | case KFD_IOCTL_SVM_ATTR_NO_ACCESS: | |
5a75ea56 | 683 | gpuidx = kfd_process_gpuidx_from_gpuid(p, val); |
42de677f PY |
684 | break; |
685 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: | |
686 | break; | |
687 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: | |
688 | break; | |
689 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: | |
690 | break; | |
691 | default: | |
692 | pr_debug("unknown attr type 0x%x\n", attrs[i].type); | |
693 | return -EINVAL; | |
694 | } | |
5a75ea56 FK |
695 | |
696 | if (gpuidx < 0) { | |
697 | pr_debug("no GPU 0x%x found\n", val); | |
698 | return -EINVAL; | |
699 | } else if (gpuidx < MAX_GPU_INSTANCE && | |
700 | !test_bit(gpuidx, p->svms.bitmap_supported)) { | |
701 | pr_debug("GPU 0x%x not supported\n", val); | |
702 | return -EINVAL; | |
703 | } | |
42de677f PY |
704 | } |
705 | ||
706 | return 0; | |
707 | } | |
708 | ||
709 | static void | |
710 | svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, | |
601354f3 PY |
711 | uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, |
712 | bool *update_mapping) | |
42de677f PY |
713 | { |
714 | uint32_t i; | |
715 | int gpuidx; | |
716 | ||
717 | for (i = 0; i < nattr; i++) { | |
718 | switch (attrs[i].type) { | |
719 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: | |
720 | prange->preferred_loc = attrs[i].value; | |
721 | break; | |
722 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: | |
723 | prange->prefetch_loc = attrs[i].value; | |
724 | break; | |
725 | case KFD_IOCTL_SVM_ATTR_ACCESS: | |
726 | case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: | |
727 | case KFD_IOCTL_SVM_ATTR_NO_ACCESS: | |
601354f3 | 728 | *update_mapping = true; |
42de677f PY |
729 | gpuidx = kfd_process_gpuidx_from_gpuid(p, |
730 | attrs[i].value); | |
731 | if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { | |
732 | bitmap_clear(prange->bitmap_access, gpuidx, 1); | |
733 | bitmap_clear(prange->bitmap_aip, gpuidx, 1); | |
734 | } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { | |
735 | bitmap_set(prange->bitmap_access, gpuidx, 1); | |
736 | bitmap_clear(prange->bitmap_aip, gpuidx, 1); | |
737 | } else { | |
738 | bitmap_clear(prange->bitmap_access, gpuidx, 1); | |
739 | bitmap_set(prange->bitmap_aip, gpuidx, 1); | |
740 | } | |
741 | break; | |
742 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: | |
601354f3 | 743 | *update_mapping = true; |
42de677f PY |
744 | prange->flags |= attrs[i].value; |
745 | break; | |
746 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: | |
601354f3 | 747 | *update_mapping = true; |
42de677f PY |
748 | prange->flags &= ~attrs[i].value; |
749 | break; | |
750 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: | |
751 | prange->granularity = attrs[i].value; | |
752 | break; | |
753 | default: | |
754 | WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); | |
755 | } | |
756 | } | |
757 | } | |
758 | ||
f864df76 FK |
759 | static bool |
760 | svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange, | |
761 | uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) | |
762 | { | |
763 | uint32_t i; | |
764 | int gpuidx; | |
765 | ||
766 | for (i = 0; i < nattr; i++) { | |
767 | switch (attrs[i].type) { | |
768 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: | |
769 | if (prange->preferred_loc != attrs[i].value) | |
770 | return false; | |
771 | break; | |
772 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: | |
773 | /* Prefetch should always trigger a migration even | |
774 | * if the value of the attribute didn't change. | |
775 | */ | |
776 | return false; | |
777 | case KFD_IOCTL_SVM_ATTR_ACCESS: | |
778 | case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: | |
779 | case KFD_IOCTL_SVM_ATTR_NO_ACCESS: | |
780 | gpuidx = kfd_process_gpuidx_from_gpuid(p, | |
781 | attrs[i].value); | |
782 | if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { | |
783 | if (test_bit(gpuidx, prange->bitmap_access) || | |
784 | test_bit(gpuidx, prange->bitmap_aip)) | |
785 | return false; | |
786 | } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { | |
787 | if (!test_bit(gpuidx, prange->bitmap_access)) | |
788 | return false; | |
789 | } else { | |
790 | if (!test_bit(gpuidx, prange->bitmap_aip)) | |
791 | return false; | |
792 | } | |
793 | break; | |
794 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: | |
795 | if ((prange->flags & attrs[i].value) != attrs[i].value) | |
796 | return false; | |
797 | break; | |
798 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: | |
799 | if ((prange->flags & attrs[i].value) != 0) | |
800 | return false; | |
801 | break; | |
802 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: | |
803 | if (prange->granularity != attrs[i].value) | |
804 | return false; | |
805 | break; | |
806 | default: | |
807 | WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); | |
808 | } | |
809 | } | |
810 | ||
811 | return true; | |
812 | } | |
813 | ||
42de677f PY |
814 | /** |
815 | * svm_range_debug_dump - print all range information from svms | |
816 | * @svms: svm range list header | |
817 | * | |
818 | * debug output svm range start, end, prefetch location from svms | |
819 | * interval tree and link list | |
820 | * | |
821 | * Context: The caller must hold svms->lock | |
822 | */ | |
823 | static void svm_range_debug_dump(struct svm_range_list *svms) | |
824 | { | |
825 | struct interval_tree_node *node; | |
826 | struct svm_range *prange; | |
827 | ||
828 | pr_debug("dump svms 0x%p list\n", svms); | |
829 | pr_debug("range\tstart\tpage\tend\t\tlocation\n"); | |
830 | ||
831 | list_for_each_entry(prange, &svms->list, list) { | |
832 | pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", | |
833 | prange, prange->start, prange->npages, | |
834 | prange->start + prange->npages - 1, | |
835 | prange->actual_loc); | |
836 | } | |
837 | ||
838 | pr_debug("dump svms 0x%p interval tree\n", svms); | |
839 | pr_debug("range\tstart\tpage\tend\t\tlocation\n"); | |
840 | node = interval_tree_iter_first(&svms->objects, 0, ~0ULL); | |
841 | while (node) { | |
842 | prange = container_of(node, struct svm_range, it_node); | |
843 | pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n", | |
844 | prange, prange->start, prange->npages, | |
845 | prange->start + prange->npages - 1, | |
846 | prange->actual_loc); | |
847 | node = interval_tree_iter_next(node, 0, ~0ULL); | |
848 | } | |
849 | } | |
850 | ||
f80fe9d3 FK |
851 | static int |
852 | svm_range_split_array(void *ppnew, void *ppold, size_t size, | |
853 | uint64_t old_start, uint64_t old_n, | |
854 | uint64_t new_start, uint64_t new_n) | |
855 | { | |
856 | unsigned char *new, *old, *pold; | |
857 | uint64_t d; | |
858 | ||
859 | if (!ppold) | |
860 | return 0; | |
861 | pold = *(unsigned char **)ppold; | |
862 | if (!pold) | |
863 | return 0; | |
864 | ||
865 | new = kvmalloc_array(new_n, size, GFP_KERNEL); | |
866 | if (!new) | |
867 | return -ENOMEM; | |
868 | ||
869 | d = (new_start - old_start) * size; | |
870 | memcpy(new, pold + d, new_n * size); | |
871 | ||
872 | old = kvmalloc_array(old_n, size, GFP_KERNEL); | |
873 | if (!old) { | |
874 | kvfree(new); | |
875 | return -ENOMEM; | |
876 | } | |
877 | ||
878 | d = (new_start == old_start) ? new_n * size : 0; | |
879 | memcpy(old, pold + d, old_n * size); | |
880 | ||
881 | kvfree(pold); | |
882 | *(void **)ppold = old; | |
883 | *(void **)ppnew = new; | |
884 | ||
885 | return 0; | |
886 | } | |
887 | ||
888 | static int | |
889 | svm_range_split_pages(struct svm_range *new, struct svm_range *old, | |
890 | uint64_t start, uint64_t last) | |
891 | { | |
892 | uint64_t npages = last - start + 1; | |
893 | int i, r; | |
894 | ||
895 | for (i = 0; i < MAX_GPU_INSTANCE; i++) { | |
896 | r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i], | |
897 | sizeof(*old->dma_addr[i]), old->start, | |
898 | npages, new->start, new->npages); | |
899 | if (r) | |
900 | return r; | |
901 | } | |
902 | ||
903 | return 0; | |
904 | } | |
905 | ||
e49fe404 FK |
906 | static int |
907 | svm_range_split_nodes(struct svm_range *new, struct svm_range *old, | |
908 | uint64_t start, uint64_t last) | |
909 | { | |
910 | uint64_t npages = last - start + 1; | |
911 | ||
912 | pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", | |
913 | new->svms, new, new->start, start, last); | |
914 | ||
915 | if (new->start == old->start) { | |
916 | new->offset = old->offset; | |
917 | old->offset += new->npages; | |
918 | } else { | |
919 | new->offset = old->offset + npages; | |
920 | } | |
921 | ||
922 | new->svm_bo = svm_range_bo_ref(old->svm_bo); | |
923 | new->ttm_res = old->ttm_res; | |
924 | ||
925 | spin_lock(&new->svm_bo->list_lock); | |
926 | list_add(&new->svm_bo_list, &new->svm_bo->range_list); | |
927 | spin_unlock(&new->svm_bo->list_lock); | |
928 | ||
929 | return 0; | |
930 | } | |
931 | ||
42de677f PY |
932 | /** |
933 | * svm_range_split_adjust - split range and adjust | |
934 | * | |
935 | * @new: new range | |
936 | * @old: the old range | |
937 | * @start: the old range adjust to start address in pages | |
938 | * @last: the old range adjust to last address in pages | |
939 | * | |
e49fe404 | 940 | * Copy system memory dma_addr or vram ttm_res in old range to new |
42de677f PY |
941 | * range from new_start up to size new->npages, the remaining old range is from |
942 | * start to last | |
943 | * | |
944 | * Return: | |
945 | * 0 - OK, -ENOMEM - out of memory | |
946 | */ | |
947 | static int | |
948 | svm_range_split_adjust(struct svm_range *new, struct svm_range *old, | |
949 | uint64_t start, uint64_t last) | |
950 | { | |
f80fe9d3 FK |
951 | int r; |
952 | ||
42de677f PY |
953 | pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n", |
954 | new->svms, new->start, old->start, old->last, start, last); | |
955 | ||
956 | if (new->start < old->start || | |
957 | new->last > old->last) { | |
958 | WARN_ONCE(1, "invalid new range start or last\n"); | |
959 | return -EINVAL; | |
960 | } | |
961 | ||
f80fe9d3 FK |
962 | r = svm_range_split_pages(new, old, start, last); |
963 | if (r) | |
964 | return r; | |
965 | ||
e49fe404 FK |
966 | if (old->actual_loc && old->ttm_res) { |
967 | r = svm_range_split_nodes(new, old, start, last); | |
968 | if (r) | |
969 | return r; | |
970 | } | |
971 | ||
42de677f PY |
972 | old->npages = last - start + 1; |
973 | old->start = start; | |
974 | old->last = last; | |
975 | new->flags = old->flags; | |
976 | new->preferred_loc = old->preferred_loc; | |
977 | new->prefetch_loc = old->prefetch_loc; | |
978 | new->actual_loc = old->actual_loc; | |
979 | new->granularity = old->granularity; | |
6b9c63a6 | 980 | new->mapped_to_gpu = old->mapped_to_gpu; |
42de677f PY |
981 | bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); |
982 | bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); | |
983 | ||
984 | return 0; | |
985 | } | |
986 | ||
987 | /** | |
988 | * svm_range_split - split a range in 2 ranges | |
989 | * | |
990 | * @prange: the svm range to split | |
991 | * @start: the remaining range start address in pages | |
992 | * @last: the remaining range last address in pages | |
993 | * @new: the result new range generated | |
994 | * | |
995 | * Two cases only: | |
996 | * case 1: if start == prange->start | |
997 | * prange ==> prange[start, last] | |
998 | * new range [last + 1, prange->last] | |
999 | * | |
1000 | * case 2: if last == prange->last | |
1001 | * prange ==> prange[start, last] | |
1002 | * new range [prange->start, start - 1] | |
1003 | * | |
1004 | * Return: | |
1005 | * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last | |
1006 | */ | |
1007 | static int | |
1008 | svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, | |
1009 | struct svm_range **new) | |
1010 | { | |
1011 | uint64_t old_start = prange->start; | |
1012 | uint64_t old_last = prange->last; | |
1013 | struct svm_range_list *svms; | |
1014 | int r = 0; | |
1015 | ||
1016 | pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms, | |
1017 | old_start, old_last, start, last); | |
1018 | ||
1019 | if (old_start != start && old_last != last) | |
1020 | return -EINVAL; | |
1021 | if (start < old_start || last > old_last) | |
1022 | return -EINVAL; | |
1023 | ||
1024 | svms = prange->svms; | |
1025 | if (old_start == start) | |
f9af3c16 | 1026 | *new = svm_range_new(svms, last + 1, old_last, false); |
42de677f | 1027 | else |
f9af3c16 | 1028 | *new = svm_range_new(svms, old_start, start - 1, false); |
42de677f PY |
1029 | if (!*new) |
1030 | return -ENOMEM; | |
1031 | ||
1032 | r = svm_range_split_adjust(*new, prange, start, last); | |
1033 | if (r) { | |
1034 | pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", | |
1035 | r, old_start, old_last, start, last); | |
f9af3c16 | 1036 | svm_range_free(*new, false); |
42de677f PY |
1037 | *new = NULL; |
1038 | } | |
1039 | ||
1040 | return r; | |
1041 | } | |
1042 | ||
1043 | static int | |
726be406 | 1044 | svm_range_split_tail(struct svm_range *prange, |
42de677f PY |
1045 | uint64_t new_last, struct list_head *insert_list) |
1046 | { | |
1047 | struct svm_range *tail; | |
1048 | int r = svm_range_split(prange, prange->start, new_last, &tail); | |
1049 | ||
1050 | if (!r) | |
ef3b4137 | 1051 | list_add(&tail->list, insert_list); |
42de677f PY |
1052 | return r; |
1053 | } | |
1054 | ||
1055 | static int | |
726be406 | 1056 | svm_range_split_head(struct svm_range *prange, |
42de677f PY |
1057 | uint64_t new_start, struct list_head *insert_list) |
1058 | { | |
1059 | struct svm_range *head; | |
1060 | int r = svm_range_split(prange, new_start, prange->last, &head); | |
1061 | ||
1062 | if (!r) | |
ef3b4137 | 1063 | list_add(&head->list, insert_list); |
42de677f PY |
1064 | return r; |
1065 | } | |
1066 | ||
4683cfec PY |
1067 | static void |
1068 | svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, | |
1069 | struct svm_range *pchild, enum svm_work_list_ops op) | |
1070 | { | |
1071 | pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", | |
1072 | pchild, pchild->start, pchild->last, prange, op); | |
1073 | ||
1074 | pchild->work_item.mm = mm; | |
1075 | pchild->work_item.op = op; | |
1076 | list_add_tail(&pchild->child_list, &prange->child_list); | |
1077 | } | |
1078 | ||
48ff079b FK |
1079 | /** |
1080 | * svm_range_split_by_granularity - collect ranges within granularity boundary | |
1081 | * | |
1082 | * @p: the process with svms list | |
1083 | * @mm: mm structure | |
1084 | * @addr: the vm fault address in pages, to split the prange | |
1085 | * @parent: parent range if prange is from child list | |
1086 | * @prange: prange to split | |
1087 | * | |
1088 | * Trims @prange to be a single aligned block of prange->granularity if | |
1089 | * possible. The head and tail are added to the child_list in @parent. | |
1090 | * | |
1091 | * Context: caller must hold mmap_read_lock and prange->lock | |
1092 | * | |
1093 | * Return: | |
1094 | * 0 - OK, otherwise error code | |
1095 | */ | |
1096 | int | |
1097 | svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, | |
1098 | unsigned long addr, struct svm_range *parent, | |
1099 | struct svm_range *prange) | |
1100 | { | |
1101 | struct svm_range *head, *tail; | |
1102 | unsigned long start, last, size; | |
1103 | int r; | |
1104 | ||
1105 | /* Align splited range start and size to granularity size, then a single | |
1106 | * PTE will be used for whole range, this reduces the number of PTE | |
1107 | * updated and the L1 TLB space used for translation. | |
1108 | */ | |
1109 | size = 1UL << prange->granularity; | |
1110 | start = ALIGN_DOWN(addr, size); | |
1111 | last = ALIGN(addr + 1, size) - 1; | |
1112 | ||
1113 | pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n", | |
1114 | prange->svms, prange->start, prange->last, start, last, size); | |
1115 | ||
1116 | if (start > prange->start) { | |
1117 | r = svm_range_split(prange, start, prange->last, &head); | |
1118 | if (r) | |
1119 | return r; | |
1120 | svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE); | |
1121 | } | |
1122 | ||
1123 | if (last < prange->last) { | |
1124 | r = svm_range_split(prange, prange->start, last, &tail); | |
1125 | if (r) | |
1126 | return r; | |
1127 | svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); | |
1128 | } | |
1129 | ||
90d7d3ed FK |
1130 | /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ |
1131 | if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) { | |
1132 | prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP; | |
1133 | pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n", | |
1134 | prange, prange->start, prange->last, | |
1135 | SVM_OP_ADD_RANGE_AND_MAP); | |
1136 | } | |
48ff079b FK |
1137 | return 0; |
1138 | } | |
1139 | ||
f80fe9d3 | 1140 | static uint64_t |
1d5dbfe6 AS |
1141 | svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, |
1142 | int domain) | |
f80fe9d3 | 1143 | { |
b53fa124 | 1144 | struct amdgpu_device *bo_adev; |
f80fe9d3 | 1145 | uint32_t flags = prange->flags; |
b53fa124 | 1146 | uint32_t mapping_flags = 0; |
f80fe9d3 | 1147 | uint64_t pte_flags; |
1d5dbfe6 | 1148 | bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); |
b53fa124 PY |
1149 | bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; |
1150 | ||
1d5dbfe6 | 1151 | if (domain == SVM_RANGE_VRAM_DOMAIN) |
b53fa124 PY |
1152 | bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); |
1153 | ||
046e674b GS |
1154 | switch (KFD_GC_VERSION(adev->kfd.dev)) { |
1155 | case IP_VERSION(9, 4, 1): | |
1d5dbfe6 | 1156 | if (domain == SVM_RANGE_VRAM_DOMAIN) { |
b53fa124 PY |
1157 | if (bo_adev == adev) { |
1158 | mapping_flags |= coherent ? | |
1159 | AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; | |
1160 | } else { | |
0c6f7777 FK |
1161 | mapping_flags |= coherent ? |
1162 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; | |
b53fa124 PY |
1163 | if (amdgpu_xgmi_same_hive(adev, bo_adev)) |
1164 | snoop = true; | |
1165 | } | |
1166 | } else { | |
1167 | mapping_flags |= coherent ? | |
1168 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; | |
1169 | } | |
1170 | break; | |
046e674b | 1171 | case IP_VERSION(9, 4, 2): |
1d5dbfe6 | 1172 | if (domain == SVM_RANGE_VRAM_DOMAIN) { |
b53fa124 PY |
1173 | if (bo_adev == adev) { |
1174 | mapping_flags |= coherent ? | |
1175 | AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; | |
1176 | if (adev->gmc.xgmi.connected_to_cpu) | |
1177 | snoop = true; | |
1178 | } else { | |
2b2339ee FK |
1179 | mapping_flags |= coherent ? |
1180 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; | |
b53fa124 PY |
1181 | if (amdgpu_xgmi_same_hive(adev, bo_adev)) |
1182 | snoop = true; | |
1183 | } | |
1184 | } else { | |
1185 | mapping_flags |= coherent ? | |
1186 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; | |
1187 | } | |
1188 | break; | |
1189 | default: | |
1190 | mapping_flags |= coherent ? | |
1191 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; | |
1192 | } | |
f80fe9d3 | 1193 | |
b53fa124 | 1194 | mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; |
f80fe9d3 FK |
1195 | |
1196 | if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) | |
1197 | mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; | |
1198 | if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) | |
1199 | mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; | |
f80fe9d3 | 1200 | |
b53fa124 | 1201 | pte_flags = AMDGPU_PTE_VALID; |
1d5dbfe6 | 1202 | pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; |
b53fa124 | 1203 | pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; |
f80fe9d3 FK |
1204 | |
1205 | pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); | |
f80fe9d3 FK |
1206 | return pte_flags; |
1207 | } | |
1208 | ||
1209 | static int | |
1210 | svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, | |
1211 | uint64_t start, uint64_t last, | |
1212 | struct dma_fence **fence) | |
1213 | { | |
1214 | uint64_t init_pte_value = 0; | |
1215 | ||
1216 | pr_debug("[0x%llx 0x%llx]\n", start, last); | |
1217 | ||
30671b44 CK |
1218 | return amdgpu_vm_update_range(adev, vm, false, true, true, NULL, start, |
1219 | last, init_pte_value, 0, 0, NULL, NULL, | |
1220 | fence); | |
f80fe9d3 FK |
1221 | } |
1222 | ||
1223 | static int | |
1224 | svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, | |
46ae2af9 | 1225 | unsigned long last, uint32_t trigger) |
f80fe9d3 FK |
1226 | { |
1227 | DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); | |
1228 | struct kfd_process_device *pdd; | |
1229 | struct dma_fence *fence = NULL; | |
f80fe9d3 FK |
1230 | struct kfd_process *p; |
1231 | uint32_t gpuidx; | |
1232 | int r = 0; | |
1233 | ||
6b9c63a6 PY |
1234 | if (!prange->mapped_to_gpu) { |
1235 | pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n", | |
1236 | prange, prange->start, prange->last); | |
1237 | return 0; | |
1238 | } | |
1239 | ||
1240 | if (prange->start == start && prange->last == last) { | |
1241 | pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange); | |
1242 | prange->mapped_to_gpu = false; | |
1243 | } | |
1244 | ||
f80fe9d3 FK |
1245 | bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, |
1246 | MAX_GPU_INSTANCE); | |
1247 | p = container_of(prange->svms, struct kfd_process, svms); | |
1248 | ||
1249 | for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { | |
1250 | pr_debug("unmap from gpu idx 0x%x\n", gpuidx); | |
1251 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); | |
1252 | if (!pdd) { | |
1253 | pr_debug("failed to find device idx %d\n", gpuidx); | |
1254 | return -EINVAL; | |
1255 | } | |
f80fe9d3 | 1256 | |
46ae2af9 PY |
1257 | kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid, |
1258 | start, last, trigger); | |
1259 | ||
6bfc7c7e GS |
1260 | r = svm_range_unmap_from_gpu(pdd->dev->adev, |
1261 | drm_priv_to_vm(pdd->drm_priv), | |
f80fe9d3 FK |
1262 | start, last, &fence); |
1263 | if (r) | |
1264 | break; | |
1265 | ||
1266 | if (fence) { | |
1267 | r = dma_fence_wait(fence, false); | |
1268 | dma_fence_put(fence); | |
1269 | fence = NULL; | |
1270 | if (r) | |
1271 | break; | |
1272 | } | |
6c1a7867 | 1273 | kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT); |
f80fe9d3 FK |
1274 | } |
1275 | ||
1276 | return r; | |
1277 | } | |
1278 | ||
1279 | static int | |
6c1a7867 MJ |
1280 | svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, |
1281 | unsigned long offset, unsigned long npages, bool readonly, | |
1282 | dma_addr_t *dma_addr, struct amdgpu_device *bo_adev, | |
601354f3 | 1283 | struct dma_fence **fence, bool flush_tlb) |
f80fe9d3 | 1284 | { |
6c1a7867 MJ |
1285 | struct amdgpu_device *adev = pdd->dev->adev; |
1286 | struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv); | |
f80fe9d3 | 1287 | uint64_t pte_flags; |
1d5dbfe6 AS |
1288 | unsigned long last_start; |
1289 | int last_domain; | |
f80fe9d3 | 1290 | int r = 0; |
e7eb2137 | 1291 | int64_t i, j; |
f80fe9d3 | 1292 | |
2f617f4d PY |
1293 | last_start = prange->start + offset; |
1294 | ||
1295 | pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms, | |
1296 | last_start, last_start + npages - 1, readonly); | |
f80fe9d3 | 1297 | |
2f617f4d | 1298 | for (i = offset; i < offset + npages; i++) { |
1d5dbfe6 AS |
1299 | last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; |
1300 | dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; | |
1aed4828 PY |
1301 | |
1302 | /* Collect all pages in the same address range and memory domain | |
1303 | * that can be mapped with a single call to update mapping. | |
1304 | */ | |
1305 | if (i < offset + npages - 1 && | |
1d5dbfe6 AS |
1306 | last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN)) |
1307 | continue; | |
1308 | ||
1309 | pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n", | |
1310 | last_start, prange->start + i, last_domain ? "GPU" : "CPU"); | |
2f617f4d | 1311 | |
1d5dbfe6 | 1312 | pte_flags = svm_range_get_pte_flags(adev, prange, last_domain); |
2f617f4d PY |
1313 | if (readonly) |
1314 | pte_flags &= ~AMDGPU_PTE_WRITEABLE; | |
1315 | ||
1316 | pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n", | |
1317 | prange->svms, last_start, prange->start + i, | |
1318 | (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, | |
1319 | pte_flags); | |
1320 | ||
601354f3 | 1321 | r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL, |
30671b44 CK |
1322 | last_start, prange->start + i, |
1323 | pte_flags, | |
88467db6 | 1324 | (last_start - prange->start) << PAGE_SHIFT, |
96621ca5 | 1325 | bo_adev ? bo_adev->vm_manager.vram_base_offset : 0, |
30671b44 | 1326 | NULL, dma_addr, &vm->last_update); |
e7eb2137 PY |
1327 | |
1328 | for (j = last_start - prange->start; j <= i; j++) | |
1329 | dma_addr[j] |= last_domain; | |
1330 | ||
1d5dbfe6 AS |
1331 | if (r) { |
1332 | pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); | |
1333 | goto out; | |
1334 | } | |
1335 | last_start = prange->start + i + 1; | |
f80fe9d3 FK |
1336 | } |
1337 | ||
1338 | r = amdgpu_vm_update_pdes(adev, vm, false); | |
1339 | if (r) { | |
1340 | pr_debug("failed %d to update directories 0x%lx\n", r, | |
1341 | prange->start); | |
1342 | goto out; | |
1343 | } | |
1344 | ||
1345 | if (fence) | |
1346 | *fence = dma_fence_get(vm->last_update); | |
1347 | ||
1348 | out: | |
1349 | return r; | |
1350 | } | |
1351 | ||
2f617f4d PY |
1352 | static int |
1353 | svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, | |
1354 | unsigned long npages, bool readonly, | |
601354f3 | 1355 | unsigned long *bitmap, bool wait, bool flush_tlb) |
f80fe9d3 FK |
1356 | { |
1357 | struct kfd_process_device *pdd; | |
b53fa124 | 1358 | struct amdgpu_device *bo_adev; |
f80fe9d3 FK |
1359 | struct kfd_process *p; |
1360 | struct dma_fence *fence = NULL; | |
1361 | uint32_t gpuidx; | |
1362 | int r = 0; | |
1363 | ||
b53fa124 PY |
1364 | if (prange->svm_bo && prange->ttm_res) |
1365 | bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); | |
1366 | else | |
1367 | bo_adev = NULL; | |
1368 | ||
f80fe9d3 FK |
1369 | p = container_of(prange->svms, struct kfd_process, svms); |
1370 | for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { | |
1a3b2b5d | 1371 | pr_debug("mapping to gpu idx 0x%x\n", gpuidx); |
f80fe9d3 FK |
1372 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); |
1373 | if (!pdd) { | |
1374 | pr_debug("failed to find device idx %d\n", gpuidx); | |
1375 | return -EINVAL; | |
1376 | } | |
f80fe9d3 FK |
1377 | |
1378 | pdd = kfd_bind_process_to_device(pdd->dev, p); | |
1379 | if (IS_ERR(pdd)) | |
1380 | return -EINVAL; | |
1381 | ||
56c5977e GS |
1382 | if (bo_adev && pdd->dev->adev != bo_adev && |
1383 | !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { | |
b53fa124 PY |
1384 | pr_debug("cannot map to device idx %d\n", gpuidx); |
1385 | continue; | |
1386 | } | |
1387 | ||
6c1a7867 | 1388 | r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly, |
2f617f4d | 1389 | prange->dma_addr[gpuidx], |
601354f3 PY |
1390 | bo_adev, wait ? &fence : NULL, |
1391 | flush_tlb); | |
f80fe9d3 FK |
1392 | if (r) |
1393 | break; | |
1394 | ||
1395 | if (fence) { | |
1396 | r = dma_fence_wait(fence, false); | |
1397 | dma_fence_put(fence); | |
1398 | fence = NULL; | |
1399 | if (r) { | |
1400 | pr_debug("failed %d to dma fence wait\n", r); | |
1401 | break; | |
1402 | } | |
1403 | } | |
4d30a83c CK |
1404 | |
1405 | kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY); | |
f80fe9d3 FK |
1406 | } |
1407 | ||
1408 | return r; | |
1409 | } | |
1410 | ||
1411 | struct svm_validate_context { | |
1412 | struct kfd_process *process; | |
1413 | struct svm_range *prange; | |
1414 | bool intr; | |
3925f9b4 | 1415 | DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); |
ec6abe83 | 1416 | struct ttm_validate_buffer tv[MAX_GPU_INSTANCE]; |
f80fe9d3 FK |
1417 | struct list_head validate_list; |
1418 | struct ww_acquire_ctx ticket; | |
1419 | }; | |
1420 | ||
1421 | static int svm_range_reserve_bos(struct svm_validate_context *ctx) | |
1422 | { | |
1423 | struct kfd_process_device *pdd; | |
f80fe9d3 FK |
1424 | struct amdgpu_vm *vm; |
1425 | uint32_t gpuidx; | |
1426 | int r; | |
1427 | ||
1428 | INIT_LIST_HEAD(&ctx->validate_list); | |
1429 | for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { | |
1430 | pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); | |
1431 | if (!pdd) { | |
1432 | pr_debug("failed to find device idx %d\n", gpuidx); | |
1433 | return -EINVAL; | |
1434 | } | |
f80fe9d3 FK |
1435 | vm = drm_priv_to_vm(pdd->drm_priv); |
1436 | ||
391629bd | 1437 | ctx->tv[gpuidx].bo = &vm->root.bo->tbo; |
f80fe9d3 FK |
1438 | ctx->tv[gpuidx].num_shared = 4; |
1439 | list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); | |
1440 | } | |
1441 | ||
1442 | r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, | |
1443 | ctx->intr, NULL); | |
1444 | if (r) { | |
1445 | pr_debug("failed %d to reserve bo\n", r); | |
1446 | return r; | |
1447 | } | |
1448 | ||
1449 | for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { | |
1450 | pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); | |
1451 | if (!pdd) { | |
1452 | pr_debug("failed to find device idx %d\n", gpuidx); | |
1453 | r = -EINVAL; | |
1454 | goto unreserve_out; | |
1455 | } | |
f80fe9d3 | 1456 | |
56c5977e GS |
1457 | r = amdgpu_vm_validate_pt_bos(pdd->dev->adev, |
1458 | drm_priv_to_vm(pdd->drm_priv), | |
f80fe9d3 FK |
1459 | svm_range_bo_validate, NULL); |
1460 | if (r) { | |
1461 | pr_debug("failed %d validate pt bos\n", r); | |
1462 | goto unreserve_out; | |
1463 | } | |
1464 | } | |
1465 | ||
1466 | return 0; | |
1467 | ||
1468 | unreserve_out: | |
1469 | ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); | |
1470 | return r; | |
1471 | } | |
1472 | ||
1473 | static void svm_range_unreserve_bos(struct svm_validate_context *ctx) | |
1474 | { | |
1475 | ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); | |
1476 | } | |
1477 | ||
1fc160cf AS |
1478 | static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) |
1479 | { | |
1480 | struct kfd_process_device *pdd; | |
1fc160cf AS |
1481 | |
1482 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); | |
1fc160cf | 1483 | |
56c5977e | 1484 | return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev); |
1fc160cf AS |
1485 | } |
1486 | ||
b1c46c7d PY |
1487 | /* |
1488 | * Validation+GPU mapping with concurrent invalidation (MMU notifiers) | |
1489 | * | |
1490 | * To prevent concurrent destruction or change of range attributes, the | |
1491 | * svm_read_lock must be held. The caller must not hold the svm_write_lock | |
1492 | * because that would block concurrent evictions and lead to deadlocks. To | |
1493 | * serialize concurrent migrations or validations of the same range, the | |
1494 | * prange->migrate_mutex must be held. | |
1495 | * | |
1496 | * For VRAM ranges, the SVM BO must be allocated and valid (protected by its | |
1497 | * eviction fence. | |
1498 | * | |
1499 | * The following sequence ensures race-free validation and GPU mapping: | |
1500 | * | |
1501 | * 1. Reserve page table (and SVM BO if range is in VRAM) | |
1502 | * 2. hmm_range_fault to get page addresses (if system memory) | |
1503 | * 3. DMA-map pages (if system memory) | |
1504 | * 4-a. Take notifier lock | |
1505 | * 4-b. Check that pages still valid (mmu_interval_read_retry) | |
1506 | * 4-c. Check that the range was not split or otherwise invalidated | |
1507 | * 4-d. Update GPU page table | |
1508 | * 4.e. Release notifier lock | |
1509 | * 5. Release page table (and SVM BO) reservation | |
1510 | */ | |
1511 | static int svm_range_validate_and_map(struct mm_struct *mm, | |
601354f3 PY |
1512 | struct svm_range *prange, int32_t gpuidx, |
1513 | bool intr, bool wait, bool flush_tlb) | |
b1c46c7d | 1514 | { |
f80fe9d3 | 1515 | struct svm_validate_context ctx; |
2f617f4d | 1516 | unsigned long start, end, addr; |
1fc160cf AS |
1517 | struct kfd_process *p; |
1518 | void *owner; | |
1519 | int32_t idx; | |
b1c46c7d PY |
1520 | int r = 0; |
1521 | ||
f80fe9d3 FK |
1522 | ctx.process = container_of(prange->svms, struct kfd_process, svms); |
1523 | ctx.prange = prange; | |
1524 | ctx.intr = intr; | |
1525 | ||
1526 | if (gpuidx < MAX_GPU_INSTANCE) { | |
1527 | bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE); | |
1528 | bitmap_set(ctx.bitmap, gpuidx, 1); | |
cda0f85b FK |
1529 | } else if (ctx.process->xnack_enabled) { |
1530 | bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); | |
1531 | ||
1532 | /* If prefetch range to GPU, or GPU retry fault migrate range to | |
1533 | * GPU, which has ACCESS attribute to the range, create mapping | |
1534 | * on that GPU. | |
1535 | */ | |
1536 | if (prange->actual_loc) { | |
1537 | gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process, | |
1538 | prange->actual_loc); | |
1539 | if (gpuidx < 0) { | |
1540 | WARN_ONCE(1, "failed get device by id 0x%x\n", | |
1541 | prange->actual_loc); | |
1542 | return -EINVAL; | |
1543 | } | |
1544 | if (test_bit(gpuidx, prange->bitmap_access)) | |
1545 | bitmap_set(ctx.bitmap, gpuidx, 1); | |
1546 | } | |
f80fe9d3 FK |
1547 | } else { |
1548 | bitmap_or(ctx.bitmap, prange->bitmap_access, | |
1549 | prange->bitmap_aip, MAX_GPU_INSTANCE); | |
1550 | } | |
1551 | ||
601354f3 PY |
1552 | if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) { |
1553 | if (!prange->mapped_to_gpu) | |
1554 | return 0; | |
1555 | ||
1556 | bitmap_copy(ctx.bitmap, prange->bitmap_access, MAX_GPU_INSTANCE); | |
1557 | } | |
f80fe9d3 | 1558 | |
e49fe404 FK |
1559 | if (prange->actual_loc && !prange->ttm_res) { |
1560 | /* This should never happen. actual_loc gets set by | |
1561 | * svm_migrate_ram_to_vram after allocating a BO. | |
1562 | */ | |
a273bc99 | 1563 | WARN_ONCE(1, "VRAM BO missing during validation\n"); |
e49fe404 FK |
1564 | return -EINVAL; |
1565 | } | |
1566 | ||
f80fe9d3 FK |
1567 | svm_range_reserve_bos(&ctx); |
1568 | ||
278a7087 AS |
1569 | p = container_of(prange->svms, struct kfd_process, svms); |
1570 | owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap, | |
1571 | MAX_GPU_INSTANCE)); | |
1572 | for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) { | |
1573 | if (kfd_svm_page_owner(p, idx) != owner) { | |
1574 | owner = NULL; | |
1575 | break; | |
f80fe9d3 | 1576 | } |
278a7087 | 1577 | } |
0b0e518d | 1578 | |
2f617f4d PY |
1579 | start = prange->start << PAGE_SHIFT; |
1580 | end = (prange->last + 1) << PAGE_SHIFT; | |
1581 | for (addr = start; addr < end && !r; ) { | |
1582 | struct hmm_range *hmm_range; | |
1583 | struct vm_area_struct *vma; | |
1584 | unsigned long next; | |
1585 | unsigned long offset; | |
1586 | unsigned long npages; | |
1587 | bool readonly; | |
1588 | ||
3a3e841d DW |
1589 | vma = vma_lookup(mm, addr); |
1590 | if (!vma) { | |
2f617f4d PY |
1591 | r = -EFAULT; |
1592 | goto unreserve_out; | |
1593 | } | |
1594 | readonly = !(vma->vm_flags & VM_WRITE); | |
b1c46c7d | 1595 | |
2f617f4d PY |
1596 | next = min(vma->vm_end, end); |
1597 | npages = (next - addr) >> PAGE_SHIFT; | |
a6283010 | 1598 | WRITE_ONCE(p->svms.faulting_task, current); |
d4cbff46 CK |
1599 | r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages, |
1600 | readonly, owner, NULL, | |
1601 | &hmm_range); | |
a6283010 | 1602 | WRITE_ONCE(p->svms.faulting_task, NULL); |
2f617f4d PY |
1603 | if (r) { |
1604 | pr_debug("failed %d to get svm range pages\n", r); | |
1605 | goto unreserve_out; | |
1606 | } | |
278a7087 | 1607 | |
2f617f4d PY |
1608 | offset = (addr - start) >> PAGE_SHIFT; |
1609 | r = svm_range_dma_map(prange, ctx.bitmap, offset, npages, | |
1610 | hmm_range->hmm_pfns); | |
1611 | if (r) { | |
1612 | pr_debug("failed %d to dma map range\n", r); | |
1613 | goto unreserve_out; | |
1614 | } | |
b1c46c7d | 1615 | |
2f617f4d PY |
1616 | svm_range_lock(prange); |
1617 | if (amdgpu_hmm_range_get_pages_done(hmm_range)) { | |
1618 | pr_debug("hmm update the range, need validate again\n"); | |
1619 | r = -EAGAIN; | |
1620 | goto unlock_out; | |
1621 | } | |
1622 | if (!list_empty(&prange->child_list)) { | |
1623 | pr_debug("range split by unmap in parallel, validate again\n"); | |
1624 | r = -EAGAIN; | |
1625 | goto unlock_out; | |
1626 | } | |
1627 | ||
1628 | r = svm_range_map_to_gpus(prange, offset, npages, readonly, | |
601354f3 | 1629 | ctx.bitmap, wait, flush_tlb); |
b1c46c7d PY |
1630 | |
1631 | unlock_out: | |
2f617f4d PY |
1632 | svm_range_unlock(prange); |
1633 | ||
1634 | addr = next; | |
1635 | } | |
1636 | ||
6b9c63a6 | 1637 | if (addr == end) { |
2f617f4d | 1638 | prange->validated_once = true; |
6b9c63a6 PY |
1639 | prange->mapped_to_gpu = true; |
1640 | } | |
2f617f4d | 1641 | |
b1c46c7d | 1642 | unreserve_out: |
f80fe9d3 | 1643 | svm_range_unreserve_bos(&ctx); |
b1c46c7d | 1644 | |
564d2b92 | 1645 | if (!r) |
e0f1e65b | 1646 | prange->validate_timestamp = ktime_get_boottime(); |
564d2b92 | 1647 | |
b1c46c7d PY |
1648 | return r; |
1649 | } | |
1650 | ||
4683cfec PY |
1651 | /** |
1652 | * svm_range_list_lock_and_flush_work - flush pending deferred work | |
1653 | * | |
1654 | * @svms: the svm range list | |
1655 | * @mm: the mm structure | |
1656 | * | |
1657 | * Context: Returns with mmap write lock held, pending deferred work flushed | |
1658 | * | |
1659 | */ | |
6bdfc37b | 1660 | void |
4683cfec PY |
1661 | svm_range_list_lock_and_flush_work(struct svm_range_list *svms, |
1662 | struct mm_struct *mm) | |
1663 | { | |
1664 | retry_flush_work: | |
1665 | flush_work(&svms->deferred_list_work); | |
1666 | mmap_write_lock(mm); | |
1667 | ||
1668 | if (list_empty(&svms->deferred_range_list)) | |
1669 | return; | |
1670 | mmap_write_unlock(mm); | |
1671 | pr_debug("retry flush\n"); | |
1672 | goto retry_flush_work; | |
1673 | } | |
1674 | ||
8a7c184a FK |
1675 | static void svm_range_restore_work(struct work_struct *work) |
1676 | { | |
1677 | struct delayed_work *dwork = to_delayed_work(work); | |
e433d684 | 1678 | struct amdkfd_process_info *process_info; |
8a7c184a FK |
1679 | struct svm_range_list *svms; |
1680 | struct svm_range *prange; | |
1681 | struct kfd_process *p; | |
1682 | struct mm_struct *mm; | |
1683 | int evicted_ranges; | |
1684 | int invalid; | |
1685 | int r; | |
1686 | ||
1687 | svms = container_of(dwork, struct svm_range_list, restore_work); | |
1688 | evicted_ranges = atomic_read(&svms->evicted_ranges); | |
1689 | if (!evicted_ranges) | |
1690 | return; | |
1691 | ||
1692 | pr_debug("restore svm ranges\n"); | |
1693 | ||
8a7c184a | 1694 | p = container_of(svms, struct kfd_process, svms); |
e433d684 | 1695 | process_info = p->kgd_process_info; |
6225bb3a PY |
1696 | |
1697 | /* Keep mm reference when svm_range_validate_and_map ranges */ | |
1698 | mm = get_task_mm(p->lead_thread); | |
1699 | if (!mm) { | |
1700 | pr_debug("svms 0x%p process mm gone\n", svms); | |
8a7c184a | 1701 | return; |
6225bb3a | 1702 | } |
8a7c184a | 1703 | |
e433d684 | 1704 | mutex_lock(&process_info->lock); |
8a7c184a FK |
1705 | svm_range_list_lock_and_flush_work(svms, mm); |
1706 | mutex_lock(&svms->lock); | |
1707 | ||
1708 | evicted_ranges = atomic_read(&svms->evicted_ranges); | |
1709 | ||
1710 | list_for_each_entry(prange, &svms->list, list) { | |
1711 | invalid = atomic_read(&prange->invalid); | |
1712 | if (!invalid) | |
1713 | continue; | |
1714 | ||
1715 | pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n", | |
1716 | prange->svms, prange, prange->start, prange->last, | |
1717 | invalid); | |
1718 | ||
0b0e518d FK |
1719 | /* |
1720 | * If range is migrating, wait for migration is done. | |
1721 | */ | |
1722 | mutex_lock(&prange->migrate_mutex); | |
1723 | ||
8a7c184a | 1724 | r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, |
601354f3 | 1725 | false, true, false); |
0b0e518d | 1726 | if (r) |
8a7c184a FK |
1727 | pr_debug("failed %d to map 0x%lx to gpus\n", r, |
1728 | prange->start); | |
0b0e518d FK |
1729 | |
1730 | mutex_unlock(&prange->migrate_mutex); | |
1731 | if (r) | |
1732 | goto out_reschedule; | |
8a7c184a FK |
1733 | |
1734 | if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid) | |
0b0e518d | 1735 | goto out_reschedule; |
8a7c184a FK |
1736 | } |
1737 | ||
1738 | if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) != | |
1739 | evicted_ranges) | |
0b0e518d | 1740 | goto out_reschedule; |
8a7c184a FK |
1741 | |
1742 | evicted_ranges = 0; | |
1743 | ||
1744 | r = kgd2kfd_resume_mm(mm); | |
1745 | if (r) { | |
1746 | /* No recovery from this failure. Probably the CP is | |
1747 | * hanging. No point trying again. | |
1748 | */ | |
1749 | pr_debug("failed %d to resume KFD\n", r); | |
1750 | } | |
1751 | ||
1752 | pr_debug("restore svm ranges successfully\n"); | |
1753 | ||
0b0e518d | 1754 | out_reschedule: |
8a7c184a FK |
1755 | mutex_unlock(&svms->lock); |
1756 | mmap_write_unlock(mm); | |
e433d684 | 1757 | mutex_unlock(&process_info->lock); |
8a7c184a FK |
1758 | |
1759 | /* If validation failed, reschedule another attempt */ | |
1760 | if (evicted_ranges) { | |
1761 | pr_debug("reschedule to restore svm range\n"); | |
1762 | schedule_delayed_work(&svms->restore_work, | |
1763 | msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); | |
c7f21978 PY |
1764 | |
1765 | kfd_smi_event_queue_restore_rescheduled(mm); | |
8a7c184a | 1766 | } |
c7f21978 | 1767 | mmput(mm); |
8a7c184a FK |
1768 | } |
1769 | ||
1770 | /** | |
1771 | * svm_range_evict - evict svm range | |
bbe04dec IB |
1772 | * @prange: svm range structure |
1773 | * @mm: current process mm_struct | |
1774 | * @start: starting process queue number | |
1775 | * @last: last process queue number | |
8a7c184a FK |
1776 | * |
1777 | * Stop all queues of the process to ensure GPU doesn't access the memory, then | |
1778 | * return to let CPU evict the buffer and proceed CPU pagetable update. | |
1779 | * | |
1780 | * Don't need use lock to sync cpu pagetable invalidation with GPU execution. | |
1781 | * If invalidation happens while restore work is running, restore work will | |
1782 | * restart to ensure to get the latest CPU pages mapping to GPU, then start | |
1783 | * the queues. | |
1784 | */ | |
1785 | static int | |
1786 | svm_range_evict(struct svm_range *prange, struct mm_struct *mm, | |
46ae2af9 PY |
1787 | unsigned long start, unsigned long last, |
1788 | enum mmu_notifier_event event) | |
8a7c184a FK |
1789 | { |
1790 | struct svm_range_list *svms = prange->svms; | |
9e4a91cd | 1791 | struct svm_range *pchild; |
90d7d3ed | 1792 | struct kfd_process *p; |
8a7c184a FK |
1793 | int r = 0; |
1794 | ||
90d7d3ed | 1795 | p = container_of(svms, struct kfd_process, svms); |
8a7c184a | 1796 | |
90d7d3ed FK |
1797 | pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n", |
1798 | svms, prange->start, prange->last, start, last); | |
8a7c184a | 1799 | |
7d261c50 EH |
1800 | if (!p->xnack_enabled || |
1801 | (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) { | |
90d7d3ed | 1802 | int evicted_ranges; |
f72fc9bd | 1803 | bool mapped = prange->mapped_to_gpu; |
8a7c184a | 1804 | |
9e4a91cd | 1805 | list_for_each_entry(pchild, &prange->child_list, child_list) { |
f72fc9bd EH |
1806 | if (!pchild->mapped_to_gpu) |
1807 | continue; | |
1808 | mapped = true; | |
9e4a91cd AS |
1809 | mutex_lock_nested(&pchild->lock, 1); |
1810 | if (pchild->start <= last && pchild->last >= start) { | |
1811 | pr_debug("increment pchild invalid [0x%lx 0x%lx]\n", | |
1812 | pchild->start, pchild->last); | |
1813 | atomic_inc(&pchild->invalid); | |
1814 | } | |
1815 | mutex_unlock(&pchild->lock); | |
1816 | } | |
1817 | ||
f72fc9bd EH |
1818 | if (!mapped) |
1819 | return r; | |
1820 | ||
9e4a91cd AS |
1821 | if (prange->start <= last && prange->last >= start) |
1822 | atomic_inc(&prange->invalid); | |
1823 | ||
90d7d3ed FK |
1824 | evicted_ranges = atomic_inc_return(&svms->evicted_ranges); |
1825 | if (evicted_ranges != 1) | |
1826 | return r; | |
1827 | ||
1828 | pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n", | |
1829 | prange->svms, prange->start, prange->last); | |
1830 | ||
1831 | /* First eviction, stop the queues */ | |
c7f21978 | 1832 | r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM); |
90d7d3ed FK |
1833 | if (r) |
1834 | pr_debug("failed to quiesce KFD\n"); | |
1835 | ||
1836 | pr_debug("schedule to restore svm %p ranges\n", svms); | |
1837 | schedule_delayed_work(&svms->restore_work, | |
1838 | msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); | |
1839 | } else { | |
90d7d3ed | 1840 | unsigned long s, l; |
46ae2af9 PY |
1841 | uint32_t trigger; |
1842 | ||
1843 | if (event == MMU_NOTIFY_MIGRATE) | |
1844 | trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE; | |
1845 | else | |
1846 | trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY; | |
90d7d3ed FK |
1847 | |
1848 | pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", | |
1849 | prange->svms, start, last); | |
1850 | list_for_each_entry(pchild, &prange->child_list, child_list) { | |
1851 | mutex_lock_nested(&pchild->lock, 1); | |
1852 | s = max(start, pchild->start); | |
1853 | l = min(last, pchild->last); | |
1854 | if (l >= s) | |
46ae2af9 | 1855 | svm_range_unmap_from_gpus(pchild, s, l, trigger); |
90d7d3ed FK |
1856 | mutex_unlock(&pchild->lock); |
1857 | } | |
1858 | s = max(start, prange->start); | |
1859 | l = min(last, prange->last); | |
1860 | if (l >= s) | |
46ae2af9 | 1861 | svm_range_unmap_from_gpus(prange, s, l, trigger); |
90d7d3ed | 1862 | } |
8a7c184a FK |
1863 | |
1864 | return r; | |
1865 | } | |
1866 | ||
42de677f PY |
1867 | static struct svm_range *svm_range_clone(struct svm_range *old) |
1868 | { | |
1869 | struct svm_range *new; | |
1870 | ||
f9af3c16 | 1871 | new = svm_range_new(old->svms, old->start, old->last, false); |
42de677f PY |
1872 | if (!new) |
1873 | return NULL; | |
1874 | ||
e49fe404 FK |
1875 | if (old->svm_bo) { |
1876 | new->ttm_res = old->ttm_res; | |
1877 | new->offset = old->offset; | |
1878 | new->svm_bo = svm_range_bo_ref(old->svm_bo); | |
1879 | spin_lock(&new->svm_bo->list_lock); | |
1880 | list_add(&new->svm_bo_list, &new->svm_bo->range_list); | |
1881 | spin_unlock(&new->svm_bo->list_lock); | |
1882 | } | |
42de677f PY |
1883 | new->flags = old->flags; |
1884 | new->preferred_loc = old->preferred_loc; | |
1885 | new->prefetch_loc = old->prefetch_loc; | |
1886 | new->actual_loc = old->actual_loc; | |
1887 | new->granularity = old->granularity; | |
6b9c63a6 | 1888 | new->mapped_to_gpu = old->mapped_to_gpu; |
42de677f PY |
1889 | bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); |
1890 | bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); | |
1891 | ||
1892 | return new; | |
1893 | } | |
1894 | ||
4959e609 PY |
1895 | void svm_range_set_max_pages(struct amdgpu_device *adev) |
1896 | { | |
1897 | uint64_t max_pages; | |
1898 | uint64_t pages, _pages; | |
1899 | ||
1900 | /* 1/32 VRAM size in pages */ | |
1901 | pages = adev->gmc.real_vram_size >> 17; | |
1902 | pages = clamp(pages, 1ULL << 9, 1ULL << 18); | |
1903 | pages = rounddown_pow_of_two(pages); | |
1904 | do { | |
1905 | max_pages = READ_ONCE(max_svm_range_pages); | |
1906 | _pages = min_not_zero(max_pages, pages); | |
1907 | } while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages); | |
1908 | } | |
1909 | ||
5640cb89 PY |
1910 | static int |
1911 | svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last, | |
1912 | uint64_t max_pages, struct list_head *insert_list, | |
1913 | struct list_head *update_list) | |
1914 | { | |
1915 | struct svm_range *prange; | |
1916 | uint64_t l; | |
1917 | ||
1918 | pr_debug("max_svm_range_pages 0x%llx adding [0x%llx 0x%llx]\n", | |
1919 | max_pages, start, last); | |
1920 | ||
1921 | while (last >= start) { | |
1922 | l = min(last, ALIGN_DOWN(start + max_pages, max_pages) - 1); | |
1923 | ||
f9af3c16 | 1924 | prange = svm_range_new(svms, start, l, true); |
5640cb89 PY |
1925 | if (!prange) |
1926 | return -ENOMEM; | |
1927 | list_add(&prange->list, insert_list); | |
1928 | list_add(&prange->update_list, update_list); | |
1929 | ||
1930 | start = l + 1; | |
1931 | } | |
1932 | return 0; | |
1933 | } | |
1934 | ||
42de677f | 1935 | /** |
726be406 FK |
1936 | * svm_range_add - add svm range and handle overlap |
1937 | * @p: the range add to this process svms | |
1938 | * @start: page size aligned | |
1939 | * @size: page size aligned | |
1940 | * @nattr: number of attributes | |
1941 | * @attrs: array of attributes | |
1942 | * @update_list: output, the ranges need validate and update GPU mapping | |
1943 | * @insert_list: output, the ranges need insert to svms | |
1944 | * @remove_list: output, the ranges are replaced and need remove from svms | |
42de677f | 1945 | * |
726be406 FK |
1946 | * Check if the virtual address range has overlap with any existing ranges, |
1947 | * split partly overlapping ranges and add new ranges in the gaps. All changes | |
1948 | * should be applied to the range_list and interval tree transactionally. If | |
1949 | * any range split or allocation fails, the entire update fails. Therefore any | |
1950 | * existing overlapping svm_ranges are cloned and the original svm_ranges left | |
1951 | * unchanged. | |
42de677f | 1952 | * |
726be406 FK |
1953 | * If the transaction succeeds, the caller can update and insert clones and |
1954 | * new ranges, then free the originals. | |
42de677f | 1955 | * |
726be406 FK |
1956 | * Otherwise the caller can free the clones and new ranges, while the old |
1957 | * svm_ranges remain unchanged. | |
1958 | * | |
1959 | * Context: Process context, caller must hold svms->lock | |
1960 | * | |
1961 | * Return: | |
1962 | * 0 - OK, otherwise error code | |
42de677f PY |
1963 | */ |
1964 | static int | |
726be406 FK |
1965 | svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, |
1966 | uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, | |
1967 | struct list_head *update_list, struct list_head *insert_list, | |
1968 | struct list_head *remove_list) | |
42de677f | 1969 | { |
726be406 FK |
1970 | unsigned long last = start + size - 1UL; |
1971 | struct svm_range_list *svms = &p->svms; | |
42de677f PY |
1972 | struct interval_tree_node *node; |
1973 | struct svm_range *prange; | |
1974 | struct svm_range *tmp; | |
f9af3c16 | 1975 | struct list_head new_list; |
42de677f PY |
1976 | int r = 0; |
1977 | ||
726be406 FK |
1978 | pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last); |
1979 | ||
42de677f PY |
1980 | INIT_LIST_HEAD(update_list); |
1981 | INIT_LIST_HEAD(insert_list); | |
1982 | INIT_LIST_HEAD(remove_list); | |
f9af3c16 | 1983 | INIT_LIST_HEAD(&new_list); |
42de677f PY |
1984 | |
1985 | node = interval_tree_iter_first(&svms->objects, start, last); | |
1986 | while (node) { | |
1987 | struct interval_tree_node *next; | |
42de677f PY |
1988 | unsigned long next_start; |
1989 | ||
1990 | pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start, | |
1991 | node->last); | |
1992 | ||
4853cbcd | 1993 | prange = container_of(node, struct svm_range, it_node); |
42de677f PY |
1994 | next = interval_tree_iter_next(node, start, last); |
1995 | next_start = min(node->last, last) + 1; | |
1996 | ||
4853cbcd FK |
1997 | if (svm_range_is_same_attrs(p, prange, nattr, attrs)) { |
1998 | /* nothing to do */ | |
1999 | } else if (node->start < start || node->last > last) { | |
2000 | /* node intersects the update range and its attributes | |
2001 | * will change. Clone and split it, apply updates only | |
2002 | * to the overlapping part | |
2003 | */ | |
2004 | struct svm_range *old = prange; | |
2005 | ||
42de677f PY |
2006 | prange = svm_range_clone(old); |
2007 | if (!prange) { | |
2008 | r = -ENOMEM; | |
2009 | goto out; | |
2010 | } | |
2011 | ||
b121862c | 2012 | list_add(&old->update_list, remove_list); |
ef3b4137 | 2013 | list_add(&prange->list, insert_list); |
4853cbcd | 2014 | list_add(&prange->update_list, update_list); |
42de677f PY |
2015 | |
2016 | if (node->start < start) { | |
2017 | pr_debug("change old range start\n"); | |
726be406 | 2018 | r = svm_range_split_head(prange, start, |
42de677f PY |
2019 | insert_list); |
2020 | if (r) | |
2021 | goto out; | |
2022 | } | |
2023 | if (node->last > last) { | |
2024 | pr_debug("change old range last\n"); | |
726be406 | 2025 | r = svm_range_split_tail(prange, last, |
42de677f PY |
2026 | insert_list); |
2027 | if (r) | |
2028 | goto out; | |
2029 | } | |
2030 | } else { | |
2031 | /* The node is contained within start..last, | |
2032 | * just update it | |
2033 | */ | |
42de677f | 2034 | list_add(&prange->update_list, update_list); |
4853cbcd | 2035 | } |
42de677f PY |
2036 | |
2037 | /* insert a new node if needed */ | |
2038 | if (node->start > start) { | |
5640cb89 PY |
2039 | r = svm_range_split_new(svms, start, node->start - 1, |
2040 | READ_ONCE(max_svm_range_pages), | |
f9af3c16 | 2041 | &new_list, update_list); |
5640cb89 | 2042 | if (r) |
42de677f | 2043 | goto out; |
42de677f PY |
2044 | } |
2045 | ||
2046 | node = next; | |
2047 | start = next_start; | |
2048 | } | |
2049 | ||
726be406 | 2050 | /* add a final range at the end if needed */ |
5640cb89 PY |
2051 | if (start <= last) |
2052 | r = svm_range_split_new(svms, start, last, | |
2053 | READ_ONCE(max_svm_range_pages), | |
f9af3c16 | 2054 | &new_list, update_list); |
42de677f PY |
2055 | |
2056 | out: | |
f9af3c16 | 2057 | if (r) { |
ef3b4137 | 2058 | list_for_each_entry_safe(prange, tmp, insert_list, list) |
f9af3c16 AS |
2059 | svm_range_free(prange, false); |
2060 | list_for_each_entry_safe(prange, tmp, &new_list, list) | |
2061 | svm_range_free(prange, true); | |
2062 | } else { | |
2063 | list_splice(&new_list, insert_list); | |
2064 | } | |
42de677f PY |
2065 | |
2066 | return r; | |
2067 | } | |
2068 | ||
4683cfec PY |
2069 | static void |
2070 | svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, | |
2071 | struct svm_range *prange) | |
2072 | { | |
2073 | unsigned long start; | |
2074 | unsigned long last; | |
2075 | ||
2076 | start = prange->notifier.interval_tree.start >> PAGE_SHIFT; | |
2077 | last = prange->notifier.interval_tree.last >> PAGE_SHIFT; | |
2078 | ||
2079 | if (prange->start == start && prange->last == last) | |
2080 | return; | |
2081 | ||
2082 | pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", | |
2083 | prange->svms, prange, start, last, prange->start, | |
2084 | prange->last); | |
2085 | ||
2086 | if (start != 0 && last != 0) { | |
2087 | interval_tree_remove(&prange->it_node, &prange->svms->objects); | |
2088 | svm_range_remove_notifier(prange); | |
2089 | } | |
2090 | prange->it_node.start = prange->start; | |
2091 | prange->it_node.last = prange->last; | |
2092 | ||
2093 | interval_tree_insert(&prange->it_node, &prange->svms->objects); | |
2094 | svm_range_add_notifier_locked(mm, prange); | |
2095 | } | |
2096 | ||
2097 | static void | |
367c9b0f PY |
2098 | svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange, |
2099 | struct mm_struct *mm) | |
4683cfec | 2100 | { |
4683cfec PY |
2101 | switch (prange->work_item.op) { |
2102 | case SVM_OP_NULL: | |
2103 | pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n", | |
2104 | svms, prange, prange->start, prange->last); | |
2105 | break; | |
2106 | case SVM_OP_UNMAP_RANGE: | |
2107 | pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n", | |
2108 | svms, prange, prange->start, prange->last); | |
2109 | svm_range_unlink(prange); | |
2110 | svm_range_remove_notifier(prange); | |
f9af3c16 | 2111 | svm_range_free(prange, true); |
4683cfec PY |
2112 | break; |
2113 | case SVM_OP_UPDATE_RANGE_NOTIFIER: | |
2114 | pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n", | |
2115 | svms, prange, prange->start, prange->last); | |
2116 | svm_range_update_notifier_and_interval_tree(mm, prange); | |
2117 | break; | |
90d7d3ed FK |
2118 | case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: |
2119 | pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", | |
2120 | svms, prange, prange->start, prange->last); | |
2121 | svm_range_update_notifier_and_interval_tree(mm, prange); | |
2122 | /* TODO: implement deferred validation and mapping */ | |
2123 | break; | |
4683cfec PY |
2124 | case SVM_OP_ADD_RANGE: |
2125 | pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange, | |
2126 | prange->start, prange->last); | |
2127 | svm_range_add_to_svms(prange); | |
2128 | svm_range_add_notifier_locked(mm, prange); | |
2129 | break; | |
90d7d3ed FK |
2130 | case SVM_OP_ADD_RANGE_AND_MAP: |
2131 | pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, | |
2132 | prange, prange->start, prange->last); | |
2133 | svm_range_add_to_svms(prange); | |
2134 | svm_range_add_notifier_locked(mm, prange); | |
2135 | /* TODO: implement deferred validation and mapping */ | |
2136 | break; | |
4683cfec PY |
2137 | default: |
2138 | WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange, | |
2139 | prange->work_item.op); | |
2140 | } | |
2141 | } | |
2142 | ||
373e3ccd PY |
2143 | static void svm_range_drain_retry_fault(struct svm_range_list *svms) |
2144 | { | |
2145 | struct kfd_process_device *pdd; | |
373e3ccd | 2146 | struct kfd_process *p; |
2e447728 | 2147 | int drain; |
373e3ccd PY |
2148 | uint32_t i; |
2149 | ||
2150 | p = container_of(svms, struct kfd_process, svms); | |
2151 | ||
2e447728 PY |
2152 | restart: |
2153 | drain = atomic_read(&svms->drain_pagefaults); | |
2154 | if (!drain) | |
2155 | return; | |
2156 | ||
5a75ea56 | 2157 | for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { |
373e3ccd PY |
2158 | pdd = p->pdds[i]; |
2159 | if (!pdd) | |
2160 | continue; | |
2161 | ||
2162 | pr_debug("drain retry fault gpu %d svms %p\n", i, svms); | |
373e3ccd | 2163 | |
3c2d6ea2 | 2164 | amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, |
56c5977e | 2165 | &pdd->dev->adev->irq.ih1); |
373e3ccd PY |
2166 | pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); |
2167 | } | |
2e447728 PY |
2168 | if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain) |
2169 | goto restart; | |
373e3ccd PY |
2170 | } |
2171 | ||
4683cfec PY |
2172 | static void svm_range_deferred_list_work(struct work_struct *work) |
2173 | { | |
2174 | struct svm_range_list *svms; | |
2175 | struct svm_range *prange; | |
2176 | struct mm_struct *mm; | |
2177 | ||
2178 | svms = container_of(work, struct svm_range_list, deferred_list_work); | |
2179 | pr_debug("enter svms 0x%p\n", svms); | |
2180 | ||
2181 | spin_lock(&svms->deferred_list_lock); | |
2182 | while (!list_empty(&svms->deferred_range_list)) { | |
2183 | prange = list_first_entry(&svms->deferred_range_list, | |
2184 | struct svm_range, deferred_list); | |
2185 | spin_unlock(&svms->deferred_list_lock); | |
a0c55ece | 2186 | |
4683cfec PY |
2187 | pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange, |
2188 | prange->start, prange->last, prange->work_item.op); | |
2189 | ||
367c9b0f PY |
2190 | mm = prange->work_item.mm; |
2191 | retry: | |
2192 | mmap_write_lock(mm); | |
2193 | ||
2194 | /* Checking for the need to drain retry faults must be inside | |
2195 | * mmap write lock to serialize with munmap notifiers. | |
2196 | */ | |
2197 | if (unlikely(atomic_read(&svms->drain_pagefaults))) { | |
2198 | mmap_write_unlock(mm); | |
2199 | svm_range_drain_retry_fault(svms); | |
2200 | goto retry; | |
2201 | } | |
2202 | ||
2203 | /* Remove from deferred_list must be inside mmap write lock, for | |
2204 | * two race cases: | |
2205 | * 1. unmap_from_cpu may change work_item.op and add the range | |
2206 | * to deferred_list again, cause use after free bug. | |
2207 | * 2. svm_range_list_lock_and_flush_work may hold mmap write | |
2208 | * lock and continue because deferred_list is empty, but | |
2209 | * deferred_list work is actually waiting for mmap lock. | |
2210 | */ | |
2211 | spin_lock(&svms->deferred_list_lock); | |
2212 | list_del_init(&prange->deferred_list); | |
2213 | spin_unlock(&svms->deferred_list_lock); | |
2214 | ||
4683cfec | 2215 | mutex_lock(&svms->lock); |
0b0e518d | 2216 | mutex_lock(&prange->migrate_mutex); |
4683cfec PY |
2217 | while (!list_empty(&prange->child_list)) { |
2218 | struct svm_range *pchild; | |
2219 | ||
2220 | pchild = list_first_entry(&prange->child_list, | |
2221 | struct svm_range, child_list); | |
2222 | pr_debug("child prange 0x%p op %d\n", pchild, | |
2223 | pchild->work_item.op); | |
2224 | list_del_init(&pchild->child_list); | |
367c9b0f | 2225 | svm_range_handle_list_op(svms, pchild, mm); |
4683cfec | 2226 | } |
0b0e518d | 2227 | mutex_unlock(&prange->migrate_mutex); |
4683cfec | 2228 | |
367c9b0f | 2229 | svm_range_handle_list_op(svms, prange, mm); |
4683cfec | 2230 | mutex_unlock(&svms->lock); |
367c9b0f PY |
2231 | mmap_write_unlock(mm); |
2232 | ||
2233 | /* Pairs with mmget in svm_range_add_list_work */ | |
2234 | mmput(mm); | |
4683cfec PY |
2235 | |
2236 | spin_lock(&svms->deferred_list_lock); | |
2237 | } | |
2238 | spin_unlock(&svms->deferred_list_lock); | |
4683cfec PY |
2239 | pr_debug("exit svms 0x%p\n", svms); |
2240 | } | |
2241 | ||
48ff079b | 2242 | void |
4683cfec PY |
2243 | svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, |
2244 | struct mm_struct *mm, enum svm_work_list_ops op) | |
2245 | { | |
2246 | spin_lock(&svms->deferred_list_lock); | |
2247 | /* if prange is on the deferred list */ | |
2248 | if (!list_empty(&prange->deferred_list)) { | |
2249 | pr_debug("update exist prange 0x%p work op %d\n", prange, op); | |
2250 | WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n"); | |
2251 | if (op != SVM_OP_NULL && | |
2252 | prange->work_item.op != SVM_OP_UNMAP_RANGE) | |
2253 | prange->work_item.op = op; | |
2254 | } else { | |
2255 | prange->work_item.op = op; | |
367c9b0f PY |
2256 | |
2257 | /* Pairs with mmput in deferred_list_work */ | |
2258 | mmget(mm); | |
4683cfec PY |
2259 | prange->work_item.mm = mm; |
2260 | list_add_tail(&prange->deferred_list, | |
2261 | &prange->svms->deferred_range_list); | |
2262 | pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", | |
2263 | prange, prange->start, prange->last, op); | |
2264 | } | |
2265 | spin_unlock(&svms->deferred_list_lock); | |
2266 | } | |
2267 | ||
48ff079b | 2268 | void schedule_deferred_list_work(struct svm_range_list *svms) |
4683cfec PY |
2269 | { |
2270 | spin_lock(&svms->deferred_list_lock); | |
2271 | if (!list_empty(&svms->deferred_range_list)) | |
2272 | schedule_work(&svms->deferred_list_work); | |
2273 | spin_unlock(&svms->deferred_list_lock); | |
2274 | } | |
2275 | ||
2276 | static void | |
2277 | svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, | |
2278 | struct svm_range *prange, unsigned long start, | |
2279 | unsigned long last) | |
2280 | { | |
2281 | struct svm_range *head; | |
2282 | struct svm_range *tail; | |
2283 | ||
2284 | if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { | |
2285 | pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange, | |
2286 | prange->start, prange->last); | |
2287 | return; | |
2288 | } | |
2289 | if (start > prange->last || last < prange->start) | |
2290 | return; | |
2291 | ||
2292 | head = tail = prange; | |
2293 | if (start > prange->start) | |
2294 | svm_range_split(prange, prange->start, start - 1, &tail); | |
2295 | if (last < tail->last) | |
2296 | svm_range_split(tail, last + 1, tail->last, &head); | |
2297 | ||
2298 | if (head != prange && tail != prange) { | |
2299 | svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); | |
2300 | svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); | |
2301 | } else if (tail != prange) { | |
2302 | svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); | |
2303 | } else if (head != prange) { | |
2304 | svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); | |
2305 | } else if (parent != prange) { | |
2306 | prange->work_item.op = SVM_OP_UNMAP_RANGE; | |
2307 | } | |
2308 | } | |
2309 | ||
2310 | static void | |
2311 | svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, | |
2312 | unsigned long start, unsigned long last) | |
2313 | { | |
46ae2af9 | 2314 | uint32_t trigger = KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU; |
4683cfec PY |
2315 | struct svm_range_list *svms; |
2316 | struct svm_range *pchild; | |
2317 | struct kfd_process *p; | |
f80fe9d3 | 2318 | unsigned long s, l; |
4683cfec PY |
2319 | bool unmap_parent; |
2320 | ||
2321 | p = kfd_lookup_process_by_mm(mm); | |
2322 | if (!p) | |
2323 | return; | |
2324 | svms = &p->svms; | |
2325 | ||
2326 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms, | |
2327 | prange, prange->start, prange->last, start, last); | |
2328 | ||
2e447728 PY |
2329 | /* Make sure pending page faults are drained in the deferred worker |
2330 | * before the range is freed to avoid straggler interrupts on | |
2331 | * unmapped memory causing "phantom faults". | |
2332 | */ | |
2333 | atomic_inc(&svms->drain_pagefaults); | |
2334 | ||
4683cfec PY |
2335 | unmap_parent = start <= prange->start && last >= prange->last; |
2336 | ||
f80fe9d3 FK |
2337 | list_for_each_entry(pchild, &prange->child_list, child_list) { |
2338 | mutex_lock_nested(&pchild->lock, 1); | |
2339 | s = max(start, pchild->start); | |
2340 | l = min(last, pchild->last); | |
2341 | if (l >= s) | |
46ae2af9 | 2342 | svm_range_unmap_from_gpus(pchild, s, l, trigger); |
4683cfec | 2343 | svm_range_unmap_split(mm, prange, pchild, start, last); |
f80fe9d3 FK |
2344 | mutex_unlock(&pchild->lock); |
2345 | } | |
2346 | s = max(start, prange->start); | |
2347 | l = min(last, prange->last); | |
2348 | if (l >= s) | |
46ae2af9 | 2349 | svm_range_unmap_from_gpus(prange, s, l, trigger); |
4683cfec PY |
2350 | svm_range_unmap_split(mm, prange, prange, start, last); |
2351 | ||
2352 | if (unmap_parent) | |
2353 | svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); | |
2354 | else | |
2355 | svm_range_add_list_work(svms, prange, mm, | |
2356 | SVM_OP_UPDATE_RANGE_NOTIFIER); | |
2357 | schedule_deferred_list_work(svms); | |
2358 | ||
2359 | kfd_unref_process(p); | |
2360 | } | |
2361 | ||
b1c46c7d PY |
2362 | /** |
2363 | * svm_range_cpu_invalidate_pagetables - interval notifier callback | |
bbe04dec IB |
2364 | * @mni: mmu_interval_notifier struct |
2365 | * @range: mmu_notifier_range struct | |
2366 | * @cur_seq: value to pass to mmu_interval_set_seq() | |
b1c46c7d | 2367 | * |
48ff079b FK |
2368 | * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it |
2369 | * is from migration, or CPU page invalidation callback. | |
2370 | * | |
2371 | * For unmap event, unmap range from GPUs, remove prange from svms in a delayed | |
2372 | * work thread, and split prange if only part of prange is unmapped. | |
8a7c184a | 2373 | * |
48ff079b FK |
2374 | * For invalidation event, if GPU retry fault is not enabled, evict the queues, |
2375 | * then schedule svm_range_restore_work to update GPU mapping and resume queues. | |
2376 | * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will | |
2377 | * update GPU mapping to recover. | |
2378 | * | |
2379 | * Context: mmap lock, notifier_invalidate_start lock are held | |
2380 | * for invalidate event, prange lock is held if this is from migration | |
b1c46c7d PY |
2381 | */ |
2382 | static bool | |
2383 | svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, | |
2384 | const struct mmu_notifier_range *range, | |
2385 | unsigned long cur_seq) | |
2386 | { | |
4683cfec PY |
2387 | struct svm_range *prange; |
2388 | unsigned long start; | |
2389 | unsigned long last; | |
2390 | ||
2391 | if (range->event == MMU_NOTIFY_RELEASE) | |
2392 | return true; | |
fa582c6f PY |
2393 | if (!mmget_not_zero(mni->mm)) |
2394 | return true; | |
4683cfec PY |
2395 | |
2396 | start = mni->interval_tree.start; | |
2397 | last = mni->interval_tree.last; | |
d999bc81 CD |
2398 | start = max(start, range->start) >> PAGE_SHIFT; |
2399 | last = min(last, range->end - 1) >> PAGE_SHIFT; | |
4683cfec PY |
2400 | pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n", |
2401 | start, last, range->start >> PAGE_SHIFT, | |
2402 | (range->end - 1) >> PAGE_SHIFT, | |
2403 | mni->interval_tree.start >> PAGE_SHIFT, | |
2404 | mni->interval_tree.last >> PAGE_SHIFT, range->event); | |
2405 | ||
2406 | prange = container_of(mni, struct svm_range, notifier); | |
2407 | ||
2408 | svm_range_lock(prange); | |
2409 | mmu_interval_set_seq(mni, cur_seq); | |
2410 | ||
2411 | switch (range->event) { | |
2412 | case MMU_NOTIFY_UNMAP: | |
2413 | svm_range_unmap_from_cpu(mni->mm, prange, start, last); | |
2414 | break; | |
2415 | default: | |
46ae2af9 | 2416 | svm_range_evict(prange, mni->mm, start, last, range->event); |
4683cfec PY |
2417 | break; |
2418 | } | |
2419 | ||
2420 | svm_range_unlock(prange); | |
fa582c6f | 2421 | mmput(mni->mm); |
4683cfec | 2422 | |
b1c46c7d PY |
2423 | return true; |
2424 | } | |
2425 | ||
48ff079b FK |
2426 | /** |
2427 | * svm_range_from_addr - find svm range from fault address | |
2428 | * @svms: svm range list header | |
2429 | * @addr: address to search range interval tree, in pages | |
2430 | * @parent: parent range if range is on child list | |
2431 | * | |
2432 | * Context: The caller must hold svms->lock | |
2433 | * | |
2434 | * Return: the svm_range found or NULL | |
2435 | */ | |
2436 | struct svm_range * | |
2437 | svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, | |
2438 | struct svm_range **parent) | |
2439 | { | |
2440 | struct interval_tree_node *node; | |
2441 | struct svm_range *prange; | |
2442 | struct svm_range *pchild; | |
2443 | ||
2444 | node = interval_tree_iter_first(&svms->objects, addr, addr); | |
2445 | if (!node) | |
2446 | return NULL; | |
2447 | ||
2448 | prange = container_of(node, struct svm_range, it_node); | |
2449 | pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n", | |
2450 | addr, prange->start, prange->last, node->start, node->last); | |
2451 | ||
2452 | if (addr >= prange->start && addr <= prange->last) { | |
2453 | if (parent) | |
2454 | *parent = prange; | |
2455 | return prange; | |
2456 | } | |
2457 | list_for_each_entry(pchild, &prange->child_list, child_list) | |
2458 | if (addr >= pchild->start && addr <= pchild->last) { | |
2459 | pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n", | |
2460 | addr, pchild->start, pchild->last); | |
2461 | if (parent) | |
2462 | *parent = prange; | |
2463 | return pchild; | |
2464 | } | |
2465 | ||
2466 | return NULL; | |
2467 | } | |
2468 | ||
cda0f85b FK |
2469 | /* svm_range_best_restore_location - decide the best fault restore location |
2470 | * @prange: svm range structure | |
2471 | * @adev: the GPU on which vm fault happened | |
2472 | * | |
2473 | * This is only called when xnack is on, to decide the best location to restore | |
2474 | * the range mapping after GPU vm fault. Caller uses the best location to do | |
2475 | * migration if actual loc is not best location, then update GPU page table | |
2476 | * mapping to the best location. | |
2477 | * | |
297753a0 | 2478 | * If the preferred loc is accessible by faulting GPU, use preferred loc. |
cda0f85b FK |
2479 | * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu |
2480 | * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then | |
2481 | * if range actual loc is cpu, best_loc is cpu | |
2482 | * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is | |
2483 | * range actual loc. | |
2484 | * Otherwise, GPU no access, best_loc is -1. | |
2485 | * | |
2486 | * Return: | |
2487 | * -1 means vm fault GPU no access | |
2488 | * 0 for CPU or GPU id | |
2489 | */ | |
2490 | static int32_t | |
2491 | svm_range_best_restore_location(struct svm_range *prange, | |
2492 | struct amdgpu_device *adev, | |
2493 | int32_t *gpuidx) | |
2494 | { | |
297753a0 | 2495 | struct amdgpu_device *bo_adev, *preferred_adev; |
cda0f85b FK |
2496 | struct kfd_process *p; |
2497 | uint32_t gpuid; | |
2498 | int r; | |
2499 | ||
2500 | p = container_of(prange->svms, struct kfd_process, svms); | |
2501 | ||
56c5977e | 2502 | r = kfd_process_gpuid_from_adev(p, adev, &gpuid, gpuidx); |
cda0f85b FK |
2503 | if (r < 0) { |
2504 | pr_debug("failed to get gpuid from kgd\n"); | |
2505 | return -1; | |
2506 | } | |
2507 | ||
297753a0 FK |
2508 | if (prange->preferred_loc == gpuid || |
2509 | prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) { | |
cda0f85b | 2510 | return prange->preferred_loc; |
297753a0 FK |
2511 | } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) { |
2512 | preferred_adev = svm_range_get_adev_by_id(prange, | |
2513 | prange->preferred_loc); | |
2514 | if (amdgpu_xgmi_same_hive(adev, preferred_adev)) | |
2515 | return prange->preferred_loc; | |
2516 | /* fall through */ | |
2517 | } | |
cda0f85b FK |
2518 | |
2519 | if (test_bit(*gpuidx, prange->bitmap_access)) | |
2520 | return gpuid; | |
2521 | ||
2522 | if (test_bit(*gpuidx, prange->bitmap_aip)) { | |
2523 | if (!prange->actual_loc) | |
2524 | return 0; | |
2525 | ||
2526 | bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc); | |
2527 | if (amdgpu_xgmi_same_hive(adev, bo_adev)) | |
2528 | return prange->actual_loc; | |
2529 | else | |
2530 | return 0; | |
2531 | } | |
2532 | ||
2533 | return -1; | |
2534 | } | |
43fc10c1 | 2535 | |
b19dbb7a AS |
2536 | static int |
2537 | svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, | |
12fcf0a7 FK |
2538 | unsigned long *start, unsigned long *last, |
2539 | bool *is_heap_stack) | |
b19dbb7a AS |
2540 | { |
2541 | struct vm_area_struct *vma; | |
2542 | struct interval_tree_node *node; | |
2543 | unsigned long start_limit, end_limit; | |
2544 | ||
3a3e841d DW |
2545 | vma = vma_lookup(p->mm, addr << PAGE_SHIFT); |
2546 | if (!vma) { | |
b19dbb7a AS |
2547 | pr_debug("VMA does not exist in address [0x%llx]\n", addr); |
2548 | return -EFAULT; | |
2549 | } | |
12fcf0a7 FK |
2550 | |
2551 | *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk && | |
2552 | vma->vm_end >= vma->vm_mm->start_brk) || | |
2553 | (vma->vm_start <= vma->vm_mm->start_stack && | |
2554 | vma->vm_end >= vma->vm_mm->start_stack); | |
2555 | ||
b19dbb7a AS |
2556 | start_limit = max(vma->vm_start >> PAGE_SHIFT, |
2557 | (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); | |
2558 | end_limit = min(vma->vm_end >> PAGE_SHIFT, | |
2559 | (unsigned long)ALIGN(addr + 1, 2UL << 8)); | |
2560 | /* First range that starts after the fault address */ | |
2561 | node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX); | |
2562 | if (node) { | |
2563 | end_limit = min(end_limit, node->start); | |
2564 | /* Last range that ends before the fault address */ | |
2565 | node = container_of(rb_prev(&node->rb), | |
2566 | struct interval_tree_node, rb); | |
2567 | } else { | |
2568 | /* Last range must end before addr because | |
2569 | * there was no range after addr | |
2570 | */ | |
2571 | node = container_of(rb_last(&p->svms.objects.rb_root), | |
2572 | struct interval_tree_node, rb); | |
2573 | } | |
2574 | if (node) { | |
2575 | if (node->last >= addr) { | |
2576 | WARN(1, "Overlap with prev node and page fault addr\n"); | |
2577 | return -EFAULT; | |
2578 | } | |
2579 | start_limit = max(start_limit, node->last + 1); | |
2580 | } | |
2581 | ||
2582 | *start = start_limit; | |
2583 | *last = end_limit - 1; | |
2584 | ||
12fcf0a7 FK |
2585 | pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n", |
2586 | vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT, | |
2587 | *start, *last, *is_heap_stack); | |
b19dbb7a AS |
2588 | |
2589 | return 0; | |
43fc10c1 PY |
2590 | } |
2591 | ||
2592 | static int | |
2593 | svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, | |
2594 | uint64_t *bo_s, uint64_t *bo_l) | |
2595 | { | |
2596 | struct amdgpu_bo_va_mapping *mapping; | |
2597 | struct interval_tree_node *node; | |
2598 | struct amdgpu_bo *bo = NULL; | |
2599 | unsigned long userptr; | |
2600 | uint32_t i; | |
2601 | int r; | |
b19dbb7a | 2602 | |
43fc10c1 PY |
2603 | for (i = 0; i < p->n_pdds; i++) { |
2604 | struct amdgpu_vm *vm; | |
2605 | ||
2606 | if (!p->pdds[i]->drm_priv) | |
2607 | continue; | |
2608 | ||
2609 | vm = drm_priv_to_vm(p->pdds[i]->drm_priv); | |
2610 | r = amdgpu_bo_reserve(vm->root.bo, false); | |
2611 | if (r) | |
2612 | return r; | |
2613 | ||
2614 | /* Check userptr by searching entire vm->va interval tree */ | |
2615 | node = interval_tree_iter_first(&vm->va, 0, ~0ULL); | |
2616 | while (node) { | |
2617 | mapping = container_of((struct rb_node *)node, | |
2618 | struct amdgpu_bo_va_mapping, rb); | |
2619 | bo = mapping->bo_va->base.bo; | |
2620 | ||
2621 | if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, | |
2622 | start << PAGE_SHIFT, | |
2623 | last << PAGE_SHIFT, | |
2624 | &userptr)) { | |
2625 | node = interval_tree_iter_next(node, 0, ~0ULL); | |
2626 | continue; | |
2627 | } | |
2628 | ||
2629 | pr_debug("[0x%llx 0x%llx] already userptr mapped\n", | |
2630 | start, last); | |
2631 | if (bo_s && bo_l) { | |
2632 | *bo_s = userptr >> PAGE_SHIFT; | |
2633 | *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1; | |
2634 | } | |
2635 | amdgpu_bo_unreserve(vm->root.bo); | |
2636 | return -EADDRINUSE; | |
2637 | } | |
2638 | amdgpu_bo_unreserve(vm->root.bo); | |
2639 | } | |
2640 | return 0; | |
b19dbb7a | 2641 | } |
43fc10c1 | 2642 | |
b19dbb7a AS |
2643 | static struct |
2644 | svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, | |
2645 | struct kfd_process *p, | |
2646 | struct mm_struct *mm, | |
2647 | int64_t addr) | |
2648 | { | |
2649 | struct svm_range *prange = NULL; | |
2650 | unsigned long start, last; | |
2651 | uint32_t gpuid, gpuidx; | |
12fcf0a7 | 2652 | bool is_heap_stack; |
43fc10c1 PY |
2653 | uint64_t bo_s = 0; |
2654 | uint64_t bo_l = 0; | |
2655 | int r; | |
b19dbb7a | 2656 | |
12fcf0a7 FK |
2657 | if (svm_range_get_range_boundaries(p, addr, &start, &last, |
2658 | &is_heap_stack)) | |
b19dbb7a AS |
2659 | return NULL; |
2660 | ||
43fc10c1 PY |
2661 | r = svm_range_check_vm(p, start, last, &bo_s, &bo_l); |
2662 | if (r != -EADDRINUSE) | |
2663 | r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l); | |
2664 | ||
2665 | if (r == -EADDRINUSE) { | |
2666 | if (addr >= bo_s && addr <= bo_l) | |
2667 | return NULL; | |
2668 | ||
2669 | /* Create one page svm range if 2MB range overlapping */ | |
2670 | start = addr; | |
2671 | last = addr; | |
2672 | } | |
2673 | ||
f9af3c16 | 2674 | prange = svm_range_new(&p->svms, start, last, true); |
b19dbb7a | 2675 | if (!prange) { |
a9a76bee | 2676 | pr_debug("Failed to create prange in address [0x%llx]\n", addr); |
b19dbb7a AS |
2677 | return NULL; |
2678 | } | |
56c5977e | 2679 | if (kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx)) { |
b19dbb7a | 2680 | pr_debug("failed to get gpuid from kgd\n"); |
f9af3c16 | 2681 | svm_range_free(prange, true); |
b19dbb7a AS |
2682 | return NULL; |
2683 | } | |
a9a76bee | 2684 | |
12fcf0a7 FK |
2685 | if (is_heap_stack) |
2686 | prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM; | |
2687 | ||
b19dbb7a AS |
2688 | svm_range_add_to_svms(prange); |
2689 | svm_range_add_notifier_locked(mm, prange); | |
2690 | ||
2691 | return prange; | |
2692 | } | |
cda0f85b | 2693 | |
373e3ccd PY |
2694 | /* svm_range_skip_recover - decide if prange can be recovered |
2695 | * @prange: svm range structure | |
2696 | * | |
2697 | * GPU vm retry fault handle skip recover the range for cases: | |
2698 | * 1. prange is on deferred list to be removed after unmap, it is stale fault, | |
2699 | * deferred list work will drain the stale fault before free the prange. | |
2700 | * 2. prange is on deferred list to add interval notifier after split, or | |
2701 | * 3. prange is child range, it is split from parent prange, recover later | |
2702 | * after interval notifier is added. | |
2703 | * | |
2704 | * Return: true to skip recover, false to recover | |
2705 | */ | |
2706 | static bool svm_range_skip_recover(struct svm_range *prange) | |
2707 | { | |
2708 | struct svm_range_list *svms = prange->svms; | |
2709 | ||
2710 | spin_lock(&svms->deferred_list_lock); | |
2711 | if (list_empty(&prange->deferred_list) && | |
2712 | list_empty(&prange->child_list)) { | |
2713 | spin_unlock(&svms->deferred_list_lock); | |
2714 | return false; | |
2715 | } | |
2716 | spin_unlock(&svms->deferred_list_lock); | |
2717 | ||
2718 | if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { | |
2719 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n", | |
2720 | svms, prange, prange->start, prange->last); | |
2721 | return true; | |
2722 | } | |
2723 | if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || | |
2724 | prange->work_item.op == SVM_OP_ADD_RANGE) { | |
2725 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n", | |
2726 | svms, prange, prange->start, prange->last); | |
2727 | return true; | |
2728 | } | |
2729 | return false; | |
2730 | } | |
2731 | ||
d4ebc200 PY |
2732 | static void |
2733 | svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p, | |
5017bf82 | 2734 | int32_t gpuidx) |
d4ebc200 PY |
2735 | { |
2736 | struct kfd_process_device *pdd; | |
2737 | ||
5017bf82 PY |
2738 | /* fault is on different page of same range |
2739 | * or fault is skipped to recover later | |
2740 | * or fault is on invalid virtual address | |
2741 | */ | |
2742 | if (gpuidx == MAX_GPU_INSTANCE) { | |
2743 | uint32_t gpuid; | |
2744 | int r; | |
d4ebc200 | 2745 | |
56c5977e | 2746 | r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx); |
5017bf82 PY |
2747 | if (r < 0) |
2748 | return; | |
2749 | } | |
2750 | ||
2751 | /* fault is recovered | |
2752 | * or fault cannot recover because GPU no access on the range | |
2753 | */ | |
2754 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); | |
d4ebc200 PY |
2755 | if (pdd) |
2756 | WRITE_ONCE(pdd->faults, pdd->faults + 1); | |
2757 | } | |
2758 | ||
ff891a2e | 2759 | static bool |
7ad153db | 2760 | svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) |
ff891a2e PY |
2761 | { |
2762 | unsigned long requested = VM_READ; | |
ff891a2e PY |
2763 | |
2764 | if (write_fault) | |
2765 | requested |= VM_WRITE; | |
2766 | ||
ff891a2e PY |
2767 | pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested, |
2768 | vma->vm_flags); | |
2769 | return (vma->vm_flags & requested) == requested; | |
2770 | } | |
2771 | ||
2383f56b FK |
2772 | int |
2773 | svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, | |
ff891a2e | 2774 | uint64_t addr, bool write_fault) |
2383f56b | 2775 | { |
2383f56b | 2776 | struct mm_struct *mm = NULL; |
2383f56b | 2777 | struct svm_range_list *svms; |
cda0f85b | 2778 | struct svm_range *prange; |
2383f56b | 2779 | struct kfd_process *p; |
e0f1e65b | 2780 | ktime_t timestamp = ktime_get_boottime(); |
d4ebc200 PY |
2781 | int32_t best_loc; |
2782 | int32_t gpuidx = MAX_GPU_INSTANCE; | |
b19dbb7a | 2783 | bool write_locked = false; |
7ad153db | 2784 | struct vm_area_struct *vma; |
e0f1e65b | 2785 | bool migration = false; |
cda0f85b | 2786 | int r = 0; |
2383f56b | 2787 | |
5a75ea56 FK |
2788 | if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { |
2789 | pr_debug("device does not support SVM\n"); | |
2790 | return -EFAULT; | |
2791 | } | |
2792 | ||
2383f56b FK |
2793 | p = kfd_lookup_process_by_pasid(pasid); |
2794 | if (!p) { | |
2795 | pr_debug("kfd process not founded pasid 0x%x\n", pasid); | |
a0c55ece | 2796 | return 0; |
2383f56b | 2797 | } |
2383f56b FK |
2798 | svms = &p->svms; |
2799 | ||
2800 | pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr); | |
2801 | ||
2e447728 PY |
2802 | if (atomic_read(&svms->drain_pagefaults)) { |
2803 | pr_debug("draining retry fault, drop fault 0x%llx\n", addr); | |
71f8f119 | 2804 | r = 0; |
2e447728 PY |
2805 | goto out; |
2806 | } | |
2807 | ||
edd11922 PY |
2808 | if (!p->xnack_enabled) { |
2809 | pr_debug("XNACK not enabled for pasid 0x%x\n", pasid); | |
2810 | r = -EFAULT; | |
2811 | goto out; | |
2812 | } | |
2813 | ||
a0c55ece PY |
2814 | /* p->lead_thread is available as kfd_process_wq_release flush the work |
2815 | * before releasing task ref. | |
2816 | */ | |
2383f56b FK |
2817 | mm = get_task_mm(p->lead_thread); |
2818 | if (!mm) { | |
2819 | pr_debug("svms 0x%p failed to get mm\n", svms); | |
71f8f119 | 2820 | r = 0; |
2383f56b FK |
2821 | goto out; |
2822 | } | |
2823 | ||
2824 | mmap_read_lock(mm); | |
b19dbb7a | 2825 | retry_write_locked: |
2383f56b FK |
2826 | mutex_lock(&svms->lock); |
2827 | prange = svm_range_from_addr(svms, addr, NULL); | |
2383f56b FK |
2828 | if (!prange) { |
2829 | pr_debug("failed to find prange svms 0x%p address [0x%llx]\n", | |
2830 | svms, addr); | |
b19dbb7a AS |
2831 | if (!write_locked) { |
2832 | /* Need the write lock to create new range with MMU notifier. | |
2833 | * Also flush pending deferred work to make sure the interval | |
2834 | * tree is up to date before we add a new range | |
2835 | */ | |
2836 | mutex_unlock(&svms->lock); | |
2837 | mmap_read_unlock(mm); | |
2838 | mmap_write_lock(mm); | |
2839 | write_locked = true; | |
2840 | goto retry_write_locked; | |
2841 | } | |
2842 | prange = svm_range_create_unregistered_range(adev, p, mm, addr); | |
2843 | if (!prange) { | |
dd57e65f | 2844 | pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", |
b19dbb7a AS |
2845 | svms, addr); |
2846 | mmap_write_downgrade(mm); | |
2847 | r = -EFAULT; | |
2848 | goto out_unlock_svms; | |
2849 | } | |
2383f56b | 2850 | } |
b19dbb7a AS |
2851 | if (write_locked) |
2852 | mmap_write_downgrade(mm); | |
2383f56b FK |
2853 | |
2854 | mutex_lock(&prange->migrate_mutex); | |
373e3ccd | 2855 | |
b3dc91f9 PY |
2856 | if (svm_range_skip_recover(prange)) { |
2857 | amdgpu_gmc_filter_faults_remove(adev, addr, pasid); | |
71f8f119 | 2858 | r = 0; |
373e3ccd | 2859 | goto out_unlock_range; |
b3dc91f9 | 2860 | } |
373e3ccd | 2861 | |
564d2b92 | 2862 | /* skip duplicate vm fault on different pages of same range */ |
e0f1e65b PY |
2863 | if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp, |
2864 | AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) { | |
564d2b92 FK |
2865 | pr_debug("svms 0x%p [0x%lx %lx] already restored\n", |
2866 | svms, prange->start, prange->last); | |
71f8f119 | 2867 | r = 0; |
564d2b92 FK |
2868 | goto out_unlock_range; |
2869 | } | |
2383f56b | 2870 | |
7ad153db PY |
2871 | /* __do_munmap removed VMA, return success as we are handling stale |
2872 | * retry fault. | |
2873 | */ | |
3a3e841d DW |
2874 | vma = vma_lookup(mm, addr << PAGE_SHIFT); |
2875 | if (!vma) { | |
7ad153db PY |
2876 | pr_debug("address 0x%llx VMA is removed\n", addr); |
2877 | r = 0; | |
2878 | goto out_unlock_range; | |
2879 | } | |
2880 | ||
2881 | if (!svm_fault_allowed(vma, write_fault)) { | |
ff891a2e PY |
2882 | pr_debug("fault addr 0x%llx no %s permission\n", addr, |
2883 | write_fault ? "write" : "read"); | |
2884 | r = -EPERM; | |
2885 | goto out_unlock_range; | |
2886 | } | |
2887 | ||
cda0f85b FK |
2888 | best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); |
2889 | if (best_loc == -1) { | |
2890 | pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", | |
2383f56b | 2891 | svms, prange->start, prange->last); |
cda0f85b FK |
2892 | r = -EACCES; |
2893 | goto out_unlock_range; | |
2894 | } | |
2895 | ||
2896 | pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", | |
2897 | svms, prange->start, prange->last, best_loc, | |
2898 | prange->actual_loc); | |
2899 | ||
e0f1e65b PY |
2900 | kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr, |
2901 | write_fault, timestamp); | |
2902 | ||
cda0f85b | 2903 | if (prange->actual_loc != best_loc) { |
e0f1e65b | 2904 | migration = true; |
cda0f85b | 2905 | if (best_loc) { |
acac270d PY |
2906 | r = svm_migrate_to_vram(prange, best_loc, mm, |
2907 | KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); | |
cda0f85b FK |
2908 | if (r) { |
2909 | pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", | |
2910 | r, addr); | |
2911 | /* Fallback to system memory if migration to | |
2912 | * VRAM failed | |
2913 | */ | |
2914 | if (prange->actual_loc) | |
acac270d | 2915 | r = svm_migrate_vram_to_ram(prange, mm, |
16ce101d AP |
2916 | KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, |
2917 | NULL); | |
cda0f85b FK |
2918 | else |
2919 | r = 0; | |
2920 | } | |
2921 | } else { | |
acac270d | 2922 | r = svm_migrate_vram_to_ram(prange, mm, |
16ce101d AP |
2923 | KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, |
2924 | NULL); | |
cda0f85b FK |
2925 | } |
2926 | if (r) { | |
2927 | pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", | |
2928 | r, svms, prange->start, prange->last); | |
2929 | goto out_unlock_range; | |
2930 | } | |
2931 | } | |
2932 | ||
601354f3 | 2933 | r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false); |
cda0f85b FK |
2934 | if (r) |
2935 | pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", | |
2936 | r, svms, prange->start, prange->last); | |
2383f56b | 2937 | |
e0f1e65b PY |
2938 | kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr, |
2939 | migration); | |
2940 | ||
cda0f85b | 2941 | out_unlock_range: |
2383f56b FK |
2942 | mutex_unlock(&prange->migrate_mutex); |
2943 | out_unlock_svms: | |
2944 | mutex_unlock(&svms->lock); | |
2945 | mmap_read_unlock(mm); | |
d4ebc200 | 2946 | |
5017bf82 | 2947 | svm_range_count_fault(adev, p, gpuidx); |
d4ebc200 | 2948 | |
2383f56b FK |
2949 | mmput(mm); |
2950 | out: | |
2951 | kfd_unref_process(p); | |
2952 | ||
4999e398 PY |
2953 | if (r == -EAGAIN) { |
2954 | pr_debug("recover vm fault later\n"); | |
b3dc91f9 | 2955 | amdgpu_gmc_filter_faults_remove(adev, addr, pasid); |
4999e398 PY |
2956 | r = 0; |
2957 | } | |
2383f56b FK |
2958 | return r; |
2959 | } | |
2960 | ||
8a7c3ce1 PY |
2961 | int |
2962 | svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled) | |
2963 | { | |
2964 | struct svm_range *prange, *pchild; | |
2965 | uint64_t reserved_size = 0; | |
2966 | uint64_t size; | |
2967 | int r = 0; | |
2968 | ||
2969 | pr_debug("switching xnack from %d to %d\n", p->xnack_enabled, xnack_enabled); | |
2970 | ||
2971 | mutex_lock(&p->svms.lock); | |
2972 | ||
2973 | list_for_each_entry(prange, &p->svms.list, list) { | |
2974 | svm_range_lock(prange); | |
2975 | list_for_each_entry(pchild, &prange->child_list, child_list) { | |
2976 | size = (pchild->last - pchild->start + 1) << PAGE_SHIFT; | |
2977 | if (xnack_enabled) { | |
2978 | amdgpu_amdkfd_unreserve_mem_limit(NULL, size, | |
2979 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); | |
2980 | } else { | |
2981 | r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, | |
2982 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); | |
2983 | if (r) | |
2984 | goto out_unlock; | |
2985 | reserved_size += size; | |
2986 | } | |
2987 | } | |
2988 | ||
2989 | size = (prange->last - prange->start + 1) << PAGE_SHIFT; | |
2990 | if (xnack_enabled) { | |
2991 | amdgpu_amdkfd_unreserve_mem_limit(NULL, size, | |
2992 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); | |
2993 | } else { | |
2994 | r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, | |
2995 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); | |
2996 | if (r) | |
2997 | goto out_unlock; | |
2998 | reserved_size += size; | |
2999 | } | |
3000 | out_unlock: | |
3001 | svm_range_unlock(prange); | |
3002 | if (r) | |
3003 | break; | |
3004 | } | |
3005 | ||
3006 | if (r) | |
3007 | amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size, | |
3008 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); | |
3009 | else | |
3010 | /* Change xnack mode must be inside svms lock, to avoid race with | |
3011 | * svm_range_deferred_list_work unreserve memory in parallel. | |
3012 | */ | |
3013 | p->xnack_enabled = xnack_enabled; | |
3014 | ||
3015 | mutex_unlock(&p->svms.lock); | |
3016 | return r; | |
3017 | } | |
3018 | ||
42de677f PY |
3019 | void svm_range_list_fini(struct kfd_process *p) |
3020 | { | |
e49fe404 FK |
3021 | struct svm_range *prange; |
3022 | struct svm_range *next; | |
42de677f PY |
3023 | |
3024 | pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); | |
4683cfec | 3025 | |
6225bb3a PY |
3026 | cancel_delayed_work_sync(&p->svms.restore_work); |
3027 | ||
4683cfec PY |
3028 | /* Ensure list work is finished before process is destroyed */ |
3029 | flush_work(&p->svms.deferred_list_work); | |
e49fe404 | 3030 | |
a0c55ece PY |
3031 | /* |
3032 | * Ensure no retry fault comes in afterwards, as page fault handler will | |
3033 | * not find kfd process and take mm lock to recover fault. | |
3034 | */ | |
2e447728 | 3035 | atomic_inc(&p->svms.drain_pagefaults); |
a0c55ece PY |
3036 | svm_range_drain_retry_fault(&p->svms); |
3037 | ||
e49fe404 FK |
3038 | list_for_each_entry_safe(prange, next, &p->svms.list, list) { |
3039 | svm_range_unlink(prange); | |
3040 | svm_range_remove_notifier(prange); | |
f9af3c16 | 3041 | svm_range_free(prange, true); |
e49fe404 FK |
3042 | } |
3043 | ||
3044 | mutex_destroy(&p->svms.lock); | |
3045 | ||
3046 | pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); | |
42de677f PY |
3047 | } |
3048 | ||
3049 | int svm_range_list_init(struct kfd_process *p) | |
3050 | { | |
3051 | struct svm_range_list *svms = &p->svms; | |
5a75ea56 | 3052 | int i; |
42de677f PY |
3053 | |
3054 | svms->objects = RB_ROOT_CACHED; | |
3055 | mutex_init(&svms->lock); | |
3056 | INIT_LIST_HEAD(&svms->list); | |
8a7c184a | 3057 | atomic_set(&svms->evicted_ranges, 0); |
2e447728 | 3058 | atomic_set(&svms->drain_pagefaults, 0); |
8a7c184a | 3059 | INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); |
4683cfec PY |
3060 | INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); |
3061 | INIT_LIST_HEAD(&svms->deferred_range_list); | |
c2db32ce | 3062 | INIT_LIST_HEAD(&svms->criu_svm_metadata_list); |
4683cfec | 3063 | spin_lock_init(&svms->deferred_list_lock); |
42de677f | 3064 | |
5a75ea56 FK |
3065 | for (i = 0; i < p->n_pdds; i++) |
3066 | if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev)) | |
3067 | bitmap_set(svms->bitmap_supported, i, 1); | |
3068 | ||
42de677f PY |
3069 | return 0; |
3070 | } | |
3071 | ||
71cbfeb3 AS |
3072 | /** |
3073 | * svm_range_check_vm - check if virtual address range mapped already | |
3074 | * @p: current kfd_process | |
3075 | * @start: range start address, in pages | |
3076 | * @last: range last address, in pages | |
43fc10c1 PY |
3077 | * @bo_s: mapping start address in pages if address range already mapped |
3078 | * @bo_l: mapping last address in pages if address range already mapped | |
71cbfeb3 AS |
3079 | * |
3080 | * The purpose is to avoid virtual address ranges already allocated by | |
3081 | * kfd_ioctl_alloc_memory_of_gpu ioctl. | |
3082 | * It looks for each pdd in the kfd_process. | |
3083 | * | |
3084 | * Context: Process context | |
3085 | * | |
3086 | * Return 0 - OK, if the range is not mapped. | |
3087 | * Otherwise error code: | |
3088 | * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu | |
3089 | * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by | |
3090 | * a signal. Release all buffer reservations and return to user-space. | |
3091 | */ | |
3092 | static int | |
43fc10c1 PY |
3093 | svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, |
3094 | uint64_t *bo_s, uint64_t *bo_l) | |
71cbfeb3 | 3095 | { |
43fc10c1 PY |
3096 | struct amdgpu_bo_va_mapping *mapping; |
3097 | struct interval_tree_node *node; | |
71cbfeb3 AS |
3098 | uint32_t i; |
3099 | int r; | |
3100 | ||
3101 | for (i = 0; i < p->n_pdds; i++) { | |
3102 | struct amdgpu_vm *vm; | |
3103 | ||
3104 | if (!p->pdds[i]->drm_priv) | |
3105 | continue; | |
3106 | ||
3107 | vm = drm_priv_to_vm(p->pdds[i]->drm_priv); | |
3108 | r = amdgpu_bo_reserve(vm->root.bo, false); | |
3109 | if (r) | |
3110 | return r; | |
43fc10c1 PY |
3111 | |
3112 | node = interval_tree_iter_first(&vm->va, start, last); | |
3113 | if (node) { | |
3114 | pr_debug("range [0x%llx 0x%llx] already TTM mapped\n", | |
3115 | start, last); | |
3116 | mapping = container_of((struct rb_node *)node, | |
3117 | struct amdgpu_bo_va_mapping, rb); | |
3118 | if (bo_s && bo_l) { | |
3119 | *bo_s = mapping->start; | |
3120 | *bo_l = mapping->last; | |
3121 | } | |
71cbfeb3 AS |
3122 | amdgpu_bo_unreserve(vm->root.bo); |
3123 | return -EADDRINUSE; | |
3124 | } | |
3125 | amdgpu_bo_unreserve(vm->root.bo); | |
3126 | } | |
3127 | ||
3128 | return 0; | |
3129 | } | |
3130 | ||
42de677f PY |
3131 | /** |
3132 | * svm_range_is_valid - check if virtual address range is valid | |
71cbfeb3 | 3133 | * @p: current kfd_process |
42de677f PY |
3134 | * @start: range start address, in pages |
3135 | * @size: range size, in pages | |
3136 | * | |
3137 | * Valid virtual address range means it belongs to one or more VMAs | |
3138 | * | |
3139 | * Context: Process context | |
3140 | * | |
3141 | * Return: | |
71cbfeb3 | 3142 | * 0 - OK, otherwise error code |
42de677f | 3143 | */ |
71cbfeb3 AS |
3144 | static int |
3145 | svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) | |
42de677f PY |
3146 | { |
3147 | const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; | |
3148 | struct vm_area_struct *vma; | |
3149 | unsigned long end; | |
71cbfeb3 | 3150 | unsigned long start_unchg = start; |
42de677f PY |
3151 | |
3152 | start <<= PAGE_SHIFT; | |
3153 | end = start + (size << PAGE_SHIFT); | |
42de677f | 3154 | do { |
3a3e841d DW |
3155 | vma = vma_lookup(p->mm, start); |
3156 | if (!vma || (vma->vm_flags & device_vma)) | |
71cbfeb3 | 3157 | return -EFAULT; |
42de677f PY |
3158 | start = min(end, vma->vm_end); |
3159 | } while (start < end); | |
3160 | ||
43fc10c1 PY |
3161 | return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL, |
3162 | NULL); | |
42de677f PY |
3163 | } |
3164 | ||
eff8cbf0 PY |
3165 | /** |
3166 | * svm_range_best_prefetch_location - decide the best prefetch location | |
0b0e518d FK |
3167 | * @prange: svm range structure |
3168 | * | |
3169 | * For xnack off: | |
eff8cbf0 | 3170 | * If range map to single GPU, the best prefetch location is prefetch_loc, which |
0b0e518d FK |
3171 | * can be CPU or GPU. |
3172 | * | |
eff8cbf0 PY |
3173 | * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on |
3174 | * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise | |
3175 | * the best prefetch location is always CPU, because GPU can not have coherent | |
3176 | * mapping VRAM of other GPUs even with large-BAR PCIe connection. | |
0b0e518d FK |
3177 | * |
3178 | * For xnack on: | |
eff8cbf0 PY |
3179 | * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is |
3180 | * prefetch_loc, other GPU access will generate vm fault and trigger migration. | |
3181 | * | |
3182 | * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same | |
3183 | * hive, the best prefetch location is prefetch_loc GPU, otherwise the best | |
3184 | * prefetch location is always CPU. | |
0b0e518d FK |
3185 | * |
3186 | * Context: Process context | |
3187 | * | |
3188 | * Return: | |
3189 | * 0 for CPU or GPU id | |
3190 | */ | |
cda0f85b FK |
3191 | static uint32_t |
3192 | svm_range_best_prefetch_location(struct svm_range *prange) | |
0b0e518d FK |
3193 | { |
3194 | DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); | |
3195 | uint32_t best_loc = prange->prefetch_loc; | |
3196 | struct kfd_process_device *pdd; | |
3197 | struct amdgpu_device *bo_adev; | |
0b0e518d FK |
3198 | struct kfd_process *p; |
3199 | uint32_t gpuidx; | |
3200 | ||
3201 | p = container_of(prange->svms, struct kfd_process, svms); | |
3202 | ||
0b0e518d FK |
3203 | if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) |
3204 | goto out; | |
3205 | ||
3206 | bo_adev = svm_range_get_adev_by_id(prange, best_loc); | |
1a3b2b5d FK |
3207 | if (!bo_adev) { |
3208 | WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc); | |
3209 | best_loc = 0; | |
3210 | goto out; | |
3211 | } | |
eff8cbf0 PY |
3212 | |
3213 | if (p->xnack_enabled) | |
3214 | bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); | |
3215 | else | |
3216 | bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip, | |
3217 | MAX_GPU_INSTANCE); | |
0b0e518d FK |
3218 | |
3219 | for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { | |
3220 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); | |
3221 | if (!pdd) { | |
3222 | pr_debug("failed to get device by idx 0x%x\n", gpuidx); | |
3223 | continue; | |
3224 | } | |
0b0e518d | 3225 | |
56c5977e | 3226 | if (pdd->dev->adev == bo_adev) |
0b0e518d FK |
3227 | continue; |
3228 | ||
56c5977e | 3229 | if (!amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { |
0b0e518d FK |
3230 | best_loc = 0; |
3231 | break; | |
3232 | } | |
3233 | } | |
3234 | ||
3235 | out: | |
3236 | pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n", | |
3237 | p->xnack_enabled, &p->svms, prange->start, prange->last, | |
3238 | best_loc); | |
3239 | ||
3240 | return best_loc; | |
3241 | } | |
3242 | ||
0b0e518d FK |
3243 | /* svm_range_trigger_migration - start page migration if prefetch loc changed |
3244 | * @mm: current process mm_struct | |
3245 | * @prange: svm range structure | |
3246 | * @migrated: output, true if migration is triggered | |
3247 | * | |
3248 | * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range | |
3249 | * from ram to vram. | |
3250 | * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range | |
3251 | * from vram to ram. | |
3252 | * | |
3253 | * If GPU vm fault retry is not enabled, migration interact with MMU notifier | |
3254 | * and restore work: | |
3255 | * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict | |
3256 | * stops all queues, schedule restore work | |
3257 | * 2. svm_range_restore_work wait for migration is done by | |
3258 | * a. svm_range_validate_vram takes prange->migrate_mutex | |
3259 | * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns | |
3260 | * 3. restore work update mappings of GPU, resume all queues. | |
3261 | * | |
3262 | * Context: Process context | |
3263 | * | |
3264 | * Return: | |
3265 | * 0 - OK, otherwise - error code of migration | |
3266 | */ | |
3267 | static int | |
3268 | svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, | |
3269 | bool *migrated) | |
3270 | { | |
3271 | uint32_t best_loc; | |
3272 | int r = 0; | |
3273 | ||
3274 | *migrated = false; | |
cda0f85b | 3275 | best_loc = svm_range_best_prefetch_location(prange); |
0b0e518d FK |
3276 | |
3277 | if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || | |
3278 | best_loc == prange->actual_loc) | |
3279 | return 0; | |
3280 | ||
1a3b2b5d | 3281 | if (!best_loc) { |
16ce101d AP |
3282 | r = svm_migrate_vram_to_ram(prange, mm, |
3283 | KFD_MIGRATE_TRIGGER_PREFETCH, NULL); | |
1a3b2b5d FK |
3284 | *migrated = !r; |
3285 | return r; | |
0b0e518d FK |
3286 | } |
3287 | ||
acac270d | 3288 | r = svm_migrate_to_vram(prange, best_loc, mm, KFD_MIGRATE_TRIGGER_PREFETCH); |
1a3b2b5d | 3289 | *migrated = !r; |
48ff079b | 3290 | |
0b0e518d FK |
3291 | return r; |
3292 | } | |
3293 | ||
b41896e3 FK |
3294 | int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) |
3295 | { | |
3296 | if (!fence) | |
3297 | return -EINVAL; | |
3298 | ||
3299 | if (dma_fence_is_signaled(&fence->base)) | |
3300 | return 0; | |
3301 | ||
3302 | if (fence->svm_bo) { | |
3303 | WRITE_ONCE(fence->svm_bo->evicting, 1); | |
3304 | schedule_work(&fence->svm_bo->eviction_work); | |
3305 | } | |
3306 | ||
3307 | return 0; | |
3308 | } | |
3309 | ||
3310 | static void svm_range_evict_svm_bo_worker(struct work_struct *work) | |
3311 | { | |
3312 | struct svm_range_bo *svm_bo; | |
b41896e3 | 3313 | struct mm_struct *mm; |
9527b9ca | 3314 | int r = 0; |
b41896e3 FK |
3315 | |
3316 | svm_bo = container_of(work, struct svm_range_bo, eviction_work); | |
3317 | if (!svm_bo_ref_unless_zero(svm_bo)) | |
3318 | return; /* svm_bo was freed while eviction was pending */ | |
3319 | ||
c0289557 FK |
3320 | if (mmget_not_zero(svm_bo->eviction_fence->mm)) { |
3321 | mm = svm_bo->eviction_fence->mm; | |
3322 | } else { | |
3323 | svm_range_bo_unref(svm_bo); | |
b41896e3 | 3324 | return; |
c0289557 | 3325 | } |
b41896e3 FK |
3326 | |
3327 | mmap_read_lock(mm); | |
3328 | spin_lock(&svm_bo->list_lock); | |
9527b9ca | 3329 | while (!list_empty(&svm_bo->range_list) && !r) { |
b41896e3 FK |
3330 | struct svm_range *prange = |
3331 | list_first_entry(&svm_bo->range_list, | |
3332 | struct svm_range, svm_bo_list); | |
740a451b FK |
3333 | int retries = 3; |
3334 | ||
b41896e3 FK |
3335 | list_del_init(&prange->svm_bo_list); |
3336 | spin_unlock(&svm_bo->list_lock); | |
3337 | ||
3338 | pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, | |
3339 | prange->start, prange->last); | |
3340 | ||
3341 | mutex_lock(&prange->migrate_mutex); | |
740a451b | 3342 | do { |
c0289557 | 3343 | r = svm_migrate_vram_to_ram(prange, mm, |
16ce101d | 3344 | KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); |
9527b9ca | 3345 | } while (!r && prange->actual_loc && --retries); |
b41896e3 | 3346 | |
9527b9ca PY |
3347 | if (!r && prange->actual_loc) |
3348 | pr_info_once("Migration failed during eviction"); | |
b41896e3 | 3349 | |
9527b9ca PY |
3350 | if (!prange->actual_loc) { |
3351 | mutex_lock(&prange->lock); | |
3352 | prange->svm_bo = NULL; | |
3353 | mutex_unlock(&prange->lock); | |
3354 | } | |
b41896e3 FK |
3355 | mutex_unlock(&prange->migrate_mutex); |
3356 | ||
3357 | spin_lock(&svm_bo->list_lock); | |
3358 | } | |
3359 | spin_unlock(&svm_bo->list_lock); | |
3360 | mmap_read_unlock(mm); | |
c0289557 | 3361 | mmput(mm); |
b41896e3 FK |
3362 | |
3363 | dma_fence_signal(&svm_bo->eviction_fence->base); | |
9527b9ca | 3364 | |
b41896e3 FK |
3365 | /* This is the last reference to svm_bo, after svm_range_vram_node_free |
3366 | * has been called in svm_migrate_vram_to_ram | |
3367 | */ | |
9527b9ca | 3368 | WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); |
b41896e3 FK |
3369 | svm_range_bo_unref(svm_bo); |
3370 | } | |
3371 | ||
42de677f | 3372 | static int |
d1289b41 RB |
3373 | svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, |
3374 | uint64_t start, uint64_t size, uint32_t nattr, | |
3375 | struct kfd_ioctl_svm_attribute *attrs) | |
42de677f | 3376 | { |
e433d684 | 3377 | struct amdkfd_process_info *process_info = p->kgd_process_info; |
42de677f PY |
3378 | struct list_head update_list; |
3379 | struct list_head insert_list; | |
3380 | struct list_head remove_list; | |
3381 | struct svm_range_list *svms; | |
3382 | struct svm_range *prange; | |
3383 | struct svm_range *next; | |
601354f3 PY |
3384 | bool update_mapping = false; |
3385 | bool flush_tlb; | |
42de677f PY |
3386 | int r = 0; |
3387 | ||
3388 | pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", | |
3389 | p->pasid, &p->svms, start, start + size - 1, size); | |
3390 | ||
3391 | r = svm_range_check_attr(p, nattr, attrs); | |
3392 | if (r) | |
3393 | return r; | |
3394 | ||
3395 | svms = &p->svms; | |
3396 | ||
e433d684 PY |
3397 | mutex_lock(&process_info->lock); |
3398 | ||
4683cfec | 3399 | svm_range_list_lock_and_flush_work(svms, mm); |
42de677f | 3400 | |
71cbfeb3 AS |
3401 | r = svm_range_is_valid(p, start, size); |
3402 | if (r) { | |
3403 | pr_debug("invalid range r=%d\n", r); | |
42de677f PY |
3404 | mmap_write_unlock(mm); |
3405 | goto out; | |
3406 | } | |
3407 | ||
3408 | mutex_lock(&svms->lock); | |
3409 | ||
3410 | /* Add new range and split existing ranges as needed */ | |
3411 | r = svm_range_add(p, start, size, nattr, attrs, &update_list, | |
3412 | &insert_list, &remove_list); | |
3413 | if (r) { | |
3414 | mutex_unlock(&svms->lock); | |
3415 | mmap_write_unlock(mm); | |
3416 | goto out; | |
3417 | } | |
3418 | /* Apply changes as a transaction */ | |
ef3b4137 | 3419 | list_for_each_entry_safe(prange, next, &insert_list, list) { |
42de677f | 3420 | svm_range_add_to_svms(prange); |
b1c46c7d | 3421 | svm_range_add_notifier_locked(mm, prange); |
42de677f PY |
3422 | } |
3423 | list_for_each_entry(prange, &update_list, update_list) { | |
601354f3 | 3424 | svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping); |
42de677f PY |
3425 | /* TODO: unmap ranges from GPU that lost access */ |
3426 | } | |
b121862c | 3427 | list_for_each_entry_safe(prange, next, &remove_list, update_list) { |
42de677f PY |
3428 | pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", |
3429 | prange->svms, prange, prange->start, | |
3430 | prange->last); | |
3431 | svm_range_unlink(prange); | |
b1c46c7d | 3432 | svm_range_remove_notifier(prange); |
f9af3c16 | 3433 | svm_range_free(prange, false); |
42de677f PY |
3434 | } |
3435 | ||
3436 | mmap_write_downgrade(mm); | |
3437 | /* Trigger migrations and revalidate and map to GPUs as needed. If | |
3438 | * this fails we may be left with partially completed actions. There | |
3439 | * is no clean way of rolling back to the previous state in such a | |
3440 | * case because the rollback wouldn't be guaranteed to work either. | |
3441 | */ | |
3442 | list_for_each_entry(prange, &update_list, update_list) { | |
0b0e518d FK |
3443 | bool migrated; |
3444 | ||
3445 | mutex_lock(&prange->migrate_mutex); | |
3446 | ||
3447 | r = svm_range_trigger_migration(mm, prange, &migrated); | |
3448 | if (r) | |
3449 | goto out_unlock_range; | |
3450 | ||
7d261c50 | 3451 | if (migrated && (!p->xnack_enabled || |
f72fc9bd EH |
3452 | (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) && |
3453 | prange->mapped_to_gpu) { | |
0b0e518d FK |
3454 | pr_debug("restore_work will update mappings of GPUs\n"); |
3455 | mutex_unlock(&prange->migrate_mutex); | |
3456 | continue; | |
3457 | } | |
3458 | ||
601354f3 PY |
3459 | if (!migrated && !update_mapping) { |
3460 | mutex_unlock(&prange->migrate_mutex); | |
3461 | continue; | |
3462 | } | |
3463 | ||
3464 | flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu; | |
3465 | ||
b1c46c7d | 3466 | r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE, |
601354f3 | 3467 | true, true, flush_tlb); |
0b0e518d FK |
3468 | if (r) |
3469 | pr_debug("failed %d to map svm range\n", r); | |
3470 | ||
3471 | out_unlock_range: | |
3472 | mutex_unlock(&prange->migrate_mutex); | |
3473 | if (r) | |
b1c46c7d | 3474 | break; |
42de677f PY |
3475 | } |
3476 | ||
3477 | svm_range_debug_dump(svms); | |
3478 | ||
3479 | mutex_unlock(&svms->lock); | |
3480 | mmap_read_unlock(mm); | |
3481 | out: | |
e433d684 PY |
3482 | mutex_unlock(&process_info->lock); |
3483 | ||
42de677f PY |
3484 | pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, |
3485 | &p->svms, start, start + size - 1, r); | |
3486 | ||
3487 | return r; | |
3488 | } | |
3489 | ||
c5e2e478 | 3490 | static int |
d1289b41 RB |
3491 | svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm, |
3492 | uint64_t start, uint64_t size, uint32_t nattr, | |
3493 | struct kfd_ioctl_svm_attribute *attrs) | |
c5e2e478 PY |
3494 | { |
3495 | DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); | |
3496 | DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); | |
3497 | bool get_preferred_loc = false; | |
3498 | bool get_prefetch_loc = false; | |
3499 | bool get_granularity = false; | |
3500 | bool get_accessible = false; | |
3501 | bool get_flags = false; | |
3502 | uint64_t last = start + size - 1UL; | |
c5e2e478 PY |
3503 | uint8_t granularity = 0xff; |
3504 | struct interval_tree_node *node; | |
3505 | struct svm_range_list *svms; | |
3506 | struct svm_range *prange; | |
3507 | uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; | |
3508 | uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; | |
a43e2a0e FK |
3509 | uint32_t flags_and = 0xffffffff; |
3510 | uint32_t flags_or = 0; | |
c5e2e478 PY |
3511 | int gpuidx; |
3512 | uint32_t i; | |
71cbfeb3 | 3513 | int r = 0; |
c5e2e478 PY |
3514 | |
3515 | pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start, | |
3516 | start + size - 1, nattr); | |
3517 | ||
2bbab7ce YZ |
3518 | /* Flush pending deferred work to avoid racing with deferred actions from |
3519 | * previous memory map changes (e.g. munmap). Concurrent memory map changes | |
3520 | * can still race with get_attr because we don't hold the mmap lock. But that | |
3521 | * would be a race condition in the application anyway, and undefined | |
3522 | * behaviour is acceptable in that case. | |
3523 | */ | |
3524 | flush_work(&p->svms.deferred_list_work); | |
3525 | ||
c5e2e478 | 3526 | mmap_read_lock(mm); |
71cbfeb3 | 3527 | r = svm_range_is_valid(p, start, size); |
c5e2e478 | 3528 | mmap_read_unlock(mm); |
71cbfeb3 AS |
3529 | if (r) { |
3530 | pr_debug("invalid range r=%d\n", r); | |
3531 | return r; | |
3532 | } | |
c5e2e478 PY |
3533 | |
3534 | for (i = 0; i < nattr; i++) { | |
3535 | switch (attrs[i].type) { | |
3536 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: | |
3537 | get_preferred_loc = true; | |
3538 | break; | |
3539 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: | |
3540 | get_prefetch_loc = true; | |
3541 | break; | |
3542 | case KFD_IOCTL_SVM_ATTR_ACCESS: | |
3543 | get_accessible = true; | |
3544 | break; | |
3545 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: | |
a43e2a0e | 3546 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: |
c5e2e478 PY |
3547 | get_flags = true; |
3548 | break; | |
3549 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: | |
3550 | get_granularity = true; | |
3551 | break; | |
c5e2e478 PY |
3552 | case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: |
3553 | case KFD_IOCTL_SVM_ATTR_NO_ACCESS: | |
3554 | fallthrough; | |
3555 | default: | |
3556 | pr_debug("get invalid attr type 0x%x\n", attrs[i].type); | |
3557 | return -EINVAL; | |
3558 | } | |
3559 | } | |
3560 | ||
3561 | svms = &p->svms; | |
3562 | ||
3563 | mutex_lock(&svms->lock); | |
3564 | ||
3565 | node = interval_tree_iter_first(&svms->objects, start, last); | |
3566 | if (!node) { | |
3567 | pr_debug("range attrs not found return default values\n"); | |
3568 | svm_range_set_default_attributes(&location, &prefetch_loc, | |
a43e2a0e FK |
3569 | &granularity, &flags_and); |
3570 | flags_or = flags_and; | |
c5e2e478 | 3571 | if (p->xnack_enabled) |
5a75ea56 FK |
3572 | bitmap_copy(bitmap_access, svms->bitmap_supported, |
3573 | MAX_GPU_INSTANCE); | |
63f1af83 AS |
3574 | else |
3575 | bitmap_zero(bitmap_access, MAX_GPU_INSTANCE); | |
3576 | bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE); | |
c5e2e478 PY |
3577 | goto fill_values; |
3578 | } | |
5a75ea56 FK |
3579 | bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); |
3580 | bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE); | |
c5e2e478 PY |
3581 | |
3582 | while (node) { | |
3583 | struct interval_tree_node *next; | |
3584 | ||
3585 | prange = container_of(node, struct svm_range, it_node); | |
3586 | next = interval_tree_iter_next(node, start, last); | |
3587 | ||
3588 | if (get_preferred_loc) { | |
3589 | if (prange->preferred_loc == | |
3590 | KFD_IOCTL_SVM_LOCATION_UNDEFINED || | |
3591 | (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && | |
3592 | location != prange->preferred_loc)) { | |
3593 | location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; | |
3594 | get_preferred_loc = false; | |
3595 | } else { | |
3596 | location = prange->preferred_loc; | |
3597 | } | |
3598 | } | |
3599 | if (get_prefetch_loc) { | |
3600 | if (prange->prefetch_loc == | |
3601 | KFD_IOCTL_SVM_LOCATION_UNDEFINED || | |
3602 | (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && | |
3603 | prefetch_loc != prange->prefetch_loc)) { | |
3604 | prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; | |
3605 | get_prefetch_loc = false; | |
3606 | } else { | |
3607 | prefetch_loc = prange->prefetch_loc; | |
3608 | } | |
3609 | } | |
3610 | if (get_accessible) { | |
3611 | bitmap_and(bitmap_access, bitmap_access, | |
3612 | prange->bitmap_access, MAX_GPU_INSTANCE); | |
3613 | bitmap_and(bitmap_aip, bitmap_aip, | |
3614 | prange->bitmap_aip, MAX_GPU_INSTANCE); | |
3615 | } | |
a43e2a0e FK |
3616 | if (get_flags) { |
3617 | flags_and &= prange->flags; | |
3618 | flags_or |= prange->flags; | |
3619 | } | |
c5e2e478 PY |
3620 | |
3621 | if (get_granularity && prange->granularity < granularity) | |
3622 | granularity = prange->granularity; | |
3623 | ||
3624 | node = next; | |
3625 | } | |
3626 | fill_values: | |
3627 | mutex_unlock(&svms->lock); | |
3628 | ||
3629 | for (i = 0; i < nattr; i++) { | |
3630 | switch (attrs[i].type) { | |
3631 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: | |
3632 | attrs[i].value = location; | |
3633 | break; | |
3634 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: | |
3635 | attrs[i].value = prefetch_loc; | |
3636 | break; | |
3637 | case KFD_IOCTL_SVM_ATTR_ACCESS: | |
3638 | gpuidx = kfd_process_gpuidx_from_gpuid(p, | |
3639 | attrs[i].value); | |
3640 | if (gpuidx < 0) { | |
3641 | pr_debug("invalid gpuid %x\n", attrs[i].value); | |
3642 | return -EINVAL; | |
3643 | } | |
3644 | if (test_bit(gpuidx, bitmap_access)) | |
3645 | attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; | |
3646 | else if (test_bit(gpuidx, bitmap_aip)) | |
3647 | attrs[i].type = | |
3648 | KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; | |
3649 | else | |
3650 | attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; | |
3651 | break; | |
3652 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: | |
a43e2a0e FK |
3653 | attrs[i].value = flags_and; |
3654 | break; | |
3655 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: | |
3656 | attrs[i].value = ~flags_or; | |
c5e2e478 PY |
3657 | break; |
3658 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: | |
3659 | attrs[i].value = (uint32_t)granularity; | |
3660 | break; | |
3661 | } | |
3662 | } | |
3663 | ||
3664 | return 0; | |
3665 | } | |
3666 | ||
2a909ae7 RB |
3667 | int kfd_criu_resume_svm(struct kfd_process *p) |
3668 | { | |
3669 | struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL; | |
3670 | int nattr_common = 4, nattr_accessibility = 1; | |
3671 | struct criu_svm_metadata *criu_svm_md = NULL; | |
3672 | struct svm_range_list *svms = &p->svms; | |
3673 | struct criu_svm_metadata *next = NULL; | |
3674 | uint32_t set_flags = 0xffffffff; | |
3675 | int i, j, num_attrs, ret = 0; | |
3676 | uint64_t set_attr_size; | |
3677 | struct mm_struct *mm; | |
3678 | ||
3679 | if (list_empty(&svms->criu_svm_metadata_list)) { | |
3680 | pr_debug("No SVM data from CRIU restore stage 2\n"); | |
3681 | return ret; | |
3682 | } | |
3683 | ||
3684 | mm = get_task_mm(p->lead_thread); | |
3685 | if (!mm) { | |
3686 | pr_err("failed to get mm for the target process\n"); | |
3687 | return -ESRCH; | |
3688 | } | |
3689 | ||
3690 | num_attrs = nattr_common + (nattr_accessibility * p->n_pdds); | |
3691 | ||
3692 | i = j = 0; | |
3693 | list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) { | |
3694 | pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n", | |
3695 | i, criu_svm_md->data.start_addr, criu_svm_md->data.size); | |
3696 | ||
3697 | for (j = 0; j < num_attrs; j++) { | |
b010a46b | 3698 | pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n", |
2a909ae7 RB |
3699 | i, j, criu_svm_md->data.attrs[j].type, |
3700 | i, j, criu_svm_md->data.attrs[j].value); | |
3701 | switch (criu_svm_md->data.attrs[j].type) { | |
3702 | /* During Checkpoint operation, the query for | |
3703 | * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might | |
3704 | * return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were | |
3705 | * not used by the range which was checkpointed. Care | |
3706 | * must be taken to not restore with an invalid value | |
3707 | * otherwise the gpuidx value will be invalid and | |
3708 | * set_attr would eventually fail so just replace those | |
3709 | * with another dummy attribute such as | |
3710 | * KFD_IOCTL_SVM_ATTR_SET_FLAGS. | |
3711 | */ | |
3712 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: | |
3713 | if (criu_svm_md->data.attrs[j].value == | |
3714 | KFD_IOCTL_SVM_LOCATION_UNDEFINED) { | |
3715 | criu_svm_md->data.attrs[j].type = | |
3716 | KFD_IOCTL_SVM_ATTR_SET_FLAGS; | |
3717 | criu_svm_md->data.attrs[j].value = 0; | |
3718 | } | |
3719 | break; | |
3720 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: | |
3721 | set_flags = criu_svm_md->data.attrs[j].value; | |
3722 | break; | |
3723 | default: | |
3724 | break; | |
3725 | } | |
3726 | } | |
3727 | ||
3728 | /* CLR_FLAGS is not available via get_attr during checkpoint but | |
3729 | * it needs to be inserted before restoring the ranges so | |
3730 | * allocate extra space for it before calling set_attr | |
3731 | */ | |
3732 | set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * | |
3733 | (num_attrs + 1); | |
3734 | set_attr_new = krealloc(set_attr, set_attr_size, | |
3735 | GFP_KERNEL); | |
3736 | if (!set_attr_new) { | |
3737 | ret = -ENOMEM; | |
3738 | goto exit; | |
3739 | } | |
3740 | set_attr = set_attr_new; | |
3741 | ||
3742 | memcpy(set_attr, criu_svm_md->data.attrs, num_attrs * | |
3743 | sizeof(struct kfd_ioctl_svm_attribute)); | |
3744 | set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS; | |
3745 | set_attr[num_attrs].value = ~set_flags; | |
3746 | ||
3747 | ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr, | |
3748 | criu_svm_md->data.size, num_attrs + 1, | |
3749 | set_attr); | |
3750 | if (ret) { | |
3751 | pr_err("CRIU: failed to set range attributes\n"); | |
3752 | goto exit; | |
3753 | } | |
3754 | ||
3755 | i++; | |
3756 | } | |
3757 | exit: | |
3758 | kfree(set_attr); | |
3759 | list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) { | |
3760 | pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n", | |
3761 | criu_svm_md->data.start_addr); | |
3762 | kfree(criu_svm_md); | |
3763 | } | |
3764 | ||
3765 | mmput(mm); | |
3766 | return ret; | |
3767 | ||
3768 | } | |
3769 | ||
c2db32ce RB |
3770 | int kfd_criu_restore_svm(struct kfd_process *p, |
3771 | uint8_t __user *user_priv_ptr, | |
3772 | uint64_t *priv_data_offset, | |
3773 | uint64_t max_priv_data_size) | |
3774 | { | |
3775 | uint64_t svm_priv_data_size, svm_object_md_size, svm_attrs_size; | |
3776 | int nattr_common = 4, nattr_accessibility = 1; | |
3777 | struct criu_svm_metadata *criu_svm_md = NULL; | |
3778 | struct svm_range_list *svms = &p->svms; | |
3779 | uint32_t num_devices; | |
3780 | int ret = 0; | |
3781 | ||
3782 | num_devices = p->n_pdds; | |
3783 | /* Handle one SVM range object at a time, also the number of gpus are | |
3784 | * assumed to be same on the restore node, checking must be done while | |
b010a46b RB |
3785 | * evaluating the topology earlier |
3786 | */ | |
c2db32ce RB |
3787 | |
3788 | svm_attrs_size = sizeof(struct kfd_ioctl_svm_attribute) * | |
3789 | (nattr_common + nattr_accessibility * num_devices); | |
3790 | svm_object_md_size = sizeof(struct criu_svm_metadata) + svm_attrs_size; | |
3791 | ||
3792 | svm_priv_data_size = sizeof(struct kfd_criu_svm_range_priv_data) + | |
3793 | svm_attrs_size; | |
3794 | ||
3795 | criu_svm_md = kzalloc(svm_object_md_size, GFP_KERNEL); | |
3796 | if (!criu_svm_md) { | |
3797 | pr_err("failed to allocate memory to store svm metadata\n"); | |
3798 | return -ENOMEM; | |
3799 | } | |
3800 | if (*priv_data_offset + svm_priv_data_size > max_priv_data_size) { | |
3801 | ret = -EINVAL; | |
3802 | goto exit; | |
3803 | } | |
3804 | ||
3805 | ret = copy_from_user(&criu_svm_md->data, user_priv_ptr + *priv_data_offset, | |
3806 | svm_priv_data_size); | |
3807 | if (ret) { | |
3808 | ret = -EFAULT; | |
3809 | goto exit; | |
3810 | } | |
3811 | *priv_data_offset += svm_priv_data_size; | |
3812 | ||
3813 | list_add_tail(&criu_svm_md->list, &svms->criu_svm_metadata_list); | |
3814 | ||
3815 | return 0; | |
3816 | ||
3817 | ||
3818 | exit: | |
3819 | kfree(criu_svm_md); | |
3820 | return ret; | |
3821 | } | |
3822 | ||
08a987a8 RB |
3823 | int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, |
3824 | uint64_t *svm_priv_data_size) | |
3825 | { | |
3826 | uint64_t total_size, accessibility_size, common_attr_size; | |
3827 | int nattr_common = 4, nattr_accessibility = 1; | |
3828 | int num_devices = p->n_pdds; | |
3829 | struct svm_range_list *svms; | |
3830 | struct svm_range *prange; | |
3831 | uint32_t count = 0; | |
3832 | ||
3833 | *svm_priv_data_size = 0; | |
3834 | ||
3835 | svms = &p->svms; | |
3836 | if (!svms) | |
3837 | return -EINVAL; | |
3838 | ||
3839 | mutex_lock(&svms->lock); | |
3840 | list_for_each_entry(prange, &svms->list, list) { | |
3841 | pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n", | |
3842 | prange, prange->start, prange->npages, | |
3843 | prange->start + prange->npages - 1); | |
3844 | count++; | |
3845 | } | |
3846 | mutex_unlock(&svms->lock); | |
3847 | ||
3848 | *num_svm_ranges = count; | |
3849 | /* Only the accessbility attributes need to be queried for all the gpus | |
3850 | * individually, remaining ones are spanned across the entire process | |
3851 | * regardless of the various gpu nodes. Of the remaining attributes, | |
3852 | * KFD_IOCTL_SVM_ATTR_CLR_FLAGS need not be saved. | |
3853 | * | |
3854 | * KFD_IOCTL_SVM_ATTR_PREFERRED_LOC | |
3855 | * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC | |
3856 | * KFD_IOCTL_SVM_ATTR_SET_FLAGS | |
3857 | * KFD_IOCTL_SVM_ATTR_GRANULARITY | |
3858 | * | |
3859 | * ** ACCESSBILITY ATTRIBUTES ** | |
3860 | * (Considered as one, type is altered during query, value is gpuid) | |
3861 | * KFD_IOCTL_SVM_ATTR_ACCESS | |
3862 | * KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE | |
3863 | * KFD_IOCTL_SVM_ATTR_NO_ACCESS | |
3864 | */ | |
3865 | if (*num_svm_ranges > 0) { | |
3866 | common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * | |
3867 | nattr_common; | |
3868 | accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) * | |
3869 | nattr_accessibility * num_devices; | |
3870 | ||
3871 | total_size = sizeof(struct kfd_criu_svm_range_priv_data) + | |
3872 | common_attr_size + accessibility_size; | |
3873 | ||
3874 | *svm_priv_data_size = *num_svm_ranges * total_size; | |
3875 | } | |
3876 | ||
3877 | pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges, | |
3878 | *svm_priv_data_size); | |
3879 | return 0; | |
3880 | } | |
3881 | ||
9d5dabfe RB |
3882 | int kfd_criu_checkpoint_svm(struct kfd_process *p, |
3883 | uint8_t __user *user_priv_data, | |
3884 | uint64_t *priv_data_offset) | |
3885 | { | |
3886 | struct kfd_criu_svm_range_priv_data *svm_priv = NULL; | |
3887 | struct kfd_ioctl_svm_attribute *query_attr = NULL; | |
3888 | uint64_t svm_priv_data_size, query_attr_size = 0; | |
3889 | int index, nattr_common = 4, ret = 0; | |
3890 | struct svm_range_list *svms; | |
3891 | int num_devices = p->n_pdds; | |
3892 | struct svm_range *prange; | |
3893 | struct mm_struct *mm; | |
3894 | ||
3895 | svms = &p->svms; | |
3896 | if (!svms) | |
3897 | return -EINVAL; | |
3898 | ||
3899 | mm = get_task_mm(p->lead_thread); | |
3900 | if (!mm) { | |
3901 | pr_err("failed to get mm for the target process\n"); | |
3902 | return -ESRCH; | |
3903 | } | |
3904 | ||
3905 | query_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * | |
3906 | (nattr_common + num_devices); | |
3907 | ||
3908 | query_attr = kzalloc(query_attr_size, GFP_KERNEL); | |
3909 | if (!query_attr) { | |
3910 | ret = -ENOMEM; | |
3911 | goto exit; | |
3912 | } | |
3913 | ||
3914 | query_attr[0].type = KFD_IOCTL_SVM_ATTR_PREFERRED_LOC; | |
3915 | query_attr[1].type = KFD_IOCTL_SVM_ATTR_PREFETCH_LOC; | |
3916 | query_attr[2].type = KFD_IOCTL_SVM_ATTR_SET_FLAGS; | |
3917 | query_attr[3].type = KFD_IOCTL_SVM_ATTR_GRANULARITY; | |
3918 | ||
3919 | for (index = 0; index < num_devices; index++) { | |
3920 | struct kfd_process_device *pdd = p->pdds[index]; | |
3921 | ||
3922 | query_attr[index + nattr_common].type = | |
3923 | KFD_IOCTL_SVM_ATTR_ACCESS; | |
3924 | query_attr[index + nattr_common].value = pdd->user_gpu_id; | |
3925 | } | |
3926 | ||
3927 | svm_priv_data_size = sizeof(*svm_priv) + query_attr_size; | |
3928 | ||
3929 | svm_priv = kzalloc(svm_priv_data_size, GFP_KERNEL); | |
3930 | if (!svm_priv) { | |
3931 | ret = -ENOMEM; | |
3932 | goto exit_query; | |
3933 | } | |
3934 | ||
3935 | index = 0; | |
3936 | list_for_each_entry(prange, &svms->list, list) { | |
3937 | ||
3938 | svm_priv->object_type = KFD_CRIU_OBJECT_TYPE_SVM_RANGE; | |
3939 | svm_priv->start_addr = prange->start; | |
3940 | svm_priv->size = prange->npages; | |
3941 | memcpy(&svm_priv->attrs, query_attr, query_attr_size); | |
3942 | pr_debug("CRIU: prange: 0x%p start: 0x%lx\t npages: 0x%llx end: 0x%llx\t size: 0x%llx\n", | |
3943 | prange, prange->start, prange->npages, | |
3944 | prange->start + prange->npages - 1, | |
3945 | prange->npages * PAGE_SIZE); | |
3946 | ||
3947 | ret = svm_range_get_attr(p, mm, svm_priv->start_addr, | |
3948 | svm_priv->size, | |
3949 | (nattr_common + num_devices), | |
3950 | svm_priv->attrs); | |
3951 | if (ret) { | |
3952 | pr_err("CRIU: failed to obtain range attributes\n"); | |
3953 | goto exit_priv; | |
3954 | } | |
3955 | ||
5aa71bd7 DC |
3956 | if (copy_to_user(user_priv_data + *priv_data_offset, svm_priv, |
3957 | svm_priv_data_size)) { | |
9d5dabfe | 3958 | pr_err("Failed to copy svm priv to user\n"); |
5aa71bd7 | 3959 | ret = -EFAULT; |
9d5dabfe RB |
3960 | goto exit_priv; |
3961 | } | |
3962 | ||
3963 | *priv_data_offset += svm_priv_data_size; | |
3964 | ||
3965 | } | |
3966 | ||
3967 | ||
3968 | exit_priv: | |
3969 | kfree(svm_priv); | |
3970 | exit_query: | |
3971 | kfree(query_attr); | |
3972 | exit: | |
3973 | mmput(mm); | |
3974 | return ret; | |
3975 | } | |
3976 | ||
42de677f PY |
3977 | int |
3978 | svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, | |
3979 | uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) | |
3980 | { | |
d1289b41 | 3981 | struct mm_struct *mm = current->mm; |
42de677f PY |
3982 | int r; |
3983 | ||
3984 | start >>= PAGE_SHIFT; | |
3985 | size >>= PAGE_SHIFT; | |
3986 | ||
3987 | switch (op) { | |
3988 | case KFD_IOCTL_SVM_OP_SET_ATTR: | |
d1289b41 | 3989 | r = svm_range_set_attr(p, mm, start, size, nattrs, attrs); |
42de677f | 3990 | break; |
c5e2e478 | 3991 | case KFD_IOCTL_SVM_OP_GET_ATTR: |
d1289b41 | 3992 | r = svm_range_get_attr(p, mm, start, size, nattrs, attrs); |
c5e2e478 | 3993 | break; |
42de677f PY |
3994 | default: |
3995 | r = EINVAL; | |
3996 | break; | |
3997 | } | |
3998 | ||
3999 | return r; | |
4000 | } |