Commit | Line | Data |
---|---|---|
dd08ebf6 MB |
1 | // SPDX-License-Identifier: MIT |
2 | /* | |
3 | * Copyright © 2020 Intel Corporation | |
4 | */ | |
ea9f879d | 5 | |
dd08ebf6 MB |
6 | #include "xe_migrate.h" |
7 | ||
8cb49012 | 8 | #include <linux/bitfield.h> |
ea9f879d LDM |
9 | #include <linux/sizes.h> |
10 | ||
11 | #include <drm/drm_managed.h> | |
12 | #include <drm/ttm/ttm_tt.h> | |
13 | #include <drm/xe_drm.h> | |
14 | ||
a24d9099 DH |
15 | #include <generated/xe_wa_oob.h> |
16 | ||
0134f130 | 17 | #include "instructions/xe_mi_commands.h" |
63955b3b | 18 | #include "regs/xe_gpu_commands.h" |
7cba3396 | 19 | #include "tests/xe_test.h" |
c73acc1e | 20 | #include "xe_assert.h" |
dd08ebf6 MB |
21 | #include "xe_bb.h" |
22 | #include "xe_bo.h" | |
c22a4ed0 | 23 | #include "xe_exec_queue.h" |
dd08ebf6 MB |
24 | #include "xe_ggtt.h" |
25 | #include "xe_gt.h" | |
26 | #include "xe_hw_engine.h" | |
27 | #include "xe_lrc.h" | |
28 | #include "xe_map.h" | |
29 | #include "xe_mocs.h" | |
30 | #include "xe_pt.h" | |
31 | #include "xe_res_cursor.h" | |
32 | #include "xe_sched_job.h" | |
33 | #include "xe_sync.h" | |
34 | #include "xe_trace.h" | |
35 | #include "xe_vm.h" | |
a043fbab | 36 | #include "xe_wa.h" |
dd08ebf6 | 37 | |
e9d285ff TH |
38 | /** |
39 | * struct xe_migrate - migrate context. | |
40 | */ | |
dd08ebf6 | 41 | struct xe_migrate { |
9b9529ce FD |
42 | /** @q: Default exec queue used for migration */ |
43 | struct xe_exec_queue *q; | |
08dea767 MR |
44 | /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ |
45 | struct xe_tile *tile; | |
e9d285ff | 46 | /** @job_mutex: Timeline mutex for @eng. */ |
dd08ebf6 | 47 | struct mutex job_mutex; |
e9d285ff | 48 | /** @pt_bo: Page-table buffer object. */ |
dd08ebf6 | 49 | struct xe_bo *pt_bo; |
e9d285ff | 50 | /** @batch_base_ofs: VM offset of the migration batch buffer */ |
dd08ebf6 | 51 | u64 batch_base_ofs; |
e9d285ff | 52 | /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ |
dd08ebf6 | 53 | u64 usm_batch_base_ofs; |
9116eabb HPG |
54 | /** @cleared_mem_ofs: VM offset of @cleared_bo. */ |
55 | u64 cleared_mem_ofs; | |
e9d285ff TH |
56 | /** |
57 | * @fence: dma-fence representing the last migration job batch. | |
58 | * Protected by @job_mutex. | |
59 | */ | |
dd08ebf6 | 60 | struct dma_fence *fence; |
e9d285ff TH |
61 | /** |
62 | * @vm_update_sa: For integrated, used to suballocate page-tables | |
63 | * out of the pt_bo. | |
64 | */ | |
dd08ebf6 | 65 | struct drm_suballoc_manager vm_update_sa; |
ef51d754 TH |
66 | /** @min_chunk_size: For dgfx, Minimum chunk size */ |
67 | u64 min_chunk_size; | |
dd08ebf6 MB |
68 | }; |
69 | ||
70 | #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ | |
09427526 | 71 | #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */ |
dd08ebf6 MB |
72 | #define NUM_KERNEL_PDE 17 |
73 | #define NUM_PT_SLOTS 32 | |
09427526 | 74 | #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M |
348769d1 | 75 | #define MAX_NUM_PTE 512 |
dd08ebf6 | 76 | |
ca630876 MR |
77 | /* |
78 | * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest | |
79 | * legal value accepted. Since that instruction field is always stored in | |
80 | * (val-2) format, this translates to 0x400 dwords for the true maximum length | |
81 | * of the instruction. Subtracting the instruction header (1 dword) and | |
82 | * address (2 dwords), that leaves 0x3FD dwords (0x1FE qwords) for PTE values. | |
83 | */ | |
84 | #define MAX_PTE_PER_SDI 0x1FE | |
85 | ||
e9d285ff | 86 | /** |
08dea767 MR |
87 | * xe_tile_migrate_engine() - Get this tile's migrate engine. |
88 | * @tile: The tile. | |
e9d285ff | 89 | * |
08dea767 | 90 | * Returns the default migrate engine of this tile. |
e9d285ff TH |
91 | * TODO: Perhaps this function is slightly misplaced, and even unneeded? |
92 | * | |
93 | * Return: The default migrate engine | |
94 | */ | |
9b9529ce | 95 | struct xe_exec_queue *xe_tile_migrate_engine(struct xe_tile *tile) |
dd08ebf6 | 96 | { |
9b9529ce | 97 | return tile->migrate->q; |
dd08ebf6 MB |
98 | } |
99 | ||
100 | static void xe_migrate_fini(struct drm_device *dev, void *arg) | |
101 | { | |
102 | struct xe_migrate *m = arg; | |
dd08ebf6 | 103 | |
d00e9cc2 | 104 | xe_vm_lock(m->q->vm, false); |
dd08ebf6 | 105 | xe_bo_unpin(m->pt_bo); |
d00e9cc2 | 106 | xe_vm_unlock(m->q->vm); |
dd08ebf6 MB |
107 | |
108 | dma_fence_put(m->fence); | |
dd08ebf6 MB |
109 | xe_bo_put(m->pt_bo); |
110 | drm_suballoc_manager_fini(&m->vm_update_sa); | |
111 | mutex_destroy(&m->job_mutex); | |
9b9529ce FD |
112 | xe_vm_close_and_put(m->q->vm); |
113 | xe_exec_queue_put(m->q); | |
dd08ebf6 MB |
114 | } |
115 | ||
116 | static u64 xe_migrate_vm_addr(u64 slot, u32 level) | |
117 | { | |
99fea682 | 118 | XE_WARN_ON(slot >= NUM_PT_SLOTS); |
dd08ebf6 MB |
119 | |
120 | /* First slot is reserved for mapping of PT bo and bb, start from 1 */ | |
121 | return (slot + 1ULL) << xe_pt_shift(level + 1); | |
122 | } | |
123 | ||
d9e85dd5 | 124 | static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr) |
dd08ebf6 | 125 | { |
d9e85dd5 DK |
126 | /* |
127 | * Remove the DPA to get a correct offset into identity table for the | |
128 | * migrate offset | |
129 | */ | |
130 | addr -= xe->mem.vram.dpa_base; | |
dd08ebf6 MB |
131 | return addr + (256ULL << xe_pt_shift(2)); |
132 | } | |
133 | ||
08dea767 | 134 | static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, |
dd08ebf6 MB |
135 | struct xe_vm *vm) |
136 | { | |
08dea767 | 137 | struct xe_device *xe = tile_to_xe(tile); |
e814389f | 138 | u16 pat_index = xe->pat.idx[XE_CACHE_WB]; |
08dea767 | 139 | u8 id = tile->id; |
dd08ebf6 MB |
140 | u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; |
141 | u32 map_ofs, level, i; | |
876611c2 | 142 | struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; |
dd08ebf6 | 143 | u64 entry; |
dd08ebf6 MB |
144 | |
145 | /* Can't bump NUM_PT_SLOTS too high */ | |
58e19acf | 146 | BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); |
dd08ebf6 | 147 | /* Must be a multiple of 64K to support all platforms */ |
58e19acf | 148 | BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); |
dd08ebf6 MB |
149 | /* And one slot reserved for the 4KiB page table updates */ |
150 | BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); | |
151 | ||
152 | /* Need to be sure everything fits in the first PT, or create more */ | |
c73acc1e | 153 | xe_tile_assert(tile, m->batch_base_ofs + batch->size < SZ_2M); |
dd08ebf6 | 154 | |
876611c2 | 155 | bo = xe_bo_create_pin_map(vm->xe, tile, vm, |
58e19acf | 156 | num_entries * XE_PAGE_SIZE, |
dd08ebf6 | 157 | ttm_bo_type_kernel, |
876611c2 | 158 | XE_BO_CREATE_VRAM_IF_DGFX(tile) | |
dd08ebf6 MB |
159 | XE_BO_CREATE_PINNED_BIT); |
160 | if (IS_ERR(bo)) | |
161 | return PTR_ERR(bo); | |
162 | ||
e814389f | 163 | entry = vm->pt_ops->pde_encode_bo(bo, bo->size - XE_PAGE_SIZE, pat_index); |
dd08ebf6 MB |
164 | xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); |
165 | ||
58e19acf | 166 | map_ofs = (num_entries - num_level) * XE_PAGE_SIZE; |
dd08ebf6 MB |
167 | |
168 | /* Map the entire BO in our level 0 pt */ | |
169 | for (i = 0, level = 0; i < num_entries; level++) { | |
0e5e77bd | 170 | entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, |
e814389f | 171 | pat_index, 0); |
dd08ebf6 MB |
172 | |
173 | xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); | |
174 | ||
0d39b6da | 175 | if (vm->flags & XE_VM_FLAG_64K) |
dd08ebf6 MB |
176 | i += 16; |
177 | else | |
178 | i += 1; | |
179 | } | |
180 | ||
181 | if (!IS_DGFX(xe)) { | |
dd08ebf6 | 182 | /* Write out batch too */ |
58e19acf | 183 | m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; |
dd08ebf6 | 184 | for (i = 0; i < batch->size; |
0d39b6da | 185 | i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : |
58e19acf | 186 | XE_PAGE_SIZE) { |
0e5e77bd | 187 | entry = vm->pt_ops->pte_encode_bo(batch, i, |
e814389f | 188 | pat_index, 0); |
dd08ebf6 MB |
189 | |
190 | xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, | |
191 | entry); | |
192 | level++; | |
193 | } | |
72f86ed3 MB |
194 | if (xe->info.has_usm) { |
195 | xe_tile_assert(tile, batch->size == SZ_1M); | |
196 | ||
197 | batch = tile->primary_gt->usm.bb_pool->bo; | |
198 | m->usm_batch_base_ofs = m->batch_base_ofs + SZ_1M; | |
199 | xe_tile_assert(tile, batch->size == SZ_512K); | |
200 | ||
201 | for (i = 0; i < batch->size; | |
202 | i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : | |
203 | XE_PAGE_SIZE) { | |
204 | entry = vm->pt_ops->pte_encode_bo(batch, i, | |
205 | pat_index, 0); | |
206 | ||
207 | xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, | |
208 | entry); | |
209 | level++; | |
210 | } | |
211 | } | |
dd08ebf6 | 212 | } else { |
937b4be7 | 213 | u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); |
dd08ebf6 | 214 | |
d9e85dd5 | 215 | m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); |
dd08ebf6 | 216 | |
5a92da34 | 217 | if (xe->info.has_usm) { |
f6929e80 | 218 | batch = tile->primary_gt->usm.bb_pool->bo; |
937b4be7 | 219 | batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); |
d9e85dd5 | 220 | m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); |
dd08ebf6 MB |
221 | } |
222 | } | |
223 | ||
224 | for (level = 1; level < num_level; level++) { | |
225 | u32 flags = 0; | |
226 | ||
0d39b6da | 227 | if (vm->flags & XE_VM_FLAG_64K && level == 1) |
58e19acf | 228 | flags = XE_PDE_64K; |
dd08ebf6 | 229 | |
9cb46b31 | 230 | entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) * |
e814389f | 231 | XE_PAGE_SIZE, pat_index); |
58e19acf | 232 | xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, |
dd08ebf6 MB |
233 | entry | flags); |
234 | } | |
235 | ||
236 | /* Write PDE's that point to our BO. */ | |
237 | for (i = 0; i < num_entries - num_level; i++) { | |
9cb46b31 | 238 | entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE, |
e814389f | 239 | pat_index); |
dd08ebf6 | 240 | |
58e19acf | 241 | xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + |
dd08ebf6 MB |
242 | (i + 1) * 8, u64, entry); |
243 | } | |
244 | ||
9116eabb HPG |
245 | /* Set up a 1GiB NULL mapping at 255GiB offset. */ |
246 | level = 2; | |
247 | xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64, | |
248 | vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) | |
249 | | XE_PTE_NULL); | |
250 | m->cleared_mem_ofs = (255ULL << xe_pt_shift(level)); | |
251 | ||
dd08ebf6 MB |
252 | /* Identity map the entire vram at 256GiB offset */ |
253 | if (IS_DGFX(xe)) { | |
254 | u64 pos, ofs, flags; | |
255 | ||
256 | level = 2; | |
58e19acf | 257 | ofs = map_ofs + XE_PAGE_SIZE * level + 256 * 8; |
e814389f | 258 | flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, |
fcd75139 | 259 | true, 0); |
dd08ebf6 MB |
260 | |
261 | /* | |
262 | * Use 1GB pages, it shouldn't matter the physical amount of | |
263 | * vram is less, when we don't access it. | |
264 | */ | |
d9e85dd5 DK |
265 | for (pos = xe->mem.vram.dpa_base; |
266 | pos < xe->mem.vram.actual_physical_size + xe->mem.vram.dpa_base; | |
267 | pos += SZ_1G, ofs += 8) | |
dd08ebf6 MB |
268 | xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); |
269 | } | |
270 | ||
271 | /* | |
272 | * Example layout created above, with root level = 3: | |
273 | * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's | |
274 | * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's | |
275 | * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's | |
276 | * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2] | |
277 | * | |
278 | * This makes the lowest part of the VM point to the pagetables. | |
279 | * Hence the lowest 2M in the vm should point to itself, with a few writes | |
280 | * and flushes, other parts of the VM can be used either for copying and | |
281 | * clearing. | |
282 | * | |
283 | * For performance, the kernel reserves PDE's, so about 20 are left | |
284 | * for async VM updates. | |
285 | * | |
286 | * To make it easier to work, each scratch PT is put in slot (1 + PT #) | |
287 | * everywhere, this allows lockless updates to scratch pages by using | |
288 | * the different addresses in VM. | |
289 | */ | |
290 | #define NUM_VMUSA_UNIT_PER_PAGE 32 | |
58e19acf | 291 | #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) |
dd08ebf6 MB |
292 | #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) |
293 | drm_suballoc_manager_init(&m->vm_update_sa, | |
9cb46b31 | 294 | (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * |
dd08ebf6 MB |
295 | NUM_VMUSA_UNIT_PER_PAGE, 0); |
296 | ||
297 | m->pt_bo = bo; | |
298 | return 0; | |
299 | } | |
300 | ||
a043fbab NV |
301 | /* |
302 | * Due to workaround 16017236439, odd instance hardware copy engines are | |
303 | * faster than even instance ones. | |
304 | * This function returns the mask involving all fast copy engines and the | |
305 | * reserved copy engine to be used as logical mask for migrate engine. | |
306 | * Including the reserved copy engine is required to avoid deadlocks due to | |
307 | * migrate jobs servicing the faults gets stuck behind the job that faulted. | |
308 | */ | |
309 | static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) | |
310 | { | |
311 | u32 logical_mask = 0; | |
312 | struct xe_hw_engine *hwe; | |
313 | enum xe_hw_engine_id id; | |
314 | ||
315 | for_each_hw_engine(hwe, gt, id) { | |
316 | if (hwe->class != XE_ENGINE_CLASS_COPY) | |
317 | continue; | |
318 | ||
319 | if (!XE_WA(gt, 16017236439) || | |
320 | xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1) | |
321 | logical_mask |= BIT(hwe->logical_instance); | |
322 | } | |
323 | ||
324 | return logical_mask; | |
325 | } | |
326 | ||
e9d285ff TH |
327 | /** |
328 | * xe_migrate_init() - Initialize a migrate context | |
08dea767 | 329 | * @tile: Back-pointer to the tile we're initializing for. |
e9d285ff TH |
330 | * |
331 | * Return: Pointer to a migrate context on success. Error pointer on error. | |
332 | */ | |
08dea767 | 333 | struct xe_migrate *xe_migrate_init(struct xe_tile *tile) |
dd08ebf6 | 334 | { |
08dea767 | 335 | struct xe_device *xe = tile_to_xe(tile); |
f6929e80 | 336 | struct xe_gt *primary_gt = tile->primary_gt; |
dd08ebf6 MB |
337 | struct xe_migrate *m; |
338 | struct xe_vm *vm; | |
dd08ebf6 MB |
339 | int err; |
340 | ||
dd08ebf6 MB |
341 | m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL); |
342 | if (!m) | |
343 | return ERR_PTR(-ENOMEM); | |
344 | ||
08dea767 | 345 | m->tile = tile; |
dd08ebf6 MB |
346 | |
347 | /* Special layout, prepared below.. */ | |
348 | vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | | |
08dea767 | 349 | XE_VM_FLAG_SET_TILE_ID(tile)); |
dd08ebf6 MB |
350 | if (IS_ERR(vm)) |
351 | return ERR_CAST(vm); | |
352 | ||
d00e9cc2 | 353 | xe_vm_lock(vm, false); |
08dea767 | 354 | err = xe_migrate_prepare_vm(tile, m, vm); |
d00e9cc2 | 355 | xe_vm_unlock(vm); |
dd08ebf6 MB |
356 | if (err) { |
357 | xe_vm_close_and_put(vm); | |
358 | return ERR_PTR(err); | |
359 | } | |
360 | ||
5a92da34 | 361 | if (xe->info.has_usm) { |
08dea767 | 362 | struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, |
dd08ebf6 | 363 | XE_ENGINE_CLASS_COPY, |
08dea767 | 364 | primary_gt->usm.reserved_bcs_instance, |
dd08ebf6 | 365 | false); |
a043fbab NV |
366 | u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); |
367 | ||
368 | if (!hwe || !logical_mask) | |
dd08ebf6 MB |
369 | return ERR_PTR(-EINVAL); |
370 | ||
a043fbab | 371 | m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, |
923e4238 | 372 | EXEC_QUEUE_FLAG_KERNEL | |
a8004af3 | 373 | EXEC_QUEUE_FLAG_PERMANENT | |
25ce7c50 | 374 | EXEC_QUEUE_FLAG_HIGH_PRIORITY, 0); |
dd08ebf6 | 375 | } else { |
9b9529ce FD |
376 | m->q = xe_exec_queue_create_class(xe, primary_gt, vm, |
377 | XE_ENGINE_CLASS_COPY, | |
923e4238 DCS |
378 | EXEC_QUEUE_FLAG_KERNEL | |
379 | EXEC_QUEUE_FLAG_PERMANENT); | |
dd08ebf6 | 380 | } |
9b9529ce | 381 | if (IS_ERR(m->q)) { |
dd08ebf6 | 382 | xe_vm_close_and_put(vm); |
9b9529ce | 383 | return ERR_CAST(m->q); |
dd08ebf6 MB |
384 | } |
385 | ||
386 | mutex_init(&m->job_mutex); | |
387 | ||
388 | err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m); | |
389 | if (err) | |
390 | return ERR_PTR(err); | |
391 | ||
ef51d754 TH |
392 | if (IS_DGFX(xe)) { |
393 | if (xe_device_has_flat_ccs(xe)) | |
394 | /* min chunk size corresponds to 4K of CCS Metadata */ | |
395 | m->min_chunk_size = SZ_4K * SZ_64K / | |
396 | xe_device_ccs_bytes(xe, SZ_64K); | |
397 | else | |
398 | /* Somewhat arbitrary to avoid a huge amount of blits */ | |
399 | m->min_chunk_size = SZ_64K; | |
400 | m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size); | |
401 | drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n", | |
402 | (unsigned long long)m->min_chunk_size); | |
403 | } | |
404 | ||
dd08ebf6 MB |
405 | return m; |
406 | } | |
407 | ||
09427526 HPG |
408 | static u64 max_mem_transfer_per_pass(struct xe_device *xe) |
409 | { | |
410 | if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe)) | |
411 | return MAX_CCS_LIMITED_TRANSFER; | |
412 | ||
413 | return MAX_PREEMPTDISABLE_TRANSFER; | |
414 | } | |
415 | ||
ef51d754 | 416 | static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur) |
dd08ebf6 | 417 | { |
ef51d754 TH |
418 | struct xe_device *xe = tile_to_xe(m->tile); |
419 | u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining); | |
420 | ||
421 | if (mem_type_is_vram(cur->mem_type)) { | |
422 | /* | |
423 | * VRAM we want to blit in chunks with sizes aligned to | |
424 | * min_chunk_size in order for the offset to CCS metadata to be | |
425 | * page-aligned. If it's the last chunk it may be smaller. | |
426 | * | |
427 | * Another constraint is that we need to limit the blit to | |
428 | * the VRAM block size, unless size is smaller than | |
429 | * min_chunk_size. | |
430 | */ | |
431 | u64 chunk = max_t(u64, cur->size, m->min_chunk_size); | |
432 | ||
433 | size = min_t(u64, size, chunk); | |
434 | if (size > m->min_chunk_size) | |
435 | size = round_down(size, m->min_chunk_size); | |
436 | } | |
437 | ||
438 | return size; | |
439 | } | |
440 | ||
441 | static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur) | |
442 | { | |
443 | /* If the chunk is not fragmented, allow identity map. */ | |
444 | return cur->size >= size; | |
dd08ebf6 MB |
445 | } |
446 | ||
447 | static u32 pte_update_size(struct xe_migrate *m, | |
448 | bool is_vram, | |
c33a7219 | 449 | struct ttm_resource *res, |
dd08ebf6 MB |
450 | struct xe_res_cursor *cur, |
451 | u64 *L0, u64 *L0_ofs, u32 *L0_pt, | |
452 | u32 cmd_size, u32 pt_ofs, u32 avail_pts) | |
453 | { | |
454 | u32 cmds = 0; | |
455 | ||
456 | *L0_pt = pt_ofs; | |
ef51d754 TH |
457 | if (is_vram && xe_migrate_allow_identity(*L0, cur)) { |
458 | /* Offset into identity map. */ | |
459 | *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), | |
460 | cur->start + vram_region_gpu_offset(res)); | |
461 | cmds += cmd_size; | |
462 | } else { | |
dd08ebf6 MB |
463 | /* Clip L0 to available size */ |
464 | u64 size = min(*L0, (u64)avail_pts * SZ_2M); | |
45cfade3 | 465 | u32 num_4k_pages = (size + XE_PAGE_SIZE - 1) >> XE_PTE_SHIFT; |
dd08ebf6 MB |
466 | |
467 | *L0 = size; | |
468 | *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); | |
469 | ||
470 | /* MI_STORE_DATA_IMM */ | |
ca630876 | 471 | cmds += 3 * DIV_ROUND_UP(num_4k_pages, MAX_PTE_PER_SDI); |
dd08ebf6 MB |
472 | |
473 | /* PDE qwords */ | |
474 | cmds += num_4k_pages * 2; | |
475 | ||
476 | /* Each chunk has a single blit command */ | |
477 | cmds += cmd_size; | |
dd08ebf6 MB |
478 | } |
479 | ||
480 | return cmds; | |
481 | } | |
482 | ||
483 | static void emit_pte(struct xe_migrate *m, | |
484 | struct xe_bb *bb, u32 at_pt, | |
65ef8dba | 485 | bool is_vram, bool is_comp_pte, |
dd08ebf6 | 486 | struct xe_res_cursor *cur, |
ef51d754 | 487 | u32 size, struct ttm_resource *res) |
dd08ebf6 | 488 | { |
65ef8dba | 489 | struct xe_device *xe = tile_to_xe(m->tile); |
ef51d754 | 490 | struct xe_vm *vm = m->q->vm; |
65ef8dba | 491 | u16 pat_index; |
dd08ebf6 | 492 | u32 ptes; |
9cb46b31 | 493 | u64 ofs = (u64)at_pt * XE_PAGE_SIZE; |
dd08ebf6 MB |
494 | u64 cur_ofs; |
495 | ||
65ef8dba HPG |
496 | /* Indirect access needs compression enabled uncached PAT index */ |
497 | if (GRAPHICS_VERx100(xe) >= 2000) | |
498 | pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] : | |
6a028675 | 499 | xe->pat.idx[XE_CACHE_WB]; |
65ef8dba HPG |
500 | else |
501 | pat_index = xe->pat.idx[XE_CACHE_WB]; | |
502 | ||
58e19acf | 503 | ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); |
dd08ebf6 MB |
504 | |
505 | while (ptes) { | |
ca630876 | 506 | u32 chunk = min(MAX_PTE_PER_SDI, ptes); |
dd08ebf6 | 507 | |
14a1e6a4 | 508 | bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); |
dd08ebf6 MB |
509 | bb->cs[bb->len++] = ofs; |
510 | bb->cs[bb->len++] = 0; | |
511 | ||
512 | cur_ofs = ofs; | |
513 | ofs += chunk * 8; | |
514 | ptes -= chunk; | |
515 | ||
516 | while (chunk--) { | |
23c8495e LDM |
517 | u64 addr, flags = 0; |
518 | bool devmem = false; | |
dd08ebf6 | 519 | |
e89b384c | 520 | addr = xe_res_dma(cur) & PAGE_MASK; |
dd08ebf6 | 521 | if (is_vram) { |
ef51d754 TH |
522 | if (vm->flags & XE_VM_FLAG_64K) { |
523 | u64 va = cur_ofs * XE_PAGE_SIZE / 8; | |
524 | ||
525 | xe_assert(xe, (va & (SZ_64K - 1)) == | |
526 | (addr & (SZ_64K - 1))); | |
527 | ||
23c8495e | 528 | flags |= XE_PTE_PS64; |
dd08ebf6 MB |
529 | } |
530 | ||
ef51d754 | 531 | addr += vram_region_gpu_offset(res); |
23c8495e | 532 | devmem = true; |
dd08ebf6 | 533 | } |
23c8495e | 534 | |
ef51d754 TH |
535 | addr = vm->pt_ops->pte_encode_addr(m->tile->xe, |
536 | addr, pat_index, | |
537 | 0, devmem, flags); | |
dd08ebf6 MB |
538 | bb->cs[bb->len++] = lower_32_bits(addr); |
539 | bb->cs[bb->len++] = upper_32_bits(addr); | |
540 | ||
72e8d73b | 541 | xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); |
dd08ebf6 MB |
542 | cur_ofs += 8; |
543 | } | |
544 | } | |
545 | } | |
546 | ||
547 | #define EMIT_COPY_CCS_DW 5 | |
548 | static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, | |
549 | u64 dst_ofs, bool dst_is_indirect, | |
550 | u64 src_ofs, bool src_is_indirect, | |
551 | u32 size) | |
552 | { | |
30603b5b | 553 | struct xe_device *xe = gt_to_xe(gt); |
dd08ebf6 MB |
554 | u32 *cs = bb->cs + bb->len; |
555 | u32 num_ccs_blks; | |
9cca4902 HPG |
556 | u32 num_pages; |
557 | u32 ccs_copy_size; | |
30603b5b | 558 | u32 mocs; |
dd08ebf6 | 559 | |
9cca4902 HPG |
560 | if (GRAPHICS_VERx100(xe) >= 2000) { |
561 | num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); | |
562 | xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1)); | |
30603b5b | 563 | |
9cca4902 | 564 | ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1); |
30603b5b | 565 | mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); |
9cca4902 HPG |
566 | |
567 | } else { | |
568 | num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), | |
569 | NUM_CCS_BYTES_PER_BLOCK); | |
570 | xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1)); | |
571 | ||
572 | ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1); | |
30603b5b | 573 | mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); |
9cca4902 | 574 | } |
30603b5b | 575 | |
dd08ebf6 MB |
576 | *cs++ = XY_CTRL_SURF_COPY_BLT | |
577 | (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | | |
578 | (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | | |
9cca4902 | 579 | ccs_copy_size; |
dd08ebf6 | 580 | *cs++ = lower_32_bits(src_ofs); |
30603b5b | 581 | *cs++ = upper_32_bits(src_ofs) | mocs; |
dd08ebf6 | 582 | *cs++ = lower_32_bits(dst_ofs); |
30603b5b | 583 | *cs++ = upper_32_bits(dst_ofs) | mocs; |
dd08ebf6 MB |
584 | |
585 | bb->len = cs - bb->cs; | |
586 | } | |
587 | ||
588 | #define EMIT_COPY_DW 10 | |
589 | static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, | |
590 | u64 src_ofs, u64 dst_ofs, unsigned int size, | |
3e8e7ee6 | 591 | unsigned int pitch) |
dd08ebf6 | 592 | { |
4bdd8c2e | 593 | struct xe_device *xe = gt_to_xe(gt); |
30603b5b HK |
594 | u32 mocs = 0; |
595 | u32 tile_y = 0; | |
4bdd8c2e | 596 | |
c73acc1e FD |
597 | xe_gt_assert(gt, size / pitch <= S16_MAX); |
598 | xe_gt_assert(gt, pitch / 4 <= S16_MAX); | |
599 | xe_gt_assert(gt, pitch <= U16_MAX); | |
dd08ebf6 | 600 | |
30603b5b HK |
601 | if (GRAPHICS_VER(xe) >= 20) |
602 | mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); | |
603 | ||
4bdd8c2e | 604 | if (GRAPHICS_VERx100(xe) >= 1250) |
30603b5b HK |
605 | tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; |
606 | ||
607 | bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); | |
608 | bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; | |
dd08ebf6 MB |
609 | bb->cs[bb->len++] = 0; |
610 | bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; | |
611 | bb->cs[bb->len++] = lower_32_bits(dst_ofs); | |
612 | bb->cs[bb->len++] = upper_32_bits(dst_ofs); | |
613 | bb->cs[bb->len++] = 0; | |
30603b5b | 614 | bb->cs[bb->len++] = pitch | mocs; |
dd08ebf6 MB |
615 | bb->cs[bb->len++] = lower_32_bits(src_ofs); |
616 | bb->cs[bb->len++] = upper_32_bits(src_ofs); | |
617 | } | |
618 | ||
619 | static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv, | |
620 | enum dma_resv_usage usage) | |
621 | { | |
622 | return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage); | |
623 | } | |
624 | ||
625 | static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) | |
626 | { | |
627 | return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; | |
628 | } | |
629 | ||
630 | static u32 xe_migrate_ccs_copy(struct xe_migrate *m, | |
631 | struct xe_bb *bb, | |
266c8588 HPG |
632 | u64 src_ofs, bool src_is_indirect, |
633 | u64 dst_ofs, bool dst_is_indirect, u32 dst_size, | |
dd08ebf6 MB |
634 | u64 ccs_ofs, bool copy_ccs) |
635 | { | |
f6929e80 | 636 | struct xe_gt *gt = m->tile->primary_gt; |
dd08ebf6 MB |
637 | u32 flush_flags = 0; |
638 | ||
266c8588 | 639 | if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_indirect) { |
dd08ebf6 | 640 | /* |
a2f9f4ff MA |
641 | * If the src is already in vram, then it should already |
642 | * have been cleared by us, or has been populated by the | |
643 | * user. Make sure we copy the CCS aux state as-is. | |
644 | * | |
645 | * Otherwise if the bo doesn't have any CCS metadata attached, | |
646 | * we still need to clear it for security reasons. | |
dd08ebf6 | 647 | */ |
266c8588 | 648 | u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs; |
a2f9f4ff MA |
649 | |
650 | emit_copy_ccs(gt, bb, | |
651 | dst_ofs, true, | |
266c8588 | 652 | ccs_src_ofs, src_is_indirect, dst_size); |
a2f9f4ff | 653 | |
dd08ebf6 MB |
654 | flush_flags = MI_FLUSH_DW_CCS; |
655 | } else if (copy_ccs) { | |
266c8588 | 656 | if (!src_is_indirect) |
dd08ebf6 | 657 | src_ofs = ccs_ofs; |
266c8588 | 658 | else if (!dst_is_indirect) |
dd08ebf6 MB |
659 | dst_ofs = ccs_ofs; |
660 | ||
266c8588 | 661 | xe_gt_assert(gt, src_is_indirect || dst_is_indirect); |
dd08ebf6 | 662 | |
266c8588 HPG |
663 | emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs, |
664 | src_is_indirect, dst_size); | |
665 | if (dst_is_indirect) | |
dd08ebf6 MB |
666 | flush_flags = MI_FLUSH_DW_CCS; |
667 | } | |
668 | ||
669 | return flush_flags; | |
670 | } | |
671 | ||
e9d285ff TH |
672 | /** |
673 | * xe_migrate_copy() - Copy content of TTM resources. | |
674 | * @m: The migration context. | |
3690a01b TH |
675 | * @src_bo: The buffer object @src is currently bound to. |
676 | * @dst_bo: If copying between resources created for the same bo, set this to | |
677 | * the same value as @src_bo. If copying between buffer objects, set it to | |
678 | * the buffer object @dst is currently bound to. | |
e9d285ff TH |
679 | * @src: The source TTM resource. |
680 | * @dst: The dst TTM resource. | |
266c8588 | 681 | * @copy_only_ccs: If true copy only CCS metadata |
e9d285ff TH |
682 | * |
683 | * Copies the contents of @src to @dst: On flat CCS devices, | |
684 | * the CCS metadata is copied as well if needed, or if not present, | |
685 | * the CCS metadata of @dst is cleared for security reasons. | |
e9d285ff TH |
686 | * |
687 | * Return: Pointer to a dma_fence representing the last copy batch, or | |
688 | * an error pointer on failure. If there is a failure, any copy operation | |
689 | * started by the function call has been synced. | |
690 | */ | |
dd08ebf6 | 691 | struct dma_fence *xe_migrate_copy(struct xe_migrate *m, |
3690a01b TH |
692 | struct xe_bo *src_bo, |
693 | struct xe_bo *dst_bo, | |
dd08ebf6 | 694 | struct ttm_resource *src, |
266c8588 HPG |
695 | struct ttm_resource *dst, |
696 | bool copy_only_ccs) | |
dd08ebf6 | 697 | { |
f6929e80 | 698 | struct xe_gt *gt = m->tile->primary_gt; |
dd08ebf6 MB |
699 | struct xe_device *xe = gt_to_xe(gt); |
700 | struct dma_fence *fence = NULL; | |
3690a01b | 701 | u64 size = src_bo->size; |
dd08ebf6 MB |
702 | struct xe_res_cursor src_it, dst_it, ccs_it; |
703 | u64 src_L0_ofs, dst_L0_ofs; | |
704 | u32 src_L0_pt, dst_L0_pt; | |
705 | u64 src_L0, dst_L0; | |
706 | int pass = 0; | |
707 | int err; | |
266c8588 HPG |
708 | bool src_is_pltt = src->mem_type == XE_PL_TT; |
709 | bool dst_is_pltt = dst->mem_type == XE_PL_TT; | |
dd08ebf6 MB |
710 | bool src_is_vram = mem_type_is_vram(src->mem_type); |
711 | bool dst_is_vram = mem_type_is_vram(dst->mem_type); | |
3690a01b TH |
712 | bool copy_ccs = xe_device_has_flat_ccs(xe) && |
713 | xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); | |
dd08ebf6 MB |
714 | bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); |
715 | ||
3690a01b TH |
716 | /* Copying CCS between two different BOs is not supported yet. */ |
717 | if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) | |
718 | return ERR_PTR(-EINVAL); | |
719 | ||
720 | if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size)) | |
721 | return ERR_PTR(-EINVAL); | |
722 | ||
dd08ebf6 | 723 | if (!src_is_vram) |
a21fe5ee | 724 | xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); |
dd08ebf6 | 725 | else |
e89b384c | 726 | xe_res_first(src, 0, size, &src_it); |
dd08ebf6 | 727 | if (!dst_is_vram) |
a21fe5ee | 728 | xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it); |
dd08ebf6 | 729 | else |
e89b384c | 730 | xe_res_first(dst, 0, size, &dst_it); |
dd08ebf6 MB |
731 | |
732 | if (copy_system_ccs) | |
a21fe5ee | 733 | xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), |
dd08ebf6 MB |
734 | PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), |
735 | &ccs_it); | |
736 | ||
737 | while (size) { | |
738 | u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ | |
739 | struct xe_sched_job *job; | |
740 | struct xe_bb *bb; | |
741 | u32 flush_flags; | |
742 | u32 update_idx; | |
743 | u64 ccs_ofs, ccs_size; | |
744 | u32 ccs_pt; | |
09427526 | 745 | |
5a92da34 | 746 | bool usm = xe->info.has_usm; |
09427526 | 747 | u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; |
dd08ebf6 | 748 | |
ef51d754 TH |
749 | src_L0 = xe_migrate_res_sizes(m, &src_it); |
750 | dst_L0 = xe_migrate_res_sizes(m, &dst_it); | |
dd08ebf6 MB |
751 | |
752 | drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", | |
753 | pass++, src_L0, dst_L0); | |
754 | ||
755 | src_L0 = min(src_L0, dst_L0); | |
756 | ||
c33a7219 | 757 | batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0, |
dd08ebf6 | 758 | &src_L0_ofs, &src_L0_pt, 0, 0, |
09427526 | 759 | avail_pts); |
dd08ebf6 | 760 | |
c33a7219 | 761 | batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0, |
dd08ebf6 | 762 | &dst_L0_ofs, &dst_L0_pt, 0, |
09427526 | 763 | avail_pts, avail_pts); |
dd08ebf6 MB |
764 | |
765 | if (copy_system_ccs) { | |
766 | ccs_size = xe_device_ccs_bytes(xe, src_L0); | |
c33a7219 | 767 | batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size, |
dd08ebf6 | 768 | &ccs_ofs, &ccs_pt, 0, |
09427526 HPG |
769 | 2 * avail_pts, |
770 | avail_pts); | |
ef51d754 | 771 | xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); |
dd08ebf6 MB |
772 | } |
773 | ||
774 | /* Add copy commands size here */ | |
266c8588 HPG |
775 | batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) + |
776 | ((xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0)); | |
dd08ebf6 MB |
777 | |
778 | bb = xe_bb_new(gt, batch_size, usm); | |
779 | if (IS_ERR(bb)) { | |
780 | err = PTR_ERR(bb); | |
781 | goto err_sync; | |
782 | } | |
783 | ||
ef51d754 | 784 | if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it)) |
dd08ebf6 | 785 | xe_res_next(&src_it, src_L0); |
dd08ebf6 | 786 | else |
6a028675 HPG |
787 | emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs, |
788 | &src_it, src_L0, src); | |
ef51d754 TH |
789 | |
790 | if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it)) | |
dd08ebf6 | 791 | xe_res_next(&dst_it, src_L0); |
ef51d754 | 792 | else |
6a028675 HPG |
793 | emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs, |
794 | &dst_it, src_L0, dst); | |
dd08ebf6 MB |
795 | |
796 | if (copy_system_ccs) | |
ef51d754 | 797 | emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); |
dd08ebf6 MB |
798 | |
799 | bb->cs[bb->len++] = MI_BATCH_BUFFER_END; | |
800 | update_idx = bb->len; | |
801 | ||
266c8588 HPG |
802 | if (!copy_only_ccs) |
803 | emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE); | |
804 | ||
805 | flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, | |
806 | IS_DGFX(xe) ? src_is_vram : src_is_pltt, | |
807 | dst_L0_ofs, | |
808 | IS_DGFX(xe) ? dst_is_vram : dst_is_pltt, | |
dd08ebf6 MB |
809 | src_L0, ccs_ofs, copy_ccs); |
810 | ||
811 | mutex_lock(&m->job_mutex); | |
9b9529ce | 812 | job = xe_bb_create_migration_job(m->q, bb, |
dd08ebf6 MB |
813 | xe_migrate_batch_base(m, usm), |
814 | update_idx); | |
815 | if (IS_ERR(job)) { | |
816 | err = PTR_ERR(job); | |
817 | goto err; | |
818 | } | |
819 | ||
820 | xe_sched_job_add_migrate_flush(job, flush_flags); | |
821 | if (!fence) { | |
3690a01b | 822 | err = job_add_deps(job, src_bo->ttm.base.resv, |
dd08ebf6 | 823 | DMA_RESV_USAGE_BOOKKEEP); |
3690a01b TH |
824 | if (!err && src_bo != dst_bo) |
825 | err = job_add_deps(job, dst_bo->ttm.base.resv, | |
826 | DMA_RESV_USAGE_BOOKKEEP); | |
dd08ebf6 MB |
827 | if (err) |
828 | goto err_job; | |
829 | } | |
830 | ||
831 | xe_sched_job_arm(job); | |
832 | dma_fence_put(fence); | |
833 | fence = dma_fence_get(&job->drm.s_fence->finished); | |
834 | xe_sched_job_push(job); | |
835 | ||
836 | dma_fence_put(m->fence); | |
837 | m->fence = dma_fence_get(fence); | |
838 | ||
839 | mutex_unlock(&m->job_mutex); | |
840 | ||
841 | xe_bb_free(bb, fence); | |
842 | size -= src_L0; | |
843 | continue; | |
844 | ||
845 | err_job: | |
846 | xe_sched_job_put(job); | |
847 | err: | |
848 | mutex_unlock(&m->job_mutex); | |
849 | xe_bb_free(bb, NULL); | |
850 | ||
851 | err_sync: | |
e9d285ff | 852 | /* Sync partial copy if any. FIXME: under job_mutex? */ |
dd08ebf6 MB |
853 | if (fence) { |
854 | dma_fence_wait(fence, false); | |
855 | dma_fence_put(fence); | |
856 | } | |
857 | ||
858 | return ERR_PTR(err); | |
859 | } | |
860 | ||
861 | return fence; | |
862 | } | |
863 | ||
11a2407e BV |
864 | static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, |
865 | u32 size, u32 pitch) | |
dd08ebf6 | 866 | { |
30603b5b | 867 | struct xe_device *xe = gt_to_xe(gt); |
11a2407e | 868 | u32 *cs = bb->cs + bb->len; |
11a2407e BV |
869 | u32 len = PVC_MEM_SET_CMD_LEN_DW; |
870 | ||
c690f0e6 | 871 | *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); |
11a2407e BV |
872 | *cs++ = pitch - 1; |
873 | *cs++ = (size / pitch) - 1; | |
874 | *cs++ = pitch - 1; | |
875 | *cs++ = lower_32_bits(src_ofs); | |
876 | *cs++ = upper_32_bits(src_ofs); | |
30603b5b HK |
877 | if (GRAPHICS_VERx100(xe) >= 2000) |
878 | *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); | |
879 | else | |
880 | *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); | |
11a2407e | 881 | |
c73acc1e | 882 | xe_gt_assert(gt, cs - bb->cs == len + bb->len); |
11a2407e BV |
883 | |
884 | bb->len += len; | |
885 | } | |
886 | ||
887 | static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, | |
888 | u64 src_ofs, u32 size, u32 pitch, bool is_vram) | |
889 | { | |
890 | struct xe_device *xe = gt_to_xe(gt); | |
dd08ebf6 MB |
891 | u32 *cs = bb->cs + bb->len; |
892 | u32 len = XY_FAST_COLOR_BLT_DW; | |
dd08ebf6 | 893 | |
11a2407e | 894 | if (GRAPHICS_VERx100(xe) < 1250) |
dd08ebf6 MB |
895 | len = 11; |
896 | ||
897 | *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | | |
898 | (len - 2); | |
30603b5b HK |
899 | if (GRAPHICS_VERx100(xe) >= 2000) |
900 | *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | | |
901 | (pitch - 1); | |
902 | else | |
903 | *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | | |
904 | (pitch - 1); | |
dd08ebf6 MB |
905 | *cs++ = 0; |
906 | *cs++ = (size / pitch) << 16 | pitch / 4; | |
907 | *cs++ = lower_32_bits(src_ofs); | |
908 | *cs++ = upper_32_bits(src_ofs); | |
909 | *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; | |
11a2407e | 910 | *cs++ = 0; |
dd08ebf6 MB |
911 | *cs++ = 0; |
912 | *cs++ = 0; | |
913 | *cs++ = 0; | |
914 | ||
915 | if (len > 11) { | |
916 | *cs++ = 0; | |
917 | *cs++ = 0; | |
918 | *cs++ = 0; | |
919 | *cs++ = 0; | |
920 | *cs++ = 0; | |
921 | } | |
922 | ||
c73acc1e | 923 | xe_gt_assert(gt, cs - bb->cs == len + bb->len); |
11a2407e | 924 | |
dd08ebf6 | 925 | bb->len += len; |
11a2407e BV |
926 | } |
927 | ||
1951dad5 | 928 | static bool has_service_copy_support(struct xe_gt *gt) |
11a2407e | 929 | { |
1951dad5 MR |
930 | /* |
931 | * What we care about is whether the architecture was designed with | |
932 | * service copy functionality (specifically the new MEM_SET / MEM_COPY | |
933 | * instructions) so check the architectural engine list rather than the | |
934 | * actual list since these instructions are usable on BCS0 even if | |
935 | * all of the actual service copy engines (BCS1-BCS8) have been fused | |
936 | * off. | |
937 | */ | |
938 | return gt->info.__engine_mask & GENMASK(XE_HW_ENGINE_BCS8, | |
939 | XE_HW_ENGINE_BCS1); | |
940 | } | |
941 | ||
942 | static u32 emit_clear_cmd_len(struct xe_gt *gt) | |
943 | { | |
944 | if (has_service_copy_support(gt)) | |
11a2407e BV |
945 | return PVC_MEM_SET_CMD_LEN_DW; |
946 | else | |
947 | return XY_FAST_COLOR_BLT_DW; | |
948 | } | |
949 | ||
1951dad5 MR |
950 | static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, |
951 | u32 size, u32 pitch, bool is_vram) | |
11a2407e | 952 | { |
1951dad5 | 953 | if (has_service_copy_support(gt)) |
11a2407e | 954 | emit_clear_link_copy(gt, bb, src_ofs, size, pitch); |
1951dad5 | 955 | else |
11a2407e BV |
956 | emit_clear_main_copy(gt, bb, src_ofs, size, pitch, |
957 | is_vram); | |
dd08ebf6 MB |
958 | } |
959 | ||
e9d285ff TH |
960 | /** |
961 | * xe_migrate_clear() - Copy content of TTM resources. | |
962 | * @m: The migration context. | |
963 | * @bo: The buffer object @dst is currently bound to. | |
964 | * @dst: The dst TTM resource to be cleared. | |
e9d285ff | 965 | * |
11a2407e BV |
966 | * Clear the contents of @dst to zero. On flat CCS devices, |
967 | * the CCS metadata is cleared to zero as well on VRAM destinations. | |
e9d285ff TH |
968 | * TODO: Eliminate the @bo argument. |
969 | * | |
970 | * Return: Pointer to a dma_fence representing the last clear batch, or | |
971 | * an error pointer on failure. If there is a failure, any clear operation | |
972 | * started by the function call has been synced. | |
973 | */ | |
dd08ebf6 MB |
974 | struct dma_fence *xe_migrate_clear(struct xe_migrate *m, |
975 | struct xe_bo *bo, | |
11a2407e | 976 | struct ttm_resource *dst) |
dd08ebf6 MB |
977 | { |
978 | bool clear_vram = mem_type_is_vram(dst->mem_type); | |
f6929e80 | 979 | struct xe_gt *gt = m->tile->primary_gt; |
dd08ebf6 | 980 | struct xe_device *xe = gt_to_xe(gt); |
266c8588 | 981 | bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) && !IS_DGFX(xe)) ? true : false; |
dd08ebf6 MB |
982 | struct dma_fence *fence = NULL; |
983 | u64 size = bo->size; | |
984 | struct xe_res_cursor src_it; | |
985 | struct ttm_resource *src = dst; | |
986 | int err; | |
987 | int pass = 0; | |
988 | ||
989 | if (!clear_vram) | |
a21fe5ee | 990 | xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it); |
dd08ebf6 MB |
991 | else |
992 | xe_res_first(src, 0, bo->size, &src_it); | |
993 | ||
994 | while (size) { | |
995 | u64 clear_L0_ofs; | |
996 | u32 clear_L0_pt; | |
997 | u32 flush_flags = 0; | |
998 | u64 clear_L0; | |
999 | struct xe_sched_job *job; | |
1000 | struct xe_bb *bb; | |
1001 | u32 batch_size, update_idx; | |
09427526 | 1002 | |
5a92da34 | 1003 | bool usm = xe->info.has_usm; |
09427526 HPG |
1004 | u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; |
1005 | ||
ef51d754 | 1006 | clear_L0 = xe_migrate_res_sizes(m, &src_it); |
dd08ebf6 | 1007 | |
dd08ebf6 MB |
1008 | drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0); |
1009 | ||
1010 | /* Calculate final sizes and batch size.. */ | |
1011 | batch_size = 2 + | |
c33a7219 | 1012 | pte_update_size(m, clear_vram, src, &src_it, |
dd08ebf6 | 1013 | &clear_L0, &clear_L0_ofs, &clear_L0_pt, |
266c8588 | 1014 | clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0, |
09427526 | 1015 | avail_pts); |
266c8588 HPG |
1016 | |
1017 | if (xe_device_has_flat_ccs(xe)) | |
dd08ebf6 MB |
1018 | batch_size += EMIT_COPY_CCS_DW; |
1019 | ||
1020 | /* Clear commands */ | |
1021 | ||
1022 | if (WARN_ON_ONCE(!clear_L0)) | |
1023 | break; | |
1024 | ||
1025 | bb = xe_bb_new(gt, batch_size, usm); | |
1026 | if (IS_ERR(bb)) { | |
1027 | err = PTR_ERR(bb); | |
1028 | goto err_sync; | |
1029 | } | |
1030 | ||
1031 | size -= clear_L0; | |
dd08ebf6 | 1032 | /* Preemption is enabled again by the ring ops. */ |
ef51d754 | 1033 | if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) |
dd08ebf6 | 1034 | xe_res_next(&src_it, clear_L0); |
ef51d754 | 1035 | else |
6a028675 HPG |
1036 | emit_pte(m, bb, clear_L0_pt, clear_vram, clear_system_ccs, |
1037 | &src_it, clear_L0, dst); | |
ef51d754 | 1038 | |
dd08ebf6 MB |
1039 | bb->cs[bb->len++] = MI_BATCH_BUFFER_END; |
1040 | update_idx = bb->len; | |
1041 | ||
266c8588 HPG |
1042 | if (!clear_system_ccs) |
1043 | emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram); | |
1044 | ||
1045 | if (xe_device_has_flat_ccs(xe)) { | |
dd08ebf6 | 1046 | emit_copy_ccs(gt, bb, clear_L0_ofs, true, |
9116eabb | 1047 | m->cleared_mem_ofs, false, clear_L0); |
dd08ebf6 MB |
1048 | flush_flags = MI_FLUSH_DW_CCS; |
1049 | } | |
1050 | ||
1051 | mutex_lock(&m->job_mutex); | |
9b9529ce | 1052 | job = xe_bb_create_migration_job(m->q, bb, |
dd08ebf6 MB |
1053 | xe_migrate_batch_base(m, usm), |
1054 | update_idx); | |
1055 | if (IS_ERR(job)) { | |
1056 | err = PTR_ERR(job); | |
1057 | goto err; | |
1058 | } | |
1059 | ||
1060 | xe_sched_job_add_migrate_flush(job, flush_flags); | |
a667cf56 MA |
1061 | if (!fence) { |
1062 | /* | |
1063 | * There can't be anything userspace related at this | |
1064 | * point, so we just need to respect any potential move | |
1065 | * fences, which are always tracked as | |
1066 | * DMA_RESV_USAGE_KERNEL. | |
1067 | */ | |
1068 | err = job_add_deps(job, bo->ttm.base.resv, | |
1069 | DMA_RESV_USAGE_KERNEL); | |
1070 | if (err) | |
1071 | goto err_job; | |
1072 | } | |
dd08ebf6 MB |
1073 | |
1074 | xe_sched_job_arm(job); | |
1075 | dma_fence_put(fence); | |
1076 | fence = dma_fence_get(&job->drm.s_fence->finished); | |
1077 | xe_sched_job_push(job); | |
1078 | ||
1079 | dma_fence_put(m->fence); | |
1080 | m->fence = dma_fence_get(fence); | |
1081 | ||
1082 | mutex_unlock(&m->job_mutex); | |
1083 | ||
1084 | xe_bb_free(bb, fence); | |
1085 | continue; | |
1086 | ||
a667cf56 MA |
1087 | err_job: |
1088 | xe_sched_job_put(job); | |
dd08ebf6 MB |
1089 | err: |
1090 | mutex_unlock(&m->job_mutex); | |
1091 | xe_bb_free(bb, NULL); | |
1092 | err_sync: | |
e9d285ff | 1093 | /* Sync partial copies if any. FIXME: job_mutex? */ |
dd08ebf6 MB |
1094 | if (fence) { |
1095 | dma_fence_wait(m->fence, false); | |
1096 | dma_fence_put(fence); | |
1097 | } | |
1098 | ||
1099 | return ERR_PTR(err); | |
1100 | } | |
1101 | ||
266c8588 HPG |
1102 | if (clear_system_ccs) |
1103 | bo->ccs_cleared = true; | |
1104 | ||
dd08ebf6 MB |
1105 | return fence; |
1106 | } | |
1107 | ||
876611c2 | 1108 | static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, |
dd08ebf6 MB |
1109 | const struct xe_vm_pgtable_update *update, |
1110 | struct xe_migrate_pt_update *pt_update) | |
1111 | { | |
1112 | const struct xe_migrate_pt_update_ops *ops = pt_update->ops; | |
1113 | u32 chunk; | |
1114 | u32 ofs = update->ofs, size = update->qwords; | |
1115 | ||
1116 | /* | |
1117 | * If we have 512 entries (max), we would populate it ourselves, | |
1118 | * and update the PDE above it to the new pointer. | |
1119 | * The only time this can only happen if we have to update the top | |
1120 | * PDE. This requires a BO that is almost vm->size big. | |
1121 | * | |
1122 | * This shouldn't be possible in practice.. might change when 16K | |
c73acc1e | 1123 | * pages are used. Hence the assert. |
dd08ebf6 | 1124 | */ |
348769d1 | 1125 | xe_tile_assert(tile, update->qwords < MAX_NUM_PTE); |
d9e85dd5 DK |
1126 | if (!ppgtt_ofs) |
1127 | ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), | |
1128 | xe_bo_addr(update->pt_bo, 0, | |
937b4be7 | 1129 | XE_PAGE_SIZE)); |
dd08ebf6 MB |
1130 | |
1131 | do { | |
1132 | u64 addr = ppgtt_ofs + ofs * 8; | |
3e8e7ee6 | 1133 | |
43d48379 | 1134 | chunk = min(size, MAX_PTE_PER_SDI); |
dd08ebf6 MB |
1135 | |
1136 | /* Ensure populatefn can do memset64 by aligning bb->cs */ | |
1137 | if (!(bb->len & 1)) | |
1138 | bb->cs[bb->len++] = MI_NOOP; | |
1139 | ||
14a1e6a4 | 1140 | bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); |
dd08ebf6 MB |
1141 | bb->cs[bb->len++] = lower_32_bits(addr); |
1142 | bb->cs[bb->len++] = upper_32_bits(addr); | |
876611c2 | 1143 | ops->populate(pt_update, tile, NULL, bb->cs + bb->len, ofs, chunk, |
dd08ebf6 MB |
1144 | update); |
1145 | ||
1146 | bb->len += chunk * 2; | |
1147 | ofs += chunk; | |
1148 | size -= chunk; | |
1149 | } while (size); | |
1150 | } | |
1151 | ||
1152 | struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) | |
1153 | { | |
9b9529ce | 1154 | return xe_vm_get(m->q->vm); |
dd08ebf6 MB |
1155 | } |
1156 | ||
7cba3396 TH |
1157 | #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) |
1158 | struct migrate_test_params { | |
1159 | struct xe_test_priv base; | |
1160 | bool force_gpu; | |
1161 | }; | |
1162 | ||
1163 | #define to_migrate_test_params(_priv) \ | |
1164 | container_of(_priv, struct migrate_test_params, base) | |
1165 | #endif | |
1166 | ||
dd08ebf6 MB |
1167 | static struct dma_fence * |
1168 | xe_migrate_update_pgtables_cpu(struct xe_migrate *m, | |
1169 | struct xe_vm *vm, struct xe_bo *bo, | |
1170 | const struct xe_vm_pgtable_update *updates, | |
1171 | u32 num_updates, bool wait_vm, | |
1172 | struct xe_migrate_pt_update *pt_update) | |
1173 | { | |
7cba3396 TH |
1174 | XE_TEST_DECLARE(struct migrate_test_params *test = |
1175 | to_migrate_test_params | |
1176 | (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) | |
dd08ebf6 MB |
1177 | const struct xe_migrate_pt_update_ops *ops = pt_update->ops; |
1178 | struct dma_fence *fence; | |
1179 | int err; | |
1180 | u32 i; | |
1181 | ||
7cba3396 TH |
1182 | if (XE_TEST_ONLY(test && test->force_gpu)) |
1183 | return ERR_PTR(-ETIME); | |
1184 | ||
fc1cc680 TH |
1185 | if (bo && !dma_resv_test_signaled(bo->ttm.base.resv, |
1186 | DMA_RESV_USAGE_KERNEL)) | |
1187 | return ERR_PTR(-ETIME); | |
1188 | ||
b06d47be | 1189 | if (wait_vm && !dma_resv_test_signaled(xe_vm_resv(vm), |
fc1cc680 TH |
1190 | DMA_RESV_USAGE_BOOKKEEP)) |
1191 | return ERR_PTR(-ETIME); | |
dd08ebf6 MB |
1192 | |
1193 | if (ops->pre_commit) { | |
fd84041d | 1194 | pt_update->job = NULL; |
dd08ebf6 MB |
1195 | err = ops->pre_commit(pt_update); |
1196 | if (err) | |
1197 | return ERR_PTR(err); | |
1198 | } | |
1199 | for (i = 0; i < num_updates; i++) { | |
1200 | const struct xe_vm_pgtable_update *update = &updates[i]; | |
1201 | ||
08dea767 | 1202 | ops->populate(pt_update, m->tile, &update->pt_bo->vmap, NULL, |
dd08ebf6 MB |
1203 | update->ofs, update->qwords, update); |
1204 | } | |
1205 | ||
fc1cc680 TH |
1206 | if (vm) { |
1207 | trace_xe_vm_cpu_bind(vm); | |
1208 | xe_device_wmb(vm->xe); | |
1209 | } | |
dd08ebf6 MB |
1210 | |
1211 | fence = dma_fence_get_stub(); | |
1212 | ||
1213 | return fence; | |
1214 | } | |
1215 | ||
eb9702ad MB |
1216 | static bool no_in_syncs(struct xe_vm *vm, struct xe_exec_queue *q, |
1217 | struct xe_sync_entry *syncs, u32 num_syncs) | |
dd08ebf6 | 1218 | { |
eb9702ad | 1219 | struct dma_fence *fence; |
dd08ebf6 MB |
1220 | int i; |
1221 | ||
1222 | for (i = 0; i < num_syncs; i++) { | |
eb9702ad | 1223 | fence = syncs[i].fence; |
dd08ebf6 MB |
1224 | |
1225 | if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, | |
1226 | &fence->flags)) | |
1227 | return false; | |
1228 | } | |
eb9702ad MB |
1229 | if (q) { |
1230 | fence = xe_exec_queue_last_fence_get(q, vm); | |
a856b67a MB |
1231 | if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { |
1232 | dma_fence_put(fence); | |
eb9702ad | 1233 | return false; |
a856b67a MB |
1234 | } |
1235 | dma_fence_put(fence); | |
eb9702ad | 1236 | } |
dd08ebf6 MB |
1237 | |
1238 | return true; | |
1239 | } | |
1240 | ||
e9d285ff TH |
1241 | /** |
1242 | * xe_migrate_update_pgtables() - Pipelined page-table update | |
1243 | * @m: The migrate context. | |
1244 | * @vm: The vm we'll be updating. | |
1245 | * @bo: The bo whose dma-resv we will await before updating, or NULL if userptr. | |
9b9529ce | 1246 | * @q: The exec queue to be used for the update or NULL if the default |
e9d285ff TH |
1247 | * migration engine is to be used. |
1248 | * @updates: An array of update descriptors. | |
1249 | * @num_updates: Number of descriptors in @updates. | |
1250 | * @syncs: Array of xe_sync_entry to await before updating. Note that waits | |
1251 | * will block the engine timeline. | |
1252 | * @num_syncs: Number of entries in @syncs. | |
1253 | * @pt_update: Pointer to a struct xe_migrate_pt_update, which contains | |
1254 | * pointers to callback functions and, if subclassed, private arguments to | |
1255 | * those. | |
1256 | * | |
1257 | * Perform a pipelined page-table update. The update descriptors are typically | |
1258 | * built under the same lock critical section as a call to this function. If | |
1259 | * using the default engine for the updates, they will be performed in the | |
1260 | * order they grab the job_mutex. If different engines are used, external | |
1261 | * synchronization is needed for overlapping updates to maintain page-table | |
1262 | * consistency. Note that the meaing of "overlapping" is that the updates | |
1263 | * touch the same page-table, which might be a higher-level page-directory. | |
1264 | * If no pipelining is needed, then updates may be performed by the cpu. | |
1265 | * | |
1266 | * Return: A dma_fence that, when signaled, indicates the update completion. | |
1267 | */ | |
dd08ebf6 MB |
1268 | struct dma_fence * |
1269 | xe_migrate_update_pgtables(struct xe_migrate *m, | |
1270 | struct xe_vm *vm, | |
1271 | struct xe_bo *bo, | |
9b9529ce | 1272 | struct xe_exec_queue *q, |
dd08ebf6 MB |
1273 | const struct xe_vm_pgtable_update *updates, |
1274 | u32 num_updates, | |
1275 | struct xe_sync_entry *syncs, u32 num_syncs, | |
1276 | struct xe_migrate_pt_update *pt_update) | |
1277 | { | |
1278 | const struct xe_migrate_pt_update_ops *ops = pt_update->ops; | |
08dea767 | 1279 | struct xe_tile *tile = m->tile; |
f6929e80 | 1280 | struct xe_gt *gt = tile->primary_gt; |
08dea767 | 1281 | struct xe_device *xe = tile_to_xe(tile); |
dd08ebf6 MB |
1282 | struct xe_sched_job *job; |
1283 | struct dma_fence *fence; | |
1284 | struct drm_suballoc *sa_bo = NULL; | |
1285 | struct xe_vma *vma = pt_update->vma; | |
1286 | struct xe_bb *bb; | |
1287 | u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0; | |
1288 | u64 addr; | |
1289 | int err = 0; | |
5a92da34 | 1290 | bool usm = !q && xe->info.has_usm; |
b06d47be MB |
1291 | bool first_munmap_rebind = vma && |
1292 | vma->gpuva.flags & XE_VMA_FIRST_REBIND; | |
9b9529ce | 1293 | struct xe_exec_queue *q_override = !q ? m->q : q; |
e814389f | 1294 | u16 pat_index = xe->pat.idx[XE_CACHE_WB]; |
dd08ebf6 MB |
1295 | |
1296 | /* Use the CPU if no in syncs and engine is idle */ | |
eb9702ad | 1297 | if (no_in_syncs(vm, q, syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) { |
dd08ebf6 MB |
1298 | fence = xe_migrate_update_pgtables_cpu(m, vm, bo, updates, |
1299 | num_updates, | |
1300 | first_munmap_rebind, | |
1301 | pt_update); | |
1302 | if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN)) | |
1303 | return fence; | |
1304 | } | |
1305 | ||
1306 | /* fixed + PTE entries */ | |
1307 | if (IS_DGFX(xe)) | |
1308 | batch_size = 2; | |
1309 | else | |
1310 | batch_size = 6 + num_updates * 2; | |
1311 | ||
1312 | for (i = 0; i < num_updates; i++) { | |
ca630876 | 1313 | u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, MAX_PTE_PER_SDI); |
dd08ebf6 MB |
1314 | |
1315 | /* align noop + MI_STORE_DATA_IMM cmd prefix */ | |
1316 | batch_size += 4 * num_cmds + updates[i].qwords * 2; | |
1317 | } | |
1318 | ||
1319 | /* | |
1320 | * XXX: Create temp bo to copy from, if batch_size becomes too big? | |
1321 | * | |
1322 | * Worst case: Sum(2 * (each lower level page size) + (top level page size)) | |
1323 | * Should be reasonably bound.. | |
1324 | */ | |
c73acc1e | 1325 | xe_tile_assert(tile, batch_size < SZ_128K); |
dd08ebf6 | 1326 | |
5a92da34 | 1327 | bb = xe_bb_new(gt, batch_size, !q && xe->info.has_usm); |
dd08ebf6 MB |
1328 | if (IS_ERR(bb)) |
1329 | return ERR_CAST(bb); | |
1330 | ||
1331 | /* For sysmem PTE's, need to map them in our hole.. */ | |
1332 | if (!IS_DGFX(xe)) { | |
1333 | ppgtt_ofs = NUM_KERNEL_PDE - 1; | |
9b9529ce | 1334 | if (q) { |
c73acc1e | 1335 | xe_tile_assert(tile, num_updates <= NUM_VMUSA_WRITES_PER_UNIT); |
dd08ebf6 MB |
1336 | |
1337 | sa_bo = drm_suballoc_new(&m->vm_update_sa, 1, | |
1338 | GFP_KERNEL, true, 0); | |
1339 | if (IS_ERR(sa_bo)) { | |
1340 | err = PTR_ERR(sa_bo); | |
1341 | goto err; | |
1342 | } | |
1343 | ||
1344 | ppgtt_ofs = NUM_KERNEL_PDE + | |
1345 | (drm_suballoc_soffset(sa_bo) / | |
1346 | NUM_VMUSA_UNIT_PER_PAGE); | |
1347 | page_ofs = (drm_suballoc_soffset(sa_bo) % | |
1348 | NUM_VMUSA_UNIT_PER_PAGE) * | |
1349 | VM_SA_UPDATE_UNIT_SIZE; | |
1350 | } | |
1351 | ||
dd08ebf6 | 1352 | /* Map our PT's to gtt */ |
14a1e6a4 | 1353 | bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(num_updates); |
58e19acf | 1354 | bb->cs[bb->len++] = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; |
dd08ebf6 MB |
1355 | bb->cs[bb->len++] = 0; /* upper_32_bits */ |
1356 | ||
1357 | for (i = 0; i < num_updates; i++) { | |
1358 | struct xe_bo *pt_bo = updates[i].pt_bo; | |
1359 | ||
c73acc1e | 1360 | xe_tile_assert(tile, pt_bo->size == SZ_4K); |
dd08ebf6 | 1361 | |
e814389f | 1362 | addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, pat_index, 0); |
dd08ebf6 MB |
1363 | bb->cs[bb->len++] = lower_32_bits(addr); |
1364 | bb->cs[bb->len++] = upper_32_bits(addr); | |
1365 | } | |
1366 | ||
1367 | bb->cs[bb->len++] = MI_BATCH_BUFFER_END; | |
1368 | update_idx = bb->len; | |
1369 | ||
1370 | addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + | |
58e19acf | 1371 | (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; |
dd08ebf6 | 1372 | for (i = 0; i < num_updates; i++) |
876611c2 | 1373 | write_pgtable(tile, bb, addr + i * XE_PAGE_SIZE, |
dd08ebf6 MB |
1374 | &updates[i], pt_update); |
1375 | } else { | |
1376 | /* phys pages, no preamble required */ | |
1377 | bb->cs[bb->len++] = MI_BATCH_BUFFER_END; | |
1378 | update_idx = bb->len; | |
1379 | ||
dd08ebf6 | 1380 | for (i = 0; i < num_updates; i++) |
876611c2 | 1381 | write_pgtable(tile, bb, 0, &updates[i], pt_update); |
dd08ebf6 MB |
1382 | } |
1383 | ||
9b9529ce | 1384 | if (!q) |
dd08ebf6 MB |
1385 | mutex_lock(&m->job_mutex); |
1386 | ||
9b9529ce | 1387 | job = xe_bb_create_migration_job(q ?: m->q, bb, |
dd08ebf6 MB |
1388 | xe_migrate_batch_base(m, usm), |
1389 | update_idx); | |
1390 | if (IS_ERR(job)) { | |
1391 | err = PTR_ERR(job); | |
1392 | goto err_bb; | |
1393 | } | |
1394 | ||
1395 | /* Wait on BO move */ | |
1396 | if (bo) { | |
1397 | err = job_add_deps(job, bo->ttm.base.resv, | |
1398 | DMA_RESV_USAGE_KERNEL); | |
1399 | if (err) | |
1400 | goto err_job; | |
1401 | } | |
1402 | ||
1403 | /* | |
1404 | * Munmap style VM unbind, need to wait for all jobs to be complete / | |
1405 | * trigger preempts before moving forward | |
1406 | */ | |
1407 | if (first_munmap_rebind) { | |
b06d47be | 1408 | err = job_add_deps(job, xe_vm_resv(vm), |
dd08ebf6 MB |
1409 | DMA_RESV_USAGE_BOOKKEEP); |
1410 | if (err) | |
1411 | goto err_job; | |
1412 | } | |
1413 | ||
eb9702ad | 1414 | err = xe_sched_job_last_fence_add_dep(job, vm); |
dd08ebf6 MB |
1415 | for (i = 0; !err && i < num_syncs; i++) |
1416 | err = xe_sync_entry_add_deps(&syncs[i], job); | |
1417 | ||
1418 | if (err) | |
1419 | goto err_job; | |
1420 | ||
1421 | if (ops->pre_commit) { | |
fd84041d | 1422 | pt_update->job = job; |
dd08ebf6 MB |
1423 | err = ops->pre_commit(pt_update); |
1424 | if (err) | |
1425 | goto err_job; | |
1426 | } | |
1427 | xe_sched_job_arm(job); | |
1428 | fence = dma_fence_get(&job->drm.s_fence->finished); | |
1429 | xe_sched_job_push(job); | |
1430 | ||
9b9529ce | 1431 | if (!q) |
dd08ebf6 MB |
1432 | mutex_unlock(&m->job_mutex); |
1433 | ||
1434 | xe_bb_free(bb, fence); | |
1435 | drm_suballoc_free(sa_bo, fence); | |
1436 | ||
1437 | return fence; | |
1438 | ||
1439 | err_job: | |
1440 | xe_sched_job_put(job); | |
1441 | err_bb: | |
9b9529ce | 1442 | if (!q) |
dd08ebf6 MB |
1443 | mutex_unlock(&m->job_mutex); |
1444 | xe_bb_free(bb, NULL); | |
1445 | err: | |
1446 | drm_suballoc_free(sa_bo, NULL); | |
1447 | return ERR_PTR(err); | |
1448 | } | |
1449 | ||
e9d285ff TH |
1450 | /** |
1451 | * xe_migrate_wait() - Complete all operations using the xe_migrate context | |
1452 | * @m: Migrate context to wait for. | |
1453 | * | |
1454 | * Waits until the GPU no longer uses the migrate context's default engine | |
1455 | * or its page-table objects. FIXME: What about separate page-table update | |
1456 | * engines? | |
1457 | */ | |
dd08ebf6 MB |
1458 | void xe_migrate_wait(struct xe_migrate *m) |
1459 | { | |
1460 | if (m->fence) | |
1461 | dma_fence_wait(m->fence, false); | |
1462 | } | |
1463 | ||
1464 | #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) | |
1465 | #include "tests/xe_migrate.c" | |
1466 | #endif |