Commit | Line | Data |
---|---|---|
dd08ebf6 MB |
1 | // SPDX-License-Identifier: MIT |
2 | /* | |
3 | * Copyright © 2020 Intel Corporation | |
4 | */ | |
ea9f879d | 5 | |
dd08ebf6 MB |
6 | #include "xe_migrate.h" |
7 | ||
8cb49012 | 8 | #include <linux/bitfield.h> |
ea9f879d LDM |
9 | #include <linux/sizes.h> |
10 | ||
11 | #include <drm/drm_managed.h> | |
12 | #include <drm/ttm/ttm_tt.h> | |
87d8ecf0 | 13 | #include <uapi/drm/xe_drm.h> |
ea9f879d | 14 | |
a24d9099 DH |
15 | #include <generated/xe_wa_oob.h> |
16 | ||
62010b3c | 17 | #include "instructions/xe_gpu_commands.h" |
0134f130 | 18 | #include "instructions/xe_mi_commands.h" |
48651e18 | 19 | #include "regs/xe_gtt_defs.h" |
7cba3396 | 20 | #include "tests/xe_test.h" |
c73acc1e | 21 | #include "xe_assert.h" |
dd08ebf6 MB |
22 | #include "xe_bb.h" |
23 | #include "xe_bo.h" | |
c22a4ed0 | 24 | #include "xe_exec_queue.h" |
dd08ebf6 MB |
25 | #include "xe_ggtt.h" |
26 | #include "xe_gt.h" | |
27 | #include "xe_hw_engine.h" | |
28 | #include "xe_lrc.h" | |
29 | #include "xe_map.h" | |
30 | #include "xe_mocs.h" | |
31 | #include "xe_pt.h" | |
32 | #include "xe_res_cursor.h" | |
33 | #include "xe_sched_job.h" | |
34 | #include "xe_sync.h" | |
e46d3f81 | 35 | #include "xe_trace_bo.h" |
dd08ebf6 MB |
36 | #include "xe_vm.h" |
37 | ||
e9d285ff TH |
38 | /** |
39 | * struct xe_migrate - migrate context. | |
40 | */ | |
dd08ebf6 | 41 | struct xe_migrate { |
9b9529ce FD |
42 | /** @q: Default exec queue used for migration */ |
43 | struct xe_exec_queue *q; | |
08dea767 MR |
44 | /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ |
45 | struct xe_tile *tile; | |
e9d285ff | 46 | /** @job_mutex: Timeline mutex for @eng. */ |
dd08ebf6 | 47 | struct mutex job_mutex; |
e9d285ff | 48 | /** @pt_bo: Page-table buffer object. */ |
dd08ebf6 | 49 | struct xe_bo *pt_bo; |
e9d285ff | 50 | /** @batch_base_ofs: VM offset of the migration batch buffer */ |
dd08ebf6 | 51 | u64 batch_base_ofs; |
e9d285ff | 52 | /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ |
dd08ebf6 | 53 | u64 usm_batch_base_ofs; |
9116eabb HPG |
54 | /** @cleared_mem_ofs: VM offset of @cleared_bo. */ |
55 | u64 cleared_mem_ofs; | |
e9d285ff TH |
56 | /** |
57 | * @fence: dma-fence representing the last migration job batch. | |
58 | * Protected by @job_mutex. | |
59 | */ | |
dd08ebf6 | 60 | struct dma_fence *fence; |
e9d285ff TH |
61 | /** |
62 | * @vm_update_sa: For integrated, used to suballocate page-tables | |
63 | * out of the pt_bo. | |
64 | */ | |
dd08ebf6 | 65 | struct drm_suballoc_manager vm_update_sa; |
ef51d754 TH |
66 | /** @min_chunk_size: For dgfx, Minimum chunk size */ |
67 | u64 min_chunk_size; | |
dd08ebf6 MB |
68 | }; |
69 | ||
70 | #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ | |
09427526 | 71 | #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */ |
6d3581ed | 72 | #define NUM_KERNEL_PDE 15 |
dd08ebf6 | 73 | #define NUM_PT_SLOTS 32 |
09427526 | 74 | #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M |
348769d1 | 75 | #define MAX_NUM_PTE 512 |
2b808d6b | 76 | #define IDENTITY_OFFSET 256ULL |
dd08ebf6 | 77 | |
ca630876 MR |
78 | /* |
79 | * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest | |
80 | * legal value accepted. Since that instruction field is always stored in | |
81 | * (val-2) format, this translates to 0x400 dwords for the true maximum length | |
82 | * of the instruction. Subtracting the instruction header (1 dword) and | |
83 | * address (2 dwords), that leaves 0x3FD dwords (0x1FE qwords) for PTE values. | |
84 | */ | |
2d5cff2b | 85 | #define MAX_PTE_PER_SDI 0x1FEU |
ca630876 | 86 | |
e9d285ff | 87 | /** |
67d90d67 | 88 | * xe_tile_migrate_exec_queue() - Get this tile's migrate exec queue. |
08dea767 | 89 | * @tile: The tile. |
e9d285ff | 90 | * |
67d90d67 | 91 | * Returns the default migrate exec queue of this tile. |
e9d285ff | 92 | * |
67d90d67 | 93 | * Return: The default migrate exec queue |
e9d285ff | 94 | */ |
67d90d67 | 95 | struct xe_exec_queue *xe_tile_migrate_exec_queue(struct xe_tile *tile) |
dd08ebf6 | 96 | { |
9b9529ce | 97 | return tile->migrate->q; |
dd08ebf6 MB |
98 | } |
99 | ||
c045e036 | 100 | static void xe_migrate_fini(void *arg) |
dd08ebf6 MB |
101 | { |
102 | struct xe_migrate *m = arg; | |
dd08ebf6 | 103 | |
d00e9cc2 | 104 | xe_vm_lock(m->q->vm, false); |
dd08ebf6 | 105 | xe_bo_unpin(m->pt_bo); |
d00e9cc2 | 106 | xe_vm_unlock(m->q->vm); |
dd08ebf6 MB |
107 | |
108 | dma_fence_put(m->fence); | |
dd08ebf6 MB |
109 | xe_bo_put(m->pt_bo); |
110 | drm_suballoc_manager_fini(&m->vm_update_sa); | |
111 | mutex_destroy(&m->job_mutex); | |
9b9529ce FD |
112 | xe_vm_close_and_put(m->q->vm); |
113 | xe_exec_queue_put(m->q); | |
dd08ebf6 MB |
114 | } |
115 | ||
116 | static u64 xe_migrate_vm_addr(u64 slot, u32 level) | |
117 | { | |
99fea682 | 118 | XE_WARN_ON(slot >= NUM_PT_SLOTS); |
dd08ebf6 MB |
119 | |
120 | /* First slot is reserved for mapping of PT bo and bb, start from 1 */ | |
121 | return (slot + 1ULL) << xe_pt_shift(level + 1); | |
122 | } | |
123 | ||
2b808d6b | 124 | static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte) |
dd08ebf6 | 125 | { |
d9e85dd5 DK |
126 | /* |
127 | * Remove the DPA to get a correct offset into identity table for the | |
128 | * migrate offset | |
129 | */ | |
2b808d6b AJ |
130 | u64 identity_offset = IDENTITY_OFFSET; |
131 | ||
132 | if (GRAPHICS_VER(xe) >= 20 && is_comp_pte) | |
133 | identity_offset += DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G); | |
134 | ||
d9e85dd5 | 135 | addr -= xe->mem.vram.dpa_base; |
2b808d6b | 136 | return addr + (identity_offset << xe_pt_shift(2)); |
dd08ebf6 MB |
137 | } |
138 | ||
8d79acd5 AJ |
139 | static void xe_migrate_program_identity(struct xe_device *xe, struct xe_vm *vm, struct xe_bo *bo, |
140 | u64 map_ofs, u64 vram_offset, u16 pat_index, u64 pt_2m_ofs) | |
141 | { | |
142 | u64 pos, ofs, flags; | |
143 | u64 entry; | |
144 | /* XXX: Unclear if this should be usable_size? */ | |
145 | u64 vram_limit = xe->mem.vram.actual_physical_size + | |
146 | xe->mem.vram.dpa_base; | |
147 | u32 level = 2; | |
148 | ||
149 | ofs = map_ofs + XE_PAGE_SIZE * level + vram_offset * 8; | |
150 | flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, | |
151 | true, 0); | |
152 | ||
153 | xe_assert(xe, IS_ALIGNED(xe->mem.vram.usable_size, SZ_2M)); | |
154 | ||
155 | /* | |
156 | * Use 1GB pages when possible, last chunk always use 2M | |
157 | * pages as mixing reserved memory (stolen, WOCPM) with a single | |
158 | * mapping is not allowed on certain platforms. | |
159 | */ | |
160 | for (pos = xe->mem.vram.dpa_base; pos < vram_limit; | |
161 | pos += SZ_1G, ofs += 8) { | |
162 | if (pos + SZ_1G >= vram_limit) { | |
163 | entry = vm->pt_ops->pde_encode_bo(bo, pt_2m_ofs, | |
164 | pat_index); | |
165 | xe_map_wr(xe, &bo->vmap, ofs, u64, entry); | |
166 | ||
167 | flags = vm->pt_ops->pte_encode_addr(xe, 0, | |
168 | pat_index, | |
169 | level - 1, | |
170 | true, 0); | |
171 | ||
172 | for (ofs = pt_2m_ofs; pos < vram_limit; | |
173 | pos += SZ_2M, ofs += 8) | |
174 | xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); | |
175 | break; /* Ensure pos == vram_limit assert correct */ | |
176 | } | |
177 | ||
178 | xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); | |
179 | } | |
180 | ||
181 | xe_assert(xe, pos == vram_limit); | |
182 | } | |
183 | ||
08dea767 | 184 | static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, |
dd08ebf6 MB |
185 | struct xe_vm *vm) |
186 | { | |
08dea767 | 187 | struct xe_device *xe = tile_to_xe(tile); |
e814389f | 188 | u16 pat_index = xe->pat.idx[XE_CACHE_WB]; |
08dea767 | 189 | u8 id = tile->id; |
2b808d6b AJ |
190 | u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; |
191 | #define VRAM_IDENTITY_MAP_COUNT 2 | |
192 | u32 num_setup = num_level + VRAM_IDENTITY_MAP_COUNT; | |
193 | #undef VRAM_IDENTITY_MAP_COUNT | |
dd08ebf6 | 194 | u32 map_ofs, level, i; |
876611c2 | 195 | struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; |
2b808d6b | 196 | u64 entry, pt29_ofs; |
dd08ebf6 MB |
197 | |
198 | /* Can't bump NUM_PT_SLOTS too high */ | |
58e19acf | 199 | BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); |
dd08ebf6 | 200 | /* Must be a multiple of 64K to support all platforms */ |
58e19acf | 201 | BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); |
dd08ebf6 MB |
202 | /* And one slot reserved for the 4KiB page table updates */ |
203 | BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); | |
204 | ||
205 | /* Need to be sure everything fits in the first PT, or create more */ | |
c73acc1e | 206 | xe_tile_assert(tile, m->batch_base_ofs + batch->size < SZ_2M); |
dd08ebf6 | 207 | |
876611c2 | 208 | bo = xe_bo_create_pin_map(vm->xe, tile, vm, |
58e19acf | 209 | num_entries * XE_PAGE_SIZE, |
dd08ebf6 | 210 | ttm_bo_type_kernel, |
62742d12 | 211 | XE_BO_FLAG_VRAM_IF_DGFX(tile) | |
febc689b | 212 | XE_BO_FLAG_PAGETABLE); |
dd08ebf6 MB |
213 | if (IS_ERR(bo)) |
214 | return PTR_ERR(bo); | |
215 | ||
2b808d6b AJ |
216 | /* PT30 & PT31 reserved for 2M identity map */ |
217 | pt29_ofs = bo->size - 3 * XE_PAGE_SIZE; | |
218 | entry = vm->pt_ops->pde_encode_bo(bo, pt29_ofs, pat_index); | |
dd08ebf6 MB |
219 | xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); |
220 | ||
6d3581ed | 221 | map_ofs = (num_entries - num_setup) * XE_PAGE_SIZE; |
dd08ebf6 MB |
222 | |
223 | /* Map the entire BO in our level 0 pt */ | |
224 | for (i = 0, level = 0; i < num_entries; level++) { | |
0e5e77bd | 225 | entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, |
e814389f | 226 | pat_index, 0); |
dd08ebf6 MB |
227 | |
228 | xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); | |
229 | ||
0d39b6da | 230 | if (vm->flags & XE_VM_FLAG_64K) |
dd08ebf6 MB |
231 | i += 16; |
232 | else | |
233 | i += 1; | |
234 | } | |
235 | ||
236 | if (!IS_DGFX(xe)) { | |
dd08ebf6 | 237 | /* Write out batch too */ |
58e19acf | 238 | m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; |
dd08ebf6 | 239 | for (i = 0; i < batch->size; |
0d39b6da | 240 | i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : |
58e19acf | 241 | XE_PAGE_SIZE) { |
0e5e77bd | 242 | entry = vm->pt_ops->pte_encode_bo(batch, i, |
e814389f | 243 | pat_index, 0); |
dd08ebf6 MB |
244 | |
245 | xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, | |
246 | entry); | |
247 | level++; | |
248 | } | |
72f86ed3 MB |
249 | if (xe->info.has_usm) { |
250 | xe_tile_assert(tile, batch->size == SZ_1M); | |
251 | ||
252 | batch = tile->primary_gt->usm.bb_pool->bo; | |
253 | m->usm_batch_base_ofs = m->batch_base_ofs + SZ_1M; | |
254 | xe_tile_assert(tile, batch->size == SZ_512K); | |
255 | ||
256 | for (i = 0; i < batch->size; | |
257 | i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : | |
258 | XE_PAGE_SIZE) { | |
259 | entry = vm->pt_ops->pte_encode_bo(batch, i, | |
260 | pat_index, 0); | |
261 | ||
262 | xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, | |
263 | entry); | |
264 | level++; | |
265 | } | |
266 | } | |
dd08ebf6 | 267 | } else { |
937b4be7 | 268 | u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); |
dd08ebf6 | 269 | |
2b808d6b | 270 | m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); |
dd08ebf6 | 271 | |
5a92da34 | 272 | if (xe->info.has_usm) { |
f6929e80 | 273 | batch = tile->primary_gt->usm.bb_pool->bo; |
937b4be7 | 274 | batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); |
2b808d6b | 275 | m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); |
dd08ebf6 MB |
276 | } |
277 | } | |
278 | ||
279 | for (level = 1; level < num_level; level++) { | |
280 | u32 flags = 0; | |
281 | ||
0d39b6da | 282 | if (vm->flags & XE_VM_FLAG_64K && level == 1) |
58e19acf | 283 | flags = XE_PDE_64K; |
dd08ebf6 | 284 | |
34820967 | 285 | entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) * |
e814389f | 286 | XE_PAGE_SIZE, pat_index); |
58e19acf | 287 | xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, |
dd08ebf6 MB |
288 | entry | flags); |
289 | } | |
290 | ||
291 | /* Write PDE's that point to our BO. */ | |
6d3581ed | 292 | for (i = 0; i < map_ofs / PAGE_SIZE; i++) { |
34820967 | 293 | entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE, |
e814389f | 294 | pat_index); |
dd08ebf6 | 295 | |
58e19acf | 296 | xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + |
dd08ebf6 MB |
297 | (i + 1) * 8, u64, entry); |
298 | } | |
299 | ||
9116eabb HPG |
300 | /* Set up a 1GiB NULL mapping at 255GiB offset. */ |
301 | level = 2; | |
302 | xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64, | |
303 | vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) | |
304 | | XE_PTE_NULL); | |
305 | m->cleared_mem_ofs = (255ULL << xe_pt_shift(level)); | |
306 | ||
dd08ebf6 MB |
307 | /* Identity map the entire vram at 256GiB offset */ |
308 | if (IS_DGFX(xe)) { | |
2b808d6b | 309 | u64 pt30_ofs = bo->size - 2 * XE_PAGE_SIZE; |
6d3581ed | 310 | |
2b808d6b AJ |
311 | xe_migrate_program_identity(xe, vm, bo, map_ofs, IDENTITY_OFFSET, |
312 | pat_index, pt30_ofs); | |
313 | xe_assert(xe, xe->mem.vram.actual_physical_size <= | |
314 | (MAX_NUM_PTE - IDENTITY_OFFSET) * SZ_1G); | |
315 | ||
316 | /* | |
317 | * Identity map the entire vram for compressed pat_index for xe2+ | |
318 | * if flat ccs is enabled. | |
319 | */ | |
320 | if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) { | |
321 | u16 comp_pat_index = xe->pat.idx[XE_CACHE_NONE_COMPRESSION]; | |
322 | u64 vram_offset = IDENTITY_OFFSET + | |
323 | DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G); | |
324 | u64 pt31_ofs = bo->size - XE_PAGE_SIZE; | |
325 | ||
326 | xe_assert(xe, xe->mem.vram.actual_physical_size <= (MAX_NUM_PTE - | |
327 | IDENTITY_OFFSET - IDENTITY_OFFSET / 2) * SZ_1G); | |
328 | xe_migrate_program_identity(xe, vm, bo, map_ofs, vram_offset, | |
329 | comp_pat_index, pt31_ofs); | |
330 | } | |
dd08ebf6 MB |
331 | } |
332 | ||
333 | /* | |
334 | * Example layout created above, with root level = 3: | |
335 | * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's | |
336 | * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's | |
2b808d6b AJ |
337 | * [PT9...PT26]: Userspace PT's for VM_BIND, 4 KiB PTE's |
338 | * [PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31 = 2M vram identity map] | |
dd08ebf6 MB |
339 | * |
340 | * This makes the lowest part of the VM point to the pagetables. | |
341 | * Hence the lowest 2M in the vm should point to itself, with a few writes | |
342 | * and flushes, other parts of the VM can be used either for copying and | |
343 | * clearing. | |
344 | * | |
345 | * For performance, the kernel reserves PDE's, so about 20 are left | |
346 | * for async VM updates. | |
347 | * | |
348 | * To make it easier to work, each scratch PT is put in slot (1 + PT #) | |
349 | * everywhere, this allows lockless updates to scratch pages by using | |
350 | * the different addresses in VM. | |
351 | */ | |
352 | #define NUM_VMUSA_UNIT_PER_PAGE 32 | |
58e19acf | 353 | #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) |
dd08ebf6 MB |
354 | #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) |
355 | drm_suballoc_manager_init(&m->vm_update_sa, | |
34820967 | 356 | (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * |
dd08ebf6 MB |
357 | NUM_VMUSA_UNIT_PER_PAGE, 0); |
358 | ||
359 | m->pt_bo = bo; | |
360 | return 0; | |
361 | } | |
362 | ||
a043fbab | 363 | /* |
a043fbab NV |
364 | * Including the reserved copy engine is required to avoid deadlocks due to |
365 | * migrate jobs servicing the faults gets stuck behind the job that faulted. | |
366 | */ | |
367 | static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) | |
368 | { | |
369 | u32 logical_mask = 0; | |
370 | struct xe_hw_engine *hwe; | |
371 | enum xe_hw_engine_id id; | |
372 | ||
373 | for_each_hw_engine(hwe, gt, id) { | |
374 | if (hwe->class != XE_ENGINE_CLASS_COPY) | |
375 | continue; | |
376 | ||
04f4a70a | 377 | if (xe_gt_is_usm_hwe(gt, hwe)) |
a043fbab NV |
378 | logical_mask |= BIT(hwe->logical_instance); |
379 | } | |
380 | ||
381 | return logical_mask; | |
382 | } | |
383 | ||
108c972a AJ |
384 | static bool xe_migrate_needs_ccs_emit(struct xe_device *xe) |
385 | { | |
386 | return xe_device_has_flat_ccs(xe) && !(GRAPHICS_VER(xe) >= 20 && IS_DGFX(xe)); | |
387 | } | |
388 | ||
e9d285ff TH |
389 | /** |
390 | * xe_migrate_init() - Initialize a migrate context | |
08dea767 | 391 | * @tile: Back-pointer to the tile we're initializing for. |
e9d285ff TH |
392 | * |
393 | * Return: Pointer to a migrate context on success. Error pointer on error. | |
394 | */ | |
08dea767 | 395 | struct xe_migrate *xe_migrate_init(struct xe_tile *tile) |
dd08ebf6 | 396 | { |
08dea767 | 397 | struct xe_device *xe = tile_to_xe(tile); |
f6929e80 | 398 | struct xe_gt *primary_gt = tile->primary_gt; |
dd08ebf6 MB |
399 | struct xe_migrate *m; |
400 | struct xe_vm *vm; | |
dd08ebf6 MB |
401 | int err; |
402 | ||
c045e036 | 403 | m = devm_kzalloc(xe->drm.dev, sizeof(*m), GFP_KERNEL); |
dd08ebf6 MB |
404 | if (!m) |
405 | return ERR_PTR(-ENOMEM); | |
406 | ||
08dea767 | 407 | m->tile = tile; |
dd08ebf6 MB |
408 | |
409 | /* Special layout, prepared below.. */ | |
410 | vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | | |
08dea767 | 411 | XE_VM_FLAG_SET_TILE_ID(tile)); |
dd08ebf6 MB |
412 | if (IS_ERR(vm)) |
413 | return ERR_CAST(vm); | |
414 | ||
d00e9cc2 | 415 | xe_vm_lock(vm, false); |
08dea767 | 416 | err = xe_migrate_prepare_vm(tile, m, vm); |
d00e9cc2 | 417 | xe_vm_unlock(vm); |
dd08ebf6 MB |
418 | if (err) { |
419 | xe_vm_close_and_put(vm); | |
420 | return ERR_PTR(err); | |
421 | } | |
422 | ||
5a92da34 | 423 | if (xe->info.has_usm) { |
08dea767 | 424 | struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, |
dd08ebf6 | 425 | XE_ENGINE_CLASS_COPY, |
08dea767 | 426 | primary_gt->usm.reserved_bcs_instance, |
dd08ebf6 | 427 | false); |
a043fbab NV |
428 | u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); |
429 | ||
430 | if (!hwe || !logical_mask) | |
dd08ebf6 MB |
431 | return ERR_PTR(-EINVAL); |
432 | ||
04f4a70a MB |
433 | /* |
434 | * XXX: Currently only reserving 1 (likely slow) BCS instance on | |
435 | * PVC, may want to revisit if performance is needed. | |
436 | */ | |
a043fbab | 437 | m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, |
923e4238 | 438 | EXEC_QUEUE_FLAG_KERNEL | |
a8004af3 | 439 | EXEC_QUEUE_FLAG_PERMANENT | |
25ce7c50 | 440 | EXEC_QUEUE_FLAG_HIGH_PRIORITY, 0); |
dd08ebf6 | 441 | } else { |
9b9529ce FD |
442 | m->q = xe_exec_queue_create_class(xe, primary_gt, vm, |
443 | XE_ENGINE_CLASS_COPY, | |
923e4238 | 444 | EXEC_QUEUE_FLAG_KERNEL | |
852856e3 | 445 | EXEC_QUEUE_FLAG_PERMANENT, 0); |
dd08ebf6 | 446 | } |
9b9529ce | 447 | if (IS_ERR(m->q)) { |
dd08ebf6 | 448 | xe_vm_close_and_put(vm); |
9b9529ce | 449 | return ERR_CAST(m->q); |
dd08ebf6 MB |
450 | } |
451 | ||
452 | mutex_init(&m->job_mutex); | |
50e52592 TH |
453 | fs_reclaim_acquire(GFP_KERNEL); |
454 | might_lock(&m->job_mutex); | |
455 | fs_reclaim_release(GFP_KERNEL); | |
dd08ebf6 | 456 | |
c045e036 | 457 | err = devm_add_action_or_reset(xe->drm.dev, xe_migrate_fini, m); |
dd08ebf6 MB |
458 | if (err) |
459 | return ERR_PTR(err); | |
460 | ||
ef51d754 | 461 | if (IS_DGFX(xe)) { |
108c972a | 462 | if (xe_migrate_needs_ccs_emit(xe)) |
ef51d754 TH |
463 | /* min chunk size corresponds to 4K of CCS Metadata */ |
464 | m->min_chunk_size = SZ_4K * SZ_64K / | |
465 | xe_device_ccs_bytes(xe, SZ_64K); | |
466 | else | |
467 | /* Somewhat arbitrary to avoid a huge amount of blits */ | |
468 | m->min_chunk_size = SZ_64K; | |
469 | m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size); | |
470 | drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n", | |
471 | (unsigned long long)m->min_chunk_size); | |
472 | } | |
473 | ||
dd08ebf6 MB |
474 | return m; |
475 | } | |
476 | ||
09427526 HPG |
477 | static u64 max_mem_transfer_per_pass(struct xe_device *xe) |
478 | { | |
479 | if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe)) | |
480 | return MAX_CCS_LIMITED_TRANSFER; | |
481 | ||
482 | return MAX_PREEMPTDISABLE_TRANSFER; | |
483 | } | |
484 | ||
ef51d754 | 485 | static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur) |
dd08ebf6 | 486 | { |
ef51d754 TH |
487 | struct xe_device *xe = tile_to_xe(m->tile); |
488 | u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining); | |
489 | ||
490 | if (mem_type_is_vram(cur->mem_type)) { | |
491 | /* | |
492 | * VRAM we want to blit in chunks with sizes aligned to | |
493 | * min_chunk_size in order for the offset to CCS metadata to be | |
494 | * page-aligned. If it's the last chunk it may be smaller. | |
495 | * | |
496 | * Another constraint is that we need to limit the blit to | |
497 | * the VRAM block size, unless size is smaller than | |
498 | * min_chunk_size. | |
499 | */ | |
500 | u64 chunk = max_t(u64, cur->size, m->min_chunk_size); | |
501 | ||
502 | size = min_t(u64, size, chunk); | |
503 | if (size > m->min_chunk_size) | |
504 | size = round_down(size, m->min_chunk_size); | |
505 | } | |
506 | ||
507 | return size; | |
508 | } | |
509 | ||
510 | static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur) | |
511 | { | |
512 | /* If the chunk is not fragmented, allow identity map. */ | |
513 | return cur->size >= size; | |
dd08ebf6 MB |
514 | } |
515 | ||
2b808d6b AJ |
516 | #define PTE_UPDATE_FLAG_IS_VRAM BIT(0) |
517 | #define PTE_UPDATE_FLAG_IS_COMP_PTE BIT(1) | |
518 | ||
dd08ebf6 | 519 | static u32 pte_update_size(struct xe_migrate *m, |
2b808d6b | 520 | u32 flags, |
c33a7219 | 521 | struct ttm_resource *res, |
dd08ebf6 MB |
522 | struct xe_res_cursor *cur, |
523 | u64 *L0, u64 *L0_ofs, u32 *L0_pt, | |
524 | u32 cmd_size, u32 pt_ofs, u32 avail_pts) | |
525 | { | |
526 | u32 cmds = 0; | |
2b808d6b AJ |
527 | bool is_vram = PTE_UPDATE_FLAG_IS_VRAM & flags; |
528 | bool is_comp_pte = PTE_UPDATE_FLAG_IS_COMP_PTE & flags; | |
dd08ebf6 MB |
529 | |
530 | *L0_pt = pt_ofs; | |
ef51d754 TH |
531 | if (is_vram && xe_migrate_allow_identity(*L0, cur)) { |
532 | /* Offset into identity map. */ | |
533 | *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), | |
2b808d6b AJ |
534 | cur->start + vram_region_gpu_offset(res), |
535 | is_comp_pte); | |
ef51d754 TH |
536 | cmds += cmd_size; |
537 | } else { | |
dd08ebf6 MB |
538 | /* Clip L0 to available size */ |
539 | u64 size = min(*L0, (u64)avail_pts * SZ_2M); | |
1408784b | 540 | u32 num_4k_pages = (size + XE_PAGE_SIZE - 1) >> XE_PTE_SHIFT; |
dd08ebf6 MB |
541 | |
542 | *L0 = size; | |
543 | *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); | |
544 | ||
545 | /* MI_STORE_DATA_IMM */ | |
ca630876 | 546 | cmds += 3 * DIV_ROUND_UP(num_4k_pages, MAX_PTE_PER_SDI); |
dd08ebf6 MB |
547 | |
548 | /* PDE qwords */ | |
549 | cmds += num_4k_pages * 2; | |
550 | ||
551 | /* Each chunk has a single blit command */ | |
552 | cmds += cmd_size; | |
dd08ebf6 MB |
553 | } |
554 | ||
555 | return cmds; | |
556 | } | |
557 | ||
558 | static void emit_pte(struct xe_migrate *m, | |
559 | struct xe_bb *bb, u32 at_pt, | |
65ef8dba | 560 | bool is_vram, bool is_comp_pte, |
dd08ebf6 | 561 | struct xe_res_cursor *cur, |
ef51d754 | 562 | u32 size, struct ttm_resource *res) |
dd08ebf6 | 563 | { |
65ef8dba | 564 | struct xe_device *xe = tile_to_xe(m->tile); |
ef51d754 | 565 | struct xe_vm *vm = m->q->vm; |
65ef8dba | 566 | u16 pat_index; |
dd08ebf6 | 567 | u32 ptes; |
34820967 | 568 | u64 ofs = (u64)at_pt * XE_PAGE_SIZE; |
dd08ebf6 MB |
569 | u64 cur_ofs; |
570 | ||
65ef8dba HPG |
571 | /* Indirect access needs compression enabled uncached PAT index */ |
572 | if (GRAPHICS_VERx100(xe) >= 2000) | |
573 | pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] : | |
6a028675 | 574 | xe->pat.idx[XE_CACHE_WB]; |
65ef8dba HPG |
575 | else |
576 | pat_index = xe->pat.idx[XE_CACHE_WB]; | |
577 | ||
58e19acf | 578 | ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); |
dd08ebf6 MB |
579 | |
580 | while (ptes) { | |
ca630876 | 581 | u32 chunk = min(MAX_PTE_PER_SDI, ptes); |
dd08ebf6 | 582 | |
88fca61b | 583 | bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); |
dd08ebf6 MB |
584 | bb->cs[bb->len++] = ofs; |
585 | bb->cs[bb->len++] = 0; | |
586 | ||
587 | cur_ofs = ofs; | |
588 | ofs += chunk * 8; | |
589 | ptes -= chunk; | |
590 | ||
591 | while (chunk--) { | |
23c8495e LDM |
592 | u64 addr, flags = 0; |
593 | bool devmem = false; | |
dd08ebf6 | 594 | |
e89b384c | 595 | addr = xe_res_dma(cur) & PAGE_MASK; |
dd08ebf6 | 596 | if (is_vram) { |
ef51d754 TH |
597 | if (vm->flags & XE_VM_FLAG_64K) { |
598 | u64 va = cur_ofs * XE_PAGE_SIZE / 8; | |
599 | ||
600 | xe_assert(xe, (va & (SZ_64K - 1)) == | |
601 | (addr & (SZ_64K - 1))); | |
602 | ||
23c8495e | 603 | flags |= XE_PTE_PS64; |
dd08ebf6 MB |
604 | } |
605 | ||
ef51d754 | 606 | addr += vram_region_gpu_offset(res); |
23c8495e | 607 | devmem = true; |
dd08ebf6 | 608 | } |
23c8495e | 609 | |
ef51d754 TH |
610 | addr = vm->pt_ops->pte_encode_addr(m->tile->xe, |
611 | addr, pat_index, | |
612 | 0, devmem, flags); | |
dd08ebf6 MB |
613 | bb->cs[bb->len++] = lower_32_bits(addr); |
614 | bb->cs[bb->len++] = upper_32_bits(addr); | |
615 | ||
72e8d73b | 616 | xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); |
dd08ebf6 MB |
617 | cur_ofs += 8; |
618 | } | |
619 | } | |
620 | } | |
621 | ||
622 | #define EMIT_COPY_CCS_DW 5 | |
623 | static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, | |
624 | u64 dst_ofs, bool dst_is_indirect, | |
625 | u64 src_ofs, bool src_is_indirect, | |
626 | u32 size) | |
627 | { | |
30603b5b | 628 | struct xe_device *xe = gt_to_xe(gt); |
dd08ebf6 MB |
629 | u32 *cs = bb->cs + bb->len; |
630 | u32 num_ccs_blks; | |
9cca4902 HPG |
631 | u32 num_pages; |
632 | u32 ccs_copy_size; | |
30603b5b | 633 | u32 mocs; |
dd08ebf6 | 634 | |
9cca4902 HPG |
635 | if (GRAPHICS_VERx100(xe) >= 2000) { |
636 | num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); | |
637 | xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1)); | |
30603b5b | 638 | |
9cca4902 | 639 | ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1); |
30603b5b | 640 | mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); |
9cca4902 HPG |
641 | |
642 | } else { | |
643 | num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), | |
644 | NUM_CCS_BYTES_PER_BLOCK); | |
645 | xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1)); | |
646 | ||
647 | ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1); | |
30603b5b | 648 | mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); |
9cca4902 | 649 | } |
30603b5b | 650 | |
dd08ebf6 MB |
651 | *cs++ = XY_CTRL_SURF_COPY_BLT | |
652 | (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | | |
653 | (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | | |
9cca4902 | 654 | ccs_copy_size; |
dd08ebf6 | 655 | *cs++ = lower_32_bits(src_ofs); |
30603b5b | 656 | *cs++ = upper_32_bits(src_ofs) | mocs; |
dd08ebf6 | 657 | *cs++ = lower_32_bits(dst_ofs); |
30603b5b | 658 | *cs++ = upper_32_bits(dst_ofs) | mocs; |
dd08ebf6 MB |
659 | |
660 | bb->len = cs - bb->cs; | |
661 | } | |
662 | ||
663 | #define EMIT_COPY_DW 10 | |
664 | static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, | |
665 | u64 src_ofs, u64 dst_ofs, unsigned int size, | |
3e8e7ee6 | 666 | unsigned int pitch) |
dd08ebf6 | 667 | { |
4bdd8c2e | 668 | struct xe_device *xe = gt_to_xe(gt); |
30603b5b HK |
669 | u32 mocs = 0; |
670 | u32 tile_y = 0; | |
4bdd8c2e | 671 | |
270172f6 | 672 | xe_gt_assert(gt, !(pitch & 3)); |
c73acc1e FD |
673 | xe_gt_assert(gt, size / pitch <= S16_MAX); |
674 | xe_gt_assert(gt, pitch / 4 <= S16_MAX); | |
675 | xe_gt_assert(gt, pitch <= U16_MAX); | |
dd08ebf6 | 676 | |
30603b5b HK |
677 | if (GRAPHICS_VER(xe) >= 20) |
678 | mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); | |
679 | ||
4bdd8c2e | 680 | if (GRAPHICS_VERx100(xe) >= 1250) |
30603b5b HK |
681 | tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; |
682 | ||
683 | bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); | |
684 | bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; | |
dd08ebf6 MB |
685 | bb->cs[bb->len++] = 0; |
686 | bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; | |
687 | bb->cs[bb->len++] = lower_32_bits(dst_ofs); | |
688 | bb->cs[bb->len++] = upper_32_bits(dst_ofs); | |
689 | bb->cs[bb->len++] = 0; | |
30603b5b | 690 | bb->cs[bb->len++] = pitch | mocs; |
dd08ebf6 MB |
691 | bb->cs[bb->len++] = lower_32_bits(src_ofs); |
692 | bb->cs[bb->len++] = upper_32_bits(src_ofs); | |
693 | } | |
694 | ||
dd08ebf6 MB |
695 | static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) |
696 | { | |
697 | return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; | |
698 | } | |
699 | ||
700 | static u32 xe_migrate_ccs_copy(struct xe_migrate *m, | |
701 | struct xe_bb *bb, | |
266c8588 HPG |
702 | u64 src_ofs, bool src_is_indirect, |
703 | u64 dst_ofs, bool dst_is_indirect, u32 dst_size, | |
dd08ebf6 MB |
704 | u64 ccs_ofs, bool copy_ccs) |
705 | { | |
f6929e80 | 706 | struct xe_gt *gt = m->tile->primary_gt; |
dd08ebf6 MB |
707 | u32 flush_flags = 0; |
708 | ||
523f191c | 709 | if (!copy_ccs && dst_is_indirect) { |
dd08ebf6 | 710 | /* |
a2f9f4ff MA |
711 | * If the src is already in vram, then it should already |
712 | * have been cleared by us, or has been populated by the | |
713 | * user. Make sure we copy the CCS aux state as-is. | |
714 | * | |
715 | * Otherwise if the bo doesn't have any CCS metadata attached, | |
716 | * we still need to clear it for security reasons. | |
dd08ebf6 | 717 | */ |
266c8588 | 718 | u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs; |
a2f9f4ff MA |
719 | |
720 | emit_copy_ccs(gt, bb, | |
721 | dst_ofs, true, | |
266c8588 | 722 | ccs_src_ofs, src_is_indirect, dst_size); |
a2f9f4ff | 723 | |
dd08ebf6 MB |
724 | flush_flags = MI_FLUSH_DW_CCS; |
725 | } else if (copy_ccs) { | |
266c8588 | 726 | if (!src_is_indirect) |
dd08ebf6 | 727 | src_ofs = ccs_ofs; |
266c8588 | 728 | else if (!dst_is_indirect) |
dd08ebf6 MB |
729 | dst_ofs = ccs_ofs; |
730 | ||
266c8588 | 731 | xe_gt_assert(gt, src_is_indirect || dst_is_indirect); |
dd08ebf6 | 732 | |
266c8588 HPG |
733 | emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs, |
734 | src_is_indirect, dst_size); | |
735 | if (dst_is_indirect) | |
dd08ebf6 MB |
736 | flush_flags = MI_FLUSH_DW_CCS; |
737 | } | |
738 | ||
739 | return flush_flags; | |
740 | } | |
741 | ||
e9d285ff TH |
742 | /** |
743 | * xe_migrate_copy() - Copy content of TTM resources. | |
744 | * @m: The migration context. | |
3690a01b TH |
745 | * @src_bo: The buffer object @src is currently bound to. |
746 | * @dst_bo: If copying between resources created for the same bo, set this to | |
747 | * the same value as @src_bo. If copying between buffer objects, set it to | |
748 | * the buffer object @dst is currently bound to. | |
e9d285ff TH |
749 | * @src: The source TTM resource. |
750 | * @dst: The dst TTM resource. | |
266c8588 | 751 | * @copy_only_ccs: If true copy only CCS metadata |
e9d285ff TH |
752 | * |
753 | * Copies the contents of @src to @dst: On flat CCS devices, | |
754 | * the CCS metadata is copied as well if needed, or if not present, | |
755 | * the CCS metadata of @dst is cleared for security reasons. | |
e9d285ff TH |
756 | * |
757 | * Return: Pointer to a dma_fence representing the last copy batch, or | |
758 | * an error pointer on failure. If there is a failure, any copy operation | |
759 | * started by the function call has been synced. | |
760 | */ | |
dd08ebf6 | 761 | struct dma_fence *xe_migrate_copy(struct xe_migrate *m, |
3690a01b TH |
762 | struct xe_bo *src_bo, |
763 | struct xe_bo *dst_bo, | |
dd08ebf6 | 764 | struct ttm_resource *src, |
266c8588 HPG |
765 | struct ttm_resource *dst, |
766 | bool copy_only_ccs) | |
dd08ebf6 | 767 | { |
f6929e80 | 768 | struct xe_gt *gt = m->tile->primary_gt; |
dd08ebf6 MB |
769 | struct xe_device *xe = gt_to_xe(gt); |
770 | struct dma_fence *fence = NULL; | |
3690a01b | 771 | u64 size = src_bo->size; |
dd08ebf6 MB |
772 | struct xe_res_cursor src_it, dst_it, ccs_it; |
773 | u64 src_L0_ofs, dst_L0_ofs; | |
774 | u32 src_L0_pt, dst_L0_pt; | |
775 | u64 src_L0, dst_L0; | |
776 | int pass = 0; | |
777 | int err; | |
266c8588 HPG |
778 | bool src_is_pltt = src->mem_type == XE_PL_TT; |
779 | bool dst_is_pltt = dst->mem_type == XE_PL_TT; | |
dd08ebf6 MB |
780 | bool src_is_vram = mem_type_is_vram(src->mem_type); |
781 | bool dst_is_vram = mem_type_is_vram(dst->mem_type); | |
58fa61ce MA |
782 | bool type_device = src_bo->ttm.type == ttm_bo_type_device; |
783 | bool needs_ccs_emit = type_device && xe_migrate_needs_ccs_emit(xe); | |
3690a01b TH |
784 | bool copy_ccs = xe_device_has_flat_ccs(xe) && |
785 | xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); | |
dd08ebf6 | 786 | bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); |
58fa61ce | 787 | bool use_comp_pat = type_device && xe_device_has_flat_ccs(xe) && |
7657d7c9 | 788 | GRAPHICS_VER(xe) >= 20 && src_is_vram && !dst_is_vram; |
dd08ebf6 | 789 | |
3690a01b TH |
790 | /* Copying CCS between two different BOs is not supported yet. */ |
791 | if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) | |
792 | return ERR_PTR(-EINVAL); | |
793 | ||
794 | if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size)) | |
795 | return ERR_PTR(-EINVAL); | |
796 | ||
dd08ebf6 | 797 | if (!src_is_vram) |
a21fe5ee | 798 | xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); |
dd08ebf6 | 799 | else |
e89b384c | 800 | xe_res_first(src, 0, size, &src_it); |
dd08ebf6 | 801 | if (!dst_is_vram) |
a21fe5ee | 802 | xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it); |
dd08ebf6 | 803 | else |
e89b384c | 804 | xe_res_first(dst, 0, size, &dst_it); |
dd08ebf6 MB |
805 | |
806 | if (copy_system_ccs) | |
a21fe5ee | 807 | xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), |
dd08ebf6 MB |
808 | PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), |
809 | &ccs_it); | |
810 | ||
811 | while (size) { | |
812 | u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ | |
813 | struct xe_sched_job *job; | |
814 | struct xe_bb *bb; | |
523f191c | 815 | u32 flush_flags = 0; |
dd08ebf6 MB |
816 | u32 update_idx; |
817 | u64 ccs_ofs, ccs_size; | |
818 | u32 ccs_pt; | |
2b808d6b | 819 | u32 pte_flags; |
09427526 | 820 | |
5a92da34 | 821 | bool usm = xe->info.has_usm; |
09427526 | 822 | u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; |
dd08ebf6 | 823 | |
ef51d754 TH |
824 | src_L0 = xe_migrate_res_sizes(m, &src_it); |
825 | dst_L0 = xe_migrate_res_sizes(m, &dst_it); | |
dd08ebf6 MB |
826 | |
827 | drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", | |
828 | pass++, src_L0, dst_L0); | |
829 | ||
830 | src_L0 = min(src_L0, dst_L0); | |
831 | ||
2b808d6b | 832 | pte_flags = src_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; |
523f191c | 833 | pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0; |
2b808d6b | 834 | batch_size += pte_update_size(m, pte_flags, src, &src_it, &src_L0, |
dd08ebf6 | 835 | &src_L0_ofs, &src_L0_pt, 0, 0, |
09427526 | 836 | avail_pts); |
dd08ebf6 | 837 | |
2b808d6b AJ |
838 | pte_flags = dst_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; |
839 | batch_size += pte_update_size(m, pte_flags, dst, &dst_it, &src_L0, | |
dd08ebf6 | 840 | &dst_L0_ofs, &dst_L0_pt, 0, |
09427526 | 841 | avail_pts, avail_pts); |
dd08ebf6 MB |
842 | |
843 | if (copy_system_ccs) { | |
58fa61ce | 844 | xe_assert(xe, type_device); |
dd08ebf6 | 845 | ccs_size = xe_device_ccs_bytes(xe, src_L0); |
2b808d6b | 846 | batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, |
dd08ebf6 | 847 | &ccs_ofs, &ccs_pt, 0, |
09427526 HPG |
848 | 2 * avail_pts, |
849 | avail_pts); | |
ef51d754 | 850 | xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); |
dd08ebf6 MB |
851 | } |
852 | ||
853 | /* Add copy commands size here */ | |
266c8588 | 854 | batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) + |
58fa61ce | 855 | ((needs_ccs_emit ? EMIT_COPY_CCS_DW : 0)); |
dd08ebf6 MB |
856 | |
857 | bb = xe_bb_new(gt, batch_size, usm); | |
858 | if (IS_ERR(bb)) { | |
859 | err = PTR_ERR(bb); | |
860 | goto err_sync; | |
861 | } | |
862 | ||
ef51d754 | 863 | if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it)) |
dd08ebf6 | 864 | xe_res_next(&src_it, src_L0); |
dd08ebf6 | 865 | else |
fee58ca1 | 866 | emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs || use_comp_pat, |
6a028675 | 867 | &src_it, src_L0, src); |
ef51d754 TH |
868 | |
869 | if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it)) | |
dd08ebf6 | 870 | xe_res_next(&dst_it, src_L0); |
ef51d754 | 871 | else |
6a028675 HPG |
872 | emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs, |
873 | &dst_it, src_L0, dst); | |
dd08ebf6 MB |
874 | |
875 | if (copy_system_ccs) | |
ef51d754 | 876 | emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); |
dd08ebf6 MB |
877 | |
878 | bb->cs[bb->len++] = MI_BATCH_BUFFER_END; | |
879 | update_idx = bb->len; | |
880 | ||
266c8588 HPG |
881 | if (!copy_only_ccs) |
882 | emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE); | |
883 | ||
58fa61ce | 884 | if (needs_ccs_emit) |
523f191c AJ |
885 | flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, |
886 | IS_DGFX(xe) ? src_is_vram : src_is_pltt, | |
887 | dst_L0_ofs, | |
888 | IS_DGFX(xe) ? dst_is_vram : dst_is_pltt, | |
889 | src_L0, ccs_ofs, copy_ccs); | |
dd08ebf6 | 890 | |
9b9529ce | 891 | job = xe_bb_create_migration_job(m->q, bb, |
dd08ebf6 MB |
892 | xe_migrate_batch_base(m, usm), |
893 | update_idx); | |
894 | if (IS_ERR(job)) { | |
895 | err = PTR_ERR(job); | |
896 | goto err; | |
897 | } | |
898 | ||
899 | xe_sched_job_add_migrate_flush(job, flush_flags); | |
900 | if (!fence) { | |
de8390b1 FD |
901 | err = xe_sched_job_add_deps(job, src_bo->ttm.base.resv, |
902 | DMA_RESV_USAGE_BOOKKEEP); | |
3690a01b | 903 | if (!err && src_bo != dst_bo) |
de8390b1 FD |
904 | err = xe_sched_job_add_deps(job, dst_bo->ttm.base.resv, |
905 | DMA_RESV_USAGE_BOOKKEEP); | |
dd08ebf6 MB |
906 | if (err) |
907 | goto err_job; | |
908 | } | |
909 | ||
50e52592 | 910 | mutex_lock(&m->job_mutex); |
dd08ebf6 MB |
911 | xe_sched_job_arm(job); |
912 | dma_fence_put(fence); | |
913 | fence = dma_fence_get(&job->drm.s_fence->finished); | |
914 | xe_sched_job_push(job); | |
915 | ||
916 | dma_fence_put(m->fence); | |
917 | m->fence = dma_fence_get(fence); | |
918 | ||
919 | mutex_unlock(&m->job_mutex); | |
920 | ||
921 | xe_bb_free(bb, fence); | |
922 | size -= src_L0; | |
923 | continue; | |
924 | ||
925 | err_job: | |
926 | xe_sched_job_put(job); | |
927 | err: | |
dd08ebf6 MB |
928 | xe_bb_free(bb, NULL); |
929 | ||
930 | err_sync: | |
e9d285ff | 931 | /* Sync partial copy if any. FIXME: under job_mutex? */ |
dd08ebf6 MB |
932 | if (fence) { |
933 | dma_fence_wait(fence, false); | |
934 | dma_fence_put(fence); | |
935 | } | |
936 | ||
937 | return ERR_PTR(err); | |
938 | } | |
939 | ||
940 | return fence; | |
941 | } | |
942 | ||
11a2407e BV |
943 | static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, |
944 | u32 size, u32 pitch) | |
dd08ebf6 | 945 | { |
30603b5b | 946 | struct xe_device *xe = gt_to_xe(gt); |
11a2407e | 947 | u32 *cs = bb->cs + bb->len; |
11a2407e BV |
948 | u32 len = PVC_MEM_SET_CMD_LEN_DW; |
949 | ||
c690f0e6 | 950 | *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); |
11a2407e BV |
951 | *cs++ = pitch - 1; |
952 | *cs++ = (size / pitch) - 1; | |
953 | *cs++ = pitch - 1; | |
954 | *cs++ = lower_32_bits(src_ofs); | |
955 | *cs++ = upper_32_bits(src_ofs); | |
30603b5b HK |
956 | if (GRAPHICS_VERx100(xe) >= 2000) |
957 | *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); | |
958 | else | |
959 | *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); | |
11a2407e | 960 | |
c73acc1e | 961 | xe_gt_assert(gt, cs - bb->cs == len + bb->len); |
11a2407e BV |
962 | |
963 | bb->len += len; | |
964 | } | |
965 | ||
966 | static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, | |
967 | u64 src_ofs, u32 size, u32 pitch, bool is_vram) | |
968 | { | |
969 | struct xe_device *xe = gt_to_xe(gt); | |
dd08ebf6 MB |
970 | u32 *cs = bb->cs + bb->len; |
971 | u32 len = XY_FAST_COLOR_BLT_DW; | |
dd08ebf6 | 972 | |
11a2407e | 973 | if (GRAPHICS_VERx100(xe) < 1250) |
dd08ebf6 MB |
974 | len = 11; |
975 | ||
976 | *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | | |
977 | (len - 2); | |
30603b5b HK |
978 | if (GRAPHICS_VERx100(xe) >= 2000) |
979 | *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | | |
980 | (pitch - 1); | |
981 | else | |
982 | *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | | |
983 | (pitch - 1); | |
dd08ebf6 MB |
984 | *cs++ = 0; |
985 | *cs++ = (size / pitch) << 16 | pitch / 4; | |
986 | *cs++ = lower_32_bits(src_ofs); | |
987 | *cs++ = upper_32_bits(src_ofs); | |
988 | *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; | |
11a2407e | 989 | *cs++ = 0; |
dd08ebf6 MB |
990 | *cs++ = 0; |
991 | *cs++ = 0; | |
992 | *cs++ = 0; | |
993 | ||
994 | if (len > 11) { | |
995 | *cs++ = 0; | |
996 | *cs++ = 0; | |
997 | *cs++ = 0; | |
998 | *cs++ = 0; | |
999 | *cs++ = 0; | |
1000 | } | |
1001 | ||
c73acc1e | 1002 | xe_gt_assert(gt, cs - bb->cs == len + bb->len); |
11a2407e | 1003 | |
dd08ebf6 | 1004 | bb->len += len; |
11a2407e BV |
1005 | } |
1006 | ||
1951dad5 | 1007 | static bool has_service_copy_support(struct xe_gt *gt) |
11a2407e | 1008 | { |
1951dad5 MR |
1009 | /* |
1010 | * What we care about is whether the architecture was designed with | |
1011 | * service copy functionality (specifically the new MEM_SET / MEM_COPY | |
1012 | * instructions) so check the architectural engine list rather than the | |
1013 | * actual list since these instructions are usable on BCS0 even if | |
1014 | * all of the actual service copy engines (BCS1-BCS8) have been fused | |
1015 | * off. | |
1016 | */ | |
61549a2e LDM |
1017 | return gt->info.engine_mask & GENMASK(XE_HW_ENGINE_BCS8, |
1018 | XE_HW_ENGINE_BCS1); | |
1951dad5 MR |
1019 | } |
1020 | ||
1021 | static u32 emit_clear_cmd_len(struct xe_gt *gt) | |
1022 | { | |
1023 | if (has_service_copy_support(gt)) | |
11a2407e BV |
1024 | return PVC_MEM_SET_CMD_LEN_DW; |
1025 | else | |
1026 | return XY_FAST_COLOR_BLT_DW; | |
1027 | } | |
1028 | ||
1951dad5 MR |
1029 | static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, |
1030 | u32 size, u32 pitch, bool is_vram) | |
11a2407e | 1031 | { |
1951dad5 | 1032 | if (has_service_copy_support(gt)) |
11a2407e | 1033 | emit_clear_link_copy(gt, bb, src_ofs, size, pitch); |
1951dad5 | 1034 | else |
11a2407e BV |
1035 | emit_clear_main_copy(gt, bb, src_ofs, size, pitch, |
1036 | is_vram); | |
dd08ebf6 MB |
1037 | } |
1038 | ||
e9d285ff TH |
1039 | /** |
1040 | * xe_migrate_clear() - Copy content of TTM resources. | |
1041 | * @m: The migration context. | |
1042 | * @bo: The buffer object @dst is currently bound to. | |
1043 | * @dst: The dst TTM resource to be cleared. | |
b8cdc47a | 1044 | * @clear_flags: flags to specify which data to clear: CCS, BO, or both. |
e9d285ff | 1045 | * |
b8cdc47a ND |
1046 | * Clear the contents of @dst to zero when XE_MIGRATE_CLEAR_FLAG_BO_DATA is set. |
1047 | * On flat CCS devices, the CCS metadata is cleared to zero with XE_MIGRATE_CLEAR_FLAG_CCS_DATA. | |
1048 | * Set XE_MIGRATE_CLEAR_FLAG_FULL to clear bo as well as CCS metadata. | |
e9d285ff TH |
1049 | * TODO: Eliminate the @bo argument. |
1050 | * | |
1051 | * Return: Pointer to a dma_fence representing the last clear batch, or | |
1052 | * an error pointer on failure. If there is a failure, any clear operation | |
1053 | * started by the function call has been synced. | |
1054 | */ | |
dd08ebf6 MB |
1055 | struct dma_fence *xe_migrate_clear(struct xe_migrate *m, |
1056 | struct xe_bo *bo, | |
b8cdc47a ND |
1057 | struct ttm_resource *dst, |
1058 | u32 clear_flags) | |
dd08ebf6 MB |
1059 | { |
1060 | bool clear_vram = mem_type_is_vram(dst->mem_type); | |
b8cdc47a ND |
1061 | bool clear_bo_data = XE_MIGRATE_CLEAR_FLAG_BO_DATA & clear_flags; |
1062 | bool clear_ccs = XE_MIGRATE_CLEAR_FLAG_CCS_DATA & clear_flags; | |
f6929e80 | 1063 | struct xe_gt *gt = m->tile->primary_gt; |
dd08ebf6 | 1064 | struct xe_device *xe = gt_to_xe(gt); |
b8cdc47a | 1065 | bool clear_only_system_ccs = false; |
dd08ebf6 MB |
1066 | struct dma_fence *fence = NULL; |
1067 | u64 size = bo->size; | |
1068 | struct xe_res_cursor src_it; | |
1069 | struct ttm_resource *src = dst; | |
1070 | int err; | |
dd08ebf6 | 1071 | |
b8cdc47a ND |
1072 | if (WARN_ON(!clear_bo_data && !clear_ccs)) |
1073 | return NULL; | |
1074 | ||
1075 | if (!clear_bo_data && clear_ccs && !IS_DGFX(xe)) | |
1076 | clear_only_system_ccs = true; | |
1077 | ||
dd08ebf6 | 1078 | if (!clear_vram) |
a21fe5ee | 1079 | xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it); |
dd08ebf6 MB |
1080 | else |
1081 | xe_res_first(src, 0, bo->size, &src_it); | |
1082 | ||
1083 | while (size) { | |
1084 | u64 clear_L0_ofs; | |
1085 | u32 clear_L0_pt; | |
1086 | u32 flush_flags = 0; | |
1087 | u64 clear_L0; | |
1088 | struct xe_sched_job *job; | |
1089 | struct xe_bb *bb; | |
1090 | u32 batch_size, update_idx; | |
2b808d6b | 1091 | u32 pte_flags; |
09427526 | 1092 | |
5a92da34 | 1093 | bool usm = xe->info.has_usm; |
09427526 HPG |
1094 | u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; |
1095 | ||
ef51d754 | 1096 | clear_L0 = xe_migrate_res_sizes(m, &src_it); |
dd08ebf6 | 1097 | |
dd08ebf6 | 1098 | /* Calculate final sizes and batch size.. */ |
2b808d6b | 1099 | pte_flags = clear_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; |
dd08ebf6 | 1100 | batch_size = 2 + |
2b808d6b | 1101 | pte_update_size(m, pte_flags, src, &src_it, |
dd08ebf6 | 1102 | &clear_L0, &clear_L0_ofs, &clear_L0_pt, |
b8cdc47a | 1103 | clear_bo_data ? emit_clear_cmd_len(gt) : 0, 0, |
09427526 | 1104 | avail_pts); |
266c8588 | 1105 | |
108c972a | 1106 | if (xe_migrate_needs_ccs_emit(xe)) |
dd08ebf6 MB |
1107 | batch_size += EMIT_COPY_CCS_DW; |
1108 | ||
1109 | /* Clear commands */ | |
1110 | ||
1111 | if (WARN_ON_ONCE(!clear_L0)) | |
1112 | break; | |
1113 | ||
1114 | bb = xe_bb_new(gt, batch_size, usm); | |
1115 | if (IS_ERR(bb)) { | |
1116 | err = PTR_ERR(bb); | |
1117 | goto err_sync; | |
1118 | } | |
1119 | ||
1120 | size -= clear_L0; | |
dd08ebf6 | 1121 | /* Preemption is enabled again by the ring ops. */ |
ef51d754 | 1122 | if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) |
dd08ebf6 | 1123 | xe_res_next(&src_it, clear_L0); |
ef51d754 | 1124 | else |
b8cdc47a | 1125 | emit_pte(m, bb, clear_L0_pt, clear_vram, clear_only_system_ccs, |
6a028675 | 1126 | &src_it, clear_L0, dst); |
ef51d754 | 1127 | |
dd08ebf6 MB |
1128 | bb->cs[bb->len++] = MI_BATCH_BUFFER_END; |
1129 | update_idx = bb->len; | |
1130 | ||
b8cdc47a | 1131 | if (clear_bo_data) |
266c8588 HPG |
1132 | emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram); |
1133 | ||
108c972a | 1134 | if (xe_migrate_needs_ccs_emit(xe)) { |
dd08ebf6 | 1135 | emit_copy_ccs(gt, bb, clear_L0_ofs, true, |
9116eabb | 1136 | m->cleared_mem_ofs, false, clear_L0); |
dd08ebf6 MB |
1137 | flush_flags = MI_FLUSH_DW_CCS; |
1138 | } | |
1139 | ||
9b9529ce | 1140 | job = xe_bb_create_migration_job(m->q, bb, |
dd08ebf6 MB |
1141 | xe_migrate_batch_base(m, usm), |
1142 | update_idx); | |
1143 | if (IS_ERR(job)) { | |
1144 | err = PTR_ERR(job); | |
1145 | goto err; | |
1146 | } | |
1147 | ||
1148 | xe_sched_job_add_migrate_flush(job, flush_flags); | |
a667cf56 MA |
1149 | if (!fence) { |
1150 | /* | |
1151 | * There can't be anything userspace related at this | |
1152 | * point, so we just need to respect any potential move | |
1153 | * fences, which are always tracked as | |
1154 | * DMA_RESV_USAGE_KERNEL. | |
1155 | */ | |
de8390b1 FD |
1156 | err = xe_sched_job_add_deps(job, bo->ttm.base.resv, |
1157 | DMA_RESV_USAGE_KERNEL); | |
a667cf56 MA |
1158 | if (err) |
1159 | goto err_job; | |
1160 | } | |
dd08ebf6 | 1161 | |
50e52592 | 1162 | mutex_lock(&m->job_mutex); |
dd08ebf6 MB |
1163 | xe_sched_job_arm(job); |
1164 | dma_fence_put(fence); | |
1165 | fence = dma_fence_get(&job->drm.s_fence->finished); | |
1166 | xe_sched_job_push(job); | |
1167 | ||
1168 | dma_fence_put(m->fence); | |
1169 | m->fence = dma_fence_get(fence); | |
1170 | ||
1171 | mutex_unlock(&m->job_mutex); | |
1172 | ||
1173 | xe_bb_free(bb, fence); | |
1174 | continue; | |
1175 | ||
a667cf56 MA |
1176 | err_job: |
1177 | xe_sched_job_put(job); | |
dd08ebf6 | 1178 | err: |
dd08ebf6 MB |
1179 | xe_bb_free(bb, NULL); |
1180 | err_sync: | |
e9d285ff | 1181 | /* Sync partial copies if any. FIXME: job_mutex? */ |
dd08ebf6 | 1182 | if (fence) { |
762b7e95 | 1183 | dma_fence_wait(fence, false); |
dd08ebf6 MB |
1184 | dma_fence_put(fence); |
1185 | } | |
1186 | ||
1187 | return ERR_PTR(err); | |
1188 | } | |
1189 | ||
b8cdc47a | 1190 | if (clear_ccs) |
266c8588 HPG |
1191 | bo->ccs_cleared = true; |
1192 | ||
dd08ebf6 MB |
1193 | return fence; |
1194 | } | |
1195 | ||
876611c2 | 1196 | static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, |
e8babb28 | 1197 | const struct xe_vm_pgtable_update_op *pt_op, |
dd08ebf6 MB |
1198 | const struct xe_vm_pgtable_update *update, |
1199 | struct xe_migrate_pt_update *pt_update) | |
1200 | { | |
1201 | const struct xe_migrate_pt_update_ops *ops = pt_update->ops; | |
1202 | u32 chunk; | |
1203 | u32 ofs = update->ofs, size = update->qwords; | |
1204 | ||
1205 | /* | |
1206 | * If we have 512 entries (max), we would populate it ourselves, | |
1207 | * and update the PDE above it to the new pointer. | |
1208 | * The only time this can only happen if we have to update the top | |
1209 | * PDE. This requires a BO that is almost vm->size big. | |
1210 | * | |
1211 | * This shouldn't be possible in practice.. might change when 16K | |
c73acc1e | 1212 | * pages are used. Hence the assert. |
dd08ebf6 | 1213 | */ |
348769d1 | 1214 | xe_tile_assert(tile, update->qwords < MAX_NUM_PTE); |
d9e85dd5 DK |
1215 | if (!ppgtt_ofs) |
1216 | ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), | |
1217 | xe_bo_addr(update->pt_bo, 0, | |
2b808d6b | 1218 | XE_PAGE_SIZE), false); |
dd08ebf6 MB |
1219 | |
1220 | do { | |
1221 | u64 addr = ppgtt_ofs + ofs * 8; | |
3e8e7ee6 | 1222 | |
43d48379 | 1223 | chunk = min(size, MAX_PTE_PER_SDI); |
dd08ebf6 MB |
1224 | |
1225 | /* Ensure populatefn can do memset64 by aligning bb->cs */ | |
1226 | if (!(bb->len & 1)) | |
1227 | bb->cs[bb->len++] = MI_NOOP; | |
1228 | ||
88fca61b | 1229 | bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); |
dd08ebf6 MB |
1230 | bb->cs[bb->len++] = lower_32_bits(addr); |
1231 | bb->cs[bb->len++] = upper_32_bits(addr); | |
e8babb28 MB |
1232 | if (pt_op->bind) |
1233 | ops->populate(pt_update, tile, NULL, bb->cs + bb->len, | |
1234 | ofs, chunk, update); | |
1235 | else | |
1236 | ops->clear(pt_update, tile, NULL, bb->cs + bb->len, | |
1237 | ofs, chunk, update); | |
dd08ebf6 MB |
1238 | |
1239 | bb->len += chunk * 2; | |
1240 | ofs += chunk; | |
1241 | size -= chunk; | |
1242 | } while (size); | |
1243 | } | |
1244 | ||
1245 | struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) | |
1246 | { | |
9b9529ce | 1247 | return xe_vm_get(m->q->vm); |
dd08ebf6 MB |
1248 | } |
1249 | ||
7cba3396 TH |
1250 | #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) |
1251 | struct migrate_test_params { | |
1252 | struct xe_test_priv base; | |
1253 | bool force_gpu; | |
1254 | }; | |
1255 | ||
1256 | #define to_migrate_test_params(_priv) \ | |
1257 | container_of(_priv, struct migrate_test_params, base) | |
1258 | #endif | |
1259 | ||
dd08ebf6 MB |
1260 | static struct dma_fence * |
1261 | xe_migrate_update_pgtables_cpu(struct xe_migrate *m, | |
dd08ebf6 MB |
1262 | struct xe_migrate_pt_update *pt_update) |
1263 | { | |
7cba3396 TH |
1264 | XE_TEST_DECLARE(struct migrate_test_params *test = |
1265 | to_migrate_test_params | |
1266 | (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) | |
dd08ebf6 | 1267 | const struct xe_migrate_pt_update_ops *ops = pt_update->ops; |
e8babb28 MB |
1268 | struct xe_vm *vm = pt_update->vops->vm; |
1269 | struct xe_vm_pgtable_update_ops *pt_update_ops = | |
1270 | &pt_update->vops->pt_update_ops[pt_update->tile_id]; | |
dd08ebf6 | 1271 | int err; |
e8babb28 | 1272 | u32 i, j; |
dd08ebf6 | 1273 | |
7cba3396 TH |
1274 | if (XE_TEST_ONLY(test && test->force_gpu)) |
1275 | return ERR_PTR(-ETIME); | |
1276 | ||
dd08ebf6 | 1277 | if (ops->pre_commit) { |
fd84041d | 1278 | pt_update->job = NULL; |
dd08ebf6 MB |
1279 | err = ops->pre_commit(pt_update); |
1280 | if (err) | |
1281 | return ERR_PTR(err); | |
1282 | } | |
dd08ebf6 | 1283 | |
e8babb28 MB |
1284 | for (i = 0; i < pt_update_ops->num_ops; ++i) { |
1285 | const struct xe_vm_pgtable_update_op *pt_op = | |
1286 | &pt_update_ops->ops[i]; | |
1287 | ||
1288 | for (j = 0; j < pt_op->num_entries; j++) { | |
1289 | const struct xe_vm_pgtable_update *update = | |
1290 | &pt_op->entries[j]; | |
1291 | ||
1292 | if (pt_op->bind) | |
1293 | ops->populate(pt_update, m->tile, | |
1294 | &update->pt_bo->vmap, NULL, | |
1295 | update->ofs, update->qwords, | |
1296 | update); | |
1297 | else | |
1298 | ops->clear(pt_update, m->tile, | |
1299 | &update->pt_bo->vmap, NULL, | |
1300 | update->ofs, update->qwords, update); | |
a856b67a | 1301 | } |
eb9702ad | 1302 | } |
dd08ebf6 | 1303 | |
e8babb28 MB |
1304 | trace_xe_vm_cpu_bind(vm); |
1305 | xe_device_wmb(vm->xe); | |
1306 | ||
1307 | return dma_fence_get_stub(); | |
dd08ebf6 MB |
1308 | } |
1309 | ||
e8babb28 MB |
1310 | static struct dma_fence * |
1311 | __xe_migrate_update_pgtables(struct xe_migrate *m, | |
1312 | struct xe_migrate_pt_update *pt_update, | |
1313 | struct xe_vm_pgtable_update_ops *pt_update_ops) | |
dd08ebf6 MB |
1314 | { |
1315 | const struct xe_migrate_pt_update_ops *ops = pt_update->ops; | |
08dea767 | 1316 | struct xe_tile *tile = m->tile; |
f6929e80 | 1317 | struct xe_gt *gt = tile->primary_gt; |
08dea767 | 1318 | struct xe_device *xe = tile_to_xe(tile); |
dd08ebf6 MB |
1319 | struct xe_sched_job *job; |
1320 | struct dma_fence *fence; | |
1321 | struct drm_suballoc *sa_bo = NULL; | |
dd08ebf6 | 1322 | struct xe_bb *bb; |
e8babb28 MB |
1323 | u32 i, j, batch_size = 0, ppgtt_ofs, update_idx, page_ofs = 0; |
1324 | u32 num_updates = 0, current_update = 0; | |
dd08ebf6 MB |
1325 | u64 addr; |
1326 | int err = 0; | |
e8babb28 MB |
1327 | bool is_migrate = pt_update_ops->q == m->q; |
1328 | bool usm = is_migrate && xe->info.has_usm; | |
1329 | ||
1330 | for (i = 0; i < pt_update_ops->num_ops; ++i) { | |
1331 | struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[i]; | |
1332 | struct xe_vm_pgtable_update *updates = pt_op->entries; | |
1333 | ||
1334 | num_updates += pt_op->num_entries; | |
1335 | for (j = 0; j < pt_op->num_entries; ++j) { | |
1336 | u32 num_cmds = DIV_ROUND_UP(updates[j].qwords, | |
1337 | MAX_PTE_PER_SDI); | |
dd08ebf6 | 1338 | |
e8babb28 MB |
1339 | /* align noop + MI_STORE_DATA_IMM cmd prefix */ |
1340 | batch_size += 4 * num_cmds + updates[j].qwords * 2; | |
1341 | } | |
dd08ebf6 MB |
1342 | } |
1343 | ||
1344 | /* fixed + PTE entries */ | |
1345 | if (IS_DGFX(xe)) | |
e8babb28 | 1346 | batch_size += 2; |
dd08ebf6 | 1347 | else |
e8babb28 MB |
1348 | batch_size += 6 * (num_updates / MAX_PTE_PER_SDI + 1) + |
1349 | num_updates * 2; | |
dd08ebf6 | 1350 | |
e8babb28 | 1351 | bb = xe_bb_new(gt, batch_size, usm); |
dd08ebf6 MB |
1352 | if (IS_ERR(bb)) |
1353 | return ERR_CAST(bb); | |
1354 | ||
1355 | /* For sysmem PTE's, need to map them in our hole.. */ | |
1356 | if (!IS_DGFX(xe)) { | |
f3dc9246 | 1357 | u16 pat_index = xe->pat.idx[XE_CACHE_WB]; |
e8babb28 MB |
1358 | u32 ptes, ofs; |
1359 | ||
dd08ebf6 | 1360 | ppgtt_ofs = NUM_KERNEL_PDE - 1; |
e8babb28 MB |
1361 | if (!is_migrate) { |
1362 | u32 num_units = DIV_ROUND_UP(num_updates, | |
1363 | NUM_VMUSA_WRITES_PER_UNIT); | |
dd08ebf6 | 1364 | |
e8babb28 MB |
1365 | if (num_units > m->vm_update_sa.size) { |
1366 | err = -ENOBUFS; | |
1367 | goto err_bb; | |
1368 | } | |
1369 | sa_bo = drm_suballoc_new(&m->vm_update_sa, num_units, | |
dd08ebf6 MB |
1370 | GFP_KERNEL, true, 0); |
1371 | if (IS_ERR(sa_bo)) { | |
1372 | err = PTR_ERR(sa_bo); | |
ce6b6333 | 1373 | goto err_bb; |
dd08ebf6 MB |
1374 | } |
1375 | ||
1376 | ppgtt_ofs = NUM_KERNEL_PDE + | |
1377 | (drm_suballoc_soffset(sa_bo) / | |
1378 | NUM_VMUSA_UNIT_PER_PAGE); | |
1379 | page_ofs = (drm_suballoc_soffset(sa_bo) % | |
1380 | NUM_VMUSA_UNIT_PER_PAGE) * | |
1381 | VM_SA_UPDATE_UNIT_SIZE; | |
1382 | } | |
1383 | ||
dd08ebf6 | 1384 | /* Map our PT's to gtt */ |
e8babb28 MB |
1385 | i = 0; |
1386 | j = 0; | |
1387 | ptes = num_updates; | |
1388 | ofs = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; | |
1389 | while (ptes) { | |
1390 | u32 chunk = min(MAX_PTE_PER_SDI, ptes); | |
1391 | u32 idx = 0; | |
1392 | ||
1393 | bb->cs[bb->len++] = MI_STORE_DATA_IMM | | |
88fca61b | 1394 | MI_SDI_NUM_QW(chunk); |
e8babb28 MB |
1395 | bb->cs[bb->len++] = ofs; |
1396 | bb->cs[bb->len++] = 0; /* upper_32_bits */ | |
1397 | ||
1398 | for (; i < pt_update_ops->num_ops; ++i) { | |
1399 | struct xe_vm_pgtable_update_op *pt_op = | |
1400 | &pt_update_ops->ops[i]; | |
1401 | struct xe_vm_pgtable_update *updates = pt_op->entries; | |
1402 | ||
1403 | for (; j < pt_op->num_entries; ++j, ++current_update, ++idx) { | |
1404 | struct xe_vm *vm = pt_update->vops->vm; | |
1405 | struct xe_bo *pt_bo = updates[j].pt_bo; | |
1406 | ||
1407 | if (idx == chunk) | |
1408 | goto next_cmd; | |
1409 | ||
1410 | xe_tile_assert(tile, pt_bo->size == SZ_4K); | |
1411 | ||
1412 | /* Map a PT at most once */ | |
1413 | if (pt_bo->update_index < 0) | |
1414 | pt_bo->update_index = current_update; | |
1415 | ||
1416 | addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, | |
f3dc9246 | 1417 | pat_index, 0); |
e8babb28 MB |
1418 | bb->cs[bb->len++] = lower_32_bits(addr); |
1419 | bb->cs[bb->len++] = upper_32_bits(addr); | |
1420 | } | |
dd08ebf6 | 1421 | |
e8babb28 MB |
1422 | j = 0; |
1423 | } | |
dd08ebf6 | 1424 | |
e8babb28 MB |
1425 | next_cmd: |
1426 | ptes -= chunk; | |
1427 | ofs += chunk * sizeof(u64); | |
dd08ebf6 MB |
1428 | } |
1429 | ||
1430 | bb->cs[bb->len++] = MI_BATCH_BUFFER_END; | |
1431 | update_idx = bb->len; | |
1432 | ||
1433 | addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + | |
58e19acf | 1434 | (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; |
e8babb28 MB |
1435 | for (i = 0; i < pt_update_ops->num_ops; ++i) { |
1436 | struct xe_vm_pgtable_update_op *pt_op = | |
1437 | &pt_update_ops->ops[i]; | |
1438 | struct xe_vm_pgtable_update *updates = pt_op->entries; | |
1439 | ||
1440 | for (j = 0; j < pt_op->num_entries; ++j) { | |
1441 | struct xe_bo *pt_bo = updates[j].pt_bo; | |
1442 | ||
1443 | write_pgtable(tile, bb, addr + | |
1444 | pt_bo->update_index * XE_PAGE_SIZE, | |
1445 | pt_op, &updates[j], pt_update); | |
1446 | } | |
1447 | } | |
dd08ebf6 MB |
1448 | } else { |
1449 | /* phys pages, no preamble required */ | |
1450 | bb->cs[bb->len++] = MI_BATCH_BUFFER_END; | |
1451 | update_idx = bb->len; | |
1452 | ||
e8babb28 MB |
1453 | for (i = 0; i < pt_update_ops->num_ops; ++i) { |
1454 | struct xe_vm_pgtable_update_op *pt_op = | |
1455 | &pt_update_ops->ops[i]; | |
1456 | struct xe_vm_pgtable_update *updates = pt_op->entries; | |
1457 | ||
1458 | for (j = 0; j < pt_op->num_entries; ++j) | |
1459 | write_pgtable(tile, bb, 0, pt_op, &updates[j], | |
1460 | pt_update); | |
1461 | } | |
dd08ebf6 MB |
1462 | } |
1463 | ||
e8babb28 | 1464 | job = xe_bb_create_migration_job(pt_update_ops->q, bb, |
dd08ebf6 MB |
1465 | xe_migrate_batch_base(m, usm), |
1466 | update_idx); | |
1467 | if (IS_ERR(job)) { | |
1468 | err = PTR_ERR(job); | |
ce6b6333 | 1469 | goto err_sa; |
dd08ebf6 MB |
1470 | } |
1471 | ||
dd08ebf6 | 1472 | if (ops->pre_commit) { |
fd84041d | 1473 | pt_update->job = job; |
dd08ebf6 MB |
1474 | err = ops->pre_commit(pt_update); |
1475 | if (err) | |
1476 | goto err_job; | |
1477 | } | |
e8babb28 | 1478 | if (is_migrate) |
50e52592 TH |
1479 | mutex_lock(&m->job_mutex); |
1480 | ||
dd08ebf6 MB |
1481 | xe_sched_job_arm(job); |
1482 | fence = dma_fence_get(&job->drm.s_fence->finished); | |
1483 | xe_sched_job_push(job); | |
1484 | ||
e8babb28 | 1485 | if (is_migrate) |
dd08ebf6 MB |
1486 | mutex_unlock(&m->job_mutex); |
1487 | ||
1488 | xe_bb_free(bb, fence); | |
1489 | drm_suballoc_free(sa_bo, fence); | |
1490 | ||
1491 | return fence; | |
1492 | ||
1493 | err_job: | |
1494 | xe_sched_job_put(job); | |
ce6b6333 MA |
1495 | err_sa: |
1496 | drm_suballoc_free(sa_bo, NULL); | |
dd08ebf6 | 1497 | err_bb: |
dd08ebf6 | 1498 | xe_bb_free(bb, NULL); |
dd08ebf6 MB |
1499 | return ERR_PTR(err); |
1500 | } | |
1501 | ||
e8babb28 MB |
1502 | /** |
1503 | * xe_migrate_update_pgtables() - Pipelined page-table update | |
1504 | * @m: The migrate context. | |
1505 | * @pt_update: PT update arguments | |
1506 | * | |
1507 | * Perform a pipelined page-table update. The update descriptors are typically | |
1508 | * built under the same lock critical section as a call to this function. If | |
1509 | * using the default engine for the updates, they will be performed in the | |
1510 | * order they grab the job_mutex. If different engines are used, external | |
1511 | * synchronization is needed for overlapping updates to maintain page-table | |
75fd04f2 | 1512 | * consistency. Note that the meaning of "overlapping" is that the updates |
e8babb28 MB |
1513 | * touch the same page-table, which might be a higher-level page-directory. |
1514 | * If no pipelining is needed, then updates may be performed by the cpu. | |
1515 | * | |
1516 | * Return: A dma_fence that, when signaled, indicates the update completion. | |
1517 | */ | |
1518 | struct dma_fence * | |
1519 | xe_migrate_update_pgtables(struct xe_migrate *m, | |
1520 | struct xe_migrate_pt_update *pt_update) | |
1521 | ||
1522 | { | |
1523 | struct xe_vm_pgtable_update_ops *pt_update_ops = | |
1524 | &pt_update->vops->pt_update_ops[pt_update->tile_id]; | |
1525 | struct dma_fence *fence; | |
1526 | ||
1527 | fence = xe_migrate_update_pgtables_cpu(m, pt_update); | |
1528 | ||
1529 | /* -ETIME indicates a job is needed, anything else is legit error */ | |
1530 | if (!IS_ERR(fence) || PTR_ERR(fence) != -ETIME) | |
1531 | return fence; | |
1532 | ||
1533 | return __xe_migrate_update_pgtables(m, pt_update, pt_update_ops); | |
1534 | } | |
1535 | ||
e9d285ff TH |
1536 | /** |
1537 | * xe_migrate_wait() - Complete all operations using the xe_migrate context | |
1538 | * @m: Migrate context to wait for. | |
1539 | * | |
1540 | * Waits until the GPU no longer uses the migrate context's default engine | |
1541 | * or its page-table objects. FIXME: What about separate page-table update | |
1542 | * engines? | |
1543 | */ | |
dd08ebf6 MB |
1544 | void xe_migrate_wait(struct xe_migrate *m) |
1545 | { | |
1546 | if (m->fence) | |
1547 | dma_fence_wait(m->fence, false); | |
1548 | } | |
1549 | ||
9c44fd5f MB |
1550 | static u32 pte_update_cmd_size(u64 size) |
1551 | { | |
1552 | u32 num_dword; | |
c9092257 | 1553 | u64 entries = DIV_U64_ROUND_UP(size, XE_PAGE_SIZE); |
9c44fd5f MB |
1554 | |
1555 | XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER); | |
2d5cff2b | 1556 | |
9c44fd5f MB |
1557 | /* |
1558 | * MI_STORE_DATA_IMM command is used to update page table. Each | |
2d5cff2b JY |
1559 | * instruction can update maximumly MAX_PTE_PER_SDI pte entries. To |
1560 | * update n (n <= MAX_PTE_PER_SDI) pte entries, we need: | |
1561 | * | |
1562 | * - 1 dword for the MI_STORE_DATA_IMM command header (opcode etc) | |
1563 | * - 2 dword for the page table's physical location | |
1564 | * - 2*n dword for value of pte to fill (each pte entry is 2 dwords) | |
9c44fd5f | 1565 | */ |
2d5cff2b | 1566 | num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, MAX_PTE_PER_SDI); |
9c44fd5f MB |
1567 | num_dword += entries * 2; |
1568 | ||
1569 | return num_dword; | |
1570 | } | |
1571 | ||
1572 | static void build_pt_update_batch_sram(struct xe_migrate *m, | |
1573 | struct xe_bb *bb, u32 pt_offset, | |
1574 | dma_addr_t *sram_addr, u32 size) | |
1575 | { | |
1576 | u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB]; | |
1577 | u32 ptes; | |
1578 | int i = 0; | |
1579 | ||
1580 | ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); | |
1581 | while (ptes) { | |
2d5cff2b | 1582 | u32 chunk = min(MAX_PTE_PER_SDI, ptes); |
9c44fd5f MB |
1583 | |
1584 | bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); | |
1585 | bb->cs[bb->len++] = pt_offset; | |
1586 | bb->cs[bb->len++] = 0; | |
1587 | ||
1588 | pt_offset += chunk * 8; | |
1589 | ptes -= chunk; | |
1590 | ||
1591 | while (chunk--) { | |
1592 | u64 addr = sram_addr[i++] & PAGE_MASK; | |
1593 | ||
1594 | xe_tile_assert(m->tile, addr); | |
1595 | addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, | |
1596 | addr, pat_index, | |
1597 | 0, false, 0); | |
1598 | bb->cs[bb->len++] = lower_32_bits(addr); | |
1599 | bb->cs[bb->len++] = upper_32_bits(addr); | |
1600 | } | |
1601 | } | |
1602 | } | |
1603 | ||
1604 | enum xe_migrate_copy_dir { | |
1605 | XE_MIGRATE_COPY_TO_VRAM, | |
1606 | XE_MIGRATE_COPY_TO_SRAM, | |
1607 | }; | |
1608 | ||
270172f6 MB |
1609 | #define XE_CACHELINE_BYTES 64ull |
1610 | #define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1) | |
1611 | ||
9c44fd5f | 1612 | static struct dma_fence *xe_migrate_vram(struct xe_migrate *m, |
270172f6 MB |
1613 | unsigned long len, |
1614 | unsigned long sram_offset, | |
9c44fd5f MB |
1615 | dma_addr_t *sram_addr, u64 vram_addr, |
1616 | const enum xe_migrate_copy_dir dir) | |
1617 | { | |
1618 | struct xe_gt *gt = m->tile->primary_gt; | |
1619 | struct xe_device *xe = gt_to_xe(gt); | |
4c200754 | 1620 | bool use_usm_batch = xe->info.has_usm; |
9c44fd5f MB |
1621 | struct dma_fence *fence = NULL; |
1622 | u32 batch_size = 2; | |
1623 | u64 src_L0_ofs, dst_L0_ofs; | |
9c44fd5f MB |
1624 | struct xe_sched_job *job; |
1625 | struct xe_bb *bb; | |
1626 | u32 update_idx, pt_slot = 0; | |
270172f6 MB |
1627 | unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE); |
1628 | unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ? | |
1629 | PAGE_SIZE : 4; | |
9c44fd5f MB |
1630 | int err; |
1631 | ||
270172f6 MB |
1632 | if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) || |
1633 | (sram_offset | vram_addr) & XE_CACHELINE_MASK)) | |
1634 | return ERR_PTR(-EOPNOTSUPP); | |
9c44fd5f | 1635 | |
270172f6 MB |
1636 | xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER); |
1637 | ||
1638 | batch_size += pte_update_cmd_size(len); | |
9c44fd5f MB |
1639 | batch_size += EMIT_COPY_DW; |
1640 | ||
4c200754 | 1641 | bb = xe_bb_new(gt, batch_size, use_usm_batch); |
9c44fd5f MB |
1642 | if (IS_ERR(bb)) { |
1643 | err = PTR_ERR(bb); | |
1644 | return ERR_PTR(err); | |
1645 | } | |
1646 | ||
1647 | build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE, | |
270172f6 | 1648 | sram_addr, len + sram_offset); |
9c44fd5f MB |
1649 | |
1650 | if (dir == XE_MIGRATE_COPY_TO_VRAM) { | |
270172f6 | 1651 | src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; |
9c44fd5f MB |
1652 | dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); |
1653 | ||
1654 | } else { | |
1655 | src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); | |
270172f6 | 1656 | dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; |
9c44fd5f MB |
1657 | } |
1658 | ||
1659 | bb->cs[bb->len++] = MI_BATCH_BUFFER_END; | |
1660 | update_idx = bb->len; | |
1661 | ||
270172f6 | 1662 | emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch); |
9c44fd5f MB |
1663 | |
1664 | job = xe_bb_create_migration_job(m->q, bb, | |
4c200754 | 1665 | xe_migrate_batch_base(m, use_usm_batch), |
9c44fd5f MB |
1666 | update_idx); |
1667 | if (IS_ERR(job)) { | |
1668 | err = PTR_ERR(job); | |
1669 | goto err; | |
1670 | } | |
1671 | ||
1672 | xe_sched_job_add_migrate_flush(job, 0); | |
1673 | ||
1674 | mutex_lock(&m->job_mutex); | |
1675 | xe_sched_job_arm(job); | |
1676 | fence = dma_fence_get(&job->drm.s_fence->finished); | |
1677 | xe_sched_job_push(job); | |
1678 | ||
1679 | dma_fence_put(m->fence); | |
1680 | m->fence = dma_fence_get(fence); | |
1681 | mutex_unlock(&m->job_mutex); | |
1682 | ||
1683 | xe_bb_free(bb, fence); | |
1684 | ||
1685 | return fence; | |
1686 | ||
1687 | err: | |
1688 | xe_bb_free(bb, NULL); | |
1689 | ||
1690 | return ERR_PTR(err); | |
1691 | } | |
1692 | ||
1693 | /** | |
1694 | * xe_migrate_to_vram() - Migrate to VRAM | |
1695 | * @m: The migration context. | |
1696 | * @npages: Number of pages to migrate. | |
1697 | * @src_addr: Array of dma addresses (source of migrate) | |
1698 | * @dst_addr: Device physical address of VRAM (destination of migrate) | |
1699 | * | |
1700 | * Copy from an array dma addresses to a VRAM device physical address | |
1701 | * | |
1702 | * Return: dma fence for migrate to signal completion on succees, ERR_PTR on | |
1703 | * failure | |
1704 | */ | |
1705 | struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m, | |
1706 | unsigned long npages, | |
1707 | dma_addr_t *src_addr, | |
1708 | u64 dst_addr) | |
1709 | { | |
270172f6 | 1710 | return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr, |
9c44fd5f MB |
1711 | XE_MIGRATE_COPY_TO_VRAM); |
1712 | } | |
1713 | ||
1714 | /** | |
1715 | * xe_migrate_from_vram() - Migrate from VRAM | |
1716 | * @m: The migration context. | |
1717 | * @npages: Number of pages to migrate. | |
1718 | * @src_addr: Device physical address of VRAM (source of migrate) | |
1719 | * @dst_addr: Array of dma addresses (destination of migrate) | |
1720 | * | |
1721 | * Copy from a VRAM device physical address to an array dma addresses | |
1722 | * | |
1723 | * Return: dma fence for migrate to signal completion on succees, ERR_PTR on | |
1724 | * failure | |
1725 | */ | |
1726 | struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m, | |
1727 | unsigned long npages, | |
1728 | u64 src_addr, | |
1729 | dma_addr_t *dst_addr) | |
1730 | { | |
270172f6 | 1731 | return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr, |
9c44fd5f MB |
1732 | XE_MIGRATE_COPY_TO_SRAM); |
1733 | } | |
1734 | ||
270172f6 MB |
1735 | static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr, |
1736 | int len, int write) | |
1737 | { | |
1738 | unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); | |
1739 | ||
1740 | for (i = 0; i < npages; ++i) { | |
1741 | if (!dma_addr[i]) | |
1742 | break; | |
1743 | ||
1744 | dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE, | |
1745 | write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | |
1746 | } | |
1747 | kfree(dma_addr); | |
1748 | } | |
1749 | ||
1750 | static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe, | |
1751 | void *buf, int len, int write) | |
1752 | { | |
1753 | dma_addr_t *dma_addr; | |
1754 | unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); | |
1755 | ||
1756 | dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL); | |
1757 | if (!dma_addr) | |
1758 | return ERR_PTR(-ENOMEM); | |
1759 | ||
1760 | for (i = 0; i < npages; ++i) { | |
1761 | dma_addr_t addr; | |
1762 | struct page *page; | |
1763 | ||
1764 | if (is_vmalloc_addr(buf)) | |
1765 | page = vmalloc_to_page(buf); | |
1766 | else | |
1767 | page = virt_to_page(buf); | |
1768 | ||
1769 | addr = dma_map_page(xe->drm.dev, | |
1770 | page, 0, PAGE_SIZE, | |
1771 | write ? DMA_TO_DEVICE : | |
1772 | DMA_FROM_DEVICE); | |
1773 | if (dma_mapping_error(xe->drm.dev, addr)) | |
1774 | goto err_fault; | |
1775 | ||
1776 | dma_addr[i] = addr; | |
1777 | buf += PAGE_SIZE; | |
1778 | } | |
1779 | ||
1780 | return dma_addr; | |
1781 | ||
1782 | err_fault: | |
1783 | xe_migrate_dma_unmap(xe, dma_addr, len, write); | |
1784 | return ERR_PTR(-EFAULT); | |
1785 | } | |
1786 | ||
1787 | /** | |
1788 | * xe_migrate_access_memory - Access memory of a BO via GPU | |
1789 | * | |
1790 | * @m: The migration context. | |
1791 | * @bo: buffer object | |
1792 | * @offset: access offset into buffer object | |
1793 | * @buf: pointer to caller memory to read into or write from | |
1794 | * @len: length of access | |
1795 | * @write: write access | |
1796 | * | |
1797 | * Access memory of a BO via GPU either reading in or writing from a passed in | |
1798 | * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to | |
1799 | * read to or write from pointer. | |
1800 | * | |
1801 | * Returns: | |
1802 | * 0 if successful, negative error code on failure. | |
1803 | */ | |
1804 | int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, | |
1805 | unsigned long offset, void *buf, int len, | |
1806 | int write) | |
1807 | { | |
1808 | struct xe_tile *tile = m->tile; | |
1809 | struct xe_device *xe = tile_to_xe(tile); | |
1810 | struct xe_res_cursor cursor; | |
1811 | struct dma_fence *fence = NULL; | |
1812 | dma_addr_t *dma_addr; | |
1813 | unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK; | |
1814 | int bytes_left = len, current_page = 0; | |
1815 | void *orig_buf = buf; | |
1816 | ||
1817 | xe_bo_assert_held(bo); | |
1818 | ||
1819 | /* Use bounce buffer for small access and unaligned access */ | |
1820 | if (len & XE_CACHELINE_MASK || | |
1821 | ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) { | |
1822 | int buf_offset = 0; | |
1823 | ||
1824 | /* | |
1825 | * Less than ideal for large unaligned access but this should be | |
1826 | * fairly rare, can fixup if this becomes common. | |
1827 | */ | |
1828 | do { | |
1829 | u8 bounce[XE_CACHELINE_BYTES]; | |
1830 | void *ptr = (void *)bounce; | |
1831 | int err; | |
1832 | int copy_bytes = min_t(int, bytes_left, | |
1833 | XE_CACHELINE_BYTES - | |
1834 | (offset & XE_CACHELINE_MASK)); | |
1835 | int ptr_offset = offset & XE_CACHELINE_MASK; | |
1836 | ||
1837 | err = xe_migrate_access_memory(m, bo, | |
1838 | offset & | |
1839 | ~XE_CACHELINE_MASK, | |
1840 | (void *)ptr, | |
1841 | sizeof(bounce), 0); | |
1842 | if (err) | |
1843 | return err; | |
1844 | ||
1845 | if (write) { | |
1846 | memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes); | |
1847 | ||
1848 | err = xe_migrate_access_memory(m, bo, | |
1849 | offset & ~XE_CACHELINE_MASK, | |
1850 | (void *)ptr, | |
1851 | sizeof(bounce), 0); | |
1852 | if (err) | |
1853 | return err; | |
1854 | } else { | |
1855 | memcpy(buf + buf_offset, ptr + ptr_offset, | |
1856 | copy_bytes); | |
1857 | } | |
1858 | ||
1859 | bytes_left -= copy_bytes; | |
1860 | buf_offset += copy_bytes; | |
1861 | offset += copy_bytes; | |
1862 | } while (bytes_left); | |
1863 | ||
1864 | return 0; | |
1865 | } | |
1866 | ||
1867 | dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write); | |
1868 | if (IS_ERR(dma_addr)) | |
1869 | return PTR_ERR(dma_addr); | |
1870 | ||
1871 | xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor); | |
1872 | ||
1873 | do { | |
1874 | struct dma_fence *__fence; | |
1875 | u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) + | |
1876 | cursor.start; | |
1877 | int current_bytes; | |
1878 | ||
1879 | if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER) | |
1880 | current_bytes = min_t(int, bytes_left, | |
1881 | MAX_PREEMPTDISABLE_TRANSFER); | |
1882 | else | |
1883 | current_bytes = min_t(int, bytes_left, cursor.size); | |
1884 | ||
1885 | if (fence) | |
1886 | dma_fence_put(fence); | |
1887 | ||
1888 | __fence = xe_migrate_vram(m, current_bytes, | |
1889 | (unsigned long)buf & ~PAGE_MASK, | |
1890 | dma_addr + current_page, | |
1891 | vram_addr, write ? | |
1892 | XE_MIGRATE_COPY_TO_VRAM : | |
1893 | XE_MIGRATE_COPY_TO_SRAM); | |
1894 | if (IS_ERR(__fence)) { | |
1895 | if (fence) | |
1896 | dma_fence_wait(fence, false); | |
1897 | fence = __fence; | |
1898 | goto out_err; | |
1899 | } | |
1900 | fence = __fence; | |
1901 | ||
1902 | buf += current_bytes; | |
1903 | offset += current_bytes; | |
1904 | current_page = (int)(buf - orig_buf) / PAGE_SIZE; | |
1905 | bytes_left -= current_bytes; | |
1906 | if (bytes_left) | |
1907 | xe_res_next(&cursor, current_bytes); | |
1908 | } while (bytes_left); | |
1909 | ||
1910 | dma_fence_wait(fence, false); | |
1911 | dma_fence_put(fence); | |
1912 | ||
1913 | out_err: | |
1914 | xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write); | |
1915 | return IS_ERR(fence) ? PTR_ERR(fence) : 0; | |
1916 | } | |
6c55404d | 1917 | |
dd08ebf6 MB |
1918 | #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) |
1919 | #include "tests/xe_migrate.c" | |
1920 | #endif |