5fb74e71f27b50bbc6dee99dfdd1a7716784fb8a
[linux-block.git] / drivers / gpu / drm / i915 / gt / intel_migrate.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5
6 #include "i915_drv.h"
7 #include "intel_context.h"
8 #include "intel_gpu_commands.h"
9 #include "intel_gt.h"
10 #include "intel_gtt.h"
11 #include "intel_migrate.h"
12 #include "intel_ring.h"
13 #include "gem/i915_gem_lmem.h"
14
15 struct insert_pte_data {
16         u64 offset;
17 };
18
19 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
20
21 #define GET_CCS_BYTES(i915, size)       (HAS_FLAT_CCS(i915) ? \
22                                          DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0)
23 static bool engine_supports_migration(struct intel_engine_cs *engine)
24 {
25         if (!engine)
26                 return false;
27
28         /*
29          * We need the ability to prevent aribtration (MI_ARB_ON_OFF),
30          * the ability to write PTE using inline data (MI_STORE_DATA)
31          * and of course the ability to do the block transfer (blits).
32          */
33         GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS);
34
35         return true;
36 }
37
38 static void xehpsdv_toggle_pdes(struct i915_address_space *vm,
39                                 struct i915_page_table *pt,
40                                 void *data)
41 {
42         struct insert_pte_data *d = data;
43
44         /*
45          * Insert a dummy PTE into every PT that will map to LMEM to ensure
46          * we have a correctly setup PDE structure for later use.
47          */
48         vm->insert_page(vm, 0, d->offset, I915_CACHE_NONE, PTE_LM);
49         GEM_BUG_ON(!pt->is_compact);
50         d->offset += SZ_2M;
51 }
52
53 static void xehpsdv_insert_pte(struct i915_address_space *vm,
54                                struct i915_page_table *pt,
55                                void *data)
56 {
57         struct insert_pte_data *d = data;
58
59         /*
60          * We are playing tricks here, since the actual pt, from the hw
61          * pov, is only 256bytes with 32 entries, or 4096bytes with 512
62          * entries, but we are still guaranteed that the physical
63          * alignment is 64K underneath for the pt, and we are careful
64          * not to access the space in the void.
65          */
66         vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE, PTE_LM);
67         d->offset += SZ_64K;
68 }
69
70 static void insert_pte(struct i915_address_space *vm,
71                        struct i915_page_table *pt,
72                        void *data)
73 {
74         struct insert_pte_data *d = data;
75
76         vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE,
77                         i915_gem_object_is_lmem(pt->base) ? PTE_LM : 0);
78         d->offset += PAGE_SIZE;
79 }
80
81 static struct i915_address_space *migrate_vm(struct intel_gt *gt)
82 {
83         struct i915_vm_pt_stash stash = {};
84         struct i915_ppgtt *vm;
85         int err;
86         int i;
87
88         /*
89          * We construct a very special VM for use by all migration contexts,
90          * it is kept pinned so that it can be used at any time. As we need
91          * to pre-allocate the page directories for the migration VM, this
92          * limits us to only using a small number of prepared vma.
93          *
94          * To be able to pipeline and reschedule migration operations while
95          * avoiding unnecessary contention on the vm itself, the PTE updates
96          * are inline with the blits. All the blits use the same fixed
97          * addresses, with the backing store redirection being updated on the
98          * fly. Only 2 implicit vma are used for all migration operations.
99          *
100          * We lay the ppGTT out as:
101          *
102          *      [0, CHUNK_SZ) -> first object
103          *      [CHUNK_SZ, 2 * CHUNK_SZ) -> second object
104          *      [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE
105          *
106          * By exposing the dma addresses of the page directories themselves
107          * within the ppGTT, we are then able to rewrite the PTE prior to use.
108          * But the PTE update and subsequent migration operation must be atomic,
109          * i.e. within the same non-preemptible window so that we do not switch
110          * to another migration context that overwrites the PTE.
111          *
112          * This changes quite a bit on platforms with HAS_64K_PAGES support,
113          * where we instead have three windows, each CHUNK_SIZE in size. The
114          * first is reserved for mapping system-memory, and that just uses the
115          * 512 entry layout using 4K GTT pages. The other two windows just map
116          * lmem pages and must use the new compact 32 entry layout using 64K GTT
117          * pages, which ensures we can address any lmem object that the user
118          * throws at us. We then also use the xehpsdv_toggle_pdes as a way of
119          * just toggling the PDE bit(GEN12_PDE_64K) for us, to enable the
120          * compact layout for each of these page-tables, that fall within the
121          * [CHUNK_SIZE, 3 * CHUNK_SIZE) range.
122          *
123          * We lay the ppGTT out as:
124          *
125          * [0, CHUNK_SZ) -> first window/object, maps smem
126          * [CHUNK_SZ, 2 * CHUNK_SZ) -> second window/object, maps lmem src
127          * [2 * CHUNK_SZ, 3 * CHUNK_SZ) -> third window/object, maps lmem dst
128          *
129          * For the PTE window it's also quite different, since each PTE must
130          * point to some 64K page, one for each PT(since it's in lmem), and yet
131          * each is only <= 4096bytes, but since the unused space within that PTE
132          * range is never touched, this should be fine.
133          *
134          * So basically each PT now needs 64K of virtual memory, instead of 4K,
135          * which looks like:
136          *
137          * [3 * CHUNK_SZ, 3 * CHUNK_SZ + ((3 * CHUNK_SZ / SZ_2M) * SZ_64K)] -> PTE
138          */
139
140         vm = i915_ppgtt_create(gt, I915_BO_ALLOC_PM_EARLY);
141         if (IS_ERR(vm))
142                 return ERR_CAST(vm);
143
144         if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
145                 err = -ENODEV;
146                 goto err_vm;
147         }
148
149         if (HAS_64K_PAGES(gt->i915))
150                 stash.pt_sz = I915_GTT_PAGE_SIZE_64K;
151
152         /*
153          * Each engine instance is assigned its own chunk in the VM, so
154          * that we can run multiple instances concurrently
155          */
156         for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
157                 struct intel_engine_cs *engine;
158                 u64 base = (u64)i << 32;
159                 struct insert_pte_data d = {};
160                 struct i915_gem_ww_ctx ww;
161                 u64 sz;
162
163                 engine = gt->engine_class[COPY_ENGINE_CLASS][i];
164                 if (!engine_supports_migration(engine))
165                         continue;
166
167                 /*
168                  * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
169                  * 4x2 page directories for source/destination.
170                  */
171                 if (HAS_64K_PAGES(gt->i915))
172                         sz = 3 * CHUNK_SZ;
173                 else
174                         sz = 2 * CHUNK_SZ;
175                 d.offset = base + sz;
176
177                 /*
178                  * We need another page directory setup so that we can write
179                  * the 8x512 PTE in each chunk.
180                  */
181                 if (HAS_64K_PAGES(gt->i915))
182                         sz += (sz / SZ_2M) * SZ_64K;
183                 else
184                         sz += (sz >> 12) * sizeof(u64);
185
186                 err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
187                 if (err)
188                         goto err_vm;
189
190                 for_i915_gem_ww(&ww, err, true) {
191                         err = i915_vm_lock_objects(&vm->vm, &ww);
192                         if (err)
193                                 continue;
194                         err = i915_vm_map_pt_stash(&vm->vm, &stash);
195                         if (err)
196                                 continue;
197
198                         vm->vm.allocate_va_range(&vm->vm, &stash, base, sz);
199                 }
200                 i915_vm_free_pt_stash(&vm->vm, &stash);
201                 if (err)
202                         goto err_vm;
203
204                 /* Now allow the GPU to rewrite the PTE via its own ppGTT */
205                 if (HAS_64K_PAGES(gt->i915)) {
206                         vm->vm.foreach(&vm->vm, base, d.offset - base,
207                                        xehpsdv_insert_pte, &d);
208                         d.offset = base + CHUNK_SZ;
209                         vm->vm.foreach(&vm->vm,
210                                        d.offset,
211                                        2 * CHUNK_SZ,
212                                        xehpsdv_toggle_pdes, &d);
213                 } else {
214                         vm->vm.foreach(&vm->vm, base, d.offset - base,
215                                        insert_pte, &d);
216                 }
217         }
218
219         return &vm->vm;
220
221 err_vm:
222         i915_vm_put(&vm->vm);
223         return ERR_PTR(err);
224 }
225
226 static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt)
227 {
228         struct intel_engine_cs *engine;
229         int i;
230
231         for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
232                 engine = gt->engine_class[COPY_ENGINE_CLASS][i];
233                 if (engine_supports_migration(engine))
234                         return engine;
235         }
236
237         return NULL;
238 }
239
240 static struct intel_context *pinned_context(struct intel_gt *gt)
241 {
242         static struct lock_class_key key;
243         struct intel_engine_cs *engine;
244         struct i915_address_space *vm;
245         struct intel_context *ce;
246
247         engine = first_copy_engine(gt);
248         if (!engine)
249                 return ERR_PTR(-ENODEV);
250
251         vm = migrate_vm(gt);
252         if (IS_ERR(vm))
253                 return ERR_CAST(vm);
254
255         ce = intel_engine_create_pinned_context(engine, vm, SZ_512K,
256                                                 I915_GEM_HWS_MIGRATE,
257                                                 &key, "migrate");
258         i915_vm_put(vm);
259         return ce;
260 }
261
262 int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt)
263 {
264         struct intel_context *ce;
265
266         memset(m, 0, sizeof(*m));
267
268         ce = pinned_context(gt);
269         if (IS_ERR(ce))
270                 return PTR_ERR(ce);
271
272         m->context = ce;
273         return 0;
274 }
275
276 static int random_index(unsigned int max)
277 {
278         return upper_32_bits(mul_u32_u32(get_random_u32(), max));
279 }
280
281 static struct intel_context *__migrate_engines(struct intel_gt *gt)
282 {
283         struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE];
284         struct intel_engine_cs *engine;
285         unsigned int count, i;
286
287         count = 0;
288         for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
289                 engine = gt->engine_class[COPY_ENGINE_CLASS][i];
290                 if (engine_supports_migration(engine))
291                         engines[count++] = engine;
292         }
293
294         return intel_context_create(engines[random_index(count)]);
295 }
296
297 struct intel_context *intel_migrate_create_context(struct intel_migrate *m)
298 {
299         struct intel_context *ce;
300
301         /*
302          * We randomly distribute contexts across the engines upon constrction,
303          * as they all share the same pinned vm, and so in order to allow
304          * multiple blits to run in parallel, we must construct each blit
305          * to use a different range of the vm for its GTT. This has to be
306          * known at construction, so we can not use the late greedy load
307          * balancing of the virtual-engine.
308          */
309         ce = __migrate_engines(m->context->engine->gt);
310         if (IS_ERR(ce))
311                 return ce;
312
313         ce->ring = NULL;
314         ce->ring_size = SZ_256K;
315
316         i915_vm_put(ce->vm);
317         ce->vm = i915_vm_get(m->context->vm);
318
319         return ce;
320 }
321
322 static inline struct sgt_dma sg_sgt(struct scatterlist *sg)
323 {
324         dma_addr_t addr = sg_dma_address(sg);
325
326         return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) };
327 }
328
329 static int emit_no_arbitration(struct i915_request *rq)
330 {
331         u32 *cs;
332
333         cs = intel_ring_begin(rq, 2);
334         if (IS_ERR(cs))
335                 return PTR_ERR(cs);
336
337         /* Explicitly disable preemption for this request. */
338         *cs++ = MI_ARB_ON_OFF;
339         *cs++ = MI_NOOP;
340         intel_ring_advance(rq, cs);
341
342         return 0;
343 }
344
345 static int max_pte_pkt_size(struct i915_request *rq, int pkt)
346 {
347         struct intel_ring *ring = rq->ring;
348
349         pkt = min_t(int, pkt, (ring->space - rq->reserved_space) / sizeof(u32) + 5);
350         pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5);
351
352         return pkt;
353 }
354
355 static int emit_pte(struct i915_request *rq,
356                     struct sgt_dma *it,
357                     enum i915_cache_level cache_level,
358                     bool is_lmem,
359                     u64 offset,
360                     int length)
361 {
362         bool has_64K_pages = HAS_64K_PAGES(rq->engine->i915);
363         const u64 encode = rq->context->vm->pte_encode(0, cache_level,
364                                                        is_lmem ? PTE_LM : 0);
365         struct intel_ring *ring = rq->ring;
366         int pkt, dword_length;
367         u32 total = 0;
368         u32 page_size;
369         u32 *hdr, *cs;
370
371         GEM_BUG_ON(GRAPHICS_VER(rq->engine->i915) < 8);
372
373         page_size = I915_GTT_PAGE_SIZE;
374         dword_length = 0x400;
375
376         /* Compute the page directory offset for the target address range */
377         if (has_64K_pages) {
378                 GEM_BUG_ON(!IS_ALIGNED(offset, SZ_2M));
379
380                 offset /= SZ_2M;
381                 offset *= SZ_64K;
382                 offset += 3 * CHUNK_SZ;
383
384                 if (is_lmem) {
385                         page_size = I915_GTT_PAGE_SIZE_64K;
386                         dword_length = 0x40;
387                 }
388         } else {
389                 offset >>= 12;
390                 offset *= sizeof(u64);
391                 offset += 2 * CHUNK_SZ;
392         }
393
394         offset += (u64)rq->engine->instance << 32;
395
396         cs = intel_ring_begin(rq, 6);
397         if (IS_ERR(cs))
398                 return PTR_ERR(cs);
399
400         /* Pack as many PTE updates as possible into a single MI command */
401         pkt = max_pte_pkt_size(rq, dword_length);
402
403         hdr = cs;
404         *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); /* as qword elements */
405         *cs++ = lower_32_bits(offset);
406         *cs++ = upper_32_bits(offset);
407
408         do {
409                 if (cs - hdr >= pkt) {
410                         int dword_rem;
411
412                         *hdr += cs - hdr - 2;
413                         *cs++ = MI_NOOP;
414
415                         ring->emit = (void *)cs - ring->vaddr;
416                         intel_ring_advance(rq, cs);
417                         intel_ring_update_space(ring);
418
419                         cs = intel_ring_begin(rq, 6);
420                         if (IS_ERR(cs))
421                                 return PTR_ERR(cs);
422
423                         dword_rem = dword_length;
424                         if (has_64K_pages) {
425                                 if (IS_ALIGNED(total, SZ_2M)) {
426                                         offset = round_up(offset, SZ_64K);
427                                 } else {
428                                         dword_rem = SZ_2M - (total & (SZ_2M - 1));
429                                         dword_rem /= page_size;
430                                         dword_rem *= 2;
431                                 }
432                         }
433
434                         pkt = max_pte_pkt_size(rq, dword_rem);
435
436                         hdr = cs;
437                         *cs++ = MI_STORE_DATA_IMM | REG_BIT(21);
438                         *cs++ = lower_32_bits(offset);
439                         *cs++ = upper_32_bits(offset);
440                 }
441
442                 GEM_BUG_ON(!IS_ALIGNED(it->dma, page_size));
443
444                 *cs++ = lower_32_bits(encode | it->dma);
445                 *cs++ = upper_32_bits(encode | it->dma);
446
447                 offset += 8;
448                 total += page_size;
449
450                 it->dma += page_size;
451                 if (it->dma >= it->max) {
452                         it->sg = __sg_next(it->sg);
453                         if (!it->sg || sg_dma_len(it->sg) == 0)
454                                 break;
455
456                         it->dma = sg_dma_address(it->sg);
457                         it->max = it->dma + sg_dma_len(it->sg);
458                 }
459         } while (total < length);
460
461         *hdr += cs - hdr - 2;
462         *cs++ = MI_NOOP;
463
464         ring->emit = (void *)cs - ring->vaddr;
465         intel_ring_advance(rq, cs);
466         intel_ring_update_space(ring);
467
468         return total;
469 }
470
471 static bool wa_1209644611_applies(int ver, u32 size)
472 {
473         u32 height = size >> PAGE_SHIFT;
474
475         if (ver != 11)
476                 return false;
477
478         return height % 4 == 3 && height <= 8;
479 }
480
481 /**
482  * DOC: Flat-CCS - Memory compression for Local memory
483  *
484  * On Xe-HP and later devices, we use dedicated compression control state (CCS)
485  * stored in local memory for each surface, to support the 3D and media
486  * compression formats.
487  *
488  * The memory required for the CCS of the entire local memory is 1/256 of the
489  * local memory size. So before the kernel boot, the required memory is reserved
490  * for the CCS data and a secure register will be programmed with the CCS base
491  * address.
492  *
493  * Flat CCS data needs to be cleared when a lmem object is allocated.
494  * And CCS data can be copied in and out of CCS region through
495  * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
496  *
497  * I915 supports Flat-CCS on lmem only objects. When an objects has smem in
498  * its preference list, on memory pressure, i915 needs to migrate the lmem
499  * content into smem. If the lmem object is Flat-CCS compressed by userspace,
500  * then i915 needs to decompress it. But I915 lack the required information
501  * for such decompression. Hence I915 supports Flat-CCS only on lmem only objects.
502  *
503  * When we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can
504  * be temporarily evicted to smem, along with the auxiliary CCS state, where
505  * it can be potentially swapped-out at a later point, if required.
506  * If userspace later touches the evicted pages, then we always move
507  * the backing memory back to lmem, which includes restoring the saved CCS state,
508  * and potentially performing any required swap-in.
509  *
510  * For the migration of the lmem objects with smem in placement list, such as
511  * {lmem, smem}, objects are treated as non Flat-CCS capable objects.
512  */
513
514 static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
515 {
516         *cmd++ = MI_FLUSH_DW | flags;
517         *cmd++ = 0;
518         *cmd++ = 0;
519
520         return cmd;
521 }
522
523 static int emit_copy_ccs(struct i915_request *rq,
524                          u32 dst_offset, u8 dst_access,
525                          u32 src_offset, u8 src_access, int size)
526 {
527         struct drm_i915_private *i915 = rq->engine->i915;
528         int mocs = rq->engine->gt->mocs.uc_index << 1;
529         u32 num_ccs_blks;
530         u32 *cs;
531
532         cs = intel_ring_begin(rq, 12);
533         if (IS_ERR(cs))
534                 return PTR_ERR(cs);
535
536         num_ccs_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
537                                     NUM_CCS_BYTES_PER_BLOCK);
538         GEM_BUG_ON(num_ccs_blks > NUM_CCS_BLKS_PER_XFER);
539         cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS);
540
541         /*
542          * The XY_CTRL_SURF_COPY_BLT instruction is used to copy the CCS
543          * data in and out of the CCS region.
544          *
545          * We can copy at most 1024 blocks of 256 bytes using one
546          * XY_CTRL_SURF_COPY_BLT instruction.
547          *
548          * In case we need to copy more than 1024 blocks, we need to add
549          * another instruction to the same batch buffer.
550          *
551          * 1024 blocks of 256 bytes of CCS represent a total 256KB of CCS.
552          *
553          * 256 KB of CCS represents 256 * 256 KB = 64 MB of LMEM.
554          */
555         *cs++ = XY_CTRL_SURF_COPY_BLT |
556                 src_access << SRC_ACCESS_TYPE_SHIFT |
557                 dst_access << DST_ACCESS_TYPE_SHIFT |
558                 ((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT;
559         *cs++ = src_offset;
560         *cs++ = rq->engine->instance |
561                 FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
562         *cs++ = dst_offset;
563         *cs++ = rq->engine->instance |
564                 FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
565
566         cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS);
567         *cs++ = MI_NOOP;
568
569         intel_ring_advance(rq, cs);
570
571         return 0;
572 }
573
574 static int emit_copy(struct i915_request *rq,
575                      u32 dst_offset, u32 src_offset, int size)
576 {
577         const int ver = GRAPHICS_VER(rq->engine->i915);
578         u32 instance = rq->engine->instance;
579         u32 *cs;
580
581         cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6);
582         if (IS_ERR(cs))
583                 return PTR_ERR(cs);
584
585         if (ver >= 9 && !wa_1209644611_applies(ver, size)) {
586                 *cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
587                 *cs++ = BLT_DEPTH_32 | PAGE_SIZE;
588                 *cs++ = 0;
589                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
590                 *cs++ = dst_offset;
591                 *cs++ = instance;
592                 *cs++ = 0;
593                 *cs++ = PAGE_SIZE;
594                 *cs++ = src_offset;
595                 *cs++ = instance;
596         } else if (ver >= 8) {
597                 *cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
598                 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
599                 *cs++ = 0;
600                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
601                 *cs++ = dst_offset;
602                 *cs++ = instance;
603                 *cs++ = 0;
604                 *cs++ = PAGE_SIZE;
605                 *cs++ = src_offset;
606                 *cs++ = instance;
607         } else {
608                 GEM_BUG_ON(instance);
609                 *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
610                 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
611                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
612                 *cs++ = dst_offset;
613                 *cs++ = PAGE_SIZE;
614                 *cs++ = src_offset;
615         }
616
617         intel_ring_advance(rq, cs);
618         return 0;
619 }
620
621 static u64 scatter_list_length(struct scatterlist *sg)
622 {
623         u64 len = 0;
624
625         while (sg && sg_dma_len(sg)) {
626                 len += sg_dma_len(sg);
627                 sg = sg_next(sg);
628         }
629
630         return len;
631 }
632
633 static int
634 calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
635                    u64 bytes_to_cpy, u64 ccs_bytes_to_cpy)
636 {
637         if (ccs_bytes_to_cpy && !src_is_lmem)
638                 /*
639                  * When CHUNK_SZ is passed all the pages upto CHUNK_SZ
640                  * will be taken for the blt. in Flat-ccs supported
641                  * platform Smem obj will have more pages than required
642                  * for main meory hence limit it to the required size
643                  * for main memory
644                  */
645                 return min_t(u64, bytes_to_cpy, CHUNK_SZ);
646         else
647                 return CHUNK_SZ;
648 }
649
650 static void get_ccs_sg_sgt(struct sgt_dma *it, u64 bytes_to_cpy)
651 {
652         u64 len;
653
654         do {
655                 GEM_BUG_ON(!it->sg || !sg_dma_len(it->sg));
656                 len = it->max - it->dma;
657                 if (len > bytes_to_cpy) {
658                         it->dma += bytes_to_cpy;
659                         break;
660                 }
661
662                 bytes_to_cpy -= len;
663
664                 it->sg = __sg_next(it->sg);
665                 it->dma = sg_dma_address(it->sg);
666                 it->max = it->dma + sg_dma_len(it->sg);
667         } while (bytes_to_cpy);
668 }
669
670 int
671 intel_context_migrate_copy(struct intel_context *ce,
672                            const struct i915_deps *deps,
673                            struct scatterlist *src,
674                            enum i915_cache_level src_cache_level,
675                            bool src_is_lmem,
676                            struct scatterlist *dst,
677                            enum i915_cache_level dst_cache_level,
678                            bool dst_is_lmem,
679                            struct i915_request **out)
680 {
681         struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst), it_ccs;
682         struct drm_i915_private *i915 = ce->engine->i915;
683         u64 ccs_bytes_to_cpy = 0, bytes_to_cpy;
684         enum i915_cache_level ccs_cache_level;
685         u32 src_offset, dst_offset;
686         u8 src_access, dst_access;
687         struct i915_request *rq;
688         u64 src_sz, dst_sz;
689         bool ccs_is_src, overwrite_ccs;
690         int err;
691
692         GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
693         GEM_BUG_ON(IS_DGFX(ce->engine->i915) && (!src_is_lmem && !dst_is_lmem));
694         *out = NULL;
695
696         GEM_BUG_ON(ce->ring->size < SZ_64K);
697
698         src_sz = scatter_list_length(src);
699         bytes_to_cpy = src_sz;
700
701         if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
702                 src_access = !src_is_lmem && dst_is_lmem;
703                 dst_access = !src_access;
704
705                 dst_sz = scatter_list_length(dst);
706                 if (src_is_lmem) {
707                         it_ccs = it_dst;
708                         ccs_cache_level = dst_cache_level;
709                         ccs_is_src = false;
710                 } else if (dst_is_lmem) {
711                         bytes_to_cpy = dst_sz;
712                         it_ccs = it_src;
713                         ccs_cache_level = src_cache_level;
714                         ccs_is_src = true;
715                 }
716
717                 /*
718                  * When there is a eviction of ccs needed smem will have the
719                  * extra pages for the ccs data
720                  *
721                  * TO-DO: Want to move the size mismatch check to a WARN_ON,
722                  * but still we have some requests of smem->lmem with same size.
723                  * Need to fix it.
724                  */
725                 ccs_bytes_to_cpy = src_sz != dst_sz ? GET_CCS_BYTES(i915, bytes_to_cpy) : 0;
726                 if (ccs_bytes_to_cpy)
727                         get_ccs_sg_sgt(&it_ccs, bytes_to_cpy);
728         }
729
730         overwrite_ccs = HAS_FLAT_CCS(i915) && !ccs_bytes_to_cpy && dst_is_lmem;
731
732         src_offset = 0;
733         dst_offset = CHUNK_SZ;
734         if (HAS_64K_PAGES(ce->engine->i915)) {
735                 src_offset = 0;
736                 dst_offset = 0;
737                 if (src_is_lmem)
738                         src_offset = CHUNK_SZ;
739                 if (dst_is_lmem)
740                         dst_offset = 2 * CHUNK_SZ;
741         }
742
743         do {
744                 int len;
745
746                 rq = i915_request_create(ce);
747                 if (IS_ERR(rq)) {
748                         err = PTR_ERR(rq);
749                         goto out_ce;
750                 }
751
752                 if (deps) {
753                         err = i915_request_await_deps(rq, deps);
754                         if (err)
755                                 goto out_rq;
756
757                         if (rq->engine->emit_init_breadcrumb) {
758                                 err = rq->engine->emit_init_breadcrumb(rq);
759                                 if (err)
760                                         goto out_rq;
761                         }
762
763                         deps = NULL;
764                 }
765
766                 /* The PTE updates + copy must not be interrupted. */
767                 err = emit_no_arbitration(rq);
768                 if (err)
769                         goto out_rq;
770
771                 src_sz = calculate_chunk_sz(i915, src_is_lmem,
772                                             bytes_to_cpy, ccs_bytes_to_cpy);
773
774                 len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem,
775                                src_offset, src_sz);
776                 if (!len) {
777                         err = -EINVAL;
778                         goto out_rq;
779                 }
780                 if (len < 0) {
781                         err = len;
782                         goto out_rq;
783                 }
784
785                 err = emit_pte(rq, &it_dst, dst_cache_level, dst_is_lmem,
786                                dst_offset, len);
787                 if (err < 0)
788                         goto out_rq;
789                 if (err < len) {
790                         err = -EINVAL;
791                         goto out_rq;
792                 }
793
794                 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
795                 if (err)
796                         goto out_rq;
797
798                 err = emit_copy(rq, dst_offset, src_offset, len);
799                 if (err)
800                         goto out_rq;
801
802                 bytes_to_cpy -= len;
803
804                 if (ccs_bytes_to_cpy) {
805                         int ccs_sz;
806
807                         err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
808                         if (err)
809                                 goto out_rq;
810
811                         ccs_sz = GET_CCS_BYTES(i915, len);
812                         err = emit_pte(rq, &it_ccs, ccs_cache_level, false,
813                                        ccs_is_src ? src_offset : dst_offset,
814                                        ccs_sz);
815                         if (err < 0)
816                                 goto out_rq;
817                         if (err < ccs_sz) {
818                                 err = -EINVAL;
819                                 goto out_rq;
820                         }
821
822                         err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
823                         if (err)
824                                 goto out_rq;
825
826                         err = emit_copy_ccs(rq, dst_offset, dst_access,
827                                             src_offset, src_access, len);
828                         if (err)
829                                 goto out_rq;
830
831                         err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
832                         if (err)
833                                 goto out_rq;
834                         ccs_bytes_to_cpy -= ccs_sz;
835                 } else if (overwrite_ccs) {
836                         err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
837                         if (err)
838                                 goto out_rq;
839
840                         if (src_is_lmem) {
841                                 /*
842                                  * If the src is already in lmem, then we must
843                                  * be doing an lmem -> lmem transfer, and so
844                                  * should be safe to directly copy the CCS
845                                  * state. In this case we have either
846                                  * initialised the CCS aux state when first
847                                  * clearing the pages (since it is already
848                                  * allocated in lmem), or the user has
849                                  * potentially populated it, in which case we
850                                  * need to copy the CCS state as-is.
851                                  */
852                                 err = emit_copy_ccs(rq,
853                                                     dst_offset, INDIRECT_ACCESS,
854                                                     src_offset, INDIRECT_ACCESS,
855                                                     len);
856                         } else {
857                                 /*
858                                  * While we can't always restore/manage the CCS
859                                  * state, we still need to ensure we don't leak
860                                  * the CCS state from the previous user, so make
861                                  * sure we overwrite it with something.
862                                  */
863                                 err = emit_copy_ccs(rq,
864                                                     dst_offset, INDIRECT_ACCESS,
865                                                     dst_offset, DIRECT_ACCESS,
866                                                     len);
867                         }
868
869                         if (err)
870                                 goto out_rq;
871
872                         err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
873                         if (err)
874                                 goto out_rq;
875                 }
876
877                 /* Arbitration is re-enabled between requests. */
878 out_rq:
879                 if (*out)
880                         i915_request_put(*out);
881                 *out = i915_request_get(rq);
882                 i915_request_add(rq);
883
884                 if (err)
885                         break;
886
887                 if (!bytes_to_cpy && !ccs_bytes_to_cpy) {
888                         if (src_is_lmem)
889                                 WARN_ON(it_src.sg && sg_dma_len(it_src.sg));
890                         else
891                                 WARN_ON(it_dst.sg && sg_dma_len(it_dst.sg));
892                         break;
893                 }
894
895                 if (WARN_ON(!it_src.sg || !sg_dma_len(it_src.sg) ||
896                             !it_dst.sg || !sg_dma_len(it_dst.sg) ||
897                             (ccs_bytes_to_cpy && (!it_ccs.sg ||
898                                                   !sg_dma_len(it_ccs.sg))))) {
899                         err = -EINVAL;
900                         break;
901                 }
902
903                 cond_resched();
904         } while (1);
905
906 out_ce:
907         return err;
908 }
909
910 static int emit_clear(struct i915_request *rq, u32 offset, int size,
911                       u32 value, bool is_lmem)
912 {
913         struct drm_i915_private *i915 = rq->engine->i915;
914         int mocs = rq->engine->gt->mocs.uc_index << 1;
915         const int ver = GRAPHICS_VER(i915);
916         int ring_sz;
917         u32 *cs;
918
919         GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
920
921         if (HAS_FLAT_CCS(i915) && ver >= 12)
922                 ring_sz = XY_FAST_COLOR_BLT_DW;
923         else if (ver >= 8)
924                 ring_sz = 8;
925         else
926                 ring_sz = 6;
927
928         cs = intel_ring_begin(rq, ring_sz);
929         if (IS_ERR(cs))
930                 return PTR_ERR(cs);
931
932         if (HAS_FLAT_CCS(i915) && ver >= 12) {
933                 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
934                         (XY_FAST_COLOR_BLT_DW - 2);
935                 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) |
936                         (PAGE_SIZE - 1);
937                 *cs++ = 0;
938                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
939                 *cs++ = offset;
940                 *cs++ = rq->engine->instance;
941                 *cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
942                 /* BG7 */
943                 *cs++ = value;
944                 *cs++ = 0;
945                 *cs++ = 0;
946                 *cs++ = 0;
947                 /* BG11 */
948                 *cs++ = 0;
949                 *cs++ = 0;
950                 /* BG13 */
951                 *cs++ = 0;
952                 *cs++ = 0;
953                 *cs++ = 0;
954         } else if (ver >= 8) {
955                 *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
956                 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
957                 *cs++ = 0;
958                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
959                 *cs++ = offset;
960                 *cs++ = rq->engine->instance;
961                 *cs++ = value;
962                 *cs++ = MI_NOOP;
963         } else {
964                 *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
965                 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
966                 *cs++ = 0;
967                 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
968                 *cs++ = offset;
969                 *cs++ = value;
970         }
971
972         intel_ring_advance(rq, cs);
973         return 0;
974 }
975
976 int
977 intel_context_migrate_clear(struct intel_context *ce,
978                             const struct i915_deps *deps,
979                             struct scatterlist *sg,
980                             enum i915_cache_level cache_level,
981                             bool is_lmem,
982                             u32 value,
983                             struct i915_request **out)
984 {
985         struct drm_i915_private *i915 = ce->engine->i915;
986         struct sgt_dma it = sg_sgt(sg);
987         struct i915_request *rq;
988         u32 offset;
989         int err;
990
991         GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
992         *out = NULL;
993
994         GEM_BUG_ON(ce->ring->size < SZ_64K);
995
996         offset = 0;
997         if (HAS_64K_PAGES(i915) && is_lmem)
998                 offset = CHUNK_SZ;
999
1000         do {
1001                 int len;
1002
1003                 rq = i915_request_create(ce);
1004                 if (IS_ERR(rq)) {
1005                         err = PTR_ERR(rq);
1006                         goto out_ce;
1007                 }
1008
1009                 if (deps) {
1010                         err = i915_request_await_deps(rq, deps);
1011                         if (err)
1012                                 goto out_rq;
1013
1014                         if (rq->engine->emit_init_breadcrumb) {
1015                                 err = rq->engine->emit_init_breadcrumb(rq);
1016                                 if (err)
1017                                         goto out_rq;
1018                         }
1019
1020                         deps = NULL;
1021                 }
1022
1023                 /* The PTE updates + clear must not be interrupted. */
1024                 err = emit_no_arbitration(rq);
1025                 if (err)
1026                         goto out_rq;
1027
1028                 len = emit_pte(rq, &it, cache_level, is_lmem, offset, CHUNK_SZ);
1029                 if (len <= 0) {
1030                         err = len;
1031                         goto out_rq;
1032                 }
1033
1034                 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
1035                 if (err)
1036                         goto out_rq;
1037
1038                 err = emit_clear(rq, offset, len, value, is_lmem);
1039                 if (err)
1040                         goto out_rq;
1041
1042                 if (HAS_FLAT_CCS(i915) && is_lmem && !value) {
1043                         /*
1044                          * copy the content of memory into corresponding
1045                          * ccs surface
1046                          */
1047                         err = emit_copy_ccs(rq, offset, INDIRECT_ACCESS, offset,
1048                                             DIRECT_ACCESS, len);
1049                         if (err)
1050                                 goto out_rq;
1051                 }
1052
1053                 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
1054
1055                 /* Arbitration is re-enabled between requests. */
1056 out_rq:
1057                 if (*out)
1058                         i915_request_put(*out);
1059                 *out = i915_request_get(rq);
1060                 i915_request_add(rq);
1061                 if (err || !it.sg || !sg_dma_len(it.sg))
1062                         break;
1063
1064                 cond_resched();
1065         } while (1);
1066
1067 out_ce:
1068         return err;
1069 }
1070
1071 int intel_migrate_copy(struct intel_migrate *m,
1072                        struct i915_gem_ww_ctx *ww,
1073                        const struct i915_deps *deps,
1074                        struct scatterlist *src,
1075                        enum i915_cache_level src_cache_level,
1076                        bool src_is_lmem,
1077                        struct scatterlist *dst,
1078                        enum i915_cache_level dst_cache_level,
1079                        bool dst_is_lmem,
1080                        struct i915_request **out)
1081 {
1082         struct intel_context *ce;
1083         int err;
1084
1085         *out = NULL;
1086         if (!m->context)
1087                 return -ENODEV;
1088
1089         ce = intel_migrate_create_context(m);
1090         if (IS_ERR(ce))
1091                 ce = intel_context_get(m->context);
1092         GEM_BUG_ON(IS_ERR(ce));
1093
1094         err = intel_context_pin_ww(ce, ww);
1095         if (err)
1096                 goto out;
1097
1098         err = intel_context_migrate_copy(ce, deps,
1099                                          src, src_cache_level, src_is_lmem,
1100                                          dst, dst_cache_level, dst_is_lmem,
1101                                          out);
1102
1103         intel_context_unpin(ce);
1104 out:
1105         intel_context_put(ce);
1106         return err;
1107 }
1108
1109 int
1110 intel_migrate_clear(struct intel_migrate *m,
1111                     struct i915_gem_ww_ctx *ww,
1112                     const struct i915_deps *deps,
1113                     struct scatterlist *sg,
1114                     enum i915_cache_level cache_level,
1115                     bool is_lmem,
1116                     u32 value,
1117                     struct i915_request **out)
1118 {
1119         struct intel_context *ce;
1120         int err;
1121
1122         *out = NULL;
1123         if (!m->context)
1124                 return -ENODEV;
1125
1126         ce = intel_migrate_create_context(m);
1127         if (IS_ERR(ce))
1128                 ce = intel_context_get(m->context);
1129         GEM_BUG_ON(IS_ERR(ce));
1130
1131         err = intel_context_pin_ww(ce, ww);
1132         if (err)
1133                 goto out;
1134
1135         err = intel_context_migrate_clear(ce, deps, sg, cache_level,
1136                                           is_lmem, value, out);
1137
1138         intel_context_unpin(ce);
1139 out:
1140         intel_context_put(ce);
1141         return err;
1142 }
1143
1144 void intel_migrate_fini(struct intel_migrate *m)
1145 {
1146         struct intel_context *ce;
1147
1148         ce = fetch_and_zero(&m->context);
1149         if (!ce)
1150                 return;
1151
1152         intel_engine_destroy_pinned_context(ce);
1153 }
1154
1155 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1156 #include "selftest_migrate.c"
1157 #endif