Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / drivers / gpu / drm / i915 / i915_gem.c
1 /*
2  * Copyright © 2008-2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27
28 #include <drm/drm_vma_manager.h>
29 #include <drm/drm_pci.h>
30 #include <drm/i915_drm.h>
31 #include <linux/dma-fence-array.h>
32 #include <linux/kthread.h>
33 #include <linux/reservation.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/slab.h>
36 #include <linux/stop_machine.h>
37 #include <linux/swap.h>
38 #include <linux/pci.h>
39 #include <linux/dma-buf.h>
40 #include <linux/mman.h>
41
42 #include "i915_drv.h"
43 #include "i915_gem_clflush.h"
44 #include "i915_gemfs.h"
45 #include "i915_reset.h"
46 #include "i915_trace.h"
47 #include "i915_vgpu.h"
48
49 #include "intel_drv.h"
50 #include "intel_frontbuffer.h"
51 #include "intel_mocs.h"
52 #include "intel_workarounds.h"
53
54 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
55
56 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
57 {
58         if (obj->cache_dirty)
59                 return false;
60
61         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
62                 return true;
63
64         return obj->pin_global; /* currently in use by HW, keep flushed */
65 }
66
67 static int
68 insert_mappable_node(struct i915_ggtt *ggtt,
69                      struct drm_mm_node *node, u32 size)
70 {
71         memset(node, 0, sizeof(*node));
72         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
73                                            size, 0, I915_COLOR_UNEVICTABLE,
74                                            0, ggtt->mappable_end,
75                                            DRM_MM_INSERT_LOW);
76 }
77
78 static void
79 remove_mappable_node(struct drm_mm_node *node)
80 {
81         drm_mm_remove_node(node);
82 }
83
84 /* some bookkeeping */
85 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
86                                   u64 size)
87 {
88         spin_lock(&dev_priv->mm.object_stat_lock);
89         dev_priv->mm.object_count++;
90         dev_priv->mm.object_memory += size;
91         spin_unlock(&dev_priv->mm.object_stat_lock);
92 }
93
94 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
95                                      u64 size)
96 {
97         spin_lock(&dev_priv->mm.object_stat_lock);
98         dev_priv->mm.object_count--;
99         dev_priv->mm.object_memory -= size;
100         spin_unlock(&dev_priv->mm.object_stat_lock);
101 }
102
103 static int
104 i915_gem_wait_for_error(struct i915_gpu_error *error)
105 {
106         int ret;
107
108         might_sleep();
109
110         /*
111          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
112          * userspace. If it takes that long something really bad is going on and
113          * we should simply try to bail out and fail as gracefully as possible.
114          */
115         ret = wait_event_interruptible_timeout(error->reset_queue,
116                                                !i915_reset_backoff(error),
117                                                I915_RESET_TIMEOUT);
118         if (ret == 0) {
119                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
120                 return -EIO;
121         } else if (ret < 0) {
122                 return ret;
123         } else {
124                 return 0;
125         }
126 }
127
128 int i915_mutex_lock_interruptible(struct drm_device *dev)
129 {
130         struct drm_i915_private *dev_priv = to_i915(dev);
131         int ret;
132
133         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
134         if (ret)
135                 return ret;
136
137         ret = mutex_lock_interruptible(&dev->struct_mutex);
138         if (ret)
139                 return ret;
140
141         return 0;
142 }
143
144 static u32 __i915_gem_park(struct drm_i915_private *i915)
145 {
146         intel_wakeref_t wakeref;
147
148         GEM_TRACE("\n");
149
150         lockdep_assert_held(&i915->drm.struct_mutex);
151         GEM_BUG_ON(i915->gt.active_requests);
152         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
153
154         if (!i915->gt.awake)
155                 return I915_EPOCH_INVALID;
156
157         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
158
159         /*
160          * Be paranoid and flush a concurrent interrupt to make sure
161          * we don't reactivate any irq tasklets after parking.
162          *
163          * FIXME: Note that even though we have waited for execlists to be idle,
164          * there may still be an in-flight interrupt even though the CSB
165          * is now empty. synchronize_irq() makes sure that a residual interrupt
166          * is completed before we continue, but it doesn't prevent the HW from
167          * raising a spurious interrupt later. To complete the shield we should
168          * coordinate disabling the CS irq with flushing the interrupts.
169          */
170         synchronize_irq(i915->drm.irq);
171
172         intel_engines_park(i915);
173         i915_timelines_park(i915);
174
175         i915_pmu_gt_parked(i915);
176         i915_vma_parked(i915);
177
178         wakeref = fetch_and_zero(&i915->gt.awake);
179         GEM_BUG_ON(!wakeref);
180
181         if (INTEL_GEN(i915) >= 6)
182                 gen6_rps_idle(i915);
183
184         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ, wakeref);
185
186         return i915->gt.epoch;
187 }
188
189 void i915_gem_park(struct drm_i915_private *i915)
190 {
191         GEM_TRACE("\n");
192
193         lockdep_assert_held(&i915->drm.struct_mutex);
194         GEM_BUG_ON(i915->gt.active_requests);
195
196         if (!i915->gt.awake)
197                 return;
198
199         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
200         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
201 }
202
203 void i915_gem_unpark(struct drm_i915_private *i915)
204 {
205         GEM_TRACE("\n");
206
207         lockdep_assert_held(&i915->drm.struct_mutex);
208         GEM_BUG_ON(!i915->gt.active_requests);
209         assert_rpm_wakelock_held(i915);
210
211         if (i915->gt.awake)
212                 return;
213
214         /*
215          * It seems that the DMC likes to transition between the DC states a lot
216          * when there are no connected displays (no active power domains) during
217          * command submission.
218          *
219          * This activity has negative impact on the performance of the chip with
220          * huge latencies observed in the interrupt handler and elsewhere.
221          *
222          * Work around it by grabbing a GT IRQ power domain whilst there is any
223          * GT activity, preventing any DC state transitions.
224          */
225         i915->gt.awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
226         GEM_BUG_ON(!i915->gt.awake);
227
228         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
229                 i915->gt.epoch = 1;
230
231         intel_enable_gt_powersave(i915);
232         i915_update_gfx_val(i915);
233         if (INTEL_GEN(i915) >= 6)
234                 gen6_rps_busy(i915);
235         i915_pmu_gt_unparked(i915);
236
237         intel_engines_unpark(i915);
238
239         i915_queue_hangcheck(i915);
240
241         queue_delayed_work(i915->wq,
242                            &i915->gt.retire_work,
243                            round_jiffies_up_relative(HZ));
244 }
245
246 int
247 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
248                             struct drm_file *file)
249 {
250         struct i915_ggtt *ggtt = &to_i915(dev)->ggtt;
251         struct drm_i915_gem_get_aperture *args = data;
252         struct i915_vma *vma;
253         u64 pinned;
254
255         mutex_lock(&ggtt->vm.mutex);
256
257         pinned = ggtt->vm.reserved;
258         list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
259                 if (i915_vma_is_pinned(vma))
260                         pinned += vma->node.size;
261
262         mutex_unlock(&ggtt->vm.mutex);
263
264         args->aper_size = ggtt->vm.total;
265         args->aper_available_size = args->aper_size - pinned;
266
267         return 0;
268 }
269
270 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
271 {
272         struct address_space *mapping = obj->base.filp->f_mapping;
273         drm_dma_handle_t *phys;
274         struct sg_table *st;
275         struct scatterlist *sg;
276         char *vaddr;
277         int i;
278         int err;
279
280         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
281                 return -EINVAL;
282
283         /* Always aligning to the object size, allows a single allocation
284          * to handle all possible callers, and given typical object sizes,
285          * the alignment of the buddy allocation will naturally match.
286          */
287         phys = drm_pci_alloc(obj->base.dev,
288                              roundup_pow_of_two(obj->base.size),
289                              roundup_pow_of_two(obj->base.size));
290         if (!phys)
291                 return -ENOMEM;
292
293         vaddr = phys->vaddr;
294         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
295                 struct page *page;
296                 char *src;
297
298                 page = shmem_read_mapping_page(mapping, i);
299                 if (IS_ERR(page)) {
300                         err = PTR_ERR(page);
301                         goto err_phys;
302                 }
303
304                 src = kmap_atomic(page);
305                 memcpy(vaddr, src, PAGE_SIZE);
306                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
307                 kunmap_atomic(src);
308
309                 put_page(page);
310                 vaddr += PAGE_SIZE;
311         }
312
313         i915_gem_chipset_flush(to_i915(obj->base.dev));
314
315         st = kmalloc(sizeof(*st), GFP_KERNEL);
316         if (!st) {
317                 err = -ENOMEM;
318                 goto err_phys;
319         }
320
321         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
322                 kfree(st);
323                 err = -ENOMEM;
324                 goto err_phys;
325         }
326
327         sg = st->sgl;
328         sg->offset = 0;
329         sg->length = obj->base.size;
330
331         sg_dma_address(sg) = phys->busaddr;
332         sg_dma_len(sg) = obj->base.size;
333
334         obj->phys_handle = phys;
335
336         __i915_gem_object_set_pages(obj, st, sg->length);
337
338         return 0;
339
340 err_phys:
341         drm_pci_free(obj->base.dev, phys);
342
343         return err;
344 }
345
346 static void __start_cpu_write(struct drm_i915_gem_object *obj)
347 {
348         obj->read_domains = I915_GEM_DOMAIN_CPU;
349         obj->write_domain = I915_GEM_DOMAIN_CPU;
350         if (cpu_write_needs_clflush(obj))
351                 obj->cache_dirty = true;
352 }
353
354 static void
355 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
356                                 struct sg_table *pages,
357                                 bool needs_clflush)
358 {
359         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
360
361         if (obj->mm.madv == I915_MADV_DONTNEED)
362                 obj->mm.dirty = false;
363
364         if (needs_clflush &&
365             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
366             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
367                 drm_clflush_sg(pages);
368
369         __start_cpu_write(obj);
370 }
371
372 static void
373 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
374                                struct sg_table *pages)
375 {
376         __i915_gem_object_release_shmem(obj, pages, false);
377
378         if (obj->mm.dirty) {
379                 struct address_space *mapping = obj->base.filp->f_mapping;
380                 char *vaddr = obj->phys_handle->vaddr;
381                 int i;
382
383                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
384                         struct page *page;
385                         char *dst;
386
387                         page = shmem_read_mapping_page(mapping, i);
388                         if (IS_ERR(page))
389                                 continue;
390
391                         dst = kmap_atomic(page);
392                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
393                         memcpy(dst, vaddr, PAGE_SIZE);
394                         kunmap_atomic(dst);
395
396                         set_page_dirty(page);
397                         if (obj->mm.madv == I915_MADV_WILLNEED)
398                                 mark_page_accessed(page);
399                         put_page(page);
400                         vaddr += PAGE_SIZE;
401                 }
402                 obj->mm.dirty = false;
403         }
404
405         sg_free_table(pages);
406         kfree(pages);
407
408         drm_pci_free(obj->base.dev, obj->phys_handle);
409 }
410
411 static void
412 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
413 {
414         i915_gem_object_unpin_pages(obj);
415 }
416
417 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
418         .get_pages = i915_gem_object_get_pages_phys,
419         .put_pages = i915_gem_object_put_pages_phys,
420         .release = i915_gem_object_release_phys,
421 };
422
423 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
424
425 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
426 {
427         struct i915_vma *vma;
428         LIST_HEAD(still_in_list);
429         int ret;
430
431         lockdep_assert_held(&obj->base.dev->struct_mutex);
432
433         /* Closed vma are removed from the obj->vma_list - but they may
434          * still have an active binding on the object. To remove those we
435          * must wait for all rendering to complete to the object (as unbinding
436          * must anyway), and retire the requests.
437          */
438         ret = i915_gem_object_set_to_cpu_domain(obj, false);
439         if (ret)
440                 return ret;
441
442         spin_lock(&obj->vma.lock);
443         while (!ret && (vma = list_first_entry_or_null(&obj->vma.list,
444                                                        struct i915_vma,
445                                                        obj_link))) {
446                 list_move_tail(&vma->obj_link, &still_in_list);
447                 spin_unlock(&obj->vma.lock);
448
449                 ret = i915_vma_unbind(vma);
450
451                 spin_lock(&obj->vma.lock);
452         }
453         list_splice(&still_in_list, &obj->vma.list);
454         spin_unlock(&obj->vma.lock);
455
456         return ret;
457 }
458
459 static long
460 i915_gem_object_wait_fence(struct dma_fence *fence,
461                            unsigned int flags,
462                            long timeout,
463                            struct intel_rps_client *rps_client)
464 {
465         struct i915_request *rq;
466
467         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
468
469         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
470                 return timeout;
471
472         if (!dma_fence_is_i915(fence))
473                 return dma_fence_wait_timeout(fence,
474                                               flags & I915_WAIT_INTERRUPTIBLE,
475                                               timeout);
476
477         rq = to_request(fence);
478         if (i915_request_completed(rq))
479                 goto out;
480
481         /*
482          * This client is about to stall waiting for the GPU. In many cases
483          * this is undesirable and limits the throughput of the system, as
484          * many clients cannot continue processing user input/output whilst
485          * blocked. RPS autotuning may take tens of milliseconds to respond
486          * to the GPU load and thus incurs additional latency for the client.
487          * We can circumvent that by promoting the GPU frequency to maximum
488          * before we wait. This makes the GPU throttle up much more quickly
489          * (good for benchmarks and user experience, e.g. window animations),
490          * but at a cost of spending more power processing the workload
491          * (bad for battery). Not all clients even want their results
492          * immediately and for them we should just let the GPU select its own
493          * frequency to maximise efficiency. To prevent a single client from
494          * forcing the clocks too high for the whole system, we only allow
495          * each client to waitboost once in a busy period.
496          */
497         if (rps_client && !i915_request_started(rq)) {
498                 if (INTEL_GEN(rq->i915) >= 6)
499                         gen6_rps_boost(rq, rps_client);
500         }
501
502         timeout = i915_request_wait(rq, flags, timeout);
503
504 out:
505         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
506                 i915_request_retire_upto(rq);
507
508         return timeout;
509 }
510
511 static long
512 i915_gem_object_wait_reservation(struct reservation_object *resv,
513                                  unsigned int flags,
514                                  long timeout,
515                                  struct intel_rps_client *rps_client)
516 {
517         unsigned int seq = __read_seqcount_begin(&resv->seq);
518         struct dma_fence *excl;
519         bool prune_fences = false;
520
521         if (flags & I915_WAIT_ALL) {
522                 struct dma_fence **shared;
523                 unsigned int count, i;
524                 int ret;
525
526                 ret = reservation_object_get_fences_rcu(resv,
527                                                         &excl, &count, &shared);
528                 if (ret)
529                         return ret;
530
531                 for (i = 0; i < count; i++) {
532                         timeout = i915_gem_object_wait_fence(shared[i],
533                                                              flags, timeout,
534                                                              rps_client);
535                         if (timeout < 0)
536                                 break;
537
538                         dma_fence_put(shared[i]);
539                 }
540
541                 for (; i < count; i++)
542                         dma_fence_put(shared[i]);
543                 kfree(shared);
544
545                 /*
546                  * If both shared fences and an exclusive fence exist,
547                  * then by construction the shared fences must be later
548                  * than the exclusive fence. If we successfully wait for
549                  * all the shared fences, we know that the exclusive fence
550                  * must all be signaled. If all the shared fences are
551                  * signaled, we can prune the array and recover the
552                  * floating references on the fences/requests.
553                  */
554                 prune_fences = count && timeout >= 0;
555         } else {
556                 excl = reservation_object_get_excl_rcu(resv);
557         }
558
559         if (excl && timeout >= 0)
560                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
561                                                      rps_client);
562
563         dma_fence_put(excl);
564
565         /*
566          * Opportunistically prune the fences iff we know they have *all* been
567          * signaled and that the reservation object has not been changed (i.e.
568          * no new fences have been added).
569          */
570         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
571                 if (reservation_object_trylock(resv)) {
572                         if (!__read_seqcount_retry(&resv->seq, seq))
573                                 reservation_object_add_excl_fence(resv, NULL);
574                         reservation_object_unlock(resv);
575                 }
576         }
577
578         return timeout;
579 }
580
581 static void __fence_set_priority(struct dma_fence *fence,
582                                  const struct i915_sched_attr *attr)
583 {
584         struct i915_request *rq;
585         struct intel_engine_cs *engine;
586
587         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
588                 return;
589
590         rq = to_request(fence);
591         engine = rq->engine;
592
593         local_bh_disable();
594         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
595         if (engine->schedule)
596                 engine->schedule(rq, attr);
597         rcu_read_unlock();
598         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
599 }
600
601 static void fence_set_priority(struct dma_fence *fence,
602                                const struct i915_sched_attr *attr)
603 {
604         /* Recurse once into a fence-array */
605         if (dma_fence_is_array(fence)) {
606                 struct dma_fence_array *array = to_dma_fence_array(fence);
607                 int i;
608
609                 for (i = 0; i < array->num_fences; i++)
610                         __fence_set_priority(array->fences[i], attr);
611         } else {
612                 __fence_set_priority(fence, attr);
613         }
614 }
615
616 int
617 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
618                               unsigned int flags,
619                               const struct i915_sched_attr *attr)
620 {
621         struct dma_fence *excl;
622
623         if (flags & I915_WAIT_ALL) {
624                 struct dma_fence **shared;
625                 unsigned int count, i;
626                 int ret;
627
628                 ret = reservation_object_get_fences_rcu(obj->resv,
629                                                         &excl, &count, &shared);
630                 if (ret)
631                         return ret;
632
633                 for (i = 0; i < count; i++) {
634                         fence_set_priority(shared[i], attr);
635                         dma_fence_put(shared[i]);
636                 }
637
638                 kfree(shared);
639         } else {
640                 excl = reservation_object_get_excl_rcu(obj->resv);
641         }
642
643         if (excl) {
644                 fence_set_priority(excl, attr);
645                 dma_fence_put(excl);
646         }
647         return 0;
648 }
649
650 /**
651  * Waits for rendering to the object to be completed
652  * @obj: i915 gem object
653  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
654  * @timeout: how long to wait
655  * @rps_client: client (user process) to charge for any waitboosting
656  */
657 int
658 i915_gem_object_wait(struct drm_i915_gem_object *obj,
659                      unsigned int flags,
660                      long timeout,
661                      struct intel_rps_client *rps_client)
662 {
663         might_sleep();
664         GEM_BUG_ON(timeout < 0);
665
666         timeout = i915_gem_object_wait_reservation(obj->resv,
667                                                    flags, timeout,
668                                                    rps_client);
669         return timeout < 0 ? timeout : 0;
670 }
671
672 static struct intel_rps_client *to_rps_client(struct drm_file *file)
673 {
674         struct drm_i915_file_private *fpriv = file->driver_priv;
675
676         return &fpriv->rps_client;
677 }
678
679 static int
680 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
681                      struct drm_i915_gem_pwrite *args,
682                      struct drm_file *file)
683 {
684         void *vaddr = obj->phys_handle->vaddr + args->offset;
685         char __user *user_data = u64_to_user_ptr(args->data_ptr);
686
687         /* We manually control the domain here and pretend that it
688          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
689          */
690         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
691         if (copy_from_user(vaddr, user_data, args->size))
692                 return -EFAULT;
693
694         drm_clflush_virt_range(vaddr, args->size);
695         i915_gem_chipset_flush(to_i915(obj->base.dev));
696
697         intel_fb_obj_flush(obj, ORIGIN_CPU);
698         return 0;
699 }
700
701 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
702 {
703         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
704 }
705
706 void i915_gem_object_free(struct drm_i915_gem_object *obj)
707 {
708         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
709         kmem_cache_free(dev_priv->objects, obj);
710 }
711
712 static int
713 i915_gem_create(struct drm_file *file,
714                 struct drm_i915_private *dev_priv,
715                 u64 size,
716                 u32 *handle_p)
717 {
718         struct drm_i915_gem_object *obj;
719         int ret;
720         u32 handle;
721
722         size = roundup(size, PAGE_SIZE);
723         if (size == 0)
724                 return -EINVAL;
725
726         /* Allocate the new object */
727         obj = i915_gem_object_create(dev_priv, size);
728         if (IS_ERR(obj))
729                 return PTR_ERR(obj);
730
731         ret = drm_gem_handle_create(file, &obj->base, &handle);
732         /* drop reference from allocate - handle holds it now */
733         i915_gem_object_put(obj);
734         if (ret)
735                 return ret;
736
737         *handle_p = handle;
738         return 0;
739 }
740
741 int
742 i915_gem_dumb_create(struct drm_file *file,
743                      struct drm_device *dev,
744                      struct drm_mode_create_dumb *args)
745 {
746         /* have to work out size/pitch and return them */
747         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
748         args->size = args->pitch * args->height;
749         return i915_gem_create(file, to_i915(dev),
750                                args->size, &args->handle);
751 }
752
753 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
754 {
755         return !(obj->cache_level == I915_CACHE_NONE ||
756                  obj->cache_level == I915_CACHE_WT);
757 }
758
759 /**
760  * Creates a new mm object and returns a handle to it.
761  * @dev: drm device pointer
762  * @data: ioctl data blob
763  * @file: drm file pointer
764  */
765 int
766 i915_gem_create_ioctl(struct drm_device *dev, void *data,
767                       struct drm_file *file)
768 {
769         struct drm_i915_private *dev_priv = to_i915(dev);
770         struct drm_i915_gem_create *args = data;
771
772         i915_gem_flush_free_objects(dev_priv);
773
774         return i915_gem_create(file, dev_priv,
775                                args->size, &args->handle);
776 }
777
778 static inline enum fb_op_origin
779 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
780 {
781         return (domain == I915_GEM_DOMAIN_GTT ?
782                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
783 }
784
785 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
786 {
787         intel_wakeref_t wakeref;
788
789         /*
790          * No actual flushing is required for the GTT write domain for reads
791          * from the GTT domain. Writes to it "immediately" go to main memory
792          * as far as we know, so there's no chipset flush. It also doesn't
793          * land in the GPU render cache.
794          *
795          * However, we do have to enforce the order so that all writes through
796          * the GTT land before any writes to the device, such as updates to
797          * the GATT itself.
798          *
799          * We also have to wait a bit for the writes to land from the GTT.
800          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
801          * timing. This issue has only been observed when switching quickly
802          * between GTT writes and CPU reads from inside the kernel on recent hw,
803          * and it appears to only affect discrete GTT blocks (i.e. on LLC
804          * system agents we cannot reproduce this behaviour, until Cannonlake
805          * that was!).
806          */
807
808         wmb();
809
810         if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
811                 return;
812
813         i915_gem_chipset_flush(dev_priv);
814
815         with_intel_runtime_pm(dev_priv, wakeref) {
816                 spin_lock_irq(&dev_priv->uncore.lock);
817
818                 POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
819
820                 spin_unlock_irq(&dev_priv->uncore.lock);
821         }
822 }
823
824 static void
825 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
826 {
827         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
828         struct i915_vma *vma;
829
830         if (!(obj->write_domain & flush_domains))
831                 return;
832
833         switch (obj->write_domain) {
834         case I915_GEM_DOMAIN_GTT:
835                 i915_gem_flush_ggtt_writes(dev_priv);
836
837                 intel_fb_obj_flush(obj,
838                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
839
840                 for_each_ggtt_vma(vma, obj) {
841                         if (vma->iomap)
842                                 continue;
843
844                         i915_vma_unset_ggtt_write(vma);
845                 }
846                 break;
847
848         case I915_GEM_DOMAIN_WC:
849                 wmb();
850                 break;
851
852         case I915_GEM_DOMAIN_CPU:
853                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
854                 break;
855
856         case I915_GEM_DOMAIN_RENDER:
857                 if (gpu_write_needs_clflush(obj))
858                         obj->cache_dirty = true;
859                 break;
860         }
861
862         obj->write_domain = 0;
863 }
864
865 /*
866  * Pins the specified object's pages and synchronizes the object with
867  * GPU accesses. Sets needs_clflush to non-zero if the caller should
868  * flush the object from the CPU cache.
869  */
870 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
871                                     unsigned int *needs_clflush)
872 {
873         int ret;
874
875         lockdep_assert_held(&obj->base.dev->struct_mutex);
876
877         *needs_clflush = 0;
878         if (!i915_gem_object_has_struct_page(obj))
879                 return -ENODEV;
880
881         ret = i915_gem_object_wait(obj,
882                                    I915_WAIT_INTERRUPTIBLE |
883                                    I915_WAIT_LOCKED,
884                                    MAX_SCHEDULE_TIMEOUT,
885                                    NULL);
886         if (ret)
887                 return ret;
888
889         ret = i915_gem_object_pin_pages(obj);
890         if (ret)
891                 return ret;
892
893         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
894             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
895                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
896                 if (ret)
897                         goto err_unpin;
898                 else
899                         goto out;
900         }
901
902         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
903
904         /* If we're not in the cpu read domain, set ourself into the gtt
905          * read domain and manually flush cachelines (if required). This
906          * optimizes for the case when the gpu will dirty the data
907          * anyway again before the next pread happens.
908          */
909         if (!obj->cache_dirty &&
910             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
911                 *needs_clflush = CLFLUSH_BEFORE;
912
913 out:
914         /* return with the pages pinned */
915         return 0;
916
917 err_unpin:
918         i915_gem_object_unpin_pages(obj);
919         return ret;
920 }
921
922 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
923                                      unsigned int *needs_clflush)
924 {
925         int ret;
926
927         lockdep_assert_held(&obj->base.dev->struct_mutex);
928
929         *needs_clflush = 0;
930         if (!i915_gem_object_has_struct_page(obj))
931                 return -ENODEV;
932
933         ret = i915_gem_object_wait(obj,
934                                    I915_WAIT_INTERRUPTIBLE |
935                                    I915_WAIT_LOCKED |
936                                    I915_WAIT_ALL,
937                                    MAX_SCHEDULE_TIMEOUT,
938                                    NULL);
939         if (ret)
940                 return ret;
941
942         ret = i915_gem_object_pin_pages(obj);
943         if (ret)
944                 return ret;
945
946         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
947             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
948                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
949                 if (ret)
950                         goto err_unpin;
951                 else
952                         goto out;
953         }
954
955         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
956
957         /* If we're not in the cpu write domain, set ourself into the
958          * gtt write domain and manually flush cachelines (as required).
959          * This optimizes for the case when the gpu will use the data
960          * right away and we therefore have to clflush anyway.
961          */
962         if (!obj->cache_dirty) {
963                 *needs_clflush |= CLFLUSH_AFTER;
964
965                 /*
966                  * Same trick applies to invalidate partially written
967                  * cachelines read before writing.
968                  */
969                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
970                         *needs_clflush |= CLFLUSH_BEFORE;
971         }
972
973 out:
974         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
975         obj->mm.dirty = true;
976         /* return with the pages pinned */
977         return 0;
978
979 err_unpin:
980         i915_gem_object_unpin_pages(obj);
981         return ret;
982 }
983
984 static int
985 shmem_pread(struct page *page, int offset, int len, char __user *user_data,
986             bool needs_clflush)
987 {
988         char *vaddr;
989         int ret;
990
991         vaddr = kmap(page);
992
993         if (needs_clflush)
994                 drm_clflush_virt_range(vaddr + offset, len);
995
996         ret = __copy_to_user(user_data, vaddr + offset, len);
997
998         kunmap(page);
999
1000         return ret ? -EFAULT : 0;
1001 }
1002
1003 static int
1004 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1005                      struct drm_i915_gem_pread *args)
1006 {
1007         char __user *user_data;
1008         u64 remain;
1009         unsigned int needs_clflush;
1010         unsigned int idx, offset;
1011         int ret;
1012
1013         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1014         if (ret)
1015                 return ret;
1016
1017         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1018         mutex_unlock(&obj->base.dev->struct_mutex);
1019         if (ret)
1020                 return ret;
1021
1022         remain = args->size;
1023         user_data = u64_to_user_ptr(args->data_ptr);
1024         offset = offset_in_page(args->offset);
1025         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1026                 struct page *page = i915_gem_object_get_page(obj, idx);
1027                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1028
1029                 ret = shmem_pread(page, offset, length, user_data,
1030                                   needs_clflush);
1031                 if (ret)
1032                         break;
1033
1034                 remain -= length;
1035                 user_data += length;
1036                 offset = 0;
1037         }
1038
1039         i915_gem_obj_finish_shmem_access(obj);
1040         return ret;
1041 }
1042
1043 static inline bool
1044 gtt_user_read(struct io_mapping *mapping,
1045               loff_t base, int offset,
1046               char __user *user_data, int length)
1047 {
1048         void __iomem *vaddr;
1049         unsigned long unwritten;
1050
1051         /* We can use the cpu mem copy function because this is X86. */
1052         vaddr = io_mapping_map_atomic_wc(mapping, base);
1053         unwritten = __copy_to_user_inatomic(user_data,
1054                                             (void __force *)vaddr + offset,
1055                                             length);
1056         io_mapping_unmap_atomic(vaddr);
1057         if (unwritten) {
1058                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1059                 unwritten = copy_to_user(user_data,
1060                                          (void __force *)vaddr + offset,
1061                                          length);
1062                 io_mapping_unmap(vaddr);
1063         }
1064         return unwritten;
1065 }
1066
1067 static int
1068 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1069                    const struct drm_i915_gem_pread *args)
1070 {
1071         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1072         struct i915_ggtt *ggtt = &i915->ggtt;
1073         intel_wakeref_t wakeref;
1074         struct drm_mm_node node;
1075         struct i915_vma *vma;
1076         void __user *user_data;
1077         u64 remain, offset;
1078         int ret;
1079
1080         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1081         if (ret)
1082                 return ret;
1083
1084         wakeref = intel_runtime_pm_get(i915);
1085         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1086                                        PIN_MAPPABLE |
1087                                        PIN_NONFAULT |
1088                                        PIN_NONBLOCK);
1089         if (!IS_ERR(vma)) {
1090                 node.start = i915_ggtt_offset(vma);
1091                 node.allocated = false;
1092                 ret = i915_vma_put_fence(vma);
1093                 if (ret) {
1094                         i915_vma_unpin(vma);
1095                         vma = ERR_PTR(ret);
1096                 }
1097         }
1098         if (IS_ERR(vma)) {
1099                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1100                 if (ret)
1101                         goto out_unlock;
1102                 GEM_BUG_ON(!node.allocated);
1103         }
1104
1105         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1106         if (ret)
1107                 goto out_unpin;
1108
1109         mutex_unlock(&i915->drm.struct_mutex);
1110
1111         user_data = u64_to_user_ptr(args->data_ptr);
1112         remain = args->size;
1113         offset = args->offset;
1114
1115         while (remain > 0) {
1116                 /* Operation in this page
1117                  *
1118                  * page_base = page offset within aperture
1119                  * page_offset = offset within page
1120                  * page_length = bytes to copy for this page
1121                  */
1122                 u32 page_base = node.start;
1123                 unsigned page_offset = offset_in_page(offset);
1124                 unsigned page_length = PAGE_SIZE - page_offset;
1125                 page_length = remain < page_length ? remain : page_length;
1126                 if (node.allocated) {
1127                         wmb();
1128                         ggtt->vm.insert_page(&ggtt->vm,
1129                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1130                                              node.start, I915_CACHE_NONE, 0);
1131                         wmb();
1132                 } else {
1133                         page_base += offset & PAGE_MASK;
1134                 }
1135
1136                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1137                                   user_data, page_length)) {
1138                         ret = -EFAULT;
1139                         break;
1140                 }
1141
1142                 remain -= page_length;
1143                 user_data += page_length;
1144                 offset += page_length;
1145         }
1146
1147         mutex_lock(&i915->drm.struct_mutex);
1148 out_unpin:
1149         if (node.allocated) {
1150                 wmb();
1151                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1152                 remove_mappable_node(&node);
1153         } else {
1154                 i915_vma_unpin(vma);
1155         }
1156 out_unlock:
1157         intel_runtime_pm_put(i915, wakeref);
1158         mutex_unlock(&i915->drm.struct_mutex);
1159
1160         return ret;
1161 }
1162
1163 /**
1164  * Reads data from the object referenced by handle.
1165  * @dev: drm device pointer
1166  * @data: ioctl data blob
1167  * @file: drm file pointer
1168  *
1169  * On error, the contents of *data are undefined.
1170  */
1171 int
1172 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1173                      struct drm_file *file)
1174 {
1175         struct drm_i915_gem_pread *args = data;
1176         struct drm_i915_gem_object *obj;
1177         int ret;
1178
1179         if (args->size == 0)
1180                 return 0;
1181
1182         if (!access_ok(u64_to_user_ptr(args->data_ptr),
1183                        args->size))
1184                 return -EFAULT;
1185
1186         obj = i915_gem_object_lookup(file, args->handle);
1187         if (!obj)
1188                 return -ENOENT;
1189
1190         /* Bounds check source.  */
1191         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1192                 ret = -EINVAL;
1193                 goto out;
1194         }
1195
1196         trace_i915_gem_object_pread(obj, args->offset, args->size);
1197
1198         ret = i915_gem_object_wait(obj,
1199                                    I915_WAIT_INTERRUPTIBLE,
1200                                    MAX_SCHEDULE_TIMEOUT,
1201                                    to_rps_client(file));
1202         if (ret)
1203                 goto out;
1204
1205         ret = i915_gem_object_pin_pages(obj);
1206         if (ret)
1207                 goto out;
1208
1209         ret = i915_gem_shmem_pread(obj, args);
1210         if (ret == -EFAULT || ret == -ENODEV)
1211                 ret = i915_gem_gtt_pread(obj, args);
1212
1213         i915_gem_object_unpin_pages(obj);
1214 out:
1215         i915_gem_object_put(obj);
1216         return ret;
1217 }
1218
1219 /* This is the fast write path which cannot handle
1220  * page faults in the source data
1221  */
1222
1223 static inline bool
1224 ggtt_write(struct io_mapping *mapping,
1225            loff_t base, int offset,
1226            char __user *user_data, int length)
1227 {
1228         void __iomem *vaddr;
1229         unsigned long unwritten;
1230
1231         /* We can use the cpu mem copy function because this is X86. */
1232         vaddr = io_mapping_map_atomic_wc(mapping, base);
1233         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1234                                                       user_data, length);
1235         io_mapping_unmap_atomic(vaddr);
1236         if (unwritten) {
1237                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1238                 unwritten = copy_from_user((void __force *)vaddr + offset,
1239                                            user_data, length);
1240                 io_mapping_unmap(vaddr);
1241         }
1242
1243         return unwritten;
1244 }
1245
1246 /**
1247  * This is the fast pwrite path, where we copy the data directly from the
1248  * user into the GTT, uncached.
1249  * @obj: i915 GEM object
1250  * @args: pwrite arguments structure
1251  */
1252 static int
1253 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1254                          const struct drm_i915_gem_pwrite *args)
1255 {
1256         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1257         struct i915_ggtt *ggtt = &i915->ggtt;
1258         intel_wakeref_t wakeref;
1259         struct drm_mm_node node;
1260         struct i915_vma *vma;
1261         u64 remain, offset;
1262         void __user *user_data;
1263         int ret;
1264
1265         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1266         if (ret)
1267                 return ret;
1268
1269         if (i915_gem_object_has_struct_page(obj)) {
1270                 /*
1271                  * Avoid waking the device up if we can fallback, as
1272                  * waking/resuming is very slow (worst-case 10-100 ms
1273                  * depending on PCI sleeps and our own resume time).
1274                  * This easily dwarfs any performance advantage from
1275                  * using the cache bypass of indirect GGTT access.
1276                  */
1277                 wakeref = intel_runtime_pm_get_if_in_use(i915);
1278                 if (!wakeref) {
1279                         ret = -EFAULT;
1280                         goto out_unlock;
1281                 }
1282         } else {
1283                 /* No backing pages, no fallback, we must force GGTT access */
1284                 wakeref = intel_runtime_pm_get(i915);
1285         }
1286
1287         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1288                                        PIN_MAPPABLE |
1289                                        PIN_NONFAULT |
1290                                        PIN_NONBLOCK);
1291         if (!IS_ERR(vma)) {
1292                 node.start = i915_ggtt_offset(vma);
1293                 node.allocated = false;
1294                 ret = i915_vma_put_fence(vma);
1295                 if (ret) {
1296                         i915_vma_unpin(vma);
1297                         vma = ERR_PTR(ret);
1298                 }
1299         }
1300         if (IS_ERR(vma)) {
1301                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1302                 if (ret)
1303                         goto out_rpm;
1304                 GEM_BUG_ON(!node.allocated);
1305         }
1306
1307         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1308         if (ret)
1309                 goto out_unpin;
1310
1311         mutex_unlock(&i915->drm.struct_mutex);
1312
1313         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1314
1315         user_data = u64_to_user_ptr(args->data_ptr);
1316         offset = args->offset;
1317         remain = args->size;
1318         while (remain) {
1319                 /* Operation in this page
1320                  *
1321                  * page_base = page offset within aperture
1322                  * page_offset = offset within page
1323                  * page_length = bytes to copy for this page
1324                  */
1325                 u32 page_base = node.start;
1326                 unsigned int page_offset = offset_in_page(offset);
1327                 unsigned int page_length = PAGE_SIZE - page_offset;
1328                 page_length = remain < page_length ? remain : page_length;
1329                 if (node.allocated) {
1330                         wmb(); /* flush the write before we modify the GGTT */
1331                         ggtt->vm.insert_page(&ggtt->vm,
1332                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1333                                              node.start, I915_CACHE_NONE, 0);
1334                         wmb(); /* flush modifications to the GGTT (insert_page) */
1335                 } else {
1336                         page_base += offset & PAGE_MASK;
1337                 }
1338                 /* If we get a fault while copying data, then (presumably) our
1339                  * source page isn't available.  Return the error and we'll
1340                  * retry in the slow path.
1341                  * If the object is non-shmem backed, we retry again with the
1342                  * path that handles page fault.
1343                  */
1344                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1345                                user_data, page_length)) {
1346                         ret = -EFAULT;
1347                         break;
1348                 }
1349
1350                 remain -= page_length;
1351                 user_data += page_length;
1352                 offset += page_length;
1353         }
1354         intel_fb_obj_flush(obj, ORIGIN_CPU);
1355
1356         mutex_lock(&i915->drm.struct_mutex);
1357 out_unpin:
1358         if (node.allocated) {
1359                 wmb();
1360                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1361                 remove_mappable_node(&node);
1362         } else {
1363                 i915_vma_unpin(vma);
1364         }
1365 out_rpm:
1366         intel_runtime_pm_put(i915, wakeref);
1367 out_unlock:
1368         mutex_unlock(&i915->drm.struct_mutex);
1369         return ret;
1370 }
1371
1372 /* Per-page copy function for the shmem pwrite fastpath.
1373  * Flushes invalid cachelines before writing to the target if
1374  * needs_clflush_before is set and flushes out any written cachelines after
1375  * writing if needs_clflush is set.
1376  */
1377 static int
1378 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1379              bool needs_clflush_before,
1380              bool needs_clflush_after)
1381 {
1382         char *vaddr;
1383         int ret;
1384
1385         vaddr = kmap(page);
1386
1387         if (needs_clflush_before)
1388                 drm_clflush_virt_range(vaddr + offset, len);
1389
1390         ret = __copy_from_user(vaddr + offset, user_data, len);
1391         if (!ret && needs_clflush_after)
1392                 drm_clflush_virt_range(vaddr + offset, len);
1393
1394         kunmap(page);
1395
1396         return ret ? -EFAULT : 0;
1397 }
1398
1399 static int
1400 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1401                       const struct drm_i915_gem_pwrite *args)
1402 {
1403         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1404         void __user *user_data;
1405         u64 remain;
1406         unsigned int partial_cacheline_write;
1407         unsigned int needs_clflush;
1408         unsigned int offset, idx;
1409         int ret;
1410
1411         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1412         if (ret)
1413                 return ret;
1414
1415         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1416         mutex_unlock(&i915->drm.struct_mutex);
1417         if (ret)
1418                 return ret;
1419
1420         /* If we don't overwrite a cacheline completely we need to be
1421          * careful to have up-to-date data by first clflushing. Don't
1422          * overcomplicate things and flush the entire patch.
1423          */
1424         partial_cacheline_write = 0;
1425         if (needs_clflush & CLFLUSH_BEFORE)
1426                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1427
1428         user_data = u64_to_user_ptr(args->data_ptr);
1429         remain = args->size;
1430         offset = offset_in_page(args->offset);
1431         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1432                 struct page *page = i915_gem_object_get_page(obj, idx);
1433                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1434
1435                 ret = shmem_pwrite(page, offset, length, user_data,
1436                                    (offset | length) & partial_cacheline_write,
1437                                    needs_clflush & CLFLUSH_AFTER);
1438                 if (ret)
1439                         break;
1440
1441                 remain -= length;
1442                 user_data += length;
1443                 offset = 0;
1444         }
1445
1446         intel_fb_obj_flush(obj, ORIGIN_CPU);
1447         i915_gem_obj_finish_shmem_access(obj);
1448         return ret;
1449 }
1450
1451 /**
1452  * Writes data to the object referenced by handle.
1453  * @dev: drm device
1454  * @data: ioctl data blob
1455  * @file: drm file
1456  *
1457  * On error, the contents of the buffer that were to be modified are undefined.
1458  */
1459 int
1460 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1461                       struct drm_file *file)
1462 {
1463         struct drm_i915_gem_pwrite *args = data;
1464         struct drm_i915_gem_object *obj;
1465         int ret;
1466
1467         if (args->size == 0)
1468                 return 0;
1469
1470         if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
1471                 return -EFAULT;
1472
1473         obj = i915_gem_object_lookup(file, args->handle);
1474         if (!obj)
1475                 return -ENOENT;
1476
1477         /* Bounds check destination. */
1478         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1479                 ret = -EINVAL;
1480                 goto err;
1481         }
1482
1483         /* Writes not allowed into this read-only object */
1484         if (i915_gem_object_is_readonly(obj)) {
1485                 ret = -EINVAL;
1486                 goto err;
1487         }
1488
1489         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1490
1491         ret = -ENODEV;
1492         if (obj->ops->pwrite)
1493                 ret = obj->ops->pwrite(obj, args);
1494         if (ret != -ENODEV)
1495                 goto err;
1496
1497         ret = i915_gem_object_wait(obj,
1498                                    I915_WAIT_INTERRUPTIBLE |
1499                                    I915_WAIT_ALL,
1500                                    MAX_SCHEDULE_TIMEOUT,
1501                                    to_rps_client(file));
1502         if (ret)
1503                 goto err;
1504
1505         ret = i915_gem_object_pin_pages(obj);
1506         if (ret)
1507                 goto err;
1508
1509         ret = -EFAULT;
1510         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1511          * it would end up going through the fenced access, and we'll get
1512          * different detiling behavior between reading and writing.
1513          * pread/pwrite currently are reading and writing from the CPU
1514          * perspective, requiring manual detiling by the client.
1515          */
1516         if (!i915_gem_object_has_struct_page(obj) ||
1517             cpu_write_needs_clflush(obj))
1518                 /* Note that the gtt paths might fail with non-page-backed user
1519                  * pointers (e.g. gtt mappings when moving data between
1520                  * textures). Fallback to the shmem path in that case.
1521                  */
1522                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1523
1524         if (ret == -EFAULT || ret == -ENOSPC) {
1525                 if (obj->phys_handle)
1526                         ret = i915_gem_phys_pwrite(obj, args, file);
1527                 else
1528                         ret = i915_gem_shmem_pwrite(obj, args);
1529         }
1530
1531         i915_gem_object_unpin_pages(obj);
1532 err:
1533         i915_gem_object_put(obj);
1534         return ret;
1535 }
1536
1537 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1538 {
1539         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1540         struct list_head *list;
1541         struct i915_vma *vma;
1542
1543         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1544
1545         mutex_lock(&i915->ggtt.vm.mutex);
1546         for_each_ggtt_vma(vma, obj) {
1547                 if (!drm_mm_node_allocated(&vma->node))
1548                         continue;
1549
1550                 list_move_tail(&vma->vm_link, &vma->vm->bound_list);
1551         }
1552         mutex_unlock(&i915->ggtt.vm.mutex);
1553
1554         spin_lock(&i915->mm.obj_lock);
1555         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1556         list_move_tail(&obj->mm.link, list);
1557         spin_unlock(&i915->mm.obj_lock);
1558 }
1559
1560 /**
1561  * Called when user space prepares to use an object with the CPU, either
1562  * through the mmap ioctl's mapping or a GTT mapping.
1563  * @dev: drm device
1564  * @data: ioctl data blob
1565  * @file: drm file
1566  */
1567 int
1568 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1569                           struct drm_file *file)
1570 {
1571         struct drm_i915_gem_set_domain *args = data;
1572         struct drm_i915_gem_object *obj;
1573         u32 read_domains = args->read_domains;
1574         u32 write_domain = args->write_domain;
1575         int err;
1576
1577         /* Only handle setting domains to types used by the CPU. */
1578         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1579                 return -EINVAL;
1580
1581         /* Having something in the write domain implies it's in the read
1582          * domain, and only that read domain.  Enforce that in the request.
1583          */
1584         if (write_domain != 0 && read_domains != write_domain)
1585                 return -EINVAL;
1586
1587         obj = i915_gem_object_lookup(file, args->handle);
1588         if (!obj)
1589                 return -ENOENT;
1590
1591         /* Try to flush the object off the GPU without holding the lock.
1592          * We will repeat the flush holding the lock in the normal manner
1593          * to catch cases where we are gazumped.
1594          */
1595         err = i915_gem_object_wait(obj,
1596                                    I915_WAIT_INTERRUPTIBLE |
1597                                    I915_WAIT_PRIORITY |
1598                                    (write_domain ? I915_WAIT_ALL : 0),
1599                                    MAX_SCHEDULE_TIMEOUT,
1600                                    to_rps_client(file));
1601         if (err)
1602                 goto out;
1603
1604         /*
1605          * Proxy objects do not control access to the backing storage, ergo
1606          * they cannot be used as a means to manipulate the cache domain
1607          * tracking for that backing storage. The proxy object is always
1608          * considered to be outside of any cache domain.
1609          */
1610         if (i915_gem_object_is_proxy(obj)) {
1611                 err = -ENXIO;
1612                 goto out;
1613         }
1614
1615         /*
1616          * Flush and acquire obj->pages so that we are coherent through
1617          * direct access in memory with previous cached writes through
1618          * shmemfs and that our cache domain tracking remains valid.
1619          * For example, if the obj->filp was moved to swap without us
1620          * being notified and releasing the pages, we would mistakenly
1621          * continue to assume that the obj remained out of the CPU cached
1622          * domain.
1623          */
1624         err = i915_gem_object_pin_pages(obj);
1625         if (err)
1626                 goto out;
1627
1628         err = i915_mutex_lock_interruptible(dev);
1629         if (err)
1630                 goto out_unpin;
1631
1632         if (read_domains & I915_GEM_DOMAIN_WC)
1633                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1634         else if (read_domains & I915_GEM_DOMAIN_GTT)
1635                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1636         else
1637                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1638
1639         /* And bump the LRU for this access */
1640         i915_gem_object_bump_inactive_ggtt(obj);
1641
1642         mutex_unlock(&dev->struct_mutex);
1643
1644         if (write_domain != 0)
1645                 intel_fb_obj_invalidate(obj,
1646                                         fb_write_origin(obj, write_domain));
1647
1648 out_unpin:
1649         i915_gem_object_unpin_pages(obj);
1650 out:
1651         i915_gem_object_put(obj);
1652         return err;
1653 }
1654
1655 /**
1656  * Called when user space has done writes to this buffer
1657  * @dev: drm device
1658  * @data: ioctl data blob
1659  * @file: drm file
1660  */
1661 int
1662 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1663                          struct drm_file *file)
1664 {
1665         struct drm_i915_gem_sw_finish *args = data;
1666         struct drm_i915_gem_object *obj;
1667
1668         obj = i915_gem_object_lookup(file, args->handle);
1669         if (!obj)
1670                 return -ENOENT;
1671
1672         /*
1673          * Proxy objects are barred from CPU access, so there is no
1674          * need to ban sw_finish as it is a nop.
1675          */
1676
1677         /* Pinned buffers may be scanout, so flush the cache */
1678         i915_gem_object_flush_if_display(obj);
1679         i915_gem_object_put(obj);
1680
1681         return 0;
1682 }
1683
1684 static inline bool
1685 __vma_matches(struct vm_area_struct *vma, struct file *filp,
1686               unsigned long addr, unsigned long size)
1687 {
1688         if (vma->vm_file != filp)
1689                 return false;
1690
1691         return vma->vm_start == addr &&
1692                (vma->vm_end - vma->vm_start) == PAGE_ALIGN(size);
1693 }
1694
1695 /**
1696  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1697  *                       it is mapped to.
1698  * @dev: drm device
1699  * @data: ioctl data blob
1700  * @file: drm file
1701  *
1702  * While the mapping holds a reference on the contents of the object, it doesn't
1703  * imply a ref on the object itself.
1704  *
1705  * IMPORTANT:
1706  *
1707  * DRM driver writers who look a this function as an example for how to do GEM
1708  * mmap support, please don't implement mmap support like here. The modern way
1709  * to implement DRM mmap support is with an mmap offset ioctl (like
1710  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1711  * That way debug tooling like valgrind will understand what's going on, hiding
1712  * the mmap call in a driver private ioctl will break that. The i915 driver only
1713  * does cpu mmaps this way because we didn't know better.
1714  */
1715 int
1716 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1717                     struct drm_file *file)
1718 {
1719         struct drm_i915_gem_mmap *args = data;
1720         struct drm_i915_gem_object *obj;
1721         unsigned long addr;
1722
1723         if (args->flags & ~(I915_MMAP_WC))
1724                 return -EINVAL;
1725
1726         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1727                 return -ENODEV;
1728
1729         obj = i915_gem_object_lookup(file, args->handle);
1730         if (!obj)
1731                 return -ENOENT;
1732
1733         /* prime objects have no backing filp to GEM mmap
1734          * pages from.
1735          */
1736         if (!obj->base.filp) {
1737                 i915_gem_object_put(obj);
1738                 return -ENXIO;
1739         }
1740
1741         addr = vm_mmap(obj->base.filp, 0, args->size,
1742                        PROT_READ | PROT_WRITE, MAP_SHARED,
1743                        args->offset);
1744         if (IS_ERR_VALUE(addr))
1745                 goto err;
1746
1747         if (args->flags & I915_MMAP_WC) {
1748                 struct mm_struct *mm = current->mm;
1749                 struct vm_area_struct *vma;
1750
1751                 if (down_write_killable(&mm->mmap_sem)) {
1752                         i915_gem_object_put(obj);
1753                         return -EINTR;
1754                 }
1755                 vma = find_vma(mm, addr);
1756                 if (vma && __vma_matches(vma, obj->base.filp, addr, args->size))
1757                         vma->vm_page_prot =
1758                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1759                 else
1760                         addr = -ENOMEM;
1761                 up_write(&mm->mmap_sem);
1762                 if (IS_ERR_VALUE(addr))
1763                         goto err;
1764
1765                 /* This may race, but that's ok, it only gets set */
1766                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1767         }
1768         i915_gem_object_put(obj);
1769
1770         args->addr_ptr = (u64)addr;
1771
1772         return 0;
1773
1774 err:
1775         i915_gem_object_put(obj);
1776
1777         return addr;
1778 }
1779
1780 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1781 {
1782         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1783 }
1784
1785 /**
1786  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1787  *
1788  * A history of the GTT mmap interface:
1789  *
1790  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1791  *     aligned and suitable for fencing, and still fit into the available
1792  *     mappable space left by the pinned display objects. A classic problem
1793  *     we called the page-fault-of-doom where we would ping-pong between
1794  *     two objects that could not fit inside the GTT and so the memcpy
1795  *     would page one object in at the expense of the other between every
1796  *     single byte.
1797  *
1798  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1799  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1800  *     object is too large for the available space (or simply too large
1801  *     for the mappable aperture!), a view is created instead and faulted
1802  *     into userspace. (This view is aligned and sized appropriately for
1803  *     fenced access.)
1804  *
1805  * 2 - Recognise WC as a separate cache domain so that we can flush the
1806  *     delayed writes via GTT before performing direct access via WC.
1807  *
1808  * Restrictions:
1809  *
1810  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1811  *    hangs on some architectures, corruption on others. An attempt to service
1812  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1813  *
1814  *  * the object must be able to fit into RAM (physical memory, though no
1815  *    limited to the mappable aperture).
1816  *
1817  *
1818  * Caveats:
1819  *
1820  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1821  *    all data to system memory. Subsequent access will not be synchronized.
1822  *
1823  *  * all mappings are revoked on runtime device suspend.
1824  *
1825  *  * there are only 8, 16 or 32 fence registers to share between all users
1826  *    (older machines require fence register for display and blitter access
1827  *    as well). Contention of the fence registers will cause the previous users
1828  *    to be unmapped and any new access will generate new page faults.
1829  *
1830  *  * running out of memory while servicing a fault may generate a SIGBUS,
1831  *    rather than the expected SIGSEGV.
1832  */
1833 int i915_gem_mmap_gtt_version(void)
1834 {
1835         return 2;
1836 }
1837
1838 static inline struct i915_ggtt_view
1839 compute_partial_view(const struct drm_i915_gem_object *obj,
1840                      pgoff_t page_offset,
1841                      unsigned int chunk)
1842 {
1843         struct i915_ggtt_view view;
1844
1845         if (i915_gem_object_is_tiled(obj))
1846                 chunk = roundup(chunk, tile_row_pages(obj));
1847
1848         view.type = I915_GGTT_VIEW_PARTIAL;
1849         view.partial.offset = rounddown(page_offset, chunk);
1850         view.partial.size =
1851                 min_t(unsigned int, chunk,
1852                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1853
1854         /* If the partial covers the entire object, just create a normal VMA. */
1855         if (chunk >= obj->base.size >> PAGE_SHIFT)
1856                 view.type = I915_GGTT_VIEW_NORMAL;
1857
1858         return view;
1859 }
1860
1861 /**
1862  * i915_gem_fault - fault a page into the GTT
1863  * @vmf: fault info
1864  *
1865  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1866  * from userspace.  The fault handler takes care of binding the object to
1867  * the GTT (if needed), allocating and programming a fence register (again,
1868  * only if needed based on whether the old reg is still valid or the object
1869  * is tiled) and inserting a new PTE into the faulting process.
1870  *
1871  * Note that the faulting process may involve evicting existing objects
1872  * from the GTT and/or fence registers to make room.  So performance may
1873  * suffer if the GTT working set is large or there are few fence registers
1874  * left.
1875  *
1876  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1877  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1878  */
1879 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
1880 {
1881 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
1882         struct vm_area_struct *area = vmf->vma;
1883         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1884         struct drm_device *dev = obj->base.dev;
1885         struct drm_i915_private *dev_priv = to_i915(dev);
1886         struct i915_ggtt *ggtt = &dev_priv->ggtt;
1887         bool write = area->vm_flags & VM_WRITE;
1888         intel_wakeref_t wakeref;
1889         struct i915_vma *vma;
1890         pgoff_t page_offset;
1891         int ret;
1892
1893         /* Sanity check that we allow writing into this object */
1894         if (i915_gem_object_is_readonly(obj) && write)
1895                 return VM_FAULT_SIGBUS;
1896
1897         /* We don't use vmf->pgoff since that has the fake offset */
1898         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1899
1900         trace_i915_gem_object_fault(obj, page_offset, true, write);
1901
1902         /* Try to flush the object off the GPU first without holding the lock.
1903          * Upon acquiring the lock, we will perform our sanity checks and then
1904          * repeat the flush holding the lock in the normal manner to catch cases
1905          * where we are gazumped.
1906          */
1907         ret = i915_gem_object_wait(obj,
1908                                    I915_WAIT_INTERRUPTIBLE,
1909                                    MAX_SCHEDULE_TIMEOUT,
1910                                    NULL);
1911         if (ret)
1912                 goto err;
1913
1914         ret = i915_gem_object_pin_pages(obj);
1915         if (ret)
1916                 goto err;
1917
1918         wakeref = intel_runtime_pm_get(dev_priv);
1919
1920         ret = i915_mutex_lock_interruptible(dev);
1921         if (ret)
1922                 goto err_rpm;
1923
1924         /* Access to snoopable pages through the GTT is incoherent. */
1925         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1926                 ret = -EFAULT;
1927                 goto err_unlock;
1928         }
1929
1930
1931         /* Now pin it into the GTT as needed */
1932         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1933                                        PIN_MAPPABLE |
1934                                        PIN_NONBLOCK |
1935                                        PIN_NONFAULT);
1936         if (IS_ERR(vma)) {
1937                 /* Use a partial view if it is bigger than available space */
1938                 struct i915_ggtt_view view =
1939                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1940                 unsigned int flags;
1941
1942                 flags = PIN_MAPPABLE;
1943                 if (view.type == I915_GGTT_VIEW_NORMAL)
1944                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
1945
1946                 /*
1947                  * Userspace is now writing through an untracked VMA, abandon
1948                  * all hope that the hardware is able to track future writes.
1949                  */
1950                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1951
1952                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1953                 if (IS_ERR(vma) && !view.type) {
1954                         flags = PIN_MAPPABLE;
1955                         view.type = I915_GGTT_VIEW_PARTIAL;
1956                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1957                 }
1958         }
1959         if (IS_ERR(vma)) {
1960                 ret = PTR_ERR(vma);
1961                 goto err_unlock;
1962         }
1963
1964         ret = i915_gem_object_set_to_gtt_domain(obj, write);
1965         if (ret)
1966                 goto err_unpin;
1967
1968         ret = i915_vma_pin_fence(vma);
1969         if (ret)
1970                 goto err_unpin;
1971
1972         /* Finally, remap it using the new GTT offset */
1973         ret = remap_io_mapping(area,
1974                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1975                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
1976                                min_t(u64, vma->size, area->vm_end - area->vm_start),
1977                                &ggtt->iomap);
1978         if (ret)
1979                 goto err_fence;
1980
1981         /* Mark as being mmapped into userspace for later revocation */
1982         assert_rpm_wakelock_held(dev_priv);
1983         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
1984                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1985         GEM_BUG_ON(!obj->userfault_count);
1986
1987         i915_vma_set_ggtt_write(vma);
1988
1989 err_fence:
1990         i915_vma_unpin_fence(vma);
1991 err_unpin:
1992         __i915_vma_unpin(vma);
1993 err_unlock:
1994         mutex_unlock(&dev->struct_mutex);
1995 err_rpm:
1996         intel_runtime_pm_put(dev_priv, wakeref);
1997         i915_gem_object_unpin_pages(obj);
1998 err:
1999         switch (ret) {
2000         case -EIO:
2001                 /*
2002                  * We eat errors when the gpu is terminally wedged to avoid
2003                  * userspace unduly crashing (gl has no provisions for mmaps to
2004                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2005                  * and so needs to be reported.
2006                  */
2007                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
2008                         return VM_FAULT_SIGBUS;
2009                 /* else: fall through */
2010         case -EAGAIN:
2011                 /*
2012                  * EAGAIN means the gpu is hung and we'll wait for the error
2013                  * handler to reset everything when re-faulting in
2014                  * i915_mutex_lock_interruptible.
2015                  */
2016         case 0:
2017         case -ERESTARTSYS:
2018         case -EINTR:
2019         case -EBUSY:
2020                 /*
2021                  * EBUSY is ok: this just means that another thread
2022                  * already did the job.
2023                  */
2024                 return VM_FAULT_NOPAGE;
2025         case -ENOMEM:
2026                 return VM_FAULT_OOM;
2027         case -ENOSPC:
2028         case -EFAULT:
2029                 return VM_FAULT_SIGBUS;
2030         default:
2031                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2032                 return VM_FAULT_SIGBUS;
2033         }
2034 }
2035
2036 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2037 {
2038         struct i915_vma *vma;
2039
2040         GEM_BUG_ON(!obj->userfault_count);
2041
2042         obj->userfault_count = 0;
2043         list_del(&obj->userfault_link);
2044         drm_vma_node_unmap(&obj->base.vma_node,
2045                            obj->base.dev->anon_inode->i_mapping);
2046
2047         for_each_ggtt_vma(vma, obj)
2048                 i915_vma_unset_userfault(vma);
2049 }
2050
2051 /**
2052  * i915_gem_release_mmap - remove physical page mappings
2053  * @obj: obj in question
2054  *
2055  * Preserve the reservation of the mmapping with the DRM core code, but
2056  * relinquish ownership of the pages back to the system.
2057  *
2058  * It is vital that we remove the page mapping if we have mapped a tiled
2059  * object through the GTT and then lose the fence register due to
2060  * resource pressure. Similarly if the object has been moved out of the
2061  * aperture, than pages mapped into userspace must be revoked. Removing the
2062  * mapping will then trigger a page fault on the next user access, allowing
2063  * fixup by i915_gem_fault().
2064  */
2065 void
2066 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2067 {
2068         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2069         intel_wakeref_t wakeref;
2070
2071         /* Serialisation between user GTT access and our code depends upon
2072          * revoking the CPU's PTE whilst the mutex is held. The next user
2073          * pagefault then has to wait until we release the mutex.
2074          *
2075          * Note that RPM complicates somewhat by adding an additional
2076          * requirement that operations to the GGTT be made holding the RPM
2077          * wakeref.
2078          */
2079         lockdep_assert_held(&i915->drm.struct_mutex);
2080         wakeref = intel_runtime_pm_get(i915);
2081
2082         if (!obj->userfault_count)
2083                 goto out;
2084
2085         __i915_gem_object_release_mmap(obj);
2086
2087         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2088          * memory transactions from userspace before we return. The TLB
2089          * flushing implied above by changing the PTE above *should* be
2090          * sufficient, an extra barrier here just provides us with a bit
2091          * of paranoid documentation about our requirement to serialise
2092          * memory writes before touching registers / GSM.
2093          */
2094         wmb();
2095
2096 out:
2097         intel_runtime_pm_put(i915, wakeref);
2098 }
2099
2100 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2101 {
2102         struct drm_i915_gem_object *obj, *on;
2103         int i;
2104
2105         /*
2106          * Only called during RPM suspend. All users of the userfault_list
2107          * must be holding an RPM wakeref to ensure that this can not
2108          * run concurrently with themselves (and use the struct_mutex for
2109          * protection between themselves).
2110          */
2111
2112         list_for_each_entry_safe(obj, on,
2113                                  &dev_priv->mm.userfault_list, userfault_link)
2114                 __i915_gem_object_release_mmap(obj);
2115
2116         /* The fence will be lost when the device powers down. If any were
2117          * in use by hardware (i.e. they are pinned), we should not be powering
2118          * down! All other fences will be reacquired by the user upon waking.
2119          */
2120         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2121                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2122
2123                 /* Ideally we want to assert that the fence register is not
2124                  * live at this point (i.e. that no piece of code will be
2125                  * trying to write through fence + GTT, as that both violates
2126                  * our tracking of activity and associated locking/barriers,
2127                  * but also is illegal given that the hw is powered down).
2128                  *
2129                  * Previously we used reg->pin_count as a "liveness" indicator.
2130                  * That is not sufficient, and we need a more fine-grained
2131                  * tool if we want to have a sanity check here.
2132                  */
2133
2134                 if (!reg->vma)
2135                         continue;
2136
2137                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2138                 reg->dirty = true;
2139         }
2140 }
2141
2142 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2143 {
2144         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2145         int err;
2146
2147         err = drm_gem_create_mmap_offset(&obj->base);
2148         if (likely(!err))
2149                 return 0;
2150
2151         /* Attempt to reap some mmap space from dead objects */
2152         do {
2153                 err = i915_gem_wait_for_idle(dev_priv,
2154                                              I915_WAIT_INTERRUPTIBLE,
2155                                              MAX_SCHEDULE_TIMEOUT);
2156                 if (err)
2157                         break;
2158
2159                 i915_gem_drain_freed_objects(dev_priv);
2160                 err = drm_gem_create_mmap_offset(&obj->base);
2161                 if (!err)
2162                         break;
2163
2164         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2165
2166         return err;
2167 }
2168
2169 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2170 {
2171         drm_gem_free_mmap_offset(&obj->base);
2172 }
2173
2174 int
2175 i915_gem_mmap_gtt(struct drm_file *file,
2176                   struct drm_device *dev,
2177                   u32 handle,
2178                   u64 *offset)
2179 {
2180         struct drm_i915_gem_object *obj;
2181         int ret;
2182
2183         obj = i915_gem_object_lookup(file, handle);
2184         if (!obj)
2185                 return -ENOENT;
2186
2187         ret = i915_gem_object_create_mmap_offset(obj);
2188         if (ret == 0)
2189                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2190
2191         i915_gem_object_put(obj);
2192         return ret;
2193 }
2194
2195 /**
2196  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2197  * @dev: DRM device
2198  * @data: GTT mapping ioctl data
2199  * @file: GEM object info
2200  *
2201  * Simply returns the fake offset to userspace so it can mmap it.
2202  * The mmap call will end up in drm_gem_mmap(), which will set things
2203  * up so we can get faults in the handler above.
2204  *
2205  * The fault handler will take care of binding the object into the GTT
2206  * (since it may have been evicted to make room for something), allocating
2207  * a fence register, and mapping the appropriate aperture address into
2208  * userspace.
2209  */
2210 int
2211 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2212                         struct drm_file *file)
2213 {
2214         struct drm_i915_gem_mmap_gtt *args = data;
2215
2216         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2217 }
2218
2219 /* Immediately discard the backing storage */
2220 static void
2221 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2222 {
2223         i915_gem_object_free_mmap_offset(obj);
2224
2225         if (obj->base.filp == NULL)
2226                 return;
2227
2228         /* Our goal here is to return as much of the memory as
2229          * is possible back to the system as we are called from OOM.
2230          * To do this we must instruct the shmfs to drop all of its
2231          * backing pages, *now*.
2232          */
2233         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2234         obj->mm.madv = __I915_MADV_PURGED;
2235         obj->mm.pages = ERR_PTR(-EFAULT);
2236 }
2237
2238 /* Try to discard unwanted pages */
2239 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2240 {
2241         struct address_space *mapping;
2242
2243         lockdep_assert_held(&obj->mm.lock);
2244         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2245
2246         switch (obj->mm.madv) {
2247         case I915_MADV_DONTNEED:
2248                 i915_gem_object_truncate(obj);
2249         case __I915_MADV_PURGED:
2250                 return;
2251         }
2252
2253         if (obj->base.filp == NULL)
2254                 return;
2255
2256         mapping = obj->base.filp->f_mapping,
2257         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2258 }
2259
2260 /*
2261  * Move pages to appropriate lru and release the pagevec, decrementing the
2262  * ref count of those pages.
2263  */
2264 static void check_release_pagevec(struct pagevec *pvec)
2265 {
2266         check_move_unevictable_pages(pvec);
2267         __pagevec_release(pvec);
2268         cond_resched();
2269 }
2270
2271 static void
2272 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2273                               struct sg_table *pages)
2274 {
2275         struct sgt_iter sgt_iter;
2276         struct pagevec pvec;
2277         struct page *page;
2278
2279         __i915_gem_object_release_shmem(obj, pages, true);
2280
2281         i915_gem_gtt_finish_pages(obj, pages);
2282
2283         if (i915_gem_object_needs_bit17_swizzle(obj))
2284                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2285
2286         mapping_clear_unevictable(file_inode(obj->base.filp)->i_mapping);
2287
2288         pagevec_init(&pvec);
2289         for_each_sgt_page(page, sgt_iter, pages) {
2290                 if (obj->mm.dirty)
2291                         set_page_dirty(page);
2292
2293                 if (obj->mm.madv == I915_MADV_WILLNEED)
2294                         mark_page_accessed(page);
2295
2296                 if (!pagevec_add(&pvec, page))
2297                         check_release_pagevec(&pvec);
2298         }
2299         if (pagevec_count(&pvec))
2300                 check_release_pagevec(&pvec);
2301         obj->mm.dirty = false;
2302
2303         sg_free_table(pages);
2304         kfree(pages);
2305 }
2306
2307 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2308 {
2309         struct radix_tree_iter iter;
2310         void __rcu **slot;
2311
2312         rcu_read_lock();
2313         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2314                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2315         rcu_read_unlock();
2316 }
2317
2318 static struct sg_table *
2319 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2320 {
2321         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2322         struct sg_table *pages;
2323
2324         pages = fetch_and_zero(&obj->mm.pages);
2325         if (IS_ERR_OR_NULL(pages))
2326                 return pages;
2327
2328         spin_lock(&i915->mm.obj_lock);
2329         list_del(&obj->mm.link);
2330         spin_unlock(&i915->mm.obj_lock);
2331
2332         if (obj->mm.mapping) {
2333                 void *ptr;
2334
2335                 ptr = page_mask_bits(obj->mm.mapping);
2336                 if (is_vmalloc_addr(ptr))
2337                         vunmap(ptr);
2338                 else
2339                         kunmap(kmap_to_page(ptr));
2340
2341                 obj->mm.mapping = NULL;
2342         }
2343
2344         __i915_gem_object_reset_page_iter(obj);
2345         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2346
2347         return pages;
2348 }
2349
2350 int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2351                                 enum i915_mm_subclass subclass)
2352 {
2353         struct sg_table *pages;
2354         int ret;
2355
2356         if (i915_gem_object_has_pinned_pages(obj))
2357                 return -EBUSY;
2358
2359         GEM_BUG_ON(obj->bind_count);
2360
2361         /* May be called by shrinker from within get_pages() (on another bo) */
2362         mutex_lock_nested(&obj->mm.lock, subclass);
2363         if (unlikely(atomic_read(&obj->mm.pages_pin_count))) {
2364                 ret = -EBUSY;
2365                 goto unlock;
2366         }
2367
2368         /*
2369          * ->put_pages might need to allocate memory for the bit17 swizzle
2370          * array, hence protect them from being reaped by removing them from gtt
2371          * lists early.
2372          */
2373         pages = __i915_gem_object_unset_pages(obj);
2374
2375         /*
2376          * XXX Temporary hijinx to avoid updating all backends to handle
2377          * NULL pages. In the future, when we have more asynchronous
2378          * get_pages backends we should be better able to handle the
2379          * cancellation of the async task in a more uniform manner.
2380          */
2381         if (!pages && !i915_gem_object_needs_async_cancel(obj))
2382                 pages = ERR_PTR(-EINVAL);
2383
2384         if (!IS_ERR(pages))
2385                 obj->ops->put_pages(obj, pages);
2386
2387         ret = 0;
2388 unlock:
2389         mutex_unlock(&obj->mm.lock);
2390
2391         return ret;
2392 }
2393
2394 bool i915_sg_trim(struct sg_table *orig_st)
2395 {
2396         struct sg_table new_st;
2397         struct scatterlist *sg, *new_sg;
2398         unsigned int i;
2399
2400         if (orig_st->nents == orig_st->orig_nents)
2401                 return false;
2402
2403         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2404                 return false;
2405
2406         new_sg = new_st.sgl;
2407         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2408                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2409                 sg_dma_address(new_sg) = sg_dma_address(sg);
2410                 sg_dma_len(new_sg) = sg_dma_len(sg);
2411
2412                 new_sg = sg_next(new_sg);
2413         }
2414         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2415
2416         sg_free_table(orig_st);
2417
2418         *orig_st = new_st;
2419         return true;
2420 }
2421
2422 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2423 {
2424         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2425         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2426         unsigned long i;
2427         struct address_space *mapping;
2428         struct sg_table *st;
2429         struct scatterlist *sg;
2430         struct sgt_iter sgt_iter;
2431         struct page *page;
2432         unsigned long last_pfn = 0;     /* suppress gcc warning */
2433         unsigned int max_segment = i915_sg_segment_size();
2434         unsigned int sg_page_sizes;
2435         struct pagevec pvec;
2436         gfp_t noreclaim;
2437         int ret;
2438
2439         /*
2440          * Assert that the object is not currently in any GPU domain. As it
2441          * wasn't in the GTT, there shouldn't be any way it could have been in
2442          * a GPU cache
2443          */
2444         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2445         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2446
2447         /*
2448          * If there's no chance of allocating enough pages for the whole
2449          * object, bail early.
2450          */
2451         if (page_count > totalram_pages())
2452                 return -ENOMEM;
2453
2454         st = kmalloc(sizeof(*st), GFP_KERNEL);
2455         if (st == NULL)
2456                 return -ENOMEM;
2457
2458 rebuild_st:
2459         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2460                 kfree(st);
2461                 return -ENOMEM;
2462         }
2463
2464         /*
2465          * Get the list of pages out of our struct file.  They'll be pinned
2466          * at this point until we release them.
2467          *
2468          * Fail silently without starting the shrinker
2469          */
2470         mapping = obj->base.filp->f_mapping;
2471         mapping_set_unevictable(mapping);
2472         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2473         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2474
2475         sg = st->sgl;
2476         st->nents = 0;
2477         sg_page_sizes = 0;
2478         for (i = 0; i < page_count; i++) {
2479                 const unsigned int shrink[] = {
2480                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2481                         0,
2482                 }, *s = shrink;
2483                 gfp_t gfp = noreclaim;
2484
2485                 do {
2486                         cond_resched();
2487                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2488                         if (likely(!IS_ERR(page)))
2489                                 break;
2490
2491                         if (!*s) {
2492                                 ret = PTR_ERR(page);
2493                                 goto err_sg;
2494                         }
2495
2496                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2497
2498                         /*
2499                          * We've tried hard to allocate the memory by reaping
2500                          * our own buffer, now let the real VM do its job and
2501                          * go down in flames if truly OOM.
2502                          *
2503                          * However, since graphics tend to be disposable,
2504                          * defer the oom here by reporting the ENOMEM back
2505                          * to userspace.
2506                          */
2507                         if (!*s) {
2508                                 /* reclaim and warn, but no oom */
2509                                 gfp = mapping_gfp_mask(mapping);
2510
2511                                 /*
2512                                  * Our bo are always dirty and so we require
2513                                  * kswapd to reclaim our pages (direct reclaim
2514                                  * does not effectively begin pageout of our
2515                                  * buffers on its own). However, direct reclaim
2516                                  * only waits for kswapd when under allocation
2517                                  * congestion. So as a result __GFP_RECLAIM is
2518                                  * unreliable and fails to actually reclaim our
2519                                  * dirty pages -- unless you try over and over
2520                                  * again with !__GFP_NORETRY. However, we still
2521                                  * want to fail this allocation rather than
2522                                  * trigger the out-of-memory killer and for
2523                                  * this we want __GFP_RETRY_MAYFAIL.
2524                                  */
2525                                 gfp |= __GFP_RETRY_MAYFAIL;
2526                         }
2527                 } while (1);
2528
2529                 if (!i ||
2530                     sg->length >= max_segment ||
2531                     page_to_pfn(page) != last_pfn + 1) {
2532                         if (i) {
2533                                 sg_page_sizes |= sg->length;
2534                                 sg = sg_next(sg);
2535                         }
2536                         st->nents++;
2537                         sg_set_page(sg, page, PAGE_SIZE, 0);
2538                 } else {
2539                         sg->length += PAGE_SIZE;
2540                 }
2541                 last_pfn = page_to_pfn(page);
2542
2543                 /* Check that the i965g/gm workaround works. */
2544                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2545         }
2546         if (sg) { /* loop terminated early; short sg table */
2547                 sg_page_sizes |= sg->length;
2548                 sg_mark_end(sg);
2549         }
2550
2551         /* Trim unused sg entries to avoid wasting memory. */
2552         i915_sg_trim(st);
2553
2554         ret = i915_gem_gtt_prepare_pages(obj, st);
2555         if (ret) {
2556                 /*
2557                  * DMA remapping failed? One possible cause is that
2558                  * it could not reserve enough large entries, asking
2559                  * for PAGE_SIZE chunks instead may be helpful.
2560                  */
2561                 if (max_segment > PAGE_SIZE) {
2562                         for_each_sgt_page(page, sgt_iter, st)
2563                                 put_page(page);
2564                         sg_free_table(st);
2565
2566                         max_segment = PAGE_SIZE;
2567                         goto rebuild_st;
2568                 } else {
2569                         dev_warn(&dev_priv->drm.pdev->dev,
2570                                  "Failed to DMA remap %lu pages\n",
2571                                  page_count);
2572                         goto err_pages;
2573                 }
2574         }
2575
2576         if (i915_gem_object_needs_bit17_swizzle(obj))
2577                 i915_gem_object_do_bit_17_swizzle(obj, st);
2578
2579         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2580
2581         return 0;
2582
2583 err_sg:
2584         sg_mark_end(sg);
2585 err_pages:
2586         mapping_clear_unevictable(mapping);
2587         pagevec_init(&pvec);
2588         for_each_sgt_page(page, sgt_iter, st) {
2589                 if (!pagevec_add(&pvec, page))
2590                         check_release_pagevec(&pvec);
2591         }
2592         if (pagevec_count(&pvec))
2593                 check_release_pagevec(&pvec);
2594         sg_free_table(st);
2595         kfree(st);
2596
2597         /*
2598          * shmemfs first checks if there is enough memory to allocate the page
2599          * and reports ENOSPC should there be insufficient, along with the usual
2600          * ENOMEM for a genuine allocation failure.
2601          *
2602          * We use ENOSPC in our driver to mean that we have run out of aperture
2603          * space and so want to translate the error from shmemfs back to our
2604          * usual understanding of ENOMEM.
2605          */
2606         if (ret == -ENOSPC)
2607                 ret = -ENOMEM;
2608
2609         return ret;
2610 }
2611
2612 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2613                                  struct sg_table *pages,
2614                                  unsigned int sg_page_sizes)
2615 {
2616         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2617         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2618         int i;
2619
2620         lockdep_assert_held(&obj->mm.lock);
2621
2622         obj->mm.get_page.sg_pos = pages->sgl;
2623         obj->mm.get_page.sg_idx = 0;
2624
2625         obj->mm.pages = pages;
2626
2627         if (i915_gem_object_is_tiled(obj) &&
2628             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2629                 GEM_BUG_ON(obj->mm.quirked);
2630                 __i915_gem_object_pin_pages(obj);
2631                 obj->mm.quirked = true;
2632         }
2633
2634         GEM_BUG_ON(!sg_page_sizes);
2635         obj->mm.page_sizes.phys = sg_page_sizes;
2636
2637         /*
2638          * Calculate the supported page-sizes which fit into the given
2639          * sg_page_sizes. This will give us the page-sizes which we may be able
2640          * to use opportunistically when later inserting into the GTT. For
2641          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2642          * 64K or 4K pages, although in practice this will depend on a number of
2643          * other factors.
2644          */
2645         obj->mm.page_sizes.sg = 0;
2646         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2647                 if (obj->mm.page_sizes.phys & ~0u << i)
2648                         obj->mm.page_sizes.sg |= BIT(i);
2649         }
2650         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2651
2652         spin_lock(&i915->mm.obj_lock);
2653         list_add(&obj->mm.link, &i915->mm.unbound_list);
2654         spin_unlock(&i915->mm.obj_lock);
2655 }
2656
2657 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2658 {
2659         int err;
2660
2661         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2662                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2663                 return -EFAULT;
2664         }
2665
2666         err = obj->ops->get_pages(obj);
2667         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2668
2669         return err;
2670 }
2671
2672 /* Ensure that the associated pages are gathered from the backing storage
2673  * and pinned into our object. i915_gem_object_pin_pages() may be called
2674  * multiple times before they are released by a single call to
2675  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2676  * either as a result of memory pressure (reaping pages under the shrinker)
2677  * or as the object is itself released.
2678  */
2679 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2680 {
2681         int err;
2682
2683         err = mutex_lock_interruptible(&obj->mm.lock);
2684         if (err)
2685                 return err;
2686
2687         if (unlikely(!i915_gem_object_has_pages(obj))) {
2688                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2689
2690                 err = ____i915_gem_object_get_pages(obj);
2691                 if (err)
2692                         goto unlock;
2693
2694                 smp_mb__before_atomic();
2695         }
2696         atomic_inc(&obj->mm.pages_pin_count);
2697
2698 unlock:
2699         mutex_unlock(&obj->mm.lock);
2700         return err;
2701 }
2702
2703 /* The 'mapping' part of i915_gem_object_pin_map() below */
2704 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2705                                  enum i915_map_type type)
2706 {
2707         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2708         struct sg_table *sgt = obj->mm.pages;
2709         struct sgt_iter sgt_iter;
2710         struct page *page;
2711         struct page *stack_pages[32];
2712         struct page **pages = stack_pages;
2713         unsigned long i = 0;
2714         pgprot_t pgprot;
2715         void *addr;
2716
2717         /* A single page can always be kmapped */
2718         if (n_pages == 1 && type == I915_MAP_WB)
2719                 return kmap(sg_page(sgt->sgl));
2720
2721         if (n_pages > ARRAY_SIZE(stack_pages)) {
2722                 /* Too big for stack -- allocate temporary array instead */
2723                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2724                 if (!pages)
2725                         return NULL;
2726         }
2727
2728         for_each_sgt_page(page, sgt_iter, sgt)
2729                 pages[i++] = page;
2730
2731         /* Check that we have the expected number of pages */
2732         GEM_BUG_ON(i != n_pages);
2733
2734         switch (type) {
2735         default:
2736                 MISSING_CASE(type);
2737                 /* fallthrough to use PAGE_KERNEL anyway */
2738         case I915_MAP_WB:
2739                 pgprot = PAGE_KERNEL;
2740                 break;
2741         case I915_MAP_WC:
2742                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2743                 break;
2744         }
2745         addr = vmap(pages, n_pages, 0, pgprot);
2746
2747         if (pages != stack_pages)
2748                 kvfree(pages);
2749
2750         return addr;
2751 }
2752
2753 /* get, pin, and map the pages of the object into kernel space */
2754 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2755                               enum i915_map_type type)
2756 {
2757         enum i915_map_type has_type;
2758         bool pinned;
2759         void *ptr;
2760         int ret;
2761
2762         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2763                 return ERR_PTR(-ENXIO);
2764
2765         ret = mutex_lock_interruptible(&obj->mm.lock);
2766         if (ret)
2767                 return ERR_PTR(ret);
2768
2769         pinned = !(type & I915_MAP_OVERRIDE);
2770         type &= ~I915_MAP_OVERRIDE;
2771
2772         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2773                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2774                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2775
2776                         ret = ____i915_gem_object_get_pages(obj);
2777                         if (ret)
2778                                 goto err_unlock;
2779
2780                         smp_mb__before_atomic();
2781                 }
2782                 atomic_inc(&obj->mm.pages_pin_count);
2783                 pinned = false;
2784         }
2785         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2786
2787         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2788         if (ptr && has_type != type) {
2789                 if (pinned) {
2790                         ret = -EBUSY;
2791                         goto err_unpin;
2792                 }
2793
2794                 if (is_vmalloc_addr(ptr))
2795                         vunmap(ptr);
2796                 else
2797                         kunmap(kmap_to_page(ptr));
2798
2799                 ptr = obj->mm.mapping = NULL;
2800         }
2801
2802         if (!ptr) {
2803                 ptr = i915_gem_object_map(obj, type);
2804                 if (!ptr) {
2805                         ret = -ENOMEM;
2806                         goto err_unpin;
2807                 }
2808
2809                 obj->mm.mapping = page_pack_bits(ptr, type);
2810         }
2811
2812 out_unlock:
2813         mutex_unlock(&obj->mm.lock);
2814         return ptr;
2815
2816 err_unpin:
2817         atomic_dec(&obj->mm.pages_pin_count);
2818 err_unlock:
2819         ptr = ERR_PTR(ret);
2820         goto out_unlock;
2821 }
2822
2823 static int
2824 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2825                            const struct drm_i915_gem_pwrite *arg)
2826 {
2827         struct address_space *mapping = obj->base.filp->f_mapping;
2828         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2829         u64 remain, offset;
2830         unsigned int pg;
2831
2832         /* Before we instantiate/pin the backing store for our use, we
2833          * can prepopulate the shmemfs filp efficiently using a write into
2834          * the pagecache. We avoid the penalty of instantiating all the
2835          * pages, important if the user is just writing to a few and never
2836          * uses the object on the GPU, and using a direct write into shmemfs
2837          * allows it to avoid the cost of retrieving a page (either swapin
2838          * or clearing-before-use) before it is overwritten.
2839          */
2840         if (i915_gem_object_has_pages(obj))
2841                 return -ENODEV;
2842
2843         if (obj->mm.madv != I915_MADV_WILLNEED)
2844                 return -EFAULT;
2845
2846         /* Before the pages are instantiated the object is treated as being
2847          * in the CPU domain. The pages will be clflushed as required before
2848          * use, and we can freely write into the pages directly. If userspace
2849          * races pwrite with any other operation; corruption will ensue -
2850          * that is userspace's prerogative!
2851          */
2852
2853         remain = arg->size;
2854         offset = arg->offset;
2855         pg = offset_in_page(offset);
2856
2857         do {
2858                 unsigned int len, unwritten;
2859                 struct page *page;
2860                 void *data, *vaddr;
2861                 int err;
2862
2863                 len = PAGE_SIZE - pg;
2864                 if (len > remain)
2865                         len = remain;
2866
2867                 err = pagecache_write_begin(obj->base.filp, mapping,
2868                                             offset, len, 0,
2869                                             &page, &data);
2870                 if (err < 0)
2871                         return err;
2872
2873                 vaddr = kmap(page);
2874                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2875                 kunmap(page);
2876
2877                 err = pagecache_write_end(obj->base.filp, mapping,
2878                                           offset, len, len - unwritten,
2879                                           page, data);
2880                 if (err < 0)
2881                         return err;
2882
2883                 if (unwritten)
2884                         return -EFAULT;
2885
2886                 remain -= len;
2887                 user_data += len;
2888                 offset += len;
2889                 pg = 0;
2890         } while (remain);
2891
2892         return 0;
2893 }
2894
2895 static bool match_ring(struct i915_request *rq)
2896 {
2897         struct drm_i915_private *dev_priv = rq->i915;
2898         u32 ring = I915_READ(RING_START(rq->engine->mmio_base));
2899
2900         return ring == i915_ggtt_offset(rq->ring->vma);
2901 }
2902
2903 struct i915_request *
2904 i915_gem_find_active_request(struct intel_engine_cs *engine)
2905 {
2906         struct i915_request *request, *active = NULL;
2907         unsigned long flags;
2908
2909         /*
2910          * We are called by the error capture, reset and to dump engine
2911          * state at random points in time. In particular, note that neither is
2912          * crucially ordered with an interrupt. After a hang, the GPU is dead
2913          * and we assume that no more writes can happen (we waited long enough
2914          * for all writes that were in transaction to be flushed) - adding an
2915          * extra delay for a recent interrupt is pointless. Hence, we do
2916          * not need an engine->irq_seqno_barrier() before the seqno reads.
2917          * At all other times, we must assume the GPU is still running, but
2918          * we only care about the snapshot of this moment.
2919          */
2920         spin_lock_irqsave(&engine->timeline.lock, flags);
2921         list_for_each_entry(request, &engine->timeline.requests, link) {
2922                 if (i915_request_completed(request))
2923                         continue;
2924
2925                 if (!i915_request_started(request))
2926                         break;
2927
2928                 /* More than one preemptible request may match! */
2929                 if (!match_ring(request))
2930                         break;
2931
2932                 active = request;
2933                 break;
2934         }
2935         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2936
2937         return active;
2938 }
2939
2940 static void
2941 i915_gem_retire_work_handler(struct work_struct *work)
2942 {
2943         struct drm_i915_private *dev_priv =
2944                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
2945         struct drm_device *dev = &dev_priv->drm;
2946
2947         /* Come back later if the device is busy... */
2948         if (mutex_trylock(&dev->struct_mutex)) {
2949                 i915_retire_requests(dev_priv);
2950                 mutex_unlock(&dev->struct_mutex);
2951         }
2952
2953         /*
2954          * Keep the retire handler running until we are finally idle.
2955          * We do not need to do this test under locking as in the worst-case
2956          * we queue the retire worker once too often.
2957          */
2958         if (READ_ONCE(dev_priv->gt.awake))
2959                 queue_delayed_work(dev_priv->wq,
2960                                    &dev_priv->gt.retire_work,
2961                                    round_jiffies_up_relative(HZ));
2962 }
2963
2964 static void shrink_caches(struct drm_i915_private *i915)
2965 {
2966         /*
2967          * kmem_cache_shrink() discards empty slabs and reorders partially
2968          * filled slabs to prioritise allocating from the mostly full slabs,
2969          * with the aim of reducing fragmentation.
2970          */
2971         kmem_cache_shrink(i915->priorities);
2972         kmem_cache_shrink(i915->dependencies);
2973         kmem_cache_shrink(i915->requests);
2974         kmem_cache_shrink(i915->luts);
2975         kmem_cache_shrink(i915->vmas);
2976         kmem_cache_shrink(i915->objects);
2977 }
2978
2979 struct sleep_rcu_work {
2980         union {
2981                 struct rcu_head rcu;
2982                 struct work_struct work;
2983         };
2984         struct drm_i915_private *i915;
2985         unsigned int epoch;
2986 };
2987
2988 static inline bool
2989 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
2990 {
2991         /*
2992          * There is a small chance that the epoch wrapped since we started
2993          * sleeping. If we assume that epoch is at least a u32, then it will
2994          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
2995          */
2996         return epoch == READ_ONCE(i915->gt.epoch);
2997 }
2998
2999 static void __sleep_work(struct work_struct *work)
3000 {
3001         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
3002         struct drm_i915_private *i915 = s->i915;
3003         unsigned int epoch = s->epoch;
3004
3005         kfree(s);
3006         if (same_epoch(i915, epoch))
3007                 shrink_caches(i915);
3008 }
3009
3010 static void __sleep_rcu(struct rcu_head *rcu)
3011 {
3012         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3013         struct drm_i915_private *i915 = s->i915;
3014
3015         destroy_rcu_head(&s->rcu);
3016
3017         if (same_epoch(i915, s->epoch)) {
3018                 INIT_WORK(&s->work, __sleep_work);
3019                 queue_work(i915->wq, &s->work);
3020         } else {
3021                 kfree(s);
3022         }
3023 }
3024
3025 static inline bool
3026 new_requests_since_last_retire(const struct drm_i915_private *i915)
3027 {
3028         return (READ_ONCE(i915->gt.active_requests) ||
3029                 work_pending(&i915->gt.idle_work.work));
3030 }
3031
3032 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3033 {
3034         struct intel_engine_cs *engine;
3035         enum intel_engine_id id;
3036
3037         if (i915_terminally_wedged(&i915->gpu_error))
3038                 return;
3039
3040         GEM_BUG_ON(i915->gt.active_requests);
3041         for_each_engine(engine, i915, id) {
3042                 GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request));
3043                 GEM_BUG_ON(engine->last_retired_context !=
3044                            to_intel_context(i915->kernel_context, engine));
3045         }
3046 }
3047
3048 static void
3049 i915_gem_idle_work_handler(struct work_struct *work)
3050 {
3051         struct drm_i915_private *dev_priv =
3052                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3053         unsigned int epoch = I915_EPOCH_INVALID;
3054         bool rearm_hangcheck;
3055
3056         if (!READ_ONCE(dev_priv->gt.awake))
3057                 return;
3058
3059         if (READ_ONCE(dev_priv->gt.active_requests))
3060                 return;
3061
3062         /*
3063          * Flush out the last user context, leaving only the pinned
3064          * kernel context resident. When we are idling on the kernel_context,
3065          * no more new requests (with a context switch) are emitted and we
3066          * can finally rest. A consequence is that the idle work handler is
3067          * always called at least twice before idling (and if the system is
3068          * idle that implies a round trip through the retire worker).
3069          */
3070         mutex_lock(&dev_priv->drm.struct_mutex);
3071         i915_gem_switch_to_kernel_context(dev_priv);
3072         mutex_unlock(&dev_priv->drm.struct_mutex);
3073
3074         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3075                   READ_ONCE(dev_priv->gt.active_requests));
3076
3077         /*
3078          * Wait for last execlists context complete, but bail out in case a
3079          * new request is submitted. As we don't trust the hardware, we
3080          * continue on if the wait times out. This is necessary to allow
3081          * the machine to suspend even if the hardware dies, and we will
3082          * try to recover in resume (after depriving the hardware of power,
3083          * it may be in a better mmod).
3084          */
3085         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3086                    intel_engines_are_idle(dev_priv),
3087                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3088                    10, 500);
3089
3090         rearm_hangcheck =
3091                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3092
3093         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3094                 /* Currently busy, come back later */
3095                 mod_delayed_work(dev_priv->wq,
3096                                  &dev_priv->gt.idle_work,
3097                                  msecs_to_jiffies(50));
3098                 goto out_rearm;
3099         }
3100
3101         /*
3102          * New request retired after this work handler started, extend active
3103          * period until next instance of the work.
3104          */
3105         if (new_requests_since_last_retire(dev_priv))
3106                 goto out_unlock;
3107
3108         epoch = __i915_gem_park(dev_priv);
3109
3110         assert_kernel_context_is_current(dev_priv);
3111
3112         rearm_hangcheck = false;
3113 out_unlock:
3114         mutex_unlock(&dev_priv->drm.struct_mutex);
3115
3116 out_rearm:
3117         if (rearm_hangcheck) {
3118                 GEM_BUG_ON(!dev_priv->gt.awake);
3119                 i915_queue_hangcheck(dev_priv);
3120         }
3121
3122         /*
3123          * When we are idle, it is an opportune time to reap our caches.
3124          * However, we have many objects that utilise RCU and the ordered
3125          * i915->wq that this work is executing on. To try and flush any
3126          * pending frees now we are idle, we first wait for an RCU grace
3127          * period, and then queue a task (that will run last on the wq) to
3128          * shrink and re-optimize the caches.
3129          */
3130         if (same_epoch(dev_priv, epoch)) {
3131                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3132                 if (s) {
3133                         init_rcu_head(&s->rcu);
3134                         s->i915 = dev_priv;
3135                         s->epoch = epoch;
3136                         call_rcu(&s->rcu, __sleep_rcu);
3137                 }
3138         }
3139 }
3140
3141 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3142 {
3143         struct drm_i915_private *i915 = to_i915(gem->dev);
3144         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3145         struct drm_i915_file_private *fpriv = file->driver_priv;
3146         struct i915_lut_handle *lut, *ln;
3147
3148         mutex_lock(&i915->drm.struct_mutex);
3149
3150         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3151                 struct i915_gem_context *ctx = lut->ctx;
3152                 struct i915_vma *vma;
3153
3154                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3155                 if (ctx->file_priv != fpriv)
3156                         continue;
3157
3158                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3159                 GEM_BUG_ON(vma->obj != obj);
3160
3161                 /* We allow the process to have multiple handles to the same
3162                  * vma, in the same fd namespace, by virtue of flink/open.
3163                  */
3164                 GEM_BUG_ON(!vma->open_count);
3165                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3166                         i915_vma_close(vma);
3167
3168                 list_del(&lut->obj_link);
3169                 list_del(&lut->ctx_link);
3170
3171                 kmem_cache_free(i915->luts, lut);
3172                 __i915_gem_object_release_unless_active(obj);
3173         }
3174
3175         mutex_unlock(&i915->drm.struct_mutex);
3176 }
3177
3178 static unsigned long to_wait_timeout(s64 timeout_ns)
3179 {
3180         if (timeout_ns < 0)
3181                 return MAX_SCHEDULE_TIMEOUT;
3182
3183         if (timeout_ns == 0)
3184                 return 0;
3185
3186         return nsecs_to_jiffies_timeout(timeout_ns);
3187 }
3188
3189 /**
3190  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3191  * @dev: drm device pointer
3192  * @data: ioctl data blob
3193  * @file: drm file pointer
3194  *
3195  * Returns 0 if successful, else an error is returned with the remaining time in
3196  * the timeout parameter.
3197  *  -ETIME: object is still busy after timeout
3198  *  -ERESTARTSYS: signal interrupted the wait
3199  *  -ENONENT: object doesn't exist
3200  * Also possible, but rare:
3201  *  -EAGAIN: incomplete, restart syscall
3202  *  -ENOMEM: damn
3203  *  -ENODEV: Internal IRQ fail
3204  *  -E?: The add request failed
3205  *
3206  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3207  * non-zero timeout parameter the wait ioctl will wait for the given number of
3208  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3209  * without holding struct_mutex the object may become re-busied before this
3210  * function completes. A similar but shorter * race condition exists in the busy
3211  * ioctl
3212  */
3213 int
3214 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3215 {
3216         struct drm_i915_gem_wait *args = data;
3217         struct drm_i915_gem_object *obj;
3218         ktime_t start;
3219         long ret;
3220
3221         if (args->flags != 0)
3222                 return -EINVAL;
3223
3224         obj = i915_gem_object_lookup(file, args->bo_handle);
3225         if (!obj)
3226                 return -ENOENT;
3227
3228         start = ktime_get();
3229
3230         ret = i915_gem_object_wait(obj,
3231                                    I915_WAIT_INTERRUPTIBLE |
3232                                    I915_WAIT_PRIORITY |
3233                                    I915_WAIT_ALL,
3234                                    to_wait_timeout(args->timeout_ns),
3235                                    to_rps_client(file));
3236
3237         if (args->timeout_ns > 0) {
3238                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3239                 if (args->timeout_ns < 0)
3240                         args->timeout_ns = 0;
3241
3242                 /*
3243                  * Apparently ktime isn't accurate enough and occasionally has a
3244                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3245                  * things up to make the test happy. We allow up to 1 jiffy.
3246                  *
3247                  * This is a regression from the timespec->ktime conversion.
3248                  */
3249                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3250                         args->timeout_ns = 0;
3251
3252                 /* Asked to wait beyond the jiffie/scheduler precision? */
3253                 if (ret == -ETIME && args->timeout_ns)
3254                         ret = -EAGAIN;
3255         }
3256
3257         i915_gem_object_put(obj);
3258         return ret;
3259 }
3260
3261 static int wait_for_engines(struct drm_i915_private *i915)
3262 {
3263         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3264                 dev_err(i915->drm.dev,
3265                         "Failed to idle engines, declaring wedged!\n");
3266                 GEM_TRACE_DUMP();
3267                 i915_gem_set_wedged(i915);
3268                 return -EIO;
3269         }
3270
3271         return 0;
3272 }
3273
3274 static long
3275 wait_for_timelines(struct drm_i915_private *i915,
3276                    unsigned int flags, long timeout)
3277 {
3278         struct i915_gt_timelines *gt = &i915->gt.timelines;
3279         struct i915_timeline *tl;
3280
3281         if (!READ_ONCE(i915->gt.active_requests))
3282                 return timeout;
3283
3284         mutex_lock(&gt->mutex);
3285         list_for_each_entry(tl, &gt->active_list, link) {
3286                 struct i915_request *rq;
3287
3288                 rq = i915_active_request_get_unlocked(&tl->last_request);
3289                 if (!rq)
3290                         continue;
3291
3292                 mutex_unlock(&gt->mutex);
3293
3294                 /*
3295                  * "Race-to-idle".
3296                  *
3297                  * Switching to the kernel context is often used a synchronous
3298                  * step prior to idling, e.g. in suspend for flushing all
3299                  * current operations to memory before sleeping. These we
3300                  * want to complete as quickly as possible to avoid prolonged
3301                  * stalls, so allow the gpu to boost to maximum clocks.
3302                  */
3303                 if (flags & I915_WAIT_FOR_IDLE_BOOST)
3304                         gen6_rps_boost(rq, NULL);
3305
3306                 timeout = i915_request_wait(rq, flags, timeout);
3307                 i915_request_put(rq);
3308                 if (timeout < 0)
3309                         return timeout;
3310
3311                 /* restart after reacquiring the lock */
3312                 mutex_lock(&gt->mutex);
3313                 tl = list_entry(&gt->active_list, typeof(*tl), link);
3314         }
3315         mutex_unlock(&gt->mutex);
3316
3317         return timeout;
3318 }
3319
3320 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3321                            unsigned int flags, long timeout)
3322 {
3323         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3324                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3325                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3326
3327         /* If the device is asleep, we have no requests outstanding */
3328         if (!READ_ONCE(i915->gt.awake))
3329                 return 0;
3330
3331         timeout = wait_for_timelines(i915, flags, timeout);
3332         if (timeout < 0)
3333                 return timeout;
3334
3335         if (flags & I915_WAIT_LOCKED) {
3336                 int err;
3337
3338                 lockdep_assert_held(&i915->drm.struct_mutex);
3339
3340                 if (GEM_SHOW_DEBUG() && !timeout) {
3341                         /* Presume that timeout was non-zero to begin with! */
3342                         dev_warn(&i915->drm.pdev->dev,
3343                                  "Missed idle-completion interrupt!\n");
3344                         GEM_TRACE_DUMP();
3345                 }
3346
3347                 err = wait_for_engines(i915);
3348                 if (err)
3349                         return err;
3350
3351                 i915_retire_requests(i915);
3352                 GEM_BUG_ON(i915->gt.active_requests);
3353         }
3354
3355         return 0;
3356 }
3357
3358 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3359 {
3360         /*
3361          * We manually flush the CPU domain so that we can override and
3362          * force the flush for the display, and perform it asyncrhonously.
3363          */
3364         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3365         if (obj->cache_dirty)
3366                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3367         obj->write_domain = 0;
3368 }
3369
3370 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3371 {
3372         if (!READ_ONCE(obj->pin_global))
3373                 return;
3374
3375         mutex_lock(&obj->base.dev->struct_mutex);
3376         __i915_gem_object_flush_for_display(obj);
3377         mutex_unlock(&obj->base.dev->struct_mutex);
3378 }
3379
3380 /**
3381  * Moves a single object to the WC read, and possibly write domain.
3382  * @obj: object to act on
3383  * @write: ask for write access or read only
3384  *
3385  * This function returns when the move is complete, including waiting on
3386  * flushes to occur.
3387  */
3388 int
3389 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3390 {
3391         int ret;
3392
3393         lockdep_assert_held(&obj->base.dev->struct_mutex);
3394
3395         ret = i915_gem_object_wait(obj,
3396                                    I915_WAIT_INTERRUPTIBLE |
3397                                    I915_WAIT_LOCKED |
3398                                    (write ? I915_WAIT_ALL : 0),
3399                                    MAX_SCHEDULE_TIMEOUT,
3400                                    NULL);
3401         if (ret)
3402                 return ret;
3403
3404         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3405                 return 0;
3406
3407         /* Flush and acquire obj->pages so that we are coherent through
3408          * direct access in memory with previous cached writes through
3409          * shmemfs and that our cache domain tracking remains valid.
3410          * For example, if the obj->filp was moved to swap without us
3411          * being notified and releasing the pages, we would mistakenly
3412          * continue to assume that the obj remained out of the CPU cached
3413          * domain.
3414          */
3415         ret = i915_gem_object_pin_pages(obj);
3416         if (ret)
3417                 return ret;
3418
3419         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3420
3421         /* Serialise direct access to this object with the barriers for
3422          * coherent writes from the GPU, by effectively invalidating the
3423          * WC domain upon first access.
3424          */
3425         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3426                 mb();
3427
3428         /* It should now be out of any other write domains, and we can update
3429          * the domain values for our changes.
3430          */
3431         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3432         obj->read_domains |= I915_GEM_DOMAIN_WC;
3433         if (write) {
3434                 obj->read_domains = I915_GEM_DOMAIN_WC;
3435                 obj->write_domain = I915_GEM_DOMAIN_WC;
3436                 obj->mm.dirty = true;
3437         }
3438
3439         i915_gem_object_unpin_pages(obj);
3440         return 0;
3441 }
3442
3443 /**
3444  * Moves a single object to the GTT read, and possibly write domain.
3445  * @obj: object to act on
3446  * @write: ask for write access or read only
3447  *
3448  * This function returns when the move is complete, including waiting on
3449  * flushes to occur.
3450  */
3451 int
3452 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3453 {
3454         int ret;
3455
3456         lockdep_assert_held(&obj->base.dev->struct_mutex);
3457
3458         ret = i915_gem_object_wait(obj,
3459                                    I915_WAIT_INTERRUPTIBLE |
3460                                    I915_WAIT_LOCKED |
3461                                    (write ? I915_WAIT_ALL : 0),
3462                                    MAX_SCHEDULE_TIMEOUT,
3463                                    NULL);
3464         if (ret)
3465                 return ret;
3466
3467         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3468                 return 0;
3469
3470         /* Flush and acquire obj->pages so that we are coherent through
3471          * direct access in memory with previous cached writes through
3472          * shmemfs and that our cache domain tracking remains valid.
3473          * For example, if the obj->filp was moved to swap without us
3474          * being notified and releasing the pages, we would mistakenly
3475          * continue to assume that the obj remained out of the CPU cached
3476          * domain.
3477          */
3478         ret = i915_gem_object_pin_pages(obj);
3479         if (ret)
3480                 return ret;
3481
3482         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3483
3484         /* Serialise direct access to this object with the barriers for
3485          * coherent writes from the GPU, by effectively invalidating the
3486          * GTT domain upon first access.
3487          */
3488         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3489                 mb();
3490
3491         /* It should now be out of any other write domains, and we can update
3492          * the domain values for our changes.
3493          */
3494         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3495         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3496         if (write) {
3497                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3498                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3499                 obj->mm.dirty = true;
3500         }
3501
3502         i915_gem_object_unpin_pages(obj);
3503         return 0;
3504 }
3505
3506 /**
3507  * Changes the cache-level of an object across all VMA.
3508  * @obj: object to act on
3509  * @cache_level: new cache level to set for the object
3510  *
3511  * After this function returns, the object will be in the new cache-level
3512  * across all GTT and the contents of the backing storage will be coherent,
3513  * with respect to the new cache-level. In order to keep the backing storage
3514  * coherent for all users, we only allow a single cache level to be set
3515  * globally on the object and prevent it from being changed whilst the
3516  * hardware is reading from the object. That is if the object is currently
3517  * on the scanout it will be set to uncached (or equivalent display
3518  * cache coherency) and all non-MOCS GPU access will also be uncached so
3519  * that all direct access to the scanout remains coherent.
3520  */
3521 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3522                                     enum i915_cache_level cache_level)
3523 {
3524         struct i915_vma *vma;
3525         int ret;
3526
3527         lockdep_assert_held(&obj->base.dev->struct_mutex);
3528
3529         if (obj->cache_level == cache_level)
3530                 return 0;
3531
3532         /* Inspect the list of currently bound VMA and unbind any that would
3533          * be invalid given the new cache-level. This is principally to
3534          * catch the issue of the CS prefetch crossing page boundaries and
3535          * reading an invalid PTE on older architectures.
3536          */
3537 restart:
3538         list_for_each_entry(vma, &obj->vma.list, obj_link) {
3539                 if (!drm_mm_node_allocated(&vma->node))
3540                         continue;
3541
3542                 if (i915_vma_is_pinned(vma)) {
3543                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3544                         return -EBUSY;
3545                 }
3546
3547                 if (!i915_vma_is_closed(vma) &&
3548                     i915_gem_valid_gtt_space(vma, cache_level))
3549                         continue;
3550
3551                 ret = i915_vma_unbind(vma);
3552                 if (ret)
3553                         return ret;
3554
3555                 /* As unbinding may affect other elements in the
3556                  * obj->vma_list (due to side-effects from retiring
3557                  * an active vma), play safe and restart the iterator.
3558                  */
3559                 goto restart;
3560         }
3561
3562         /* We can reuse the existing drm_mm nodes but need to change the
3563          * cache-level on the PTE. We could simply unbind them all and
3564          * rebind with the correct cache-level on next use. However since
3565          * we already have a valid slot, dma mapping, pages etc, we may as
3566          * rewrite the PTE in the belief that doing so tramples upon less
3567          * state and so involves less work.
3568          */
3569         if (obj->bind_count) {
3570                 /* Before we change the PTE, the GPU must not be accessing it.
3571                  * If we wait upon the object, we know that all the bound
3572                  * VMA are no longer active.
3573                  */
3574                 ret = i915_gem_object_wait(obj,
3575                                            I915_WAIT_INTERRUPTIBLE |
3576                                            I915_WAIT_LOCKED |
3577                                            I915_WAIT_ALL,
3578                                            MAX_SCHEDULE_TIMEOUT,
3579                                            NULL);
3580                 if (ret)
3581                         return ret;
3582
3583                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
3584                     cache_level != I915_CACHE_NONE) {
3585                         /* Access to snoopable pages through the GTT is
3586                          * incoherent and on some machines causes a hard
3587                          * lockup. Relinquish the CPU mmaping to force
3588                          * userspace to refault in the pages and we can
3589                          * then double check if the GTT mapping is still
3590                          * valid for that pointer access.
3591                          */
3592                         i915_gem_release_mmap(obj);
3593
3594                         /* As we no longer need a fence for GTT access,
3595                          * we can relinquish it now (and so prevent having
3596                          * to steal a fence from someone else on the next
3597                          * fence request). Note GPU activity would have
3598                          * dropped the fence as all snoopable access is
3599                          * supposed to be linear.
3600                          */
3601                         for_each_ggtt_vma(vma, obj) {
3602                                 ret = i915_vma_put_fence(vma);
3603                                 if (ret)
3604                                         return ret;
3605                         }
3606                 } else {
3607                         /* We either have incoherent backing store and
3608                          * so no GTT access or the architecture is fully
3609                          * coherent. In such cases, existing GTT mmaps
3610                          * ignore the cache bit in the PTE and we can
3611                          * rewrite it without confusing the GPU or having
3612                          * to force userspace to fault back in its mmaps.
3613                          */
3614                 }
3615
3616                 list_for_each_entry(vma, &obj->vma.list, obj_link) {
3617                         if (!drm_mm_node_allocated(&vma->node))
3618                                 continue;
3619
3620                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3621                         if (ret)
3622                                 return ret;
3623                 }
3624         }
3625
3626         list_for_each_entry(vma, &obj->vma.list, obj_link)
3627                 vma->node.color = cache_level;
3628         i915_gem_object_set_cache_coherency(obj, cache_level);
3629         obj->cache_dirty = true; /* Always invalidate stale cachelines */
3630
3631         return 0;
3632 }
3633
3634 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3635                                struct drm_file *file)
3636 {
3637         struct drm_i915_gem_caching *args = data;
3638         struct drm_i915_gem_object *obj;
3639         int err = 0;
3640
3641         rcu_read_lock();
3642         obj = i915_gem_object_lookup_rcu(file, args->handle);
3643         if (!obj) {
3644                 err = -ENOENT;
3645                 goto out;
3646         }
3647
3648         switch (obj->cache_level) {
3649         case I915_CACHE_LLC:
3650         case I915_CACHE_L3_LLC:
3651                 args->caching = I915_CACHING_CACHED;
3652                 break;
3653
3654         case I915_CACHE_WT:
3655                 args->caching = I915_CACHING_DISPLAY;
3656                 break;
3657
3658         default:
3659                 args->caching = I915_CACHING_NONE;
3660                 break;
3661         }
3662 out:
3663         rcu_read_unlock();
3664         return err;
3665 }
3666
3667 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
3668                                struct drm_file *file)
3669 {
3670         struct drm_i915_private *i915 = to_i915(dev);
3671         struct drm_i915_gem_caching *args = data;
3672         struct drm_i915_gem_object *obj;
3673         enum i915_cache_level level;
3674         int ret = 0;
3675
3676         switch (args->caching) {
3677         case I915_CACHING_NONE:
3678                 level = I915_CACHE_NONE;
3679                 break;
3680         case I915_CACHING_CACHED:
3681                 /*
3682                  * Due to a HW issue on BXT A stepping, GPU stores via a
3683                  * snooped mapping may leave stale data in a corresponding CPU
3684                  * cacheline, whereas normally such cachelines would get
3685                  * invalidated.
3686                  */
3687                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
3688                         return -ENODEV;
3689
3690                 level = I915_CACHE_LLC;
3691                 break;
3692         case I915_CACHING_DISPLAY:
3693                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
3694                 break;
3695         default:
3696                 return -EINVAL;
3697         }
3698
3699         obj = i915_gem_object_lookup(file, args->handle);
3700         if (!obj)
3701                 return -ENOENT;
3702
3703         /*
3704          * The caching mode of proxy object is handled by its generator, and
3705          * not allowed to be changed by userspace.
3706          */
3707         if (i915_gem_object_is_proxy(obj)) {
3708                 ret = -ENXIO;
3709                 goto out;
3710         }
3711
3712         if (obj->cache_level == level)
3713                 goto out;
3714
3715         ret = i915_gem_object_wait(obj,
3716                                    I915_WAIT_INTERRUPTIBLE,
3717                                    MAX_SCHEDULE_TIMEOUT,
3718                                    to_rps_client(file));
3719         if (ret)
3720                 goto out;
3721
3722         ret = i915_mutex_lock_interruptible(dev);
3723         if (ret)
3724                 goto out;
3725
3726         ret = i915_gem_object_set_cache_level(obj, level);
3727         mutex_unlock(&dev->struct_mutex);
3728
3729 out:
3730         i915_gem_object_put(obj);
3731         return ret;
3732 }
3733
3734 /*
3735  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
3736  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
3737  * (for pageflips). We only flush the caches while preparing the buffer for
3738  * display, the callers are responsible for frontbuffer flush.
3739  */
3740 struct i915_vma *
3741 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
3742                                      u32 alignment,
3743                                      const struct i915_ggtt_view *view,
3744                                      unsigned int flags)
3745 {
3746         struct i915_vma *vma;
3747         int ret;
3748
3749         lockdep_assert_held(&obj->base.dev->struct_mutex);
3750
3751         /* Mark the global pin early so that we account for the
3752          * display coherency whilst setting up the cache domains.
3753          */
3754         obj->pin_global++;
3755
3756         /* The display engine is not coherent with the LLC cache on gen6.  As
3757          * a result, we make sure that the pinning that is about to occur is
3758          * done with uncached PTEs. This is lowest common denominator for all
3759          * chipsets.
3760          *
3761          * However for gen6+, we could do better by using the GFDT bit instead
3762          * of uncaching, which would allow us to flush all the LLC-cached data
3763          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
3764          */
3765         ret = i915_gem_object_set_cache_level(obj,
3766                                               HAS_WT(to_i915(obj->base.dev)) ?
3767                                               I915_CACHE_WT : I915_CACHE_NONE);
3768         if (ret) {
3769                 vma = ERR_PTR(ret);
3770                 goto err_unpin_global;
3771         }
3772
3773         /* As the user may map the buffer once pinned in the display plane
3774          * (e.g. libkms for the bootup splash), we have to ensure that we
3775          * always use map_and_fenceable for all scanout buffers. However,
3776          * it may simply be too big to fit into mappable, in which case
3777          * put it anyway and hope that userspace can cope (but always first
3778          * try to preserve the existing ABI).
3779          */
3780         vma = ERR_PTR(-ENOSPC);
3781         if ((flags & PIN_MAPPABLE) == 0 &&
3782             (!view || view->type == I915_GGTT_VIEW_NORMAL))
3783                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
3784                                                flags |
3785                                                PIN_MAPPABLE |
3786                                                PIN_NONBLOCK);
3787         if (IS_ERR(vma))
3788                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
3789         if (IS_ERR(vma))
3790                 goto err_unpin_global;
3791
3792         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
3793
3794         __i915_gem_object_flush_for_display(obj);
3795
3796         /* It should now be out of any other write domains, and we can update
3797          * the domain values for our changes.
3798          */
3799         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3800
3801         return vma;
3802
3803 err_unpin_global:
3804         obj->pin_global--;
3805         return vma;
3806 }
3807
3808 void
3809 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
3810 {
3811         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
3812
3813         if (WARN_ON(vma->obj->pin_global == 0))
3814                 return;
3815
3816         if (--vma->obj->pin_global == 0)
3817                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
3818
3819         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
3820         i915_gem_object_bump_inactive_ggtt(vma->obj);
3821
3822         i915_vma_unpin(vma);
3823 }
3824
3825 /**
3826  * Moves a single object to the CPU read, and possibly write domain.
3827  * @obj: object to act on
3828  * @write: requesting write or read-only access
3829  *
3830  * This function returns when the move is complete, including waiting on
3831  * flushes to occur.
3832  */
3833 int
3834 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
3835 {
3836         int ret;
3837
3838         lockdep_assert_held(&obj->base.dev->struct_mutex);
3839
3840         ret = i915_gem_object_wait(obj,
3841                                    I915_WAIT_INTERRUPTIBLE |
3842                                    I915_WAIT_LOCKED |
3843                                    (write ? I915_WAIT_ALL : 0),
3844                                    MAX_SCHEDULE_TIMEOUT,
3845                                    NULL);
3846         if (ret)
3847                 return ret;
3848
3849         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3850
3851         /* Flush the CPU cache if it's still invalid. */
3852         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
3853                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
3854                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
3855         }
3856
3857         /* It should now be out of any other write domains, and we can update
3858          * the domain values for our changes.
3859          */
3860         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
3861
3862         /* If we're writing through the CPU, then the GPU read domains will
3863          * need to be invalidated at next use.
3864          */
3865         if (write)
3866                 __start_cpu_write(obj);
3867
3868         return 0;
3869 }
3870
3871 /* Throttle our rendering by waiting until the ring has completed our requests
3872  * emitted over 20 msec ago.
3873  *
3874  * Note that if we were to use the current jiffies each time around the loop,
3875  * we wouldn't escape the function with any frames outstanding if the time to
3876  * render a frame was over 20ms.
3877  *
3878  * This should get us reasonable parallelism between CPU and GPU but also
3879  * relatively low latency when blocking on a particular request to finish.
3880  */
3881 static int
3882 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
3883 {
3884         struct drm_i915_private *dev_priv = to_i915(dev);
3885         struct drm_i915_file_private *file_priv = file->driver_priv;
3886         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
3887         struct i915_request *request, *target = NULL;
3888         long ret;
3889
3890         /* ABI: return -EIO if already wedged */
3891         if (i915_terminally_wedged(&dev_priv->gpu_error))
3892                 return -EIO;
3893
3894         spin_lock(&file_priv->mm.lock);
3895         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
3896                 if (time_after_eq(request->emitted_jiffies, recent_enough))
3897                         break;
3898
3899                 if (target) {
3900                         list_del(&target->client_link);
3901                         target->file_priv = NULL;
3902                 }
3903
3904                 target = request;
3905         }
3906         if (target)
3907                 i915_request_get(target);
3908         spin_unlock(&file_priv->mm.lock);
3909
3910         if (target == NULL)
3911                 return 0;
3912
3913         ret = i915_request_wait(target,
3914                                 I915_WAIT_INTERRUPTIBLE,
3915                                 MAX_SCHEDULE_TIMEOUT);
3916         i915_request_put(target);
3917
3918         return ret < 0 ? ret : 0;
3919 }
3920
3921 struct i915_vma *
3922 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
3923                          const struct i915_ggtt_view *view,
3924                          u64 size,
3925                          u64 alignment,
3926                          u64 flags)
3927 {
3928         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
3929         struct i915_address_space *vm = &dev_priv->ggtt.vm;
3930         struct i915_vma *vma;
3931         int ret;
3932
3933         lockdep_assert_held(&obj->base.dev->struct_mutex);
3934
3935         if (flags & PIN_MAPPABLE &&
3936             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
3937                 /* If the required space is larger than the available
3938                  * aperture, we will not able to find a slot for the
3939                  * object and unbinding the object now will be in
3940                  * vain. Worse, doing so may cause us to ping-pong
3941                  * the object in and out of the Global GTT and
3942                  * waste a lot of cycles under the mutex.
3943                  */
3944                 if (obj->base.size > dev_priv->ggtt.mappable_end)
3945                         return ERR_PTR(-E2BIG);
3946
3947                 /* If NONBLOCK is set the caller is optimistically
3948                  * trying to cache the full object within the mappable
3949                  * aperture, and *must* have a fallback in place for
3950                  * situations where we cannot bind the object. We
3951                  * can be a little more lax here and use the fallback
3952                  * more often to avoid costly migrations of ourselves
3953                  * and other objects within the aperture.
3954                  *
3955                  * Half-the-aperture is used as a simple heuristic.
3956                  * More interesting would to do search for a free
3957                  * block prior to making the commitment to unbind.
3958                  * That caters for the self-harm case, and with a
3959                  * little more heuristics (e.g. NOFAULT, NOEVICT)
3960                  * we could try to minimise harm to others.
3961                  */
3962                 if (flags & PIN_NONBLOCK &&
3963                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
3964                         return ERR_PTR(-ENOSPC);
3965         }
3966
3967         vma = i915_vma_instance(obj, vm, view);
3968         if (unlikely(IS_ERR(vma)))
3969                 return vma;
3970
3971         if (i915_vma_misplaced(vma, size, alignment, flags)) {
3972                 if (flags & PIN_NONBLOCK) {
3973                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
3974                                 return ERR_PTR(-ENOSPC);
3975
3976                         if (flags & PIN_MAPPABLE &&
3977                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
3978                                 return ERR_PTR(-ENOSPC);
3979                 }
3980
3981                 WARN(i915_vma_is_pinned(vma),
3982                      "bo is already pinned in ggtt with incorrect alignment:"
3983                      " offset=%08x, req.alignment=%llx,"
3984                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
3985                      i915_ggtt_offset(vma), alignment,
3986                      !!(flags & PIN_MAPPABLE),
3987                      i915_vma_is_map_and_fenceable(vma));
3988                 ret = i915_vma_unbind(vma);
3989                 if (ret)
3990                         return ERR_PTR(ret);
3991         }
3992
3993         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
3994         if (ret)
3995                 return ERR_PTR(ret);
3996
3997         return vma;
3998 }
3999
4000 static __always_inline unsigned int __busy_read_flag(unsigned int id)
4001 {
4002         /* Note that we could alias engines in the execbuf API, but
4003          * that would be very unwise as it prevents userspace from
4004          * fine control over engine selection. Ahem.
4005          *
4006          * This should be something like EXEC_MAX_ENGINE instead of
4007          * I915_NUM_ENGINES.
4008          */
4009         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4010         return 0x10000 << id;
4011 }
4012
4013 static __always_inline unsigned int __busy_write_id(unsigned int id)
4014 {
4015         /* The uABI guarantees an active writer is also amongst the read
4016          * engines. This would be true if we accessed the activity tracking
4017          * under the lock, but as we perform the lookup of the object and
4018          * its activity locklessly we can not guarantee that the last_write
4019          * being active implies that we have set the same engine flag from
4020          * last_read - hence we always set both read and write busy for
4021          * last_write.
4022          */
4023         return id | __busy_read_flag(id);
4024 }
4025
4026 static __always_inline unsigned int
4027 __busy_set_if_active(const struct dma_fence *fence,
4028                      unsigned int (*flag)(unsigned int id))
4029 {
4030         struct i915_request *rq;
4031
4032         /* We have to check the current hw status of the fence as the uABI
4033          * guarantees forward progress. We could rely on the idle worker
4034          * to eventually flush us, but to minimise latency just ask the
4035          * hardware.
4036          *
4037          * Note we only report on the status of native fences.
4038          */
4039         if (!dma_fence_is_i915(fence))
4040                 return 0;
4041
4042         /* opencode to_request() in order to avoid const warnings */
4043         rq = container_of(fence, struct i915_request, fence);
4044         if (i915_request_completed(rq))
4045                 return 0;
4046
4047         return flag(rq->engine->uabi_id);
4048 }
4049
4050 static __always_inline unsigned int
4051 busy_check_reader(const struct dma_fence *fence)
4052 {
4053         return __busy_set_if_active(fence, __busy_read_flag);
4054 }
4055
4056 static __always_inline unsigned int
4057 busy_check_writer(const struct dma_fence *fence)
4058 {
4059         if (!fence)
4060                 return 0;
4061
4062         return __busy_set_if_active(fence, __busy_write_id);
4063 }
4064
4065 int
4066 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4067                     struct drm_file *file)
4068 {
4069         struct drm_i915_gem_busy *args = data;
4070         struct drm_i915_gem_object *obj;
4071         struct reservation_object_list *list;
4072         unsigned int seq;
4073         int err;
4074
4075         err = -ENOENT;
4076         rcu_read_lock();
4077         obj = i915_gem_object_lookup_rcu(file, args->handle);
4078         if (!obj)
4079                 goto out;
4080
4081         /* A discrepancy here is that we do not report the status of
4082          * non-i915 fences, i.e. even though we may report the object as idle,
4083          * a call to set-domain may still stall waiting for foreign rendering.
4084          * This also means that wait-ioctl may report an object as busy,
4085          * where busy-ioctl considers it idle.
4086          *
4087          * We trade the ability to warn of foreign fences to report on which
4088          * i915 engines are active for the object.
4089          *
4090          * Alternatively, we can trade that extra information on read/write
4091          * activity with
4092          *      args->busy =
4093          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4094          * to report the overall busyness. This is what the wait-ioctl does.
4095          *
4096          */
4097 retry:
4098         seq = raw_read_seqcount(&obj->resv->seq);
4099
4100         /* Translate the exclusive fence to the READ *and* WRITE engine */
4101         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4102
4103         /* Translate shared fences to READ set of engines */
4104         list = rcu_dereference(obj->resv->fence);
4105         if (list) {
4106                 unsigned int shared_count = list->shared_count, i;
4107
4108                 for (i = 0; i < shared_count; ++i) {
4109                         struct dma_fence *fence =
4110                                 rcu_dereference(list->shared[i]);
4111
4112                         args->busy |= busy_check_reader(fence);
4113                 }
4114         }
4115
4116         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4117                 goto retry;
4118
4119         err = 0;
4120 out:
4121         rcu_read_unlock();
4122         return err;
4123 }
4124
4125 int
4126 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4127                         struct drm_file *file_priv)
4128 {
4129         return i915_gem_ring_throttle(dev, file_priv);
4130 }
4131
4132 int
4133 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4134                        struct drm_file *file_priv)
4135 {
4136         struct drm_i915_private *dev_priv = to_i915(dev);
4137         struct drm_i915_gem_madvise *args = data;
4138         struct drm_i915_gem_object *obj;
4139         int err;
4140
4141         switch (args->madv) {
4142         case I915_MADV_DONTNEED:
4143         case I915_MADV_WILLNEED:
4144             break;
4145         default:
4146             return -EINVAL;
4147         }
4148
4149         obj = i915_gem_object_lookup(file_priv, args->handle);
4150         if (!obj)
4151                 return -ENOENT;
4152
4153         err = mutex_lock_interruptible(&obj->mm.lock);
4154         if (err)
4155                 goto out;
4156
4157         if (i915_gem_object_has_pages(obj) &&
4158             i915_gem_object_is_tiled(obj) &&
4159             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4160                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4161                         GEM_BUG_ON(!obj->mm.quirked);
4162                         __i915_gem_object_unpin_pages(obj);
4163                         obj->mm.quirked = false;
4164                 }
4165                 if (args->madv == I915_MADV_WILLNEED) {
4166                         GEM_BUG_ON(obj->mm.quirked);
4167                         __i915_gem_object_pin_pages(obj);
4168                         obj->mm.quirked = true;
4169                 }
4170         }
4171
4172         if (obj->mm.madv != __I915_MADV_PURGED)
4173                 obj->mm.madv = args->madv;
4174
4175         /* if the object is no longer attached, discard its backing storage */
4176         if (obj->mm.madv == I915_MADV_DONTNEED &&
4177             !i915_gem_object_has_pages(obj))
4178                 i915_gem_object_truncate(obj);
4179
4180         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4181         mutex_unlock(&obj->mm.lock);
4182
4183 out:
4184         i915_gem_object_put(obj);
4185         return err;
4186 }
4187
4188 static void
4189 frontbuffer_retire(struct i915_active_request *active,
4190                    struct i915_request *request)
4191 {
4192         struct drm_i915_gem_object *obj =
4193                 container_of(active, typeof(*obj), frontbuffer_write);
4194
4195         intel_fb_obj_flush(obj, ORIGIN_CS);
4196 }
4197
4198 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4199                           const struct drm_i915_gem_object_ops *ops)
4200 {
4201         mutex_init(&obj->mm.lock);
4202
4203         spin_lock_init(&obj->vma.lock);
4204         INIT_LIST_HEAD(&obj->vma.list);
4205
4206         INIT_LIST_HEAD(&obj->lut_list);
4207         INIT_LIST_HEAD(&obj->batch_pool_link);
4208
4209         init_rcu_head(&obj->rcu);
4210
4211         obj->ops = ops;
4212
4213         reservation_object_init(&obj->__builtin_resv);
4214         obj->resv = &obj->__builtin_resv;
4215
4216         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4217         i915_active_request_init(&obj->frontbuffer_write,
4218                                  NULL, frontbuffer_retire);
4219
4220         obj->mm.madv = I915_MADV_WILLNEED;
4221         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4222         mutex_init(&obj->mm.get_page.lock);
4223
4224         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4225 }
4226
4227 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4228         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4229                  I915_GEM_OBJECT_IS_SHRINKABLE,
4230
4231         .get_pages = i915_gem_object_get_pages_gtt,
4232         .put_pages = i915_gem_object_put_pages_gtt,
4233
4234         .pwrite = i915_gem_object_pwrite_gtt,
4235 };
4236
4237 static int i915_gem_object_create_shmem(struct drm_device *dev,
4238                                         struct drm_gem_object *obj,
4239                                         size_t size)
4240 {
4241         struct drm_i915_private *i915 = to_i915(dev);
4242         unsigned long flags = VM_NORESERVE;
4243         struct file *filp;
4244
4245         drm_gem_private_object_init(dev, obj, size);
4246
4247         if (i915->mm.gemfs)
4248                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4249                                                  flags);
4250         else
4251                 filp = shmem_file_setup("i915", size, flags);
4252
4253         if (IS_ERR(filp))
4254                 return PTR_ERR(filp);
4255
4256         obj->filp = filp;
4257
4258         return 0;
4259 }
4260
4261 struct drm_i915_gem_object *
4262 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4263 {
4264         struct drm_i915_gem_object *obj;
4265         struct address_space *mapping;
4266         unsigned int cache_level;
4267         gfp_t mask;
4268         int ret;
4269
4270         /* There is a prevalence of the assumption that we fit the object's
4271          * page count inside a 32bit _signed_ variable. Let's document this and
4272          * catch if we ever need to fix it. In the meantime, if you do spot
4273          * such a local variable, please consider fixing!
4274          */
4275         if (size >> PAGE_SHIFT > INT_MAX)
4276                 return ERR_PTR(-E2BIG);
4277
4278         if (overflows_type(size, obj->base.size))
4279                 return ERR_PTR(-E2BIG);
4280
4281         obj = i915_gem_object_alloc(dev_priv);
4282         if (obj == NULL)
4283                 return ERR_PTR(-ENOMEM);
4284
4285         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4286         if (ret)
4287                 goto fail;
4288
4289         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4290         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4291                 /* 965gm cannot relocate objects above 4GiB. */
4292                 mask &= ~__GFP_HIGHMEM;
4293                 mask |= __GFP_DMA32;
4294         }
4295
4296         mapping = obj->base.filp->f_mapping;
4297         mapping_set_gfp_mask(mapping, mask);
4298         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4299
4300         i915_gem_object_init(obj, &i915_gem_object_ops);
4301
4302         obj->write_domain = I915_GEM_DOMAIN_CPU;
4303         obj->read_domains = I915_GEM_DOMAIN_CPU;
4304
4305         if (HAS_LLC(dev_priv))
4306                 /* On some devices, we can have the GPU use the LLC (the CPU
4307                  * cache) for about a 10% performance improvement
4308                  * compared to uncached.  Graphics requests other than
4309                  * display scanout are coherent with the CPU in
4310                  * accessing this cache.  This means in this mode we
4311                  * don't need to clflush on the CPU side, and on the
4312                  * GPU side we only need to flush internal caches to
4313                  * get data visible to the CPU.
4314                  *
4315                  * However, we maintain the display planes as UC, and so
4316                  * need to rebind when first used as such.
4317                  */
4318                 cache_level = I915_CACHE_LLC;
4319         else
4320                 cache_level = I915_CACHE_NONE;
4321
4322         i915_gem_object_set_cache_coherency(obj, cache_level);
4323
4324         trace_i915_gem_object_create(obj);
4325
4326         return obj;
4327
4328 fail:
4329         i915_gem_object_free(obj);
4330         return ERR_PTR(ret);
4331 }
4332
4333 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4334 {
4335         /* If we are the last user of the backing storage (be it shmemfs
4336          * pages or stolen etc), we know that the pages are going to be
4337          * immediately released. In this case, we can then skip copying
4338          * back the contents from the GPU.
4339          */
4340
4341         if (obj->mm.madv != I915_MADV_WILLNEED)
4342                 return false;
4343
4344         if (obj->base.filp == NULL)
4345                 return true;
4346
4347         /* At first glance, this looks racy, but then again so would be
4348          * userspace racing mmap against close. However, the first external
4349          * reference to the filp can only be obtained through the
4350          * i915_gem_mmap_ioctl() which safeguards us against the user
4351          * acquiring such a reference whilst we are in the middle of
4352          * freeing the object.
4353          */
4354         return atomic_long_read(&obj->base.filp->f_count) == 1;
4355 }
4356
4357 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4358                                     struct llist_node *freed)
4359 {
4360         struct drm_i915_gem_object *obj, *on;
4361         intel_wakeref_t wakeref;
4362
4363         wakeref = intel_runtime_pm_get(i915);
4364         llist_for_each_entry_safe(obj, on, freed, freed) {
4365                 struct i915_vma *vma, *vn;
4366
4367                 trace_i915_gem_object_destroy(obj);
4368
4369                 mutex_lock(&i915->drm.struct_mutex);
4370
4371                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4372                 list_for_each_entry_safe(vma, vn, &obj->vma.list, obj_link) {
4373                         GEM_BUG_ON(i915_vma_is_active(vma));
4374                         vma->flags &= ~I915_VMA_PIN_MASK;
4375                         i915_vma_destroy(vma);
4376                 }
4377                 GEM_BUG_ON(!list_empty(&obj->vma.list));
4378                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma.tree));
4379
4380                 /* This serializes freeing with the shrinker. Since the free
4381                  * is delayed, first by RCU then by the workqueue, we want the
4382                  * shrinker to be able to free pages of unreferenced objects,
4383                  * or else we may oom whilst there are plenty of deferred
4384                  * freed objects.
4385                  */
4386                 if (i915_gem_object_has_pages(obj)) {
4387                         spin_lock(&i915->mm.obj_lock);
4388                         list_del_init(&obj->mm.link);
4389                         spin_unlock(&i915->mm.obj_lock);
4390                 }
4391
4392                 mutex_unlock(&i915->drm.struct_mutex);
4393
4394                 GEM_BUG_ON(obj->bind_count);
4395                 GEM_BUG_ON(obj->userfault_count);
4396                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4397                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4398
4399                 if (obj->ops->release)
4400                         obj->ops->release(obj);
4401
4402                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4403                         atomic_set(&obj->mm.pages_pin_count, 0);
4404                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4405                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4406
4407                 if (obj->base.import_attach)
4408                         drm_prime_gem_destroy(&obj->base, NULL);
4409
4410                 reservation_object_fini(&obj->__builtin_resv);
4411                 drm_gem_object_release(&obj->base);
4412                 i915_gem_info_remove_obj(i915, obj->base.size);
4413
4414                 kfree(obj->bit_17);
4415                 i915_gem_object_free(obj);
4416
4417                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4418                 atomic_dec(&i915->mm.free_count);
4419
4420                 if (on)
4421                         cond_resched();
4422         }
4423         intel_runtime_pm_put(i915, wakeref);
4424 }
4425
4426 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4427 {
4428         struct llist_node *freed;
4429
4430         /* Free the oldest, most stale object to keep the free_list short */
4431         freed = NULL;
4432         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4433                 /* Only one consumer of llist_del_first() allowed */
4434                 spin_lock(&i915->mm.free_lock);
4435                 freed = llist_del_first(&i915->mm.free_list);
4436                 spin_unlock(&i915->mm.free_lock);
4437         }
4438         if (unlikely(freed)) {
4439                 freed->next = NULL;
4440                 __i915_gem_free_objects(i915, freed);
4441         }
4442 }
4443
4444 static void __i915_gem_free_work(struct work_struct *work)
4445 {
4446         struct drm_i915_private *i915 =
4447                 container_of(work, struct drm_i915_private, mm.free_work);
4448         struct llist_node *freed;
4449
4450         /*
4451          * All file-owned VMA should have been released by this point through
4452          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4453          * However, the object may also be bound into the global GTT (e.g.
4454          * older GPUs without per-process support, or for direct access through
4455          * the GTT either for the user or for scanout). Those VMA still need to
4456          * unbound now.
4457          */
4458
4459         spin_lock(&i915->mm.free_lock);
4460         while ((freed = llist_del_all(&i915->mm.free_list))) {
4461                 spin_unlock(&i915->mm.free_lock);
4462
4463                 __i915_gem_free_objects(i915, freed);
4464                 if (need_resched())
4465                         return;
4466
4467                 spin_lock(&i915->mm.free_lock);
4468         }
4469         spin_unlock(&i915->mm.free_lock);
4470 }
4471
4472 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4473 {
4474         struct drm_i915_gem_object *obj =
4475                 container_of(head, typeof(*obj), rcu);
4476         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4477
4478         /*
4479          * We reuse obj->rcu for the freed list, so we had better not treat
4480          * it like a rcu_head from this point forwards. And we expect all
4481          * objects to be freed via this path.
4482          */
4483         destroy_rcu_head(&obj->rcu);
4484
4485         /*
4486          * Since we require blocking on struct_mutex to unbind the freed
4487          * object from the GPU before releasing resources back to the
4488          * system, we can not do that directly from the RCU callback (which may
4489          * be a softirq context), but must instead then defer that work onto a
4490          * kthread. We use the RCU callback rather than move the freed object
4491          * directly onto the work queue so that we can mix between using the
4492          * worker and performing frees directly from subsequent allocations for
4493          * crude but effective memory throttling.
4494          */
4495         if (llist_add(&obj->freed, &i915->mm.free_list))
4496                 queue_work(i915->wq, &i915->mm.free_work);
4497 }
4498
4499 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4500 {
4501         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4502
4503         if (obj->mm.quirked)
4504                 __i915_gem_object_unpin_pages(obj);
4505
4506         if (discard_backing_storage(obj))
4507                 obj->mm.madv = I915_MADV_DONTNEED;
4508
4509         /*
4510          * Before we free the object, make sure any pure RCU-only
4511          * read-side critical sections are complete, e.g.
4512          * i915_gem_busy_ioctl(). For the corresponding synchronized
4513          * lookup see i915_gem_object_lookup_rcu().
4514          */
4515         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4516         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4517 }
4518
4519 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4520 {
4521         lockdep_assert_held(&obj->base.dev->struct_mutex);
4522
4523         if (!i915_gem_object_has_active_reference(obj) &&
4524             i915_gem_object_is_active(obj))
4525                 i915_gem_object_set_active_reference(obj);
4526         else
4527                 i915_gem_object_put(obj);
4528 }
4529
4530 void i915_gem_sanitize(struct drm_i915_private *i915)
4531 {
4532         intel_wakeref_t wakeref;
4533
4534         GEM_TRACE("\n");
4535
4536         wakeref = intel_runtime_pm_get(i915);
4537         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4538
4539         /*
4540          * As we have just resumed the machine and woken the device up from
4541          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
4542          * back to defaults, recovering from whatever wedged state we left it
4543          * in and so worth trying to use the device once more.
4544          */
4545         if (i915_terminally_wedged(&i915->gpu_error))
4546                 i915_gem_unset_wedged(i915);
4547
4548         /*
4549          * If we inherit context state from the BIOS or earlier occupants
4550          * of the GPU, the GPU may be in an inconsistent state when we
4551          * try to take over. The only way to remove the earlier state
4552          * is by resetting. However, resetting on earlier gen is tricky as
4553          * it may impact the display and we are uncertain about the stability
4554          * of the reset, so this could be applied to even earlier gen.
4555          */
4556         intel_engines_sanitize(i915, false);
4557
4558         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4559         intel_runtime_pm_put(i915, wakeref);
4560
4561         mutex_lock(&i915->drm.struct_mutex);
4562         i915_gem_contexts_lost(i915);
4563         mutex_unlock(&i915->drm.struct_mutex);
4564 }
4565
4566 int i915_gem_suspend(struct drm_i915_private *i915)
4567 {
4568         intel_wakeref_t wakeref;
4569         int ret;
4570
4571         GEM_TRACE("\n");
4572
4573         wakeref = intel_runtime_pm_get(i915);
4574         intel_suspend_gt_powersave(i915);
4575
4576         flush_workqueue(i915->wq);
4577
4578         mutex_lock(&i915->drm.struct_mutex);
4579
4580         /*
4581          * We have to flush all the executing contexts to main memory so
4582          * that they can saved in the hibernation image. To ensure the last
4583          * context image is coherent, we have to switch away from it. That
4584          * leaves the i915->kernel_context still active when
4585          * we actually suspend, and its image in memory may not match the GPU
4586          * state. Fortunately, the kernel_context is disposable and we do
4587          * not rely on its state.
4588          */
4589         if (!i915_terminally_wedged(&i915->gpu_error)) {
4590                 ret = i915_gem_switch_to_kernel_context(i915);
4591                 if (ret)
4592                         goto err_unlock;
4593
4594                 ret = i915_gem_wait_for_idle(i915,
4595                                              I915_WAIT_INTERRUPTIBLE |
4596                                              I915_WAIT_LOCKED |
4597                                              I915_WAIT_FOR_IDLE_BOOST,
4598                                              MAX_SCHEDULE_TIMEOUT);
4599                 if (ret && ret != -EIO)
4600                         goto err_unlock;
4601
4602                 assert_kernel_context_is_current(i915);
4603         }
4604         i915_retire_requests(i915); /* ensure we flush after wedging */
4605
4606         mutex_unlock(&i915->drm.struct_mutex);
4607         i915_reset_flush(i915);
4608
4609         drain_delayed_work(&i915->gt.retire_work);
4610
4611         /*
4612          * As the idle_work is rearming if it detects a race, play safe and
4613          * repeat the flush until it is definitely idle.
4614          */
4615         drain_delayed_work(&i915->gt.idle_work);
4616
4617         intel_uc_suspend(i915);
4618
4619         /*
4620          * Assert that we successfully flushed all the work and
4621          * reset the GPU back to its idle, low power state.
4622          */
4623         WARN_ON(i915->gt.awake);
4624         if (WARN_ON(!intel_engines_are_idle(i915)))
4625                 i915_gem_set_wedged(i915); /* no hope, discard everything */
4626
4627         intel_runtime_pm_put(i915, wakeref);
4628         return 0;
4629
4630 err_unlock:
4631         mutex_unlock(&i915->drm.struct_mutex);
4632         intel_runtime_pm_put(i915, wakeref);
4633         return ret;
4634 }
4635
4636 void i915_gem_suspend_late(struct drm_i915_private *i915)
4637 {
4638         struct drm_i915_gem_object *obj;
4639         struct list_head *phases[] = {
4640                 &i915->mm.unbound_list,
4641                 &i915->mm.bound_list,
4642                 NULL
4643         }, **phase;
4644
4645         /*
4646          * Neither the BIOS, ourselves or any other kernel
4647          * expects the system to be in execlists mode on startup,
4648          * so we need to reset the GPU back to legacy mode. And the only
4649          * known way to disable logical contexts is through a GPU reset.
4650          *
4651          * So in order to leave the system in a known default configuration,
4652          * always reset the GPU upon unload and suspend. Afterwards we then
4653          * clean up the GEM state tracking, flushing off the requests and
4654          * leaving the system in a known idle state.
4655          *
4656          * Note that is of the upmost importance that the GPU is idle and
4657          * all stray writes are flushed *before* we dismantle the backing
4658          * storage for the pinned objects.
4659          *
4660          * However, since we are uncertain that resetting the GPU on older
4661          * machines is a good idea, we don't - just in case it leaves the
4662          * machine in an unusable condition.
4663          */
4664
4665         mutex_lock(&i915->drm.struct_mutex);
4666         for (phase = phases; *phase; phase++) {
4667                 list_for_each_entry(obj, *phase, mm.link)
4668                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
4669         }
4670         mutex_unlock(&i915->drm.struct_mutex);
4671
4672         intel_uc_sanitize(i915);
4673         i915_gem_sanitize(i915);
4674 }
4675
4676 void i915_gem_resume(struct drm_i915_private *i915)
4677 {
4678         GEM_TRACE("\n");
4679
4680         WARN_ON(i915->gt.awake);
4681
4682         mutex_lock(&i915->drm.struct_mutex);
4683         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4684
4685         i915_gem_restore_gtt_mappings(i915);
4686         i915_gem_restore_fences(i915);
4687
4688         /*
4689          * As we didn't flush the kernel context before suspend, we cannot
4690          * guarantee that the context image is complete. So let's just reset
4691          * it and start again.
4692          */
4693         i915->gt.resume(i915);
4694
4695         if (i915_gem_init_hw(i915))
4696                 goto err_wedged;
4697
4698         intel_uc_resume(i915);
4699
4700         /* Always reload a context for powersaving. */
4701         if (i915_gem_switch_to_kernel_context(i915))
4702                 goto err_wedged;
4703
4704 out_unlock:
4705         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4706         mutex_unlock(&i915->drm.struct_mutex);
4707         return;
4708
4709 err_wedged:
4710         if (!i915_terminally_wedged(&i915->gpu_error)) {
4711                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
4712                 i915_gem_set_wedged(i915);
4713         }
4714         goto out_unlock;
4715 }
4716
4717 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
4718 {
4719         if (INTEL_GEN(dev_priv) < 5 ||
4720             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
4721                 return;
4722
4723         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
4724                                  DISP_TILE_SURFACE_SWIZZLING);
4725
4726         if (IS_GEN(dev_priv, 5))
4727                 return;
4728
4729         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
4730         if (IS_GEN(dev_priv, 6))
4731                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
4732         else if (IS_GEN(dev_priv, 7))
4733                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
4734         else if (IS_GEN(dev_priv, 8))
4735                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
4736         else
4737                 BUG();
4738 }
4739
4740 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
4741 {
4742         I915_WRITE(RING_CTL(base), 0);
4743         I915_WRITE(RING_HEAD(base), 0);
4744         I915_WRITE(RING_TAIL(base), 0);
4745         I915_WRITE(RING_START(base), 0);
4746 }
4747
4748 static void init_unused_rings(struct drm_i915_private *dev_priv)
4749 {
4750         if (IS_I830(dev_priv)) {
4751                 init_unused_ring(dev_priv, PRB1_BASE);
4752                 init_unused_ring(dev_priv, SRB0_BASE);
4753                 init_unused_ring(dev_priv, SRB1_BASE);
4754                 init_unused_ring(dev_priv, SRB2_BASE);
4755                 init_unused_ring(dev_priv, SRB3_BASE);
4756         } else if (IS_GEN(dev_priv, 2)) {
4757                 init_unused_ring(dev_priv, SRB0_BASE);
4758                 init_unused_ring(dev_priv, SRB1_BASE);
4759         } else if (IS_GEN(dev_priv, 3)) {
4760                 init_unused_ring(dev_priv, PRB1_BASE);
4761                 init_unused_ring(dev_priv, PRB2_BASE);
4762         }
4763 }
4764
4765 static int __i915_gem_restart_engines(void *data)
4766 {
4767         struct drm_i915_private *i915 = data;
4768         struct intel_engine_cs *engine;
4769         enum intel_engine_id id;
4770         int err;
4771
4772         for_each_engine(engine, i915, id) {
4773                 err = engine->init_hw(engine);
4774                 if (err) {
4775                         DRM_ERROR("Failed to restart %s (%d)\n",
4776                                   engine->name, err);
4777                         return err;
4778                 }
4779         }
4780
4781         return 0;
4782 }
4783
4784 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
4785 {
4786         int ret;
4787
4788         dev_priv->gt.last_init_time = ktime_get();
4789
4790         /* Double layer security blanket, see i915_gem_init() */
4791         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
4792
4793         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
4794                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
4795
4796         if (IS_HASWELL(dev_priv))
4797                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
4798                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
4799
4800         /* Apply the GT workarounds... */
4801         intel_gt_apply_workarounds(dev_priv);
4802         /* ...and determine whether they are sticking. */
4803         intel_gt_verify_workarounds(dev_priv, "init");
4804
4805         i915_gem_init_swizzling(dev_priv);
4806
4807         /*
4808          * At least 830 can leave some of the unused rings
4809          * "active" (ie. head != tail) after resume which
4810          * will prevent c3 entry. Makes sure all unused rings
4811          * are totally idle.
4812          */
4813         init_unused_rings(dev_priv);
4814
4815         BUG_ON(!dev_priv->kernel_context);
4816         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
4817                 ret = -EIO;
4818                 goto out;
4819         }
4820
4821         ret = i915_ppgtt_init_hw(dev_priv);
4822         if (ret) {
4823                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
4824                 goto out;
4825         }
4826
4827         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
4828         if (ret) {
4829                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
4830                 goto out;
4831         }
4832
4833         /* We can't enable contexts until all firmware is loaded */
4834         ret = intel_uc_init_hw(dev_priv);
4835         if (ret) {
4836                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
4837                 goto out;
4838         }
4839
4840         intel_mocs_init_l3cc_table(dev_priv);
4841
4842         /* Only when the HW is re-initialised, can we replay the requests */
4843         ret = __i915_gem_restart_engines(dev_priv);
4844         if (ret)
4845                 goto cleanup_uc;
4846
4847         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4848
4849         return 0;
4850
4851 cleanup_uc:
4852         intel_uc_fini_hw(dev_priv);
4853 out:
4854         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4855
4856         return ret;
4857 }
4858
4859 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
4860 {
4861         struct i915_gem_context *ctx;
4862         struct intel_engine_cs *engine;
4863         enum intel_engine_id id;
4864         int err;
4865
4866         /*
4867          * As we reset the gpu during very early sanitisation, the current
4868          * register state on the GPU should reflect its defaults values.
4869          * We load a context onto the hw (with restore-inhibit), then switch
4870          * over to a second context to save that default register state. We
4871          * can then prime every new context with that state so they all start
4872          * from the same default HW values.
4873          */
4874
4875         ctx = i915_gem_context_create_kernel(i915, 0);
4876         if (IS_ERR(ctx))
4877                 return PTR_ERR(ctx);
4878
4879         for_each_engine(engine, i915, id) {
4880                 struct i915_request *rq;
4881
4882                 rq = i915_request_alloc(engine, ctx);
4883                 if (IS_ERR(rq)) {
4884                         err = PTR_ERR(rq);
4885                         goto out_ctx;
4886                 }
4887
4888                 err = 0;
4889                 if (engine->init_context)
4890                         err = engine->init_context(rq);
4891
4892                 i915_request_add(rq);
4893                 if (err)
4894                         goto err_active;
4895         }
4896
4897         err = i915_gem_switch_to_kernel_context(i915);
4898         if (err)
4899                 goto err_active;
4900
4901         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
4902                 i915_gem_set_wedged(i915);
4903                 err = -EIO; /* Caller will declare us wedged */
4904                 goto err_active;
4905         }
4906
4907         assert_kernel_context_is_current(i915);
4908
4909         /*
4910          * Immediately park the GPU so that we enable powersaving and
4911          * treat it as idle. The next time we issue a request, we will
4912          * unpark and start using the engine->pinned_default_state, otherwise
4913          * it is in limbo and an early reset may fail.
4914          */
4915         __i915_gem_park(i915);
4916
4917         for_each_engine(engine, i915, id) {
4918                 struct i915_vma *state;
4919                 void *vaddr;
4920
4921                 GEM_BUG_ON(to_intel_context(ctx, engine)->pin_count);
4922
4923                 state = to_intel_context(ctx, engine)->state;
4924                 if (!state)
4925                         continue;
4926
4927                 /*
4928                  * As we will hold a reference to the logical state, it will
4929                  * not be torn down with the context, and importantly the
4930                  * object will hold onto its vma (making it possible for a
4931                  * stray GTT write to corrupt our defaults). Unmap the vma
4932                  * from the GTT to prevent such accidents and reclaim the
4933                  * space.
4934                  */
4935                 err = i915_vma_unbind(state);
4936                 if (err)
4937                         goto err_active;
4938
4939                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
4940                 if (err)
4941                         goto err_active;
4942
4943                 engine->default_state = i915_gem_object_get(state->obj);
4944
4945                 /* Check we can acquire the image of the context state */
4946                 vaddr = i915_gem_object_pin_map(engine->default_state,
4947                                                 I915_MAP_FORCE_WB);
4948                 if (IS_ERR(vaddr)) {
4949                         err = PTR_ERR(vaddr);
4950                         goto err_active;
4951                 }
4952
4953                 i915_gem_object_unpin_map(engine->default_state);
4954         }
4955
4956         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
4957                 unsigned int found = intel_engines_has_context_isolation(i915);
4958
4959                 /*
4960                  * Make sure that classes with multiple engine instances all
4961                  * share the same basic configuration.
4962                  */
4963                 for_each_engine(engine, i915, id) {
4964                         unsigned int bit = BIT(engine->uabi_class);
4965                         unsigned int expected = engine->default_state ? bit : 0;
4966
4967                         if ((found & bit) != expected) {
4968                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
4969                                           engine->uabi_class, engine->name);
4970                         }
4971                 }
4972         }
4973
4974 out_ctx:
4975         i915_gem_context_set_closed(ctx);
4976         i915_gem_context_put(ctx);
4977         return err;
4978
4979 err_active:
4980         /*
4981          * If we have to abandon now, we expect the engines to be idle
4982          * and ready to be torn-down. First try to flush any remaining
4983          * request, ensure we are pointing at the kernel context and
4984          * then remove it.
4985          */
4986         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
4987                 goto out_ctx;
4988
4989         if (WARN_ON(i915_gem_wait_for_idle(i915,
4990                                            I915_WAIT_LOCKED,
4991                                            MAX_SCHEDULE_TIMEOUT)))
4992                 goto out_ctx;
4993
4994         i915_gem_contexts_lost(i915);
4995         goto out_ctx;
4996 }
4997
4998 static int
4999 i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
5000 {
5001         struct drm_i915_gem_object *obj;
5002         struct i915_vma *vma;
5003         int ret;
5004
5005         obj = i915_gem_object_create_stolen(i915, size);
5006         if (!obj)
5007                 obj = i915_gem_object_create_internal(i915, size);
5008         if (IS_ERR(obj)) {
5009                 DRM_ERROR("Failed to allocate scratch page\n");
5010                 return PTR_ERR(obj);
5011         }
5012
5013         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
5014         if (IS_ERR(vma)) {
5015                 ret = PTR_ERR(vma);
5016                 goto err_unref;
5017         }
5018
5019         ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
5020         if (ret)
5021                 goto err_unref;
5022
5023         i915->gt.scratch = vma;
5024         return 0;
5025
5026 err_unref:
5027         i915_gem_object_put(obj);
5028         return ret;
5029 }
5030
5031 static void i915_gem_fini_scratch(struct drm_i915_private *i915)
5032 {
5033         i915_vma_unpin_and_release(&i915->gt.scratch, 0);
5034 }
5035
5036 int i915_gem_init(struct drm_i915_private *dev_priv)
5037 {
5038         int ret;
5039
5040         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5041         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5042                 mkwrite_device_info(dev_priv)->page_sizes =
5043                         I915_GTT_PAGE_SIZE_4K;
5044
5045         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5046
5047         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5048                 dev_priv->gt.resume = intel_lr_context_resume;
5049                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5050         } else {
5051                 dev_priv->gt.resume = intel_legacy_submission_resume;
5052                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5053         }
5054
5055         i915_timelines_init(dev_priv);
5056
5057         ret = i915_gem_init_userptr(dev_priv);
5058         if (ret)
5059                 return ret;
5060
5061         ret = intel_uc_init_misc(dev_priv);
5062         if (ret)
5063                 return ret;
5064
5065         ret = intel_wopcm_init(&dev_priv->wopcm);
5066         if (ret)
5067                 goto err_uc_misc;
5068
5069         /* This is just a security blanket to placate dragons.
5070          * On some systems, we very sporadically observe that the first TLBs
5071          * used by the CS may be stale, despite us poking the TLB reset. If
5072          * we hold the forcewake during initialisation these problems
5073          * just magically go away.
5074          */
5075         mutex_lock(&dev_priv->drm.struct_mutex);
5076         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5077
5078         ret = i915_gem_init_ggtt(dev_priv);
5079         if (ret) {
5080                 GEM_BUG_ON(ret == -EIO);
5081                 goto err_unlock;
5082         }
5083
5084         ret = i915_gem_init_scratch(dev_priv,
5085                                     IS_GEN(dev_priv, 2) ? SZ_256K : PAGE_SIZE);
5086         if (ret) {
5087                 GEM_BUG_ON(ret == -EIO);
5088                 goto err_ggtt;
5089         }
5090
5091         ret = i915_gem_contexts_init(dev_priv);
5092         if (ret) {
5093                 GEM_BUG_ON(ret == -EIO);
5094                 goto err_scratch;
5095         }
5096
5097         ret = intel_engines_init(dev_priv);
5098         if (ret) {
5099                 GEM_BUG_ON(ret == -EIO);
5100                 goto err_context;
5101         }
5102
5103         intel_init_gt_powersave(dev_priv);
5104
5105         ret = intel_uc_init(dev_priv);
5106         if (ret)
5107                 goto err_pm;
5108
5109         ret = i915_gem_init_hw(dev_priv);
5110         if (ret)
5111                 goto err_uc_init;
5112
5113         /*
5114          * Despite its name intel_init_clock_gating applies both display
5115          * clock gating workarounds; GT mmio workarounds and the occasional
5116          * GT power context workaround. Worse, sometimes it includes a context
5117          * register workaround which we need to apply before we record the
5118          * default HW state for all contexts.
5119          *
5120          * FIXME: break up the workarounds and apply them at the right time!
5121          */
5122         intel_init_clock_gating(dev_priv);
5123
5124         ret = __intel_engines_record_defaults(dev_priv);
5125         if (ret)
5126                 goto err_init_hw;
5127
5128         if (i915_inject_load_failure()) {
5129                 ret = -ENODEV;
5130                 goto err_init_hw;
5131         }
5132
5133         if (i915_inject_load_failure()) {
5134                 ret = -EIO;
5135                 goto err_init_hw;
5136         }
5137
5138         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5139         mutex_unlock(&dev_priv->drm.struct_mutex);
5140
5141         return 0;
5142
5143         /*
5144          * Unwinding is complicated by that we want to handle -EIO to mean
5145          * disable GPU submission but keep KMS alive. We want to mark the
5146          * HW as irrevisibly wedged, but keep enough state around that the
5147          * driver doesn't explode during runtime.
5148          */
5149 err_init_hw:
5150         mutex_unlock(&dev_priv->drm.struct_mutex);
5151
5152         WARN_ON(i915_gem_suspend(dev_priv));
5153         i915_gem_suspend_late(dev_priv);
5154
5155         i915_gem_drain_workqueue(dev_priv);
5156
5157         mutex_lock(&dev_priv->drm.struct_mutex);
5158         intel_uc_fini_hw(dev_priv);
5159 err_uc_init:
5160         intel_uc_fini(dev_priv);
5161 err_pm:
5162         if (ret != -EIO) {
5163                 intel_cleanup_gt_powersave(dev_priv);
5164                 i915_gem_cleanup_engines(dev_priv);
5165         }
5166 err_context:
5167         if (ret != -EIO)
5168                 i915_gem_contexts_fini(dev_priv);
5169 err_scratch:
5170         i915_gem_fini_scratch(dev_priv);
5171 err_ggtt:
5172 err_unlock:
5173         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5174         mutex_unlock(&dev_priv->drm.struct_mutex);
5175
5176 err_uc_misc:
5177         intel_uc_fini_misc(dev_priv);
5178
5179         if (ret != -EIO) {
5180                 i915_gem_cleanup_userptr(dev_priv);
5181                 i915_timelines_fini(dev_priv);
5182         }
5183
5184         if (ret == -EIO) {
5185                 mutex_lock(&dev_priv->drm.struct_mutex);
5186
5187                 /*
5188                  * Allow engine initialisation to fail by marking the GPU as
5189                  * wedged. But we only want to do this where the GPU is angry,
5190                  * for all other failure, such as an allocation failure, bail.
5191                  */
5192                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5193                         i915_load_error(dev_priv,
5194                                         "Failed to initialize GPU, declaring it wedged!\n");
5195                         i915_gem_set_wedged(dev_priv);
5196                 }
5197
5198                 /* Minimal basic recovery for KMS */
5199                 ret = i915_ggtt_enable_hw(dev_priv);
5200                 i915_gem_restore_gtt_mappings(dev_priv);
5201                 i915_gem_restore_fences(dev_priv);
5202                 intel_init_clock_gating(dev_priv);
5203
5204                 mutex_unlock(&dev_priv->drm.struct_mutex);
5205         }
5206
5207         i915_gem_drain_freed_objects(dev_priv);
5208         return ret;
5209 }
5210
5211 void i915_gem_fini(struct drm_i915_private *dev_priv)
5212 {
5213         i915_gem_suspend_late(dev_priv);
5214         intel_disable_gt_powersave(dev_priv);
5215
5216         /* Flush any outstanding unpin_work. */
5217         i915_gem_drain_workqueue(dev_priv);
5218
5219         mutex_lock(&dev_priv->drm.struct_mutex);
5220         intel_uc_fini_hw(dev_priv);
5221         intel_uc_fini(dev_priv);
5222         i915_gem_cleanup_engines(dev_priv);
5223         i915_gem_contexts_fini(dev_priv);
5224         i915_gem_fini_scratch(dev_priv);
5225         mutex_unlock(&dev_priv->drm.struct_mutex);
5226
5227         intel_wa_list_free(&dev_priv->gt_wa_list);
5228
5229         intel_cleanup_gt_powersave(dev_priv);
5230
5231         intel_uc_fini_misc(dev_priv);
5232         i915_gem_cleanup_userptr(dev_priv);
5233         i915_timelines_fini(dev_priv);
5234
5235         i915_gem_drain_freed_objects(dev_priv);
5236
5237         WARN_ON(!list_empty(&dev_priv->contexts.list));
5238 }
5239
5240 void i915_gem_init_mmio(struct drm_i915_private *i915)
5241 {
5242         i915_gem_sanitize(i915);
5243 }
5244
5245 void
5246 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5247 {
5248         struct intel_engine_cs *engine;
5249         enum intel_engine_id id;
5250
5251         for_each_engine(engine, dev_priv, id)
5252                 dev_priv->gt.cleanup_engine(engine);
5253 }
5254
5255 void
5256 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5257 {
5258         int i;
5259
5260         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5261             !IS_CHERRYVIEW(dev_priv))
5262                 dev_priv->num_fence_regs = 32;
5263         else if (INTEL_GEN(dev_priv) >= 4 ||
5264                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5265                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5266                 dev_priv->num_fence_regs = 16;
5267         else
5268                 dev_priv->num_fence_regs = 8;
5269
5270         if (intel_vgpu_active(dev_priv))
5271                 dev_priv->num_fence_regs =
5272                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5273
5274         /* Initialize fence registers to zero */
5275         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5276                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5277
5278                 fence->i915 = dev_priv;
5279                 fence->id = i;
5280                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5281         }
5282         i915_gem_restore_fences(dev_priv);
5283
5284         i915_gem_detect_bit_6_swizzle(dev_priv);
5285 }
5286
5287 static void i915_gem_init__mm(struct drm_i915_private *i915)
5288 {
5289         spin_lock_init(&i915->mm.object_stat_lock);
5290         spin_lock_init(&i915->mm.obj_lock);
5291         spin_lock_init(&i915->mm.free_lock);
5292
5293         init_llist_head(&i915->mm.free_list);
5294
5295         INIT_LIST_HEAD(&i915->mm.unbound_list);
5296         INIT_LIST_HEAD(&i915->mm.bound_list);
5297         INIT_LIST_HEAD(&i915->mm.fence_list);
5298         INIT_LIST_HEAD(&i915->mm.userfault_list);
5299
5300         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5301 }
5302
5303 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5304 {
5305         int err = -ENOMEM;
5306
5307         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5308         if (!dev_priv->objects)
5309                 goto err_out;
5310
5311         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5312         if (!dev_priv->vmas)
5313                 goto err_objects;
5314
5315         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5316         if (!dev_priv->luts)
5317                 goto err_vmas;
5318
5319         dev_priv->requests = KMEM_CACHE(i915_request,
5320                                         SLAB_HWCACHE_ALIGN |
5321                                         SLAB_RECLAIM_ACCOUNT |
5322                                         SLAB_TYPESAFE_BY_RCU);
5323         if (!dev_priv->requests)
5324                 goto err_luts;
5325
5326         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5327                                             SLAB_HWCACHE_ALIGN |
5328                                             SLAB_RECLAIM_ACCOUNT);
5329         if (!dev_priv->dependencies)
5330                 goto err_requests;
5331
5332         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5333         if (!dev_priv->priorities)
5334                 goto err_dependencies;
5335
5336         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5337         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5338
5339         i915_gem_init__mm(dev_priv);
5340
5341         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5342                           i915_gem_retire_work_handler);
5343         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5344                           i915_gem_idle_work_handler);
5345         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5346         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5347         mutex_init(&dev_priv->gpu_error.wedge_mutex);
5348
5349         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5350
5351         spin_lock_init(&dev_priv->fb_tracking.lock);
5352
5353         err = i915_gemfs_init(dev_priv);
5354         if (err)
5355                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5356
5357         return 0;
5358
5359 err_dependencies:
5360         kmem_cache_destroy(dev_priv->dependencies);
5361 err_requests:
5362         kmem_cache_destroy(dev_priv->requests);
5363 err_luts:
5364         kmem_cache_destroy(dev_priv->luts);
5365 err_vmas:
5366         kmem_cache_destroy(dev_priv->vmas);
5367 err_objects:
5368         kmem_cache_destroy(dev_priv->objects);
5369 err_out:
5370         return err;
5371 }
5372
5373 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5374 {
5375         i915_gem_drain_freed_objects(dev_priv);
5376         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5377         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5378         WARN_ON(dev_priv->mm.object_count);
5379
5380         kmem_cache_destroy(dev_priv->priorities);
5381         kmem_cache_destroy(dev_priv->dependencies);
5382         kmem_cache_destroy(dev_priv->requests);
5383         kmem_cache_destroy(dev_priv->luts);
5384         kmem_cache_destroy(dev_priv->vmas);
5385         kmem_cache_destroy(dev_priv->objects);
5386
5387         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5388         rcu_barrier();
5389
5390         i915_gemfs_fini(dev_priv);
5391 }
5392
5393 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5394 {
5395         /* Discard all purgeable objects, let userspace recover those as
5396          * required after resuming.
5397          */
5398         i915_gem_shrink_all(dev_priv);
5399
5400         return 0;
5401 }
5402
5403 int i915_gem_freeze_late(struct drm_i915_private *i915)
5404 {
5405         struct drm_i915_gem_object *obj;
5406         struct list_head *phases[] = {
5407                 &i915->mm.unbound_list,
5408                 &i915->mm.bound_list,
5409                 NULL
5410         }, **phase;
5411
5412         /*
5413          * Called just before we write the hibernation image.
5414          *
5415          * We need to update the domain tracking to reflect that the CPU
5416          * will be accessing all the pages to create and restore from the
5417          * hibernation, and so upon restoration those pages will be in the
5418          * CPU domain.
5419          *
5420          * To make sure the hibernation image contains the latest state,
5421          * we update that state just before writing out the image.
5422          *
5423          * To try and reduce the hibernation image, we manually shrink
5424          * the objects as well, see i915_gem_freeze()
5425          */
5426
5427         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5428         i915_gem_drain_freed_objects(i915);
5429
5430         mutex_lock(&i915->drm.struct_mutex);
5431         for (phase = phases; *phase; phase++) {
5432                 list_for_each_entry(obj, *phase, mm.link)
5433                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5434         }
5435         mutex_unlock(&i915->drm.struct_mutex);
5436
5437         return 0;
5438 }
5439
5440 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5441 {
5442         struct drm_i915_file_private *file_priv = file->driver_priv;
5443         struct i915_request *request;
5444
5445         /* Clean up our request list when the client is going away, so that
5446          * later retire_requests won't dereference our soon-to-be-gone
5447          * file_priv.
5448          */
5449         spin_lock(&file_priv->mm.lock);
5450         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5451                 request->file_priv = NULL;
5452         spin_unlock(&file_priv->mm.lock);
5453 }
5454
5455 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5456 {
5457         struct drm_i915_file_private *file_priv;
5458         int ret;
5459
5460         DRM_DEBUG("\n");
5461
5462         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5463         if (!file_priv)
5464                 return -ENOMEM;
5465
5466         file->driver_priv = file_priv;
5467         file_priv->dev_priv = i915;
5468         file_priv->file = file;
5469
5470         spin_lock_init(&file_priv->mm.lock);
5471         INIT_LIST_HEAD(&file_priv->mm.request_list);
5472
5473         file_priv->bsd_engine = -1;
5474         file_priv->hang_timestamp = jiffies;
5475
5476         ret = i915_gem_context_open(i915, file);
5477         if (ret)
5478                 kfree(file_priv);
5479
5480         return ret;
5481 }
5482
5483 /**
5484  * i915_gem_track_fb - update frontbuffer tracking
5485  * @old: current GEM buffer for the frontbuffer slots
5486  * @new: new GEM buffer for the frontbuffer slots
5487  * @frontbuffer_bits: bitmask of frontbuffer slots
5488  *
5489  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5490  * from @old and setting them in @new. Both @old and @new can be NULL.
5491  */
5492 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5493                        struct drm_i915_gem_object *new,
5494                        unsigned frontbuffer_bits)
5495 {
5496         /* Control of individual bits within the mask are guarded by
5497          * the owning plane->mutex, i.e. we can never see concurrent
5498          * manipulation of individual bits. But since the bitfield as a whole
5499          * is updated using RMW, we need to use atomics in order to update
5500          * the bits.
5501          */
5502         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5503                      BITS_PER_TYPE(atomic_t));
5504
5505         if (old) {
5506                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5507                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5508         }
5509
5510         if (new) {
5511                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5512                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5513         }
5514 }
5515
5516 /* Allocate a new GEM object and fill it with the supplied data */
5517 struct drm_i915_gem_object *
5518 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5519                                  const void *data, size_t size)
5520 {
5521         struct drm_i915_gem_object *obj;
5522         struct file *file;
5523         size_t offset;
5524         int err;
5525
5526         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5527         if (IS_ERR(obj))
5528                 return obj;
5529
5530         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5531
5532         file = obj->base.filp;
5533         offset = 0;
5534         do {
5535                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5536                 struct page *page;
5537                 void *pgdata, *vaddr;
5538
5539                 err = pagecache_write_begin(file, file->f_mapping,
5540                                             offset, len, 0,
5541                                             &page, &pgdata);
5542                 if (err < 0)
5543                         goto fail;
5544
5545                 vaddr = kmap(page);
5546                 memcpy(vaddr, data, len);
5547                 kunmap(page);
5548
5549                 err = pagecache_write_end(file, file->f_mapping,
5550                                           offset, len, len,
5551                                           page, pgdata);
5552                 if (err < 0)
5553                         goto fail;
5554
5555                 size -= len;
5556                 data += len;
5557                 offset += len;
5558         } while (size);
5559
5560         return obj;
5561
5562 fail:
5563         i915_gem_object_put(obj);
5564         return ERR_PTR(err);
5565 }
5566
5567 struct scatterlist *
5568 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5569                        unsigned int n,
5570                        unsigned int *offset)
5571 {
5572         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5573         struct scatterlist *sg;
5574         unsigned int idx, count;
5575
5576         might_sleep();
5577         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5578         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5579
5580         /* As we iterate forward through the sg, we record each entry in a
5581          * radixtree for quick repeated (backwards) lookups. If we have seen
5582          * this index previously, we will have an entry for it.
5583          *
5584          * Initial lookup is O(N), but this is amortized to O(1) for
5585          * sequential page access (where each new request is consecutive
5586          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5587          * i.e. O(1) with a large constant!
5588          */
5589         if (n < READ_ONCE(iter->sg_idx))
5590                 goto lookup;
5591
5592         mutex_lock(&iter->lock);
5593
5594         /* We prefer to reuse the last sg so that repeated lookup of this
5595          * (or the subsequent) sg are fast - comparing against the last
5596          * sg is faster than going through the radixtree.
5597          */
5598
5599         sg = iter->sg_pos;
5600         idx = iter->sg_idx;
5601         count = __sg_page_count(sg);
5602
5603         while (idx + count <= n) {
5604                 void *entry;
5605                 unsigned long i;
5606                 int ret;
5607
5608                 /* If we cannot allocate and insert this entry, or the
5609                  * individual pages from this range, cancel updating the
5610                  * sg_idx so that on this lookup we are forced to linearly
5611                  * scan onwards, but on future lookups we will try the
5612                  * insertion again (in which case we need to be careful of
5613                  * the error return reporting that we have already inserted
5614                  * this index).
5615                  */
5616                 ret = radix_tree_insert(&iter->radix, idx, sg);
5617                 if (ret && ret != -EEXIST)
5618                         goto scan;
5619
5620                 entry = xa_mk_value(idx);
5621                 for (i = 1; i < count; i++) {
5622                         ret = radix_tree_insert(&iter->radix, idx + i, entry);
5623                         if (ret && ret != -EEXIST)
5624                                 goto scan;
5625                 }
5626
5627                 idx += count;
5628                 sg = ____sg_next(sg);
5629                 count = __sg_page_count(sg);
5630         }
5631
5632 scan:
5633         iter->sg_pos = sg;
5634         iter->sg_idx = idx;
5635
5636         mutex_unlock(&iter->lock);
5637
5638         if (unlikely(n < idx)) /* insertion completed by another thread */
5639                 goto lookup;
5640
5641         /* In case we failed to insert the entry into the radixtree, we need
5642          * to look beyond the current sg.
5643          */
5644         while (idx + count <= n) {
5645                 idx += count;
5646                 sg = ____sg_next(sg);
5647                 count = __sg_page_count(sg);
5648         }
5649
5650         *offset = n - idx;
5651         return sg;
5652
5653 lookup:
5654         rcu_read_lock();
5655
5656         sg = radix_tree_lookup(&iter->radix, n);
5657         GEM_BUG_ON(!sg);
5658
5659         /* If this index is in the middle of multi-page sg entry,
5660          * the radix tree will contain a value entry that points
5661          * to the start of that range. We will return the pointer to
5662          * the base page and the offset of this page within the
5663          * sg entry's range.
5664          */
5665         *offset = 0;
5666         if (unlikely(xa_is_value(sg))) {
5667                 unsigned long base = xa_to_value(sg);
5668
5669                 sg = radix_tree_lookup(&iter->radix, base);
5670                 GEM_BUG_ON(!sg);
5671
5672                 *offset = n - base;
5673         }
5674
5675         rcu_read_unlock();
5676
5677         return sg;
5678 }
5679
5680 struct page *
5681 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5682 {
5683         struct scatterlist *sg;
5684         unsigned int offset;
5685
5686         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5687
5688         sg = i915_gem_object_get_sg(obj, n, &offset);
5689         return nth_page(sg_page(sg), offset);
5690 }
5691
5692 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5693 struct page *
5694 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5695                                unsigned int n)
5696 {
5697         struct page *page;
5698
5699         page = i915_gem_object_get_page(obj, n);
5700         if (!obj->mm.dirty)
5701                 set_page_dirty(page);
5702
5703         return page;
5704 }
5705
5706 dma_addr_t
5707 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5708                                 unsigned long n)
5709 {
5710         struct scatterlist *sg;
5711         unsigned int offset;
5712
5713         sg = i915_gem_object_get_sg(obj, n, &offset);
5714         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5715 }
5716
5717 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5718 {
5719         struct sg_table *pages;
5720         int err;
5721
5722         if (align > obj->base.size)
5723                 return -EINVAL;
5724
5725         if (obj->ops == &i915_gem_phys_ops)
5726                 return 0;
5727
5728         if (obj->ops != &i915_gem_object_ops)
5729                 return -EINVAL;
5730
5731         err = i915_gem_object_unbind(obj);
5732         if (err)
5733                 return err;
5734
5735         mutex_lock(&obj->mm.lock);
5736
5737         if (obj->mm.madv != I915_MADV_WILLNEED) {
5738                 err = -EFAULT;
5739                 goto err_unlock;
5740         }
5741
5742         if (obj->mm.quirked) {
5743                 err = -EFAULT;
5744                 goto err_unlock;
5745         }
5746
5747         if (obj->mm.mapping) {
5748                 err = -EBUSY;
5749                 goto err_unlock;
5750         }
5751
5752         pages = __i915_gem_object_unset_pages(obj);
5753
5754         obj->ops = &i915_gem_phys_ops;
5755
5756         err = ____i915_gem_object_get_pages(obj);
5757         if (err)
5758                 goto err_xfer;
5759
5760         /* Perma-pin (until release) the physical set of pages */
5761         __i915_gem_object_pin_pages(obj);
5762
5763         if (!IS_ERR_OR_NULL(pages))
5764                 i915_gem_object_ops.put_pages(obj, pages);
5765         mutex_unlock(&obj->mm.lock);
5766         return 0;
5767
5768 err_xfer:
5769         obj->ops = &i915_gem_object_ops;
5770         if (!IS_ERR_OR_NULL(pages)) {
5771                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
5772
5773                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
5774         }
5775 err_unlock:
5776         mutex_unlock(&obj->mm.lock);
5777         return err;
5778 }
5779
5780 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5781 #include "selftests/scatterlist.c"
5782 #include "selftests/mock_gem_device.c"
5783 #include "selftests/huge_gem_object.c"
5784 #include "selftests/huge_pages.c"
5785 #include "selftests/i915_gem_object.c"
5786 #include "selftests/i915_gem_coherency.c"
5787 #include "selftests/i915_gem.c"
5788 #endif