drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c

   1 /*
   2  * Copyright 2022 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  */
  23 #include <linux/slab.h>
  24 #include <drm/drm_print.h>
  25
  26 #include "amdgpu_ring_mux.h"
  27 #include "amdgpu_ring.h"
  28 #include "amdgpu.h"
  29
  30 #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
  31 #define AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US 10000
  32
  33 static const struct ring_info {
  34         unsigned int hw_pio;
  35         const char *ring_name;
  36 } sw_ring_info[] = {
  37         { AMDGPU_RING_PRIO_DEFAULT, "gfx_low"},
  38         { AMDGPU_RING_PRIO_2, "gfx_high"},
  39 };
  40
  41 static struct kmem_cache *amdgpu_mux_chunk_slab;
  42
  43 static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
  44                                                                 struct amdgpu_ring *ring)
  45 {
  46         return ring->entry_index < mux->ring_entry_size ?
  47                         &mux->ring_entry[ring->entry_index] : NULL;
  48 }
  49
  50 /* copy packages on sw ring range[begin, end) */
  51 static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
  52                                                   struct amdgpu_ring *ring,
  53                                                   u64 s_start, u64 s_end)
  54 {
  55         u64 start, end;
  56         struct amdgpu_ring *real_ring = mux->real_ring;
  57
  58         start = s_start & ring->buf_mask;
  59         end = s_end & ring->buf_mask;
  60
  61         if (start == end) {
  62                 DRM_ERROR("no more data copied from sw ring\n");
  63                 return;
  64         }
  65         if (start > end) {
  66                 amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
  67                 amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
  68                                            (ring->ring_size >> 2) - start);
  69                 amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
  70         } else {
  71                 amdgpu_ring_alloc(real_ring, end - start);
  72                 amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
  73         }
  74 }
  75
  76 static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux)
  77 {
  78         struct amdgpu_mux_entry *e = NULL;
  79         struct amdgpu_mux_chunk *chunk;
  80         uint32_t seq, last_seq;
  81         int i;
  82
  83         /*find low priority entries:*/
  84         if (!mux->s_resubmit)
  85                 return;
  86
  87         for (i = 0; i < mux->num_ring_entries; i++) {
  88                 if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
  89                         e = &mux->ring_entry[i];
  90                         break;
  91                 }
  92         }
  93
  94         if (!e) {
  95                 DRM_ERROR("%s no low priority ring found\n", __func__);
  96                 return;
  97         }
  98
  99         last_seq = atomic_read(&e->ring->fence_drv.last_seq);
 100         seq = mux->seqno_to_resubmit;
 101         if (last_seq < seq) {
 102                 /*resubmit all the fences between (last_seq, seq]*/
 103                 list_for_each_entry(chunk, &e->list, entry) {
 104                         if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
 105                                 amdgpu_fence_update_start_timestamp(e->ring,
 106                                                                     chunk->sync_seq,
 107                                                                     ktime_get());
 108                                 amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
 109                                                                       chunk->start,
 110                                                                       chunk->end);
 111                                 mux->wptr_resubmit = chunk->end;
 112                                 amdgpu_ring_commit(mux->real_ring);
 113                         }
 114                 }
 115         }
 116
 117         del_timer(&mux->resubmit_timer);
 118         mux->s_resubmit = false;
 119 }
 120
 121 static void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux)
 122 {
 123         mod_timer(&mux->resubmit_timer, jiffies + AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
 124 }
 125
 126 static void amdgpu_mux_resubmit_fallback(struct timer_list *t)
 127 {
 128         struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
 129
 130         if (!spin_trylock(&mux->lock)) {
 131                 amdgpu_ring_mux_schedule_resubmit(mux);
 132                 DRM_ERROR("reschedule resubmit\n");
 133                 return;
 134         }
 135         amdgpu_mux_resubmit_chunks(mux);
 136         spin_unlock(&mux->lock);
 137 }
 138
 139 int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
 140                          unsigned int entry_size)
 141 {
 142         mux->real_ring = ring;
 143         mux->num_ring_entries = 0;
 144
 145         mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
 146         if (!mux->ring_entry)
 147                 return -ENOMEM;
 148
 149         mux->ring_entry_size = entry_size;
 150         mux->s_resubmit = false;
 151
 152         amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
 153                                                   sizeof(struct amdgpu_mux_chunk), 0,
 154                                                   SLAB_HWCACHE_ALIGN, NULL);
 155         if (!amdgpu_mux_chunk_slab) {
 156                 DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
 157                 return -ENOMEM;
 158         }
 159
 160         spin_lock_init(&mux->lock);
 161         timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
 162
 163         return 0;
 164 }
 165
 166 void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
 167 {
 168         struct amdgpu_mux_entry *e;
 169         struct amdgpu_mux_chunk *chunk, *chunk2;
 170         int i;
 171
 172         for (i = 0; i < mux->num_ring_entries; i++) {
 173                 e = &mux->ring_entry[i];
 174                 list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
 175                         list_del(&chunk->entry);
 176                         kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
 177                 }
 178         }
 179         kmem_cache_destroy(amdgpu_mux_chunk_slab);
 180         kfree(mux->ring_entry);
 181         mux->ring_entry = NULL;
 182         mux->num_ring_entries = 0;
 183         mux->ring_entry_size = 0;
 184 }
 185
 186 int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
 187 {
 188         struct amdgpu_mux_entry *e;
 189
 190         if (mux->num_ring_entries >= mux->ring_entry_size) {
 191                 DRM_ERROR("add sw ring exceeding max entry size\n");
 192                 return -ENOENT;
 193         }
 194
 195         e = &mux->ring_entry[mux->num_ring_entries];
 196         ring->entry_index = mux->num_ring_entries;
 197         e->ring = ring;
 198
 199         INIT_LIST_HEAD(&e->list);
 200         mux->num_ring_entries += 1;
 201         return 0;
 202 }
 203
 204 void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
 205 {
 206         struct amdgpu_mux_entry *e;
 207
 208         spin_lock(&mux->lock);
 209
 210         if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
 211                 amdgpu_mux_resubmit_chunks(mux);
 212
 213         e = amdgpu_ring_mux_sw_entry(mux, ring);
 214         if (!e) {
 215                 DRM_ERROR("cannot find entry for sw ring\n");
 216                 spin_unlock(&mux->lock);
 217                 return;
 218         }
 219
 220         /* We could skip this set wptr as preemption in process. */
 221         if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
 222                 spin_unlock(&mux->lock);
 223                 return;
 224         }
 225
 226         e->sw_cptr = e->sw_wptr;
 227         /* Update cptr if the package already copied in resubmit functions */
 228         if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
 229                 e->sw_cptr = mux->wptr_resubmit;
 230         e->sw_wptr = wptr;
 231         e->start_ptr_in_hw_ring = mux->real_ring->wptr;
 232
 233         /* Skip copying for the packages already resubmitted.*/
 234         if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
 235                 amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
 236                 e->end_ptr_in_hw_ring = mux->real_ring->wptr;
 237                 amdgpu_ring_commit(mux->real_ring);
 238         } else {
 239                 e->end_ptr_in_hw_ring = mux->real_ring->wptr;
 240         }
 241         spin_unlock(&mux->lock);
 242 }
 243
 244 u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
 245 {
 246         struct amdgpu_mux_entry *e;
 247
 248         e = amdgpu_ring_mux_sw_entry(mux, ring);
 249         if (!e) {
 250                 DRM_ERROR("cannot find entry for sw ring\n");
 251                 return 0;
 252         }
 253
 254         return e->sw_wptr;
 255 }
 256
 257 /**
 258  * amdgpu_ring_mux_get_rptr - get the readptr of the software ring
 259  * @mux: the multiplexer the software rings attach to
 260  * @ring: the software ring of which we calculate the readptr
 261  *
 262  * The return value of the readptr is not precise while the other rings could
 263  * write data onto the real ring buffer.After overwriting on the real ring, we
 264  * can not decide if our packages have been excuted or not read yet. However,
 265  * this function is only called by the tools such as umr to collect the latest
 266  * packages for the hang analysis. We assume the hang happens near our latest
 267  * submit. Thus we could use the following logic to give the clue:
 268  * If the readptr is between start and end, then we return the copy pointer
 269  * plus the distance from start to readptr. If the readptr is before start, we
 270  * return the copy pointer. Lastly, if the readptr is past end, we return the
 271  * write pointer.
 272  */
 273 u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
 274 {
 275         struct amdgpu_mux_entry *e;
 276         u64 readp, offset, start, end;
 277
 278         e = amdgpu_ring_mux_sw_entry(mux, ring);
 279         if (!e) {
 280                 DRM_ERROR("no sw entry found!\n");
 281                 return 0;
 282         }
 283
 284         readp = amdgpu_ring_get_rptr(mux->real_ring);
 285
 286         start = e->start_ptr_in_hw_ring & mux->real_ring->buf_mask;
 287         end = e->end_ptr_in_hw_ring & mux->real_ring->buf_mask;
 288         if (start > end) {
 289                 if (readp <= end)
 290                         readp += mux->real_ring->ring_size >> 2;
 291                 end += mux->real_ring->ring_size >> 2;
 292         }
 293
 294         if (start <= readp && readp <= end) {
 295                 offset = readp - start;
 296                 e->sw_rptr = (e->sw_cptr + offset) & ring->buf_mask;
 297         } else if (readp < start) {
 298                 e->sw_rptr = e->sw_cptr;
 299         } else {
 300                 /* end < readptr */
 301                 e->sw_rptr = e->sw_wptr;
 302         }
 303
 304         return e->sw_rptr;
 305 }
 306
 307 u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring)
 308 {
 309         struct amdgpu_device *adev = ring->adev;
 310         struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
 311
 312         WARN_ON(!ring->is_sw_ring);
 313         return amdgpu_ring_mux_get_rptr(mux, ring);
 314 }
 315
 316 u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring)
 317 {
 318         struct amdgpu_device *adev = ring->adev;
 319         struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
 320
 321         WARN_ON(!ring->is_sw_ring);
 322         return amdgpu_ring_mux_get_wptr(mux, ring);
 323 }
 324
 325 void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring)
 326 {
 327         struct amdgpu_device *adev = ring->adev;
 328         struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
 329
 330         WARN_ON(!ring->is_sw_ring);
 331         amdgpu_ring_mux_set_wptr(mux, ring, ring->wptr);
 332 }
 333
 334 /* Override insert_nop to prevent emitting nops to the software rings */
 335 void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 336 {
 337         WARN_ON(!ring->is_sw_ring);
 338 }
 339
 340 const char *amdgpu_sw_ring_name(int idx)
 341 {
 342         return idx < ARRAY_SIZE(sw_ring_info) ?
 343                 sw_ring_info[idx].ring_name : NULL;
 344 }
 345
 346 unsigned int amdgpu_sw_ring_priority(int idx)
 347 {
 348         return idx < ARRAY_SIZE(sw_ring_info) ?
 349                 sw_ring_info[idx].hw_pio : AMDGPU_RING_PRIO_DEFAULT;
 350 }
 351
 352 /*Scan on low prio rings to have unsignaled fence and high ring has no fence.*/
 353 static int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux)
 354 {
 355         struct amdgpu_ring *ring;
 356         int i, need_preempt;
 357
 358         need_preempt = 0;
 359         for (i = 0; i < mux->num_ring_entries; i++) {
 360                 ring = mux->ring_entry[i].ring;
 361                 if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
 362                     amdgpu_fence_count_emitted(ring) > 0)
 363                         return 0;
 364                 if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
 365                     amdgpu_fence_last_unsignaled_time_us(ring) >
 366                     AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US)
 367                         need_preempt = 1;
 368         }
 369         return need_preempt && !mux->s_resubmit;
 370 }
 371
 372 /* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need to resubmit. */
 373 static int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
 374 {
 375         int r;
 376
 377         spin_lock(&mux->lock);
 378         mux->pending_trailing_fence_signaled = true;
 379         r = amdgpu_ring_preempt_ib(mux->real_ring);
 380         spin_unlock(&mux->lock);
 381         return r;
 382 }
 383
 384 void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring)
 385 {
 386         struct amdgpu_device *adev = ring->adev;
 387         struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
 388
 389         WARN_ON(!ring->is_sw_ring);
 390         if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
 391                 if (amdgpu_mcbp_scan(mux) > 0)
 392                         amdgpu_mcbp_trigger_preempt(mux);
 393                 return;
 394         }
 395
 396         amdgpu_ring_mux_start_ib(mux, ring);
 397 }
 398
 399 void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring)
 400 {
 401         struct amdgpu_device *adev = ring->adev;
 402         struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
 403
 404         WARN_ON(!ring->is_sw_ring);
 405         if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
 406                 return;
 407         amdgpu_ring_mux_end_ib(mux, ring);
 408 }
 409
 410 void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
 411 {
 412         struct amdgpu_mux_entry *e;
 413         struct amdgpu_mux_chunk *chunk;
 414
 415         spin_lock(&mux->lock);
 416         amdgpu_mux_resubmit_chunks(mux);
 417         spin_unlock(&mux->lock);
 418
 419         e = amdgpu_ring_mux_sw_entry(mux, ring);
 420         if (!e) {
 421                 DRM_ERROR("cannot find entry!\n");
 422                 return;
 423         }
 424
 425         chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
 426         if (!chunk) {
 427                 DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
 428                 return;
 429         }
 430
 431         chunk->start = ring->wptr;
 432         list_add_tail(&chunk->entry, &e->list);
 433 }
 434
 435 static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
 436 {
 437         uint32_t last_seq = 0;
 438         struct amdgpu_mux_entry *e;
 439         struct amdgpu_mux_chunk *chunk, *tmp;
 440
 441         e = amdgpu_ring_mux_sw_entry(mux, ring);
 442         if (!e) {
 443                 DRM_ERROR("cannot find entry!\n");
 444                 return;
 445         }
 446
 447         last_seq = atomic_read(&ring->fence_drv.last_seq);
 448
 449         list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
 450                 if (chunk->sync_seq <= last_seq) {
 451                         list_del(&chunk->entry);
 452                         kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
 453                 }
 454         }
 455 }
 456
 457 void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
 458 {
 459         struct amdgpu_mux_entry *e;
 460         struct amdgpu_mux_chunk *chunk;
 461
 462         e = amdgpu_ring_mux_sw_entry(mux, ring);
 463         if (!e) {
 464                 DRM_ERROR("cannot find entry!\n");
 465                 return;
 466         }
 467
 468         chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
 469         if (!chunk) {
 470                 DRM_ERROR("cannot find chunk!\n");
 471                 return;
 472         }
 473
 474         chunk->end = ring->wptr;
 475         chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
 476
 477         scan_and_remove_signaled_chunk(mux, ring);
 478 }
 479
 480 bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux)
 481 {
 482         struct amdgpu_mux_entry *e;
 483         struct amdgpu_ring *ring = NULL;
 484         int i;
 485
 486         if (!mux->pending_trailing_fence_signaled)
 487                 return false;
 488
 489         if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
 490                 return false;
 491
 492         for (i = 0; i < mux->num_ring_entries; i++) {
 493                 e = &mux->ring_entry[i];
 494                 if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
 495                         ring = e->ring;
 496                         break;
 497                 }
 498         }
 499
 500         if (!ring) {
 501                 DRM_ERROR("cannot find low priority ring\n");
 502                 return false;
 503         }
 504
 505         amdgpu_fence_process(ring);
 506         if (amdgpu_fence_count_emitted(ring) > 0) {
 507                 mux->s_resubmit = true;
 508                 mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
 509                 amdgpu_ring_mux_schedule_resubmit(mux);
 510         }
 511
 512         mux->pending_trailing_fence_signaled = false;
 513         return true;
 514 }