From: Ben Skeggs Date: Wed, 1 Jun 2022 10:48:17 +0000 (+1000) Subject: drm/nouveau/gr/gv100-: port smid mapping code from nvgpu X-Git-Tag: io_uring-6.2-2022-12-19~36^2~20^2~12 X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=3ffa6f329b610029b44ebd7bc2320a92468a0e42;p=linux-block.git drm/nouveau/gr/gv100-: port smid mapping code from nvgpu Essentially ripped verbatim from NVGPU, comments and all, and adapted to nvkm's structs and style. - maybe fixes an nvgpu bug though, a small tweak was needed to match RM v2: - remove unnecessary WARN_ON Signed-off-by: Ben Skeggs Reviewed-by: Lyude Paul --- diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgv100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgv100.c index 9975a4d5c058..d68741a81a58 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgv100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgv100.c @@ -157,6 +157,9 @@ static void gv100_grctx_generate_sm_id(struct gf100_gr *gr, int gpc, int tpc, int sm) { struct nvkm_device *device = gr->base.engine.subdev.device; + + tpc = gv100_gr_nonpes_aware_tpc(gr, gpc, tpc); + nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x608), sm); nvkm_wr32(device, GPC_UNIT(gpc, 0x0c10 + tpc * 4), sm); nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x088), sm); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxtu102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxtu102.c index 1a151e89ea05..da1bac3963a9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxtu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxtu102.c @@ -34,6 +34,9 @@ static void tu102_grctx_generate_sm_id(struct gf100_gr *gr, int gpc, int tpc, int sm) { struct nvkm_device *device = gr->base.engine.subdev.device; + + tpc = gv100_gr_nonpes_aware_tpc(gr, gpc, tpc); + nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x608), sm); nvkm_wr32(device, TPC_UNIT(gpc, tpc, 0x088), sm); } diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c index ebbd7476e0fe..5ce83b915ebb 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c @@ -1889,10 +1889,11 @@ gf100_gr_init_ctxctl(struct gf100_gr *gr) return ret; } -void +int gf100_gr_oneinit_sm_id(struct gf100_gr *gr) { int tpc, gpc; + for (tpc = 0; tpc < gr->tpc_max; tpc++) { for (gpc = 0; gpc < gr->gpc_nr; gpc++) { if (tpc < gr->tpc_nr[gpc]) { @@ -1902,6 +1903,8 @@ gf100_gr_oneinit_sm_id(struct gf100_gr *gr) } } } + + return 0; } void @@ -2019,6 +2022,8 @@ gf100_gr_oneinit(struct nvkm_gr *base) if (gr->ppc_tpc_max < gr->ppc_tpc_nr[i][j]) gr->ppc_tpc_max = gr->ppc_tpc_nr[i][j]; } + + gr->ppc_total += gr->ppc_nr[i]; } /* Allocate global context buffers. */ @@ -2046,8 +2051,8 @@ gf100_gr_oneinit(struct nvkm_gr *base) memset(gr->tile, 0xff, sizeof(gr->tile)); gr->func->oneinit_tiles(gr); - gr->func->oneinit_sm_id(gr); - return 0; + + return gr->func->oneinit_sm_id(gr); } static int diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h index 9c6823672d2e..a9ba9af25728 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h @@ -107,6 +107,7 @@ struct gf100_gr { u8 ppc_tpc_nr[GPC_MAX][4]; u8 ppc_tpc_min; u8 ppc_tpc_max; + u8 ppc_total; struct nvkm_memory *pagepool; struct nvkm_memory *bundle_cb; @@ -141,7 +142,7 @@ struct gf100_gr_func_zbc { struct gf100_gr_func { void (*oneinit_tiles)(struct gf100_gr *); - void (*oneinit_sm_id)(struct gf100_gr *); + int (*oneinit_sm_id)(struct gf100_gr *); int (*init)(struct gf100_gr *); void (*init_419bd8)(struct gf100_gr *); void (*init_gpc_mmu)(struct gf100_gr *); @@ -191,7 +192,7 @@ struct gf100_gr_func { int gf100_gr_rops(struct gf100_gr *); void gf100_gr_oneinit_tiles(struct gf100_gr *); -void gf100_gr_oneinit_sm_id(struct gf100_gr *); +int gf100_gr_oneinit_sm_id(struct gf100_gr *); int gf100_gr_init(struct gf100_gr *); void gf100_gr_init_vsc_stream_master(struct gf100_gr *); void gf100_gr_init_zcull(struct gf100_gr *); @@ -228,7 +229,7 @@ int gk20a_gr_aiv_to_init(struct nvkm_blob *, struct gf100_gr_pack **); int gk20a_gr_av_to_method(struct nvkm_blob *, struct gf100_gr_pack **); void gm200_gr_oneinit_tiles(struct gf100_gr *); -void gm200_gr_oneinit_sm_id(struct gf100_gr *); +int gm200_gr_oneinit_sm_id(struct gf100_gr *); int gm200_gr_rops(struct gf100_gr *); void gm200_gr_init_num_active_ltcs(struct gf100_gr *); void gm200_gr_init_ds_hww_esr_2(struct gf100_gr *); @@ -245,6 +246,8 @@ extern const struct gf100_gr_func_zbc gp102_gr_zbc; extern const struct gf100_gr_func gp107_gr; +int gv100_gr_oneinit_sm_id(struct gf100_gr *); +u32 gv100_gr_nonpes_aware_tpc(struct gf100_gr *gr, u32 gpc, u32 tpc); void gv100_gr_init_419bd8(struct gf100_gr *); void gv100_gr_init_504430(struct gf100_gr *, int, int); void gv100_gr_init_shader_exceptions(struct gf100_gr *, int, int); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm200.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm200.c index b9d74d65038d..b5210b31c1b2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm200.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gm200.c @@ -148,11 +148,11 @@ gm200_gr_tile_map_2_8[] = { 0, 1, 1, 0, 0, 1, 1, 0, }; -void +int gm200_gr_oneinit_sm_id(struct gf100_gr *gr) { /*XXX: There's a different algorithm here I've not yet figured out. */ - gf100_gr_oneinit_sm_id(gr); + return gf100_gr_oneinit_sm_id(gr); } void diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gv100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gv100.c index 25228d09a30b..aeb767e582c8 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gv100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gv100.c @@ -85,10 +85,202 @@ gv100_gr_init_419bd8(struct gf100_gr *gr) nvkm_mask(device, 0x419bd8, 0x00000700, 0x00000000); } +u32 +gv100_gr_nonpes_aware_tpc(struct gf100_gr *gr, u32 gpc, u32 tpc) +{ + u32 pes, temp, tpc_new = 0; + + for (pes = 0; pes < gr->ppc_nr[gpc]; pes++) { + if (gr->ppc_tpc_mask[gpc][pes] & BIT(tpc)) + break; + + tpc_new += gr->ppc_tpc_nr[gpc][pes]; + } + + temp = (BIT(tpc) - 1) & gr->ppc_tpc_mask[gpc][pes]; + temp = hweight32(temp); + return tpc_new + temp; +} + +static int +gv100_gr_scg_estimate_perf(struct gf100_gr *gr, unsigned long *gpc_tpc_mask, + u32 disable_gpc, u32 disable_tpc, int *perf) +{ + const u32 scale_factor = 512UL; /* Use fx23.9 */ + const u32 pix_scale = 1024*1024UL; /* Pix perf in [29:20] */ + const u32 world_scale = 1024UL; /* World performance in [19:10] */ + const u32 tpc_scale = 1; /* TPC balancing in [9:0] */ + u32 scg_num_pes = 0; + u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */ + u32 average_tpcs = 0; /* Average of # of TPCs per GPC */ + u32 deviation; /* absolute diff between TPC# and average_tpcs, averaged across GPCs */ + u32 norm_tpc_deviation; /* deviation/max_tpc_per_gpc */ + u32 tpc_balance; + u32 scg_gpc_pix_perf; + u32 scg_world_perf; + u32 gpc; + u32 pes; + int diff; + bool tpc_removed_gpc = false; + bool tpc_removed_pes = false; + u32 max_tpc_gpc = 0; + u32 num_tpc_mask; + u32 *num_tpc_gpc; + int ret = -EINVAL; + + if (!(num_tpc_gpc = kcalloc(gr->gpc_nr, sizeof(*num_tpc_gpc), GFP_KERNEL))) + return -ENOMEM; + + /* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */ + for (gpc = 0; gpc < gr->gpc_nr; gpc++) { + num_tpc_mask = gpc_tpc_mask[gpc]; + + if ((gpc == disable_gpc) && num_tpc_mask & BIT(disable_tpc)) { + /* Safety check if a TPC is removed twice */ + if (WARN_ON(tpc_removed_gpc)) + goto done; + + /* Remove logical TPC from set */ + num_tpc_mask &= ~BIT(disable_tpc); + tpc_removed_gpc = true; + } + + /* track balancing of tpcs across gpcs */ + num_tpc_gpc[gpc] = hweight32(num_tpc_mask); + average_tpcs += num_tpc_gpc[gpc]; + + /* save the maximum numer of gpcs */ + max_tpc_gpc = num_tpc_gpc[gpc] > max_tpc_gpc ? num_tpc_gpc[gpc] : max_tpc_gpc; + + /* + * Calculate ratio between TPC count and post-FS and post-SCG + * + * ratio represents relative throughput of the GPC + */ + scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc] / gr->tpc_nr[gpc]; + if (min_scg_gpc_pix_perf > scg_gpc_pix_perf) + min_scg_gpc_pix_perf = scg_gpc_pix_perf; + + /* Calculate # of surviving PES */ + for (pes = 0; pes < gr->ppc_nr[gpc]; pes++) { + /* Count the number of TPC on the set */ + num_tpc_mask = gr->ppc_tpc_mask[gpc][pes] & gpc_tpc_mask[gpc]; + + if ((gpc == disable_gpc) && (num_tpc_mask & BIT(disable_tpc))) { + if (WARN_ON(tpc_removed_pes)) + goto done; + + num_tpc_mask &= ~BIT(disable_tpc); + tpc_removed_pes = true; + } + + if (hweight32(num_tpc_mask)) + scg_num_pes++; + } + } + + if (WARN_ON(!tpc_removed_gpc || !tpc_removed_pes)) + goto done; + + if (max_tpc_gpc == 0) { + *perf = 0; + goto done_ok; + } + + /* Now calculate perf */ + scg_world_perf = (scale_factor * scg_num_pes) / gr->ppc_total; + deviation = 0; + average_tpcs = scale_factor * average_tpcs / gr->gpc_nr; + for (gpc = 0; gpc < gr->gpc_nr; gpc++) { + diff = average_tpcs - scale_factor * num_tpc_gpc[gpc]; + if (diff < 0) + diff = -diff; + + deviation += diff; + } + + deviation /= gr->gpc_nr; + + norm_tpc_deviation = deviation / max_tpc_gpc; + + tpc_balance = scale_factor - norm_tpc_deviation; + + if ((tpc_balance > scale_factor) || + (scg_world_perf > scale_factor) || + (min_scg_gpc_pix_perf > scale_factor) || + (norm_tpc_deviation > scale_factor)) { + WARN_ON(1); + goto done; + } + + *perf = (pix_scale * min_scg_gpc_pix_perf) + + (world_scale * scg_world_perf) + + (tpc_scale * tpc_balance); +done_ok: + ret = 0; +done: + kfree(num_tpc_gpc); + return ret; +} + +int +gv100_gr_oneinit_sm_id(struct gf100_gr *gr) +{ + unsigned long *gpc_tpc_mask; + u32 *tpc_table, *gpc_table; + u32 gpc, tpc, pes, gtpc; + int perf, maxperf, ret = 0; + + gpc_tpc_mask = kcalloc(gr->gpc_nr, sizeof(*gpc_tpc_mask), GFP_KERNEL); + gpc_table = kcalloc(gr->tpc_total, sizeof(*gpc_table), GFP_KERNEL); + tpc_table = kcalloc(gr->tpc_total, sizeof(*tpc_table), GFP_KERNEL); + if (!gpc_table || !tpc_table || !gpc_tpc_mask) { + ret = -ENOMEM; + goto done; + } + + for (gpc = 0; gpc < gr->gpc_nr; gpc++) { + for (pes = 0; pes < gr->ppc_nr[gpc]; pes++) + gpc_tpc_mask[gpc] |= gr->ppc_tpc_mask[gpc][pes]; + } + + for (gtpc = 0; gtpc < gr->tpc_total; gtpc++) { + for (maxperf = -1, gpc = 0; gpc < gr->gpc_nr; gpc++) { + for_each_set_bit(tpc, &gpc_tpc_mask[gpc], gr->tpc_nr[gpc]) { + ret = gv100_gr_scg_estimate_perf(gr, gpc_tpc_mask, gpc, tpc, &perf); + if (ret) + goto done; + + /* nvgpu does ">=" here, but this gets us RM's numbers. */ + if (perf > maxperf) { + maxperf = perf; + gpc_table[gtpc] = gpc; + tpc_table[gtpc] = tpc; + } + } + } + + gpc_tpc_mask[gpc_table[gtpc]] &= ~BIT(tpc_table[gtpc]); + } + + /*TODO: build table for sm_per_tpc != 1, don't use yet, but might need later? */ + for (gtpc = 0; gtpc < gr->tpc_total; gtpc++) { + gr->sm[gtpc].gpc = gpc_table[gtpc]; + gr->sm[gtpc].tpc = tpc_table[gtpc]; + gr->sm_nr++; + } + +done: + kfree(gpc_table); + kfree(tpc_table); + kfree(gpc_tpc_mask); + return ret; +} + static const struct gf100_gr_func gv100_gr = { .oneinit_tiles = gm200_gr_oneinit_tiles, - .oneinit_sm_id = gm200_gr_oneinit_sm_id, + .oneinit_sm_id = gv100_gr_oneinit_sm_id, .init = gf100_gr_init, .init_419bd8 = gv100_gr_init_419bd8, .init_gpc_mmu = gm200_gr_init_gpc_mmu, diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c index 3491005cf5d7..1b1c6b219fc2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c @@ -40,8 +40,9 @@ tu102_gr_init_fs(struct gf100_gr *gr) gk104_grctx_generate_gpc_tpc_nr(gr); for (sm = 0; sm < gr->sm_nr; sm++) { - nvkm_wr32(device, GPC_UNIT(gr->sm[sm].gpc, 0x0c10 + - gr->sm[sm].tpc * 4), sm); + int tpc = gv100_gr_nonpes_aware_tpc(gr, gr->sm[sm].gpc, gr->sm[sm].tpc); + + nvkm_wr32(device, GPC_UNIT(gr->sm[sm].gpc, 0x0c10 + tpc * 4), sm); } gm200_grctx_generate_dist_skip_table(gr); @@ -93,7 +94,7 @@ tu102_gr_init_gpc_mmu(struct gf100_gr *gr) static const struct gf100_gr_func tu102_gr = { .oneinit_tiles = gm200_gr_oneinit_tiles, - .oneinit_sm_id = gm200_gr_oneinit_sm_id, + .oneinit_sm_id = gv100_gr_oneinit_sm_id, .init = gf100_gr_init, .init_419bd8 = gv100_gr_init_419bd8, .init_gpc_mmu = tu102_gr_init_gpc_mmu,