Merge tag 'drm-vc4-fixes-2016-09-14' of https://github.com/anholt/linux into drm...
[linux-2.6-block.git] / drivers / gpu / drm / vc4 / vc4_validate_shaders.c
index f67124b4c5348706136e4c0f9fe06ea10b77b230..2543cf5b8b51869d51b72a5db5017dded38761be 100644 (file)
 #include "vc4_drv.h"
 #include "vc4_qpu_defines.h"
 
+#define LIVE_REG_COUNT (32 + 32 + 4)
+
 struct vc4_shader_validation_state {
+       /* Current IP being validated. */
+       uint32_t ip;
+
+       /* IP at the end of the BO, do not read shader[max_ip] */
+       uint32_t max_ip;
+
+       uint64_t *shader;
+
        struct vc4_texture_sample_info tmu_setup[2];
        int tmu_write_count[2];
 
@@ -49,8 +59,30 @@ struct vc4_shader_validation_state {
         *
         * This is used for the validation of direct address memory reads.
         */
-       uint32_t live_min_clamp_offsets[32 + 32 + 4];
-       bool live_max_clamp_regs[32 + 32 + 4];
+       uint32_t live_min_clamp_offsets[LIVE_REG_COUNT];
+       bool live_max_clamp_regs[LIVE_REG_COUNT];
+       uint32_t live_immediates[LIVE_REG_COUNT];
+
+       /* Bitfield of which IPs are used as branch targets.
+        *
+        * Used for validation that the uniform stream is updated at the right
+        * points and clearing the texturing/clamping state.
+        */
+       unsigned long *branch_targets;
+
+       /* Set when entering a basic block, and cleared when the uniform
+        * address update is found.  This is used to make sure that we don't
+        * read uniforms when the address is undefined.
+        */
+       bool needs_uniform_address_update;
+
+       /* Set when we find a backwards branch.  If the branch is backwards,
+        * the taraget is probably doing an address reset to read uniforms,
+        * and so we need to be sure that a uniforms address is present in the
+        * stream, even if the shader didn't need to read uniforms in later
+        * basic blocks.
+        */
+       bool needs_uniform_address_for_loop;
 };
 
 static uint32_t
@@ -129,11 +161,11 @@ record_texture_sample(struct vc4_validated_shader_info *validated_shader,
 }
 
 static bool
-check_tmu_write(uint64_t inst,
-               struct vc4_validated_shader_info *validated_shader,
+check_tmu_write(struct vc4_validated_shader_info *validated_shader,
                struct vc4_shader_validation_state *validation_state,
                bool is_mul)
 {
+       uint64_t inst = validation_state->shader[validation_state->ip];
        uint32_t waddr = (is_mul ?
                          QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
                          QPU_GET_FIELD(inst, QPU_WADDR_ADD));
@@ -162,7 +194,7 @@ check_tmu_write(uint64_t inst,
                        return false;
                }
 
-               /* We assert that the the clamped address is the first
+               /* We assert that the clamped address is the first
                 * argument, and the UBO base address is the second argument.
                 * This is arbitrary, but simpler than supporting flipping the
                 * two either way.
@@ -212,8 +244,14 @@ check_tmu_write(uint64_t inst,
        /* Since direct uses a RADDR uniform reference, it will get counted in
         * check_instruction_reads()
         */
-       if (!is_direct)
+       if (!is_direct) {
+               if (validation_state->needs_uniform_address_update) {
+                       DRM_ERROR("Texturing with undefined uniform address\n");
+                       return false;
+               }
+
                validated_shader->uniforms_size += 4;
+       }
 
        if (submit) {
                if (!record_texture_sample(validated_shader,
@@ -227,23 +265,144 @@ check_tmu_write(uint64_t inst,
        return true;
 }
 
+static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader)
+{
+       uint32_t o = validated_shader->num_uniform_addr_offsets;
+       uint32_t num_uniforms = validated_shader->uniforms_size / 4;
+
+       validated_shader->uniform_addr_offsets =
+               krealloc(validated_shader->uniform_addr_offsets,
+                        (o + 1) *
+                        sizeof(*validated_shader->uniform_addr_offsets),
+                        GFP_KERNEL);
+       if (!validated_shader->uniform_addr_offsets)
+               return false;
+
+       validated_shader->uniform_addr_offsets[o] = num_uniforms;
+       validated_shader->num_uniform_addr_offsets++;
+
+       return true;
+}
+
 static bool
-check_reg_write(uint64_t inst,
-               struct vc4_validated_shader_info *validated_shader,
+validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader,
+                              struct vc4_shader_validation_state *validation_state,
+                              bool is_mul)
+{
+       uint64_t inst = validation_state->shader[validation_state->ip];
+       u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
+       u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+       u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+       u32 add_lri = raddr_add_a_to_live_reg_index(inst);
+       /* We want our reset to be pointing at whatever uniform follows the
+        * uniforms base address.
+        */
+       u32 expected_offset = validated_shader->uniforms_size + 4;
+
+       /* We only support absolute uniform address changes, and we
+        * require that they be in the current basic block before any
+        * of its uniform reads.
+        *
+        * One could potentially emit more efficient QPU code, by
+        * noticing that (say) an if statement does uniform control
+        * flow for all threads and that the if reads the same number
+        * of uniforms on each side.  However, this scheme is easy to
+        * validate so it's all we allow for now.
+        */
+       switch (QPU_GET_FIELD(inst, QPU_SIG)) {
+       case QPU_SIG_NONE:
+       case QPU_SIG_SCOREBOARD_UNLOCK:
+       case QPU_SIG_COLOR_LOAD:
+       case QPU_SIG_LOAD_TMU0:
+       case QPU_SIG_LOAD_TMU1:
+               break;
+       default:
+               DRM_ERROR("uniforms address change must be "
+                         "normal math\n");
+               return false;
+       }
+
+       if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
+               DRM_ERROR("Uniform address reset must be an ADD.\n");
+               return false;
+       }
+
+       if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) {
+               DRM_ERROR("Uniform address reset must be unconditional.\n");
+               return false;
+       }
+
+       if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP &&
+           !(inst & QPU_PM)) {
+               DRM_ERROR("No packing allowed on uniforms reset\n");
+               return false;
+       }
+
+       if (add_lri == -1) {
+               DRM_ERROR("First argument of uniform address write must be "
+                         "an immediate value.\n");
+               return false;
+       }
+
+       if (validation_state->live_immediates[add_lri] != expected_offset) {
+               DRM_ERROR("Resetting uniforms with offset %db instead of %db\n",
+                         validation_state->live_immediates[add_lri],
+                         expected_offset);
+               return false;
+       }
+
+       if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
+           !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
+               DRM_ERROR("Second argument of uniform address write must be "
+                         "a uniform.\n");
+               return false;
+       }
+
+       validation_state->needs_uniform_address_update = false;
+       validation_state->needs_uniform_address_for_loop = false;
+       return require_uniform_address_uniform(validated_shader);
+}
+
+static bool
+check_reg_write(struct vc4_validated_shader_info *validated_shader,
                struct vc4_shader_validation_state *validation_state,
                bool is_mul)
 {
+       uint64_t inst = validation_state->shader[validation_state->ip];
        uint32_t waddr = (is_mul ?
                          QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
                          QPU_GET_FIELD(inst, QPU_WADDR_ADD));
+       uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+       bool ws = inst & QPU_WS;
+       bool is_b = is_mul ^ ws;
+       u32 lri = waddr_to_live_reg_index(waddr, is_b);
+
+       if (lri != -1) {
+               uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
+               uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL);
+
+               if (sig == QPU_SIG_LOAD_IMM &&
+                   QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP &&
+                   ((is_mul && cond_mul == QPU_COND_ALWAYS) ||
+                    (!is_mul && cond_add == QPU_COND_ALWAYS))) {
+                       validation_state->live_immediates[lri] =
+                               QPU_GET_FIELD(inst, QPU_LOAD_IMM);
+               } else {
+                       validation_state->live_immediates[lri] = ~0;
+               }
+       }
 
        switch (waddr) {
        case QPU_W_UNIFORMS_ADDRESS:
-               /* XXX: We'll probably need to support this for reladdr, but
-                * it's definitely a security-related one.
-                */
-               DRM_ERROR("uniforms address load unsupported\n");
-               return false;
+               if (is_b) {
+                       DRM_ERROR("relative uniforms address change "
+                                 "unsupported\n");
+                       return false;
+               }
+
+               return validate_uniform_address_write(validated_shader,
+                                                     validation_state,
+                                                     is_mul);
 
        case QPU_W_TLB_COLOR_MS:
        case QPU_W_TLB_COLOR_ALL:
@@ -261,7 +420,7 @@ check_reg_write(uint64_t inst,
        case QPU_W_TMU1_T:
        case QPU_W_TMU1_R:
        case QPU_W_TMU1_B:
-               return check_tmu_write(inst, validated_shader, validation_state,
+               return check_tmu_write(validated_shader, validation_state,
                                       is_mul);
 
        case QPU_W_HOST_INT:
@@ -294,10 +453,10 @@ check_reg_write(uint64_t inst,
 }
 
 static void
-track_live_clamps(uint64_t inst,
-                 struct vc4_validated_shader_info *validated_shader,
+track_live_clamps(struct vc4_validated_shader_info *validated_shader,
                  struct vc4_shader_validation_state *validation_state)
 {
+       uint64_t inst = validation_state->shader[validation_state->ip];
        uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
@@ -369,10 +528,10 @@ track_live_clamps(uint64_t inst,
 }
 
 static bool
-check_instruction_writes(uint64_t inst,
-                        struct vc4_validated_shader_info *validated_shader,
+check_instruction_writes(struct vc4_validated_shader_info *validated_shader,
                         struct vc4_shader_validation_state *validation_state)
 {
+       uint64_t inst = validation_state->shader[validation_state->ip];
        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
        bool ok;
@@ -382,20 +541,44 @@ check_instruction_writes(uint64_t inst,
                return false;
        }
 
-       ok = (check_reg_write(inst, validated_shader, validation_state,
-                             false) &&
-             check_reg_write(inst, validated_shader, validation_state,
-                             true));
+       ok = (check_reg_write(validated_shader, validation_state, false) &&
+             check_reg_write(validated_shader, validation_state, true));
 
-       track_live_clamps(inst, validated_shader, validation_state);
+       track_live_clamps(validated_shader, validation_state);
 
        return ok;
 }
 
 static bool
-check_instruction_reads(uint64_t inst,
-                       struct vc4_validated_shader_info *validated_shader)
+check_branch(uint64_t inst,
+            struct vc4_validated_shader_info *validated_shader,
+            struct vc4_shader_validation_state *validation_state,
+            int ip)
+{
+       int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
+       uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+       uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+
+       if ((int)branch_imm < 0)
+               validation_state->needs_uniform_address_for_loop = true;
+
+       /* We don't want to have to worry about validation of this, and
+        * there's no need for it.
+        */
+       if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) {
+               DRM_ERROR("branch instruction at %d wrote a register.\n",
+                         validation_state->ip);
+               return false;
+       }
+
+       return true;
+}
+
+static bool
+check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
+                       struct vc4_shader_validation_state *validation_state)
 {
+       uint64_t inst = validation_state->shader[validation_state->ip];
        uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
        uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
@@ -407,40 +590,204 @@ check_instruction_reads(uint64_t inst,
                 * already be OOM.
                 */
                validated_shader->uniforms_size += 4;
+
+               if (validation_state->needs_uniform_address_update) {
+                       DRM_ERROR("Uniform read with undefined uniform "
+                                 "address\n");
+                       return false;
+               }
        }
 
        return true;
 }
 
+/* Make sure that all branches are absolute and point within the shader, and
+ * note their targets for later.
+ */
+static bool
+vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
+{
+       uint32_t max_branch_target = 0;
+       bool found_shader_end = false;
+       int ip;
+       int shader_end_ip = 0;
+       int last_branch = -2;
+
+       for (ip = 0; ip < validation_state->max_ip; ip++) {
+               uint64_t inst = validation_state->shader[ip];
+               int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
+               uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+               uint32_t after_delay_ip = ip + 4;
+               uint32_t branch_target_ip;
+
+               if (sig == QPU_SIG_PROG_END) {
+                       shader_end_ip = ip;
+                       found_shader_end = true;
+                       continue;
+               }
+
+               if (sig != QPU_SIG_BRANCH)
+                       continue;
+
+               if (ip - last_branch < 4) {
+                       DRM_ERROR("Branch at %d during delay slots\n", ip);
+                       return false;
+               }
+               last_branch = ip;
+
+               if (inst & QPU_BRANCH_REG) {
+                       DRM_ERROR("branching from register relative "
+                                 "not supported\n");
+                       return false;
+               }
+
+               if (!(inst & QPU_BRANCH_REL)) {
+                       DRM_ERROR("relative branching required\n");
+                       return false;
+               }
+
+               /* The actual branch target is the instruction after the delay
+                * slots, plus whatever byte offset is in the low 32 bits of
+                * the instruction.  Make sure we're not branching beyond the
+                * end of the shader object.
+                */
+               if (branch_imm % sizeof(inst) != 0) {
+                       DRM_ERROR("branch target not aligned\n");
+                       return false;
+               }
+
+               branch_target_ip = after_delay_ip + (branch_imm >> 3);
+               if (branch_target_ip >= validation_state->max_ip) {
+                       DRM_ERROR("Branch at %d outside of shader (ip %d/%d)\n",
+                                 ip, branch_target_ip,
+                                 validation_state->max_ip);
+                       return false;
+               }
+               set_bit(branch_target_ip, validation_state->branch_targets);
+
+               /* Make sure that the non-branching path is also not outside
+                * the shader.
+                */
+               if (after_delay_ip >= validation_state->max_ip) {
+                       DRM_ERROR("Branch at %d continues past shader end "
+                                 "(%d/%d)\n",
+                                 ip, after_delay_ip, validation_state->max_ip);
+                       return false;
+               }
+               set_bit(after_delay_ip, validation_state->branch_targets);
+               max_branch_target = max(max_branch_target, after_delay_ip);
+
+               /* There are two delay slots after program end is signaled
+                * that are still executed, then we're finished.
+                */
+               if (found_shader_end && ip == shader_end_ip + 2)
+                       break;
+       }
+
+       if (max_branch_target > shader_end_ip) {
+               DRM_ERROR("Branch landed after QPU_SIG_PROG_END");
+               return false;
+       }
+
+       return true;
+}
+
+/* Resets any known state for the shader, used when we may be branched to from
+ * multiple locations in the program (or at shader start).
+ */
+static void
+reset_validation_state(struct vc4_shader_validation_state *validation_state)
+{
+       int i;
+
+       for (i = 0; i < 8; i++)
+               validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0;
+
+       for (i = 0; i < LIVE_REG_COUNT; i++) {
+               validation_state->live_min_clamp_offsets[i] = ~0;
+               validation_state->live_max_clamp_regs[i] = false;
+               validation_state->live_immediates[i] = ~0;
+       }
+}
+
+static bool
+texturing_in_progress(struct vc4_shader_validation_state *validation_state)
+{
+       return (validation_state->tmu_write_count[0] != 0 ||
+               validation_state->tmu_write_count[1] != 0);
+}
+
+static bool
+vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state)
+{
+       uint32_t ip = validation_state->ip;
+
+       if (!test_bit(ip, validation_state->branch_targets))
+               return true;
+
+       if (texturing_in_progress(validation_state)) {
+               DRM_ERROR("Branch target landed during TMU setup\n");
+               return false;
+       }
+
+       /* Reset our live values tracking, since this instruction may have
+        * multiple predecessors.
+        *
+        * One could potentially do analysis to determine that, for
+        * example, all predecessors have a live max clamp in the same
+        * register, but we don't bother with that.
+        */
+       reset_validation_state(validation_state);
+
+       /* Since we've entered a basic block from potentially multiple
+        * predecessors, we need the uniforms address to be updated before any
+        * unforms are read.  We require that after any branch point, the next
+        * uniform to be loaded is a uniform address offset.  That uniform's
+        * offset will be marked by the uniform address register write
+        * validation, or a one-off the end-of-program check.
+        */
+       validation_state->needs_uniform_address_update = true;
+
+       return true;
+}
+
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 {
        bool found_shader_end = false;
        int shader_end_ip = 0;
-       uint32_t ip, max_ip;
-       uint64_t *shader;
-       struct vc4_validated_shader_info *validated_shader;
+       uint32_t ip;
+       struct vc4_validated_shader_info *validated_shader = NULL;
        struct vc4_shader_validation_state validation_state;
-       int i;
 
        memset(&validation_state, 0, sizeof(validation_state));
+       validation_state.shader = shader_obj->vaddr;
+       validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t);
 
-       for (i = 0; i < 8; i++)
-               validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0;
-       for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++)
-               validation_state.live_min_clamp_offsets[i] = ~0;
+       reset_validation_state(&validation_state);
 
-       shader = shader_obj->vaddr;
-       max_ip = shader_obj->base.size / sizeof(uint64_t);
+       validation_state.branch_targets =
+               kcalloc(BITS_TO_LONGS(validation_state.max_ip),
+                       sizeof(unsigned long), GFP_KERNEL);
+       if (!validation_state.branch_targets)
+               goto fail;
 
        validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
        if (!validated_shader)
-               return NULL;
+               goto fail;
+
+       if (!vc4_validate_branches(&validation_state))
+               goto fail;
 
-       for (ip = 0; ip < max_ip; ip++) {
-               uint64_t inst = shader[ip];
+       for (ip = 0; ip < validation_state.max_ip; ip++) {
+               uint64_t inst = validation_state.shader[ip];
                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 
+               validation_state.ip = ip;
+
+               if (!vc4_handle_branch_target(&validation_state))
+                       goto fail;
+
                switch (sig) {
                case QPU_SIG_NONE:
                case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -450,13 +797,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                case QPU_SIG_LOAD_TMU1:
                case QPU_SIG_PROG_END:
                case QPU_SIG_SMALL_IMM:
-                       if (!check_instruction_writes(inst, validated_shader,
+                       if (!check_instruction_writes(validated_shader,
                                                      &validation_state)) {
                                DRM_ERROR("Bad write at ip %d\n", ip);
                                goto fail;
                        }
 
-                       if (!check_instruction_reads(inst, validated_shader))
+                       if (!check_instruction_reads(validated_shader,
+                                                    &validation_state))
                                goto fail;
 
                        if (sig == QPU_SIG_PROG_END) {
@@ -467,13 +815,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                        break;
 
                case QPU_SIG_LOAD_IMM:
-                       if (!check_instruction_writes(inst, validated_shader,
+                       if (!check_instruction_writes(validated_shader,
                                                      &validation_state)) {
                                DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip);
                                goto fail;
                        }
                        break;
 
+               case QPU_SIG_BRANCH:
+                       if (!check_branch(inst, validated_shader,
+                                         &validation_state, ip))
+                               goto fail;
+                       break;
                default:
                        DRM_ERROR("Unsupported QPU signal %d at "
                                  "instruction %d\n", sig, ip);
@@ -487,13 +840,28 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                        break;
        }
 
-       if (ip == max_ip) {
+       if (ip == validation_state.max_ip) {
                DRM_ERROR("shader failed to terminate before "
                          "shader BO end at %zd\n",
                          shader_obj->base.size);
                goto fail;
        }
 
+       /* If we did a backwards branch and we haven't emitted a uniforms
+        * reset since then, we still need the uniforms stream to have the
+        * uniforms address available so that the backwards branch can do its
+        * uniforms reset.
+        *
+        * We could potentially prove that the backwards branch doesn't
+        * contain any uses of uniforms until program exit, but that doesn't
+        * seem to be worth the trouble.
+        */
+       if (validation_state.needs_uniform_address_for_loop) {
+               if (!require_uniform_address_uniform(validated_shader))
+                       goto fail;
+               validated_shader->uniforms_size += 4;
+       }
+
        /* Again, no chance of integer overflow here because the worst case
         * scenario is 8 bytes of uniforms plus handles per 8-byte
         * instruction.
@@ -502,9 +870,12 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                (validated_shader->uniforms_size +
                 4 * validated_shader->num_texture_samples);
 
+       kfree(validation_state.branch_targets);
+
        return validated_shader;
 
 fail:
+       kfree(validation_state.branch_targets);
        if (validated_shader) {
                kfree(validated_shader->texture_samples);
                kfree(validated_shader);