From: Dave Airlie <[email protected]>

So it appears R600s (except rv670) do AR handling different using a different
opcode. This patch fixes up r600g to work properly on r600.

This fixes ~100 piglit tests here (in GLSL1.30 mode) on rv610.

v3: add index_mode as per the docs.

This still fails any dst relative tests for some reason I can't quite see yet,
but it passes a lot more tests than without.

v4: add a nop after dst.rel this could be improved using a second pass,
where we only insert nops if two instructions are sure to collide.

Signed-off-by: Dave Airlie <[email protected]>
---
 src/gallium/drivers/r600/r600_asm.c    |   82 ++++++++++++++++++++++++++++++--
 src/gallium/drivers/r600/r600_asm.h    |    9 +++-
 src/gallium/drivers/r600/r600_shader.c |    2 +-
 src/gallium/drivers/r600/r600_sq.h     |    7 +++
 4 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_asm.c 
b/src/gallium/drivers/r600/r600_asm.c
index 8234744..16c1143 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -94,6 +94,7 @@ static inline unsigned int 
r600_bytecode_get_num_operands(struct r600_bytecode *
                case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
                case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
                case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
+               case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
                case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
                case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
                case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
@@ -249,8 +250,18 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
        return tex;
 }
 
-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class)
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, 
enum radeon_family family)
 {
+       if ((chip_class == R600) && (family != CHIP_RV670))
+               bc->ar_handling = AR_HANDLE_RV6XX;
+       else
+               bc->ar_handling = AR_HANDLE_NORMAL;
+
+       if ((chip_class == R600) && (family != CHIP_RV670 && family != 
CHIP_RS780 &&
+                                          family != CHIP_RS880))
+               bc->r6xx_nop_after_rel_dst = 1;
+       else
+               bc->r6xx_nop_after_rel_dst = 0;
        LIST_INITHEAD(&bc->cf);
        bc->chip_class = chip_class;
 }
@@ -441,7 +452,8 @@ static int is_alu_mova_inst(struct r600_bytecode *bc, 
struct r600_bytecode_alu *
                return !alu->is_op3 && (
                        alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
                        alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR 
||
-                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
+                       alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT ||
+                       alu->inst == 
V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT);
        case EVERGREEN:
        case CAYMAN:
        default:
@@ -457,7 +469,8 @@ static int is_alu_vec_unit_inst(struct r600_bytecode *bc, 
struct r600_bytecode_a
        case R600:
        case R700:
                return is_alu_reduction_inst(bc, alu) ||
-                       is_alu_mova_inst(bc, alu);
+                       (is_alu_mova_inst(bc, alu) && 
+                        (alu->inst != 
V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT));
        case EVERGREEN:
        case CAYMAN:
        default:
@@ -478,6 +491,7 @@ static int is_alu_trans_unit_inst(struct r600_bytecode *bc, 
struct r600_bytecode
        case R700:
                if (!alu->is_op3)
                        return alu->inst == 
V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
+                               alu->inst == 
V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT ||
                                alu->inst == 
V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
                                alu->inst == 
V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT ||
                                alu->inst == 
V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
@@ -1048,6 +1062,10 @@ static int merge_inst_groups(struct r600_bytecode *bc, 
struct r600_bytecode_alu
                alu = slots[i];
                num_once_inst += is_alu_once_inst(bc, alu);
 
+               /* don't reschedule NOPs */
+               if (alu->inst == BC_INST(bc, 
V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP))
+                       return 0;
+
                /* Let's check dst gpr. */
                if (alu->dst.rel) {
                        if (have_mova)
@@ -1236,12 +1254,60 @@ static int r600_bytecode_alloc_kcache_lines(struct 
r600_bytecode *bc, struct r60
        return 0;
 }
 
+static int insert_nop_r6xx(struct r600_bytecode *bc)
+{
+       struct r600_bytecode_alu alu;
+       int r, i;
+
+       for (i = 0; i < 4; i++) {
+               memset(&alu, 0, sizeof(alu));
+               alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+               alu.src[0].chan = i;
+               alu.dst.chan = i;
+               alu.last = (i == 3);
+               r = r600_bytecode_add_alu(bc, &alu);
+               if (r)
+                       return r;
+       }
+       return 0;
+}
+
+/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
+static int load_ar_r6xx(struct r600_bytecode *bc)
+{
+       struct r600_bytecode_alu alu;
+       int r;
+
+       if (bc->ar_loaded)
+               return 0;
+
+       /* hack to avoid making MOVA the last instruction in the clause */
+       if ((bc->cf_last->ndw>>1) >= 110)
+               bc->force_add_cf = 1;
+
+       memset(&alu, 0, sizeof(alu));
+       alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT;
+       alu.src[0].sel = bc->ar_reg;
+       alu.last = 1;
+       alu.index_mode = INDEX_MODE_LOOP;
+       r = r600_bytecode_add_alu(bc, &alu);
+       if (r)
+               return r;
+
+       /* no requirement to set uses waterfall on MOVA_GPR_INT */
+       bc->ar_loaded = 1;
+       return 0;
+}
+
 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */
 static int load_ar(struct r600_bytecode *bc)
 {
        struct r600_bytecode_alu alu;
        int r;
 
+       if (bc->ar_handling)
+               return load_ar_r6xx(bc);
+
        if (bc->ar_loaded)
                return 0;
 
@@ -1376,6 +1442,10 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc, 
const struct r600_bytec
                bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
                bc->cf_last->curr_bs_head = NULL;
        }
+
+       if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
+               insert_nop_r6xx(bc);
+
        return 0;
 }
 
@@ -1599,6 +1669,7 @@ static int r600_bytecode_alu_build(struct r600_bytecode 
*bc, struct r600_bytecod
                                S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
                                S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
                                S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
+                               S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
                                S_SQ_ALU_WORD0_LAST(alu->last);
 
        if (alu->is_op3) {
@@ -2286,7 +2357,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
                        fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
                        fprintf(stderr, "REL:%d ", alu->src[1].rel);
                        fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
-                       fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
+                       fprintf(stderr, "NEG:%d ", alu->src[1].neg);
+                       fprintf(stderr, "IM:%d) ", alu->index_mode);
                        fprintf(stderr, "LAST:%d)\n", alu->last);
                        id++;
                        fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], 
alu->last ? '*' : ' ');
@@ -2565,7 +2637,7 @@ int r600_vertex_elements_build_fetch_shader(struct 
r600_pipe_context *rctx, stru
        }
 
        memset(&bc, 0, sizeof(bc));
-       r600_bytecode_init(&bc, rctx->chip_class);
+       r600_bytecode_init(&bc, rctx->chip_class, rctx->family);
 
        for (i = 0; i < ve->count; i++) {
                if (elements[i].instance_divisor > 1) {
diff --git a/src/gallium/drivers/r600/r600_asm.h 
b/src/gallium/drivers/r600/r600_asm.h
index d0ff75d..00f7e59 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -54,6 +54,7 @@ struct r600_bytecode_alu {
        unsigned                        bank_swizzle;
        unsigned                        bank_swizzle_force;
        unsigned                        omod;
+       unsigned                        index_mode;
 };
 
 struct r600_bytecode_tex {
@@ -176,6 +177,10 @@ struct r600_cf_callstack {
        int                             max;
 };
 
+#define AR_HANDLE_NORMAL 0
+#define AR_HANDLE_RV6XX 1 /* except RV670 */
+
+
 struct r600_bytecode {
        enum chip_class                 chip_class;
        int                             type;
@@ -194,13 +199,15 @@ struct r600_bytecode {
        struct r600_cf_callstack        callstack[SQ_MAX_CALL_DEPTH];
        unsigned        ar_loaded;
        unsigned        ar_reg;
+       unsigned        ar_handling;
+       unsigned        r6xx_nop_after_rel_dst;
 };
 
 /* eg_asm.c */
 int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf 
*cf);
 
 /* r600_asm.c */
-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class);
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, 
enum radeon_family family);
 void r600_bytecode_clear(struct r600_bytecode *bc);
 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct 
r600_bytecode_alu *alu);
 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct 
r600_bytecode_vtx *vtx);
diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 59d41cf..5819c2b 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -807,7 +807,7 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * 
rctx, struct r600_pi
 
        ctx.bc = &shader->bc;
        ctx.shader = shader;
-       r600_bytecode_init(ctx.bc, rctx->chip_class);
+       r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
        ctx.tokens = tokens;
        tgsi_scan_shader(tokens, &ctx.info);
        tgsi_parse_init(&ctx.parse, tokens);
diff --git a/src/gallium/drivers/r600/r600_sq.h 
b/src/gallium/drivers/r600/r600_sq.h
index b9c4126..4b2a19a 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -471,4 +471,11 @@
 #define SQ_ALU_SCL_122                           0x00000001
 #define SQ_ALU_SCL_212                           0x00000002
 #define SQ_ALU_SCL_221                           0x00000003
+
+#define   INDEX_MODE_AR_X 0
+#define   INDEX_MODE_AR_Y 1
+#define   INDEX_MODE_AR_Z 2
+#define   INDEX_MODE_AR_W 3
+#define   INDEX_MODE_LOOP 4
+
 #endif
-- 
1.7.7.4

_______________________________________________
mesa-dev mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to