Jerome Glisse wrote:
Okay i finaly came over a stupid bug (as all bugs are...).
Thus i commited the table to r300 and here is what look
like swizzle & modified emit_arithm (there is some debug
code to test swizzling)...
Note that i changed pfs_reg_t thus swizzling is done
in emit arith and note in t_src. This way we can have
multiple constant as arg for emit_arith and then swizzling
alloc & copy const for us (have to add 7 native case to
the table for that).
The reason I was doing swizzling in t_src is that some ARB_f_p opcodes
aren't
native on r300 and we need to emit multiple instuctions to emulate them
(see LRP).
If one of the sources used a non-native swizzle, we'd waste alu
instructions re-doing
the swizzle at each emit. A case where this may be very important is
the SIN/COS
instructions, a document in the Radeon SDK says that COS is 11
instructions..
Also, TEX sources can be swizzled. So putting swizzling/negation into
t_src made sense
in my mind.
If you think that i remove on important field in
pfs_reg tell me. I am wondering if we can drop
the valid field ?
The most important thing missing is the v_cross/s_cross fields. These
are used to
say that the source swizzle depends on the result of the other
instruction stream.
ie. WZYW (v_cross=1), colour instruction depends on result of alpha
instruction,
XYZX (s_cross=1), alpha insn depends on result of colour instruction.
WZYX (v_cross=1,
s_cross=1), both depend on opposite stream.
This allows for an extremely primitive form of instruction reordering so
that we make
use of the split xyz/w units, instead of leaving a whole load of NOPS
when an ARB_f_p
instruction only writes xyz or w.
The valid field comes in useful occasionally when testing some things.
The has_w field
was only used by my swizzling code to say whether or not the W coord had
to be copied
over to the resulting swizzle, so you could probably drop that if you
don't need it for
your code.
I haven't yet done indivual or global neg but as i said
i think that the best solution is to first swizzle and then
do a
MAD t, -t, 1, 0 with appropriate write mask.
Anyway once Keith commited your patch and you
commited your change in r300, i will commit change
to use table with individual neg support...
Cool. I'll have a closer look at your code when I get home again in 12
or so hours.
Cheers,
Ben Skeggs.
Jerome Glisse
typedef struct _pfs_reg_t {
enum {
REG_TYPE_INPUT,
REG_TYPE_OUTPUT,
REG_TYPE_TEMP,
REG_TYPE_CONST
} type:2;
GLuint index:6;
GLuint xyzw:12;
GLuint negate:4;
GLboolean has_w:1;
GLboolean valid:1;
} pfs_reg_t;
GLuint swizzle( struct r300_fragment_program *rp,
pfs_reg_t swz_src )
{
GLuint src[3] = { 0, 0, 0 };
GLuint inst[4] = { 0, 0, 0, 0 };
GLuint i, xyz, w, j;
pfs_reg_t tmp;
switch (swz_src.type) {
case REG_TYPE_INPUT:
src[0] = rp->inputs[swz_src.index];
break;
case REG_TYPE_TEMP:
src[0] = rp->temps[swz_src.index];
src[0] = swz_src.index;
rp->used_in_node |= (1 << src[0]);
break;
case REG_TYPE_CONST:
src[0] = swz_src.index;
break;
default:
ERROR("invalid source reg\n");
return 0;
}
/* Allocate temp reg for swizzling */
tmp = get_temp_reg(rp);
src[1] = tmp.index;
xyz = swz_src.xyzw & 511;
w = (swz_src.xyzw >> 9) & 7;
printf("w : %d\n",w);
inst[2] = r300_swz_srca_mask[0][w] |
(R300_FPI2_ARGA_ONE << R300_FPI2_ARG1A_SHIFT) |
(R300_FPI2_ARGA_ZERO << R300_FPI2_ARG2A_SHIFT) |
R300_FPI0_OUTC_MAD;
inst[3] = src[0] |
R300_FPI3_SRC1A_CONST |
R300_FPI3_SRC2A_CONST |
(src[1] << R300_FPI3_DSTA_SHIFT);
inst[3] |= R300_FPI3_DSTA_REG;
for (i = 0; i < r300_swizzle[xyz].length; i++) {
inst[0] = r300_swizzle[xyz].inst[(i << 1)];
inst[1] = r300_swizzle[xyz].inst[(i << 1) + 1];
inst[1] |= src[r300_swizzle[xyz].src[i]];
inst[1] |= src[1] << R300_FPI1_DSTC_SHIFT;
rp->alu.inst[rp->v_pos].inst0 = inst[0];
rp->alu.inst[rp->v_pos].inst1 = inst[1];
rp->alu.inst[rp->s_pos].inst2 = inst[2];
rp->alu.inst[rp->s_pos].inst3 = inst[3];
rp->v_pos += 1;
rp->s_pos += 1;
j = rp->v_pos > rp->s_pos ? rp->v_pos : rp->s_pos;
if (j > rp->alu.length) {
rp->alu.length++;
rp->node[rp->cur_node].alu_end++;
}
}
return src[1];
}
static void emit_arith( struct r300_fragment_program *rp,
int op,
pfs_reg_t dest,
int mask,
pfs_reg_t src0,
pfs_reg_t src1,
pfs_reg_t src2,
int flags )
{
pfs_reg_t src[3] = { src0, src1, src2 };
int hwdest, hwsrc[3];
int argc;
int v_idx = rp->v_pos, s_idx = rp->s_pos;
GLuint inst[4] = { 0, 0, 0, 0 };
GLuint srcc_mask, srca_mask;
int i;
pfs_reg_t tt_reg = get_temp_reg(rp);
GLuint tt_id = tt_reg.index;
/* check opcode */
if (op > MAX_PFS_OP) {
ERROR("unknown opcode!\n");
return;
}
argc = r300_fpop[op].argc;
/* grab hwregs of sources */
for (i=0;i<argc;i++) {
switch (src[i].type) {
case REG_TYPE_INPUT:
hwsrc[i] = rp->inputs[src[i].index];
break;
case REG_TYPE_TEMP:
hwsrc[i] = rp->temps[src[i].index];
rp->used_in_node |= (1 << hwsrc[i]);
break;
case REG_TYPE_CONST:
hwsrc[i] = src[i].index;
break;
default:
ERROR("invalid source reg\n");
return;
}
}
/* grab hwregs of dest */
switch (dest.type) {
case REG_TYPE_TEMP:
hwdest = rp->temps[dest.index];
rp->used_in_node |= (1 << hwdest);
break;
case REG_TYPE_OUTPUT:
hwdest = 0;
break;
default:
ERROR("invalid dest reg type %d\n", dest.type);
return;
}
for (i=0;i<3;i++) {
if (i < argc) {
#define GET_XYZ(u) ((u) & 511)
#define GET_W(u) (((u) >> 9) & 7)
if (0) {
printf("------------------------------\n");
printf("zero a %d %d %d\n",
i,
GET_XYZ(pfs_zero.xyzw),
GET_W(pfs_zero.xyzw));
printf("one a %d %d %d\n",
i,
GET_XYZ(pfs_one.xyzw),
GET_W(pfs_one.xyzw));
printf("arith a %d %d %d\n",
i,
GET_XYZ(src[i].xyzw),
GET_W(src[i].xyzw));
}
srcc_mask=r300_swz_srcc_mask[i][GET_XYZ(src[i].xyzw)];
srca_mask=r300_swz_srca_mask[i][GET_W(src[i].xyzw)];
if (srcc_mask & 32) {
/* swizzle */
hwsrc[i] = swizzle(rp, src[i]);
inst[0] |= r300_swz_srcc_mask[i][136] << (i*7);
inst[2] |= r300_swz_srca_mask[i][3] << (i*7);
} else {
/* native format lucky :) */
inst[0] |= srcc_mask << (i*7);
inst[2] |= srca_mask << (i*7);
if (src[i].type == REG_TYPE_CONST) {
inst[1] |= (1<<5) << (i*6);
inst[3] |= (1<<5) << (i*6);
}
}
inst[1] |= hwsrc[i] << (i*6);
inst[3] |= hwsrc[i] << (i*6);
} else {
/* read constant zero, may aswell use a ZERO swizzle
aswell.. */
inst[0] |= R300_FPI0_ARGC_ZERO << (i*7);
inst[2] |= R300_FPI2_ARGA_ZERO << (i*7);
inst[1] |= (1<<5) << (i*6);
inst[2] |= (1<<5) << (i*6);
}
}
if (mask & 7) {
rp->alu.inst[v_idx].inst0 = inst[0] | r300_fpop[op].v_op |flags;
#if 1
rp->alu.inst[v_idx].inst1 = inst[1] |
(tt_id << R300_FPI1_DSTC_SHIFT) |
((mask & WRITEMASK_XYZ) << 23);
#else
rp->alu.inst[v_idx].inst1 = inst[1] |
(hwdest << R300_FPI1_DSTC_SHIFT) |
((mask & WRITEMASK_XYZ) << (dest.type ==
REG_TYPE_OUTPUT ? 26 : 23));
#endif
rp->v_pos = v_idx + 1;
}
if (mask & 8) {
rp->alu.inst[s_idx].inst2 = inst[2] | r300_fpop[op].s_op |flags;
#if 1
rp->alu.inst[s_idx].inst3 = inst[3] |
(tt_id << R300_FPI3_DSTA_SHIFT) |
(1 << 23);
#else
rp->alu.inst[s_idx].inst3 = inst[3] |
(hwdest << R300_FPI3_DSTA_SHIFT) |
(1 << (dest.type == REG_TYPE_OUTPUT ? 24 : 23));
#endif
rp->s_pos = s_idx + 1;
}
i = rp->v_pos > rp->s_pos ? rp->v_pos : rp->s_pos;
if (i > rp->alu.length) {
rp->alu.length++;
rp->node[rp->cur_node].alu_end++;
}
#if 1
tt_reg.xyzw = (SWIZZLE_Z) |
(SWIZZLE_Y << 3)|
(SWIZZLE_X << 6)|
(SWIZZLE_W << 9);
tt_id = swizzle(rp, tt_reg);
// tt_id = 0;
v_idx = rp->v_pos;
s_idx = rp->s_pos;
printf("reg : %d\n",tt_id);
inst[0] = r300_swz_srcc_mask[0][136] << (0*7);
inst[2] = r300_swz_srca_mask[0][3] << (0*7);
inst[0] |= r300_swz_srcc_mask[0][365] << (1*7);
inst[2] |= r300_swz_srca_mask[0][5] << (1*7);
inst[0] |= r300_swz_srcc_mask[0][292] << (2*7);
inst[2] |= r300_swz_srca_mask[0][4] << (2*7);
inst[1] = tt_id;
inst[3] = tt_id;
inst[1] |= (1<<5) << (1*6);
inst[1] |= (1<<5) << (2*6);
inst[3] |= (1<<5) << (1*6);
inst[3] |= (1<<5) << (2*6);
if (0) {
inst[1] |= (1<<5);
inst[3] |= (1<<5);
}
if (mask & 7) {
inst[1] |= (hwdest << R300_FPI1_DSTC_SHIFT) |
((mask & WRITEMASK_XYZ) << (dest.type ==
REG_TYPE_OUTPUT ? 26 : 23));
rp->alu.inst[v_idx].inst0 = inst[0];
rp->alu.inst[v_idx].inst1 = inst[1];
rp->v_pos = v_idx + 1;
}
if (mask & 8) {
inst[3] |= (hwdest << R300_FPI3_DSTA_SHIFT) |
(1 << (dest.type == REG_TYPE_OUTPUT ? 24 : 23));
rp->alu.inst[s_idx].inst2 = inst[2];
rp->alu.inst[s_idx].inst3 = inst[3];
rp->s_pos = s_idx + 1;
}
i = rp->v_pos > rp->s_pos ? rp->v_pos : rp->s_pos;
if (i > rp->alu.length) {
rp->alu.length++;
rp->node[rp->cur_node].alu_end++;
}
#endif
return;
}
-------------------------------------------------------
This SF.Net email is sponsored by Oracle Space Sweepstakes
Want to be the first software developer in space?
Enter now for the Oracle Space Sweepstakes!
http://ads.osdn.com/?ad_idt12&alloc_id344&op=click
--
_______________________________________________
Dri-devel mailing list
Dri-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/dri-devel
-------------------------------------------------------
This SF.Net email is sponsored by Oracle Space Sweepstakes
Want to be the first software developer in space?
Enter now for the Oracle Space Sweepstakes!
http://ads.osdn.com/?ad_id=7412&alloc_id=16344&op=click
--
_______________________________________________
Dri-devel mailing list
Dri-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/dri-devel