On Tue, 8 Feb 2005 15:31:59 -0500, Daniel Phillips <[EMAIL PROTECTED]> wrote:
> Hi Timothy,
>
> On Sunday 06 February 2005 10:39, Timothy Miller wrote:
> > Well, say it took 4 cycles to compute one sum. Then what you need is
> > dZ, dZ*2, dZ*3, and dZ*4, all of which are either trivial or easy to
> > compute. You use dZ*4 to get to the next loop, and send Z,0; Z,dZ;
> > Z,dZ*2; and Z,dZ*3 down the pipeline.
>
> OK, I just want to tie this one off. It's clear how this will work with
> floating point adders: say the adder requires 4 clocks, and delivers
> one result every clock. For some interpolant T, four steps of dTdx can
> be in the pipeline, and because we compute two pixels on each step, we
> need two adders. The increment will always be 8*dTdx.
>
> We've got something like 17 interpolants, so that's 34 simplified fp
> adders. Interpolating vertically between scan lines will additionally
> use twice as many FP adders as interpolants, because two edges have to
> be interpolated. The edge setup requires a multiply and add per
> interpolant. Should we worry about this number of components, or is it
> no sweat? The vertical rasterization doesn't necessarily have to
> deliver a span per clock but it would be nice if it did, to keep up
> with one and two pixel-wide triangles.
We only interpolate the left edge vertically, but you do have a good
point that the vertical will double the number of adders. While a
generalized adder takes up a whopping 1% of the design, the simplified
ones won't be nearly so bad. There may be grounds for combining
horizontal and vertical and sharing some of the logic.
>
> Finally, have we clawed our way back to 200 MHz yet?
Yes and no. I'd taken true-zero and over/underflow detection out.
Like that, I was able to get the generalized adder down to 5ns. But
when I added true-zero and over/underflow back in, and then did more
tweaking, I'm back up to about 6ns. As is the case with modern VLSI,
routing/wiring delay is the dominant performance factor.
Normally, 3 levels of logic isn't a big deal, but when the routing for
each level of logic exceeds 1ns, because it just can't seem to route
the whole design very well, you get into trouble. There's a certain
aspect of the adder I've been tinkering with (on and off here) that is
purely academic, because pipelined adders can be any length and looped
ones will be simplified.
Here's the current design. Mere mortals, bow at my feet because of my
great chip design m4d 5ki11z. (Har har)
module float_add(
clock,
Ainx, Binx,
Coutx);
input clock;
input [24:0] Ainx, Binx;
output [24:0] Coutx;
reg [24:0] Coutx, Cout;
reg [24:0] Ain, Bin;
// Decouple the inputs from the inferred IOBs,
// Otherwise, we get bad (and misleading) routing delays.
always @(posedge clock) begin
Ain <= Ainx;
Bin <= Binx;
end
/************** STAGE 1 **************/
reg [8:0] Bshift1, Ashift1;
reg BmantGT1u, BmantGT1l, BmantEQ1u;
reg [24:0] A1, B1;
reg Anzero1, Bnzero1;
always @(posedge clock) begin
// For if A is greater
Bshift1 <= Ain[24:17] - Bin[24:17];
// For if B is greater
Ashift1 <= Bin[24:17] - Ain[24:17];
// Check for (non)zeros
Anzero1 <= |Ain[24:17];
Bnzero1 <= |Bin[24:17];
// Compare mantissas
BmantGT1u <= Bin[15:8] > Ain[15:8];
BmantGT1l <= Bin[7:0] > Ain[7:0];
BmantEQ1u <= Bin[15:8] == Ain[15:8];
// Forward A and B
{A1, B1} <= {Ain, Bin};
end
/************** STAGE 2 **************/
reg [16:0] Ashifted2, Bshifted2;
reg sub2, PickB2;
reg [24:0] A2, B2;
wire BmantGT2 = BmantGT1u || (BmantEQ1u && BmantGT1l);
wire [16:0] Btoshift2 = {Bnzero1, B1[15:0]};
wire [16:0] Atoshift2 = {Anzero1, A1[15:0]};
always @(posedge clock) begin
// For if A is greater
Bshifted2 <= Btoshift2 >> Bshift1[7:0];
// For if B is greater
Ashifted2 <= Atoshift2 >> Ashift1[7:0];
// Subtract if signs are different
sub2 <= A1[16] ^ B1[16];
// Compare A and B
PickB2 <= Bshift1[8] || (!Ashift1[8] && BmantGT2);
// Forward
A2 <= A1;
B2 <= B1;
end
/************** STAGE 3 **************/
// For if A is greater, addsub: A,B
wire [17:0] A_as0, B_as0, C_as0;
assign A_as0 = {1'b1, A2[15:0]};
assign B_as0 = Bshifted2;
assign C_as0 = sub2 ? (A_as0-B_as0) : (A_as0+B_as0);
// For if B is greater, addsub: B,A
wire [17:0] A_as1, B_as1, C_as1;
assign B_as1 = {1'b1, B2[15:0]};
assign A_as1 = Ashifted2;
assign C_as1 = sub2 ? (B_as1-A_as1) : (B_as1+A_as1);
reg [17:0] Amant3, Bmant3;
reg Asign3, Bsign3, PickB3;
reg [7:0] Aexp3, Bexp3;
always @(posedge clock) begin
// For if A is greater
Amant3 <= C_as0;
Asign3 <= A2[16];
Aexp3 <= A2[24:17];
// For if B is greater
Bmant3 <= C_as1;
Bsign3 <= B2[16];
Bexp3 <= B2[24:17];
// Forward
PickB3 <= PickB2;
end
/************** STAGE 4 **************/
wire [17:0] mant4i = PickB3 ? Bmant3 : Amant3;
wire [7:0] exp4i = PickB3 ? Bexp3 : Aexp3;
wire sign4i = PickB3 ? Bsign3 : Asign3;
reg sign4;
reg [17:0] mant4;
reg [7:0] exp4;
reg [3:0] hasbits4;
reg [1:0] extra_shift [0:3];
always @(posedge clock) begin
sign4 <= sign4i;
mant4 <= mant4i;
exp4 <= exp4i;
extra_shift[0] <= 0;
case (1'b1)
mant4i[16]: extra_shift[0] <= 0;
mant4i[15]: extra_shift[0] <= 1;
mant4i[14]: extra_shift[0] <= 2;
mant4i[13]: extra_shift[0] <= 3;
endcase
extra_shift[1] <= 0;
case (1'b1)
mant4i[12]: extra_shift[1] <= 0;
mant4i[11]: extra_shift[1] <= 1;
mant4i[10]: extra_shift[1] <= 2;
mant4i[ 9]: extra_shift[1] <= 3;
endcase
extra_shift[2] <= 0;
case (1'b1)
mant4i[ 8]: extra_shift[2] <= 0;
mant4i[ 7]: extra_shift[2] <= 1;
mant4i[ 6]: extra_shift[2] <= 2;
mant4i[ 5]: extra_shift[2] <= 3;
endcase
extra_shift[3] <= 0;
case (1'b1)
mant4i[ 4]: extra_shift[3] <= 0;
mant4i[ 3]: extra_shift[3] <= 1;
mant4i[ 2]: extra_shift[3] <= 2;
mant4i[ 1]: extra_shift[3] <= 3;
endcase
hasbits4[3] <= |mant4i[17:13];
hasbits4[2] <= |mant4i[12:9];
hasbits4[1] <= |mant4i[8:5];
hasbits4[0] <= |mant4i[4:1];
end
/************** STAGE 5 **************/
reg [8:0] exp5;
reg [4:0] exp_diff5;
reg sign5;
reg [15:0] mant5;
reg [1:0] lsh5;
always @(posedge clock) begin
sign5 <= sign4;
exp5 <= exp4;
if (mant4[17]) begin
mant5 <= mant4[16:1];
lsh5 <= 0;
exp5 <= exp4 + 1;
exp_diff5 <= 0;
end else if (hasbits4[3]) begin
mant5 <= mant4[15:0];
lsh5 <= extra_shift[0];
exp_diff5 <= extra_shift[0];
end else if (hasbits4[2]) begin
mant5 <= {mant4[11:0], 4'b0};
lsh5 <= extra_shift[1];
exp_diff5 <= {1'b1, extra_shift[1]};
end else if (hasbits4[1]) begin
mant5 <= {mant4[7:0], 8'b0};
lsh5 <= extra_shift[2];
exp_diff5 <= {2'd2, extra_shift[2]};
end else if (hasbits4[0]) begin
mant5 <= {mant4[3:0], 12'b0};
lsh5 <= extra_shift[3];
exp_diff5 <= {2'd3, extra_shift[3]};
end else begin
mant5 <= 0;
lsh5 <= 0;
if (mant4[0]) begin
exp_diff5 <= 16;
end else begin
exp5 <= 0;
exp_diff5 <= 0;
end
end
end
/************** STAGE 6 **************/
wire [9:0] exp6 = exp5 - exp_diff5;
always @(posedge clock) begin
Cout[16] <= sign5;
case (exp6[9:8])
0: Cout[24:17] <= exp6[7:0]; // in bounds
1: Cout[24:17] <= 255; // overflow
2,3: Cout[24:17] <= 0; // underflow
endcase
Cout[15:0] <= mant5 << lsh5;
end
// Decouple outputs from inferred IOBs.
always @(posedge clock) begin
Coutx <= Cout;
end
endmodule
_______________________________________________
Open-graphics mailing list
[email protected]
http://lists.duskglow.com/mailman/listinfo/open-graphics
List service provided by Duskglow Consulting, LLC (www.duskglow.com)