On 2007-08-31, Mark wrote:
> I've posted a run-down of the multipliers so far (any important ones
> missing?) at http://jarvin.net/opengraphics/. This includes photos of the
> most critical path post-PAR.
Nice overview of the syntheses; I've had some trouble getting the
synthesis tools working, so I appreciate you effort. I suspect my
version did not synthesise as I intended. In the attached version I
made the LUT4s explicit by putting them in a separate module. Not sure
if it would have made a difference. Well, I think we can go with the
radix-4 version unless there is compelling reason to optimise further
*and* it is technically feasible to use a 4x clock for the multiplier
(which I don't know).
So, let's consider integrating Farhan's version in the nanocontroller.
Given that the VGA code will use 16 bit, would it be better to reduce
the multiplier to 16x16->32? Will this be insufficient for the DMA
code? (Does DMA require multiply at all, other than powers of 2?)
Conversely is 33 cycles multiply to slow for the VGA code, and would 17
cycles be fast enough?
I'd go with the non-blocking out-of-band approach. That is, the
programmer will count instructions before fetching the result. One
instruction takes a reg and a reg/imm and issues the multiply, and
another writes back result to a reg. The ALU stage can be the
point of transit. The issue-multiply instruction transfers the ALU
operands to the multiplier and initiates the multiply. The multiplier
holds the result after finishing as long as no new multiply is issued.
A fetch-product instruction moves the result to the ALU output, thus
allowing it to be part in register-forwarding.
As a slight variant, we can hard-code the multiplication result to r31
and drop the fetch-product instruction. That's just as easy to
implement, and it saves one cycle, since it means the product can be
directly used as an operand to the ALU.
The introduction of interrupts, if needed, will not cause problems as
long as interrupt handlers don't use the multiplier. Moreover, if an
interrupt handler needs to use the multiplier, this is also possible:
When the interrupt handler is sure any pending multiplication is
finished, it can save the result R. Then it can do it's own
multiplication. Before returning to normal code, it must perform a
multiply R*1 and wait long enough for the result to be available.
module add_1x1x1_to_2(input clock,
input clear, input x, input y, input z,
output reg s, output reg c);
// This should reduce to two parallel LUT4s, one for each output.
always @(posedge clock) {c, s} = (x + y + z) & ~{2{clear}};
endmodule
module mul32x32ser_helper(clock, start, x, y, za_o, zb_o);
input clock, start;
input[31:0] x;
input[31:0] y;
output[63:0] za_o;
output[31:0] zb_o;
reg[31:0] x_r;
reg[31:0] xy_r;
wire[30:0] s;
wire[31:0] c;
reg[30:0] v;
wire v31;
always @(posedge clock) begin
if (start) begin
x_r <= x;
xy_r <= x & {32{y[0]}};
v <= y[31:1];
end else begin
xy_r <= x_r & {32{v[0]}};
v <= {v31, v[30:1]};
end
end
add_1x1x1_to_2 m0(clock, start, xy_r[0], c[0], s[0], v31, c[0]);
add_1x1x1_to_2 m1(clock, start, xy_r[1], c[1], s[1], s[0], c[1]);
add_1x1x1_to_2 m2(clock, start, xy_r[2], c[2], s[2], s[1], c[2]);
add_1x1x1_to_2 m3(clock, start, xy_r[3], c[3], s[3], s[2], c[3]);
add_1x1x1_to_2 m4(clock, start, xy_r[4], c[4], s[4], s[3], c[4]);
add_1x1x1_to_2 m5(clock, start, xy_r[5], c[5], s[5], s[4], c[5]);
add_1x1x1_to_2 m6(clock, start, xy_r[6], c[6], s[6], s[5], c[6]);
add_1x1x1_to_2 m7(clock, start, xy_r[7], c[7], s[7], s[6], c[7]);
add_1x1x1_to_2 m8(clock, start, xy_r[8], c[8], s[8], s[7], c[8]);
add_1x1x1_to_2 m9(clock, start, xy_r[9], c[9], s[9], s[8], c[9]);
add_1x1x1_to_2 m10(clock, start, xy_r[10], c[10], s[10], s[9], c[10]);
add_1x1x1_to_2 m11(clock, start, xy_r[11], c[11], s[11], s[10], c[11]);
add_1x1x1_to_2 m12(clock, start, xy_r[12], c[12], s[12], s[11], c[12]);
add_1x1x1_to_2 m13(clock, start, xy_r[13], c[13], s[13], s[12], c[13]);
add_1x1x1_to_2 m14(clock, start, xy_r[14], c[14], s[14], s[13], c[14]);
add_1x1x1_to_2 m15(clock, start, xy_r[15], c[15], s[15], s[14], c[15]);
add_1x1x1_to_2 m16(clock, start, xy_r[16], c[16], s[16], s[15], c[16]);
add_1x1x1_to_2 m17(clock, start, xy_r[17], c[17], s[17], s[16], c[17]);
add_1x1x1_to_2 m18(clock, start, xy_r[18], c[18], s[18], s[17], c[18]);
add_1x1x1_to_2 m19(clock, start, xy_r[19], c[19], s[19], s[18], c[19]);
add_1x1x1_to_2 m20(clock, start, xy_r[20], c[20], s[20], s[19], c[20]);
add_1x1x1_to_2 m21(clock, start, xy_r[21], c[21], s[21], s[20], c[21]);
add_1x1x1_to_2 m22(clock, start, xy_r[22], c[22], s[22], s[21], c[22]);
add_1x1x1_to_2 m23(clock, start, xy_r[23], c[23], s[23], s[22], c[23]);
add_1x1x1_to_2 m24(clock, start, xy_r[24], c[24], s[24], s[23], c[24]);
add_1x1x1_to_2 m25(clock, start, xy_r[25], c[25], s[25], s[24], c[25]);
add_1x1x1_to_2 m26(clock, start, xy_r[26], c[26], s[26], s[25], c[26]);
add_1x1x1_to_2 m27(clock, start, xy_r[27], c[27], s[27], s[26], c[27]);
add_1x1x1_to_2 m28(clock, start, xy_r[28], c[28], s[28], s[27], c[28]);
add_1x1x1_to_2 m29(clock, start, xy_r[29], c[29], s[29], s[28], c[29]);
add_1x1x1_to_2 m30(clock, start, xy_r[30], c[30], s[30], s[29], c[30]);
add_1x1x1_to_2 m31(clock, start, xy_r[31], c[31], 1'b0, s[30], c[31]);
assign za_o = {s[30:0], v31, v};
assign zb_o = c;
endmodule
_______________________________________________
Open-graphics mailing list
[email protected]
http://lists.duskglow.com/mailman/listinfo/open-graphics
List service provided by Duskglow Consulting, LLC (www.duskglow.com)