[Open-graphics] More code: Floating point portion of the pipeline

Timothy Normand Miller Tue, 08 Jan 2013 13:25:04 -0800

I had to make changes to the other code I posted earlier, but here's the
module that glues all that together.  I'm sure I've made buckets of
mistakes, and there's at least one edge case not handled.






`include "defines.v"


/*
    TODO:
    Enables for clock gating
*/


// Stages after fetch and decode.

module oga2_compute_pipeline(
    input clock,
    input reset,
    input [31:0] operandA,
    input [31:0] operandB,
    input [NUM_OP_FLAGS-1:0] operation_in,
    output [31:0] result,
    output [7:0] flags,
    output [NUM_OP_FLAGS-1:0] operation_out);


/**** STAGE 1:  comparison, denormalize ****/

wire [EXPONENT_SIZE-1:0] expA_m_expB_s1;
wire [EXPONENT_SIZE-1:0] expB_m_expA_s1;
wire A_lt_B_s1;
wire B_lt_A_s1;

sub_exponent subex(
    .clock(clock),
    .expA(operandA[EXPONENT]),
    .expB(operandB[EXPONENT]),
    .expA_m_expB(expA_m_expB_s1),
    .expB_m_expA(expB_m_expA_s1),
    .A_lt_B(A_lt_B_s1),
    .B_lt_A(B_lt_A_s1));

wire A_lt_B_mant_s1;

lt_mantissa ltm(
    .clock(clock),
    .mantA(operandA[MANTISSA]),
    .mantB(operandB[MANTISSA]),
    .AltB(A_lt_B_mant_s1));

wire [DEN_FP_SIZE-1:0] operandA_s1;
wire [DEN_FP_SIZE-1:0] operandB_s1;

denormalize denA(
    .clock(clock),
    .A(operandA),
    .signA(operandA_s1[DEN_SIGN]),
    .expA(operandA_s1[DEN_EXPONENT]),
    .mantA(operandA_s1[DEN_MANTISSA]));

denormalize denB(
    .clock(clock),
    .A(operandB),
    .signA(operandB_s1[DEN_SIGN]),
    .expA(operandB_s1[DEN_EXPONENT]),
    .mantA(operandB_s1[DEN_MANTISSA]));

// Insert integer stages
// one-hot for left shift
// sign extend for multiply and ASR


// Propagate controls
reg [NUM_OP_FLAGS-1:0] operation_s1;
always @(posedge clock) begin
    operation_s1 <= operation;
end



/**** STAGE 2:  swap, mux, product exponent&sign ****/

wire [32:0] mantA_s2, mantB_s2;
wire signA_s2, signB_s2;
wire [DEN_EXPONENT_SIZE-1:0] expA_s2, expB_s2;

swap_mux swp(
    .clock(clock),
    .inA(operandA_s1),
    .inB(operandB_s1),
    .expA_lt_expB(A_lt_B_s1),
    .expB_lt_expA(B_lt_A_s1),
    .mantA_lt_mantB(A_lt_B_mant_s1),
    .integer_op(1'b0),
    .intA(0),
    .intB(0),
    .mantA(mantA_s2),
    .mantB(mantB_s2),
    .signA(signA_s2),
    .signB(signB_s2),
    .expA(expA_s2),
    .expB(expB_s2));

wire [DEN_EXPONENT_SIZE-1:0] prod_exponent_s2;
wire prod_sign_s2;
wire prod_underflow_s2;

product_exponents prodex(
    .clock(clock),
    .expA(operandA_s1[DEN_EXPONENT]),
    .expB(operandB_s1[DEN_EXPONENT]),
    .signA(operandA_s1[DEN_SIGN]),
    .signB(operandB_s1[DEN_SIGN]),
    .expC(prod_exponent_s2),
    .signC(prod_sign_s2),
    .underflow(prod_underflow_s2));

/*
Corner case not handled:
The exponent here is -1, but the mantissas are large enough that a right
shift
of 1 will be required at the end.
*/

// Keep track of right shift
reg [EXPONENT_SIZE-1:0] rtshift_s2;
always @(posedge clock) begin
    rtshift_s2 <= A_lt_B_s1 ? expB_m_expA_s1 : expA_m_expB_s1;
end

// Propagate other signals
reg [NUM_OP_FLAGS-1:0] operation_s2;
always @(posedge clock) begin
    operation_s2 <= operation_s1;
end



/**** STAGE 3:  First stage of mul, rsh for fp add ****/

// Last stage of product is 6, and it will be valid as input to stage 7
wire [69:0] product_s6;

// First stage of multiply happens here
four_stage_signed_35x35_multiply fssmul(
    .clock(clock),
    .A(mantA_s2),
    .B(mantB_s2),
    .P(product_s6));

wire [GRS_MANTISSA_SIZE-1:0] mantB_s3;
wire signB_s3;

right_shift_mantissa rsm(
    .clock(clock),
    .inB({signB_s2, expB_s2, mantB_s2[DEN_MANTISSA_SIZE-1:0]}),
    .shift_amt(rtshift_s2),
    .expA(expA_s2),
    .mantB(mantB_s3),
    .signB(signB_s3));

// Propagate A, mux in product sign and exponent
reg [GRS_FP_SIZE-1:0] operandA_s3;
always @(posedge clock) begin
    if (operation_s2[FP_MUL]) begin
        operandA_s3[GRS_SIGN] <= prod_sign_s2;
        operandA_s3[GRS_EXPONENT] <= prod_exponent_s2;
    end else begin
        operandA_s3[GRS_SIGN] <= signA_s2;
        operandA_s3[GRS_EXPONENT] <= expA_s2;
    end
    operandA_s3[GRS_MANTISSA] <= {mantA_s3, 3'b0};
end

// Propagate other signals
reg prod_underflow_s3;
reg [NUM_OP_FLAGS-1:0] operation_s3;
always @(posedge clock) begin
    operation_s3 <= operation_s2;
    prod_underflow_s3 <= prod_underflow_s2;

    // At this point, we can throw away the mantissa and sign for B,
    // except that we need to use B's sign to know if we're adding
    // or subtracting.  Right here, we alter the operation appropriately,
    // depending on the original operation and the signs of A and B.
    operation_s3[FP_SUB] <= operation_s2[FP_SUB] ^ signA_s2 ^ signB_s2;
end



/**** STAGE 4:  add/sub mantissas ****/

reg signC_s4;
wire [GRS_MANTISSA_SIZE-1:0] mantC_s4;
reg [GRS_EXPONENT_SIZE-1:0] expC_s4;

addsub_mantissa asm(
    .clock(clock),
    .mantA(operandA_s3[GRS_MANTISSA]),
    .mantB(mantB_s3),
    .sub(operation_s3[FP_SUB]),
    .sum(mantC_s4));

always @(posedge clock) begin
    signC_s4 <= operandA_s3[GRS_SIGN];
    expC_s4 <= operandA_s3[GRS_EXPONENT];
end

// Propagate other signals
reg prod_underflow_s4;
reg [NUM_OP_FLAGS-1:0] operation_s4;
always @(posedge clock) begin
    operation_s4 <= operation_s3;
    prod_underflow_s4 <= prod_underflow_s3;
end



/**** STAGE 5:  Compute the shift for renormalizing the result  ****/

wire [4:0] left_shift_s5;
wire right_shift_s5;

compute_shift comsh(
    .clock(clock),
    .inA(mantC_s4),
    .max_shift(expC_s4),
    .left(left_shift_s5),
    .right(right_shift_s5));

reg [GRS_FP_SIZE-1:0] resultC_s5;
always @(posedge clock) begin
    resultC_s5[GRS_SIGN] <= signC_s4;
    resultC_s5[GRS_EXPONENT] <= expC_s4;
    resultC_s5[GRS_MANTISSA] <= mantC_s4;
end

// Propagate other signals
reg prod_underflow_s5;
reg [NUM_OP_FLAGS-1:0] operation_s5;
always @(posedge clock) begin
    operation_s5 <= operation_s4;
    prod_underflow_s5 <= prod_underflow_s4;
end



/**** STAGE 6:  perform normalization shift ****/

reg signC_s6;
wire [GRS_MANTISSA_SIZE-1:0] mantC_s6;
wire [GRS_EXPONENT_SIZE-1:0] expC_s6;

perform_shift persh(
    .clock(clock),
    .mantAin(resultC_s5[GRS_MANTISSA]),
    .expAin(resultC_s5[GRS_EXPONENT]),
    .left(left_shift_s5),
    .right(right_shift_s5),
    .mantAout(mantC_s6),
    .expAout(expC_s6));

always @(posedge clock) begin
    signC_s6 <= resultC_s5[GRS_SIGN];
end

/* last stage of multiply */

// Propagate other signals
reg prod_underflow_s6;
reg [NUM_OP_FLAGS-1:0] operation_s6;
always @(posedge clock) begin
    operation_s6 <= operation_s5;
    prod_underflow_s6 <= prod_underflow_s5;
end



/**** STAGE 7:  mux in product and round result */

/*
How to interpret the product:
We fed in two 24-bit numbers into the multiplier, in 1.23 format.
This gives us a 48-bit result in 2.46 format.  We'll shift this right
by 20 to get the 2.26 format that the rounder wants.  Since the
largest product is 0xFFFFFE000001, it is not possible for rounding
to overflow by yet another bit.  The final stage will right-shift by
one if necessary.
*/

wire [GRS_MANTISSA_SIZE-1:0] mant_into_s7;
assign mant_into_s7 =
    operation_s6[FP_MUL] ?
        product_s6[47:20] :
        mantC_s6;

wire [GRS_MANTISSA_SIZE-1:0] mantC_s7;
reg signC_s7;
reg [GRS_EXPONENT_SIZE-1:0] expC_s7;

round rnd(
    .clock(clock),
    .inA(mant_into_s6),
    .outA(mantC_s7));

always @(posedge clock) begin
    signC_s7 <= signC_s6;
    expC_s7 <= signC_s6;
end

// Propagate other signals
reg prod_underflow_s7;
reg [NUM_OP_FLAGS-1:0] operation_s7;
always @(posedge clock) begin
    operation_s7 <= operation_s6;
    prod_underflow_s7 <= prod_underflow_s6;
end



/**** STAGE 8:  shift right if rounding caused an overflow ****/

wire [DEN_MANTISSA_SIZE-1:0] mantC_s8;
reg signC_s8;
wire [DEN_EXPONENT_SIZE-1:0] expC_s8;

round_overflow_shift rof(
    .clock(clock),
    .mantAin(mantC_s7),
    .expAin(expC_s7),
    .mantAout(mantC_s8),
    .expAout(expC_s8));

always @(posedge clock) begin
    signC_s8 <= signC_s7;
end

// Propagate other signals
reg prod_underflow_s8;
reg [NUM_OP_FLAGS-1:0] operation_s8;
always @(posedge clock) begin
    operation_s8 <= operation_s7;
    prod_underflow_s8 <= prod_underflow_s7;
end



/**** STAGE 9:  renormalize ****/

wire [MANTISSA_SIZE-1:0] mantC_s9;
reg signC_s9;
wire [EXPONENT_SIZE-1:0] expC_s9;
wire overflow_s9;

renormalize ren(
    .clock(clock),
    .mantAin(mantC_s8),
    .expAin(expC_s8),
    .underflow(prod_underflow_s8),
    .mantAout(mantC_s9),
    .expAout(expC_s9),
    .overflow(overflow_s9));

// Propagate other signals
reg prod_underflow_s9;
reg [NUM_OP_FLAGS-1:0] operation_s9;
always @(posedge clock) begin
    operation_s9 <= operation_s8;
    prod_underflow_s9 <= prod_underflow_s8;
end


/**** Output ****/

assign result = {signC_s9, expC_s9, mantC_s9};
assign flags  = {prod_underflow_s9, overflow_s9, 6'b0};
assign operation_out = operation_s9;

endmodule




-- 
Timothy Normand Miller, PhD
Assistant Professor of Computer Science, Binghamton University
http://www.cs.binghamton.edu/~millerti/<http://www.cse.ohio-state.edu/~millerti>
Open Graphics Project

_______________________________________________
Open-graphics mailing list
[email protected]
http://lists.duskglow.com/mailman/listinfo/open-graphics
List service provided by Duskglow Consulting, LLC (www.duskglow.com)

[Open-graphics] More code: Floating point portion of the pipeline

Reply via email to