[Open-graphics] Combined integer/FP compute backend

Timothy Normand Miller Sun, 13 Jan 2013 12:53:50 -0800

Got all the int and FP stuff squished together into one pipeline.  THIS is
what we need to write tests for.  This version doesn't handle denormals
properly, but we can write the tests anyway.  What you're missing is the
4-stage multiplier block; I've made some tweaks to that, so I'll just post
it to the list for now (next email).




`include "defines.v"


/*
    TODO:
    Enables for clock gating
*/


// Stages after fetch and decode.

module oga2_compute_pipeline(
    input clock,
    input reset,
    input [31:0] operandA,
    input [31:0] operandB,
    input [NUM_OP_FLAGS-1:0] operation_in,
    output reg [31:0] result,
    output reg [7:0] flags,
    output reg [NUM_OP_FLAGS-1:0] operation_out);


/**** STAGE 1:  comparison, denormalize ****/

wire [EXPONENT_SIZE-1:0] expA_m_expB_s1;
wire [EXPONENT_SIZE-1:0] expB_m_expA_s1;
wire A_lt_B_s1;
wire B_lt_A_s1;

sub_exponent subex(
    .clock(clock),
    .expA(operandA[EXPONENT]),
    .expB(operandB[EXPONENT]),
    .expA_m_expB(expA_m_expB_s1),
    .expB_m_expA(expB_m_expA_s1),
    .A_lt_B(A_lt_B_s1),
    .B_lt_A(B_lt_A_s1));

wire A_lt_B_mant_s1;

lt_mantissa ltm(
    .clock(clock),
    .mantA(operandA[MANTISSA]),
    .mantB(operandB[MANTISSA]),
    .AltB(A_lt_B_mant_s1));

wire [DEN_FP_SIZE-1:0] operandA_s1;
wire [DEN_FP_SIZE-1:0] operandB_s1;

denormalize denA(
    .clock(clock),
    .A(operandA),
    .signA(operandA_s1[DEN_SIGN]),
    .expA(operandA_s1[DEN_EXPONENT]),
    .mantA(operandA_s1[DEN_MANTISSA]));

denormalize denB(
    .clock(clock),
    .A(operandB),
    .signA(operandB_s1[DEN_SIGN]),
    .expA(operandB_s1[DEN_EXPONENT]),
    .mantA(operandB_s1[DEN_MANTISSA]));


// Integer left shift
wire [31:0] shift_multiplier_s1;
shift_one_hot soh(
    .clock(clock),
    .in(operandB),
    .out(shift_multiplier_s1));

// Propagate integer operands down with partial sign ext
reg [32:0] integerA_s1, integerB_s1;
always @(posedge clock) begin
    integerA_s1[31:0] <= operandA;
    integerA_s1[32] <= operandA[31] && operation[INT_SIGN_EXT];
    integerB_s1[31:0] <= operandB;
    integerB_s1[32] <= operandB[31] && operation[INT_SIGN_EXT];
end


// Propagate controls
reg [NUM_OP_FLAGS-1:0] operation_s1;
always @(posedge clock) begin
    operation_s1 <= operation;
end



/**** STAGE 2:  swap, mux, product exponent&sign ****/

wire [32:0] mantA_s2, mantB_s2;
wire signA_s2, signB_s2;
wire [DEN_EXPONENT_SIZE-1:0] expA_s2, expB_s2;

swap_mux swp(
    .clock(clock),
    .inA(operandA_s1),
    .inB(operandB_s1),
    .expA_lt_expB(A_lt_B_s1),
    .expB_lt_expA(B_lt_A_s1),
    .mantA_lt_mantB(A_lt_B_mant_s1),
    .integer_op(operation_s1[INT_MUL] || operation_s1[INT_SHL]),
    .intA(integerA_s1),
    .intB(operation_s1[INT_SHL] ? shift_multiplier_s1 : integerB_s1),
    .mantA(mantA_s2),
    .mantB(mantB_s2),
    .signA(signA_s2),
    .signB(signB_s2),
    .expA(expA_s2),
    .expB(expB_s2));

wire [DEN_EXPONENT_SIZE-1:0] prod_exponent_s2;
wire prod_sign_s2;
wire prod_underflow_s2;

product_exponents prodex(
    .clock(clock),
    .expA(operandA_s1[DEN_EXPONENT]),
    .expB(operandB_s1[DEN_EXPONENT]),
    .signA(operandA_s1[DEN_SIGN]),
    .signB(operandB_s1[DEN_SIGN]),
    .expC(prod_exponent_s2),
    .signC(prod_sign_s2),
    .underflow(prod_underflow_s2));

/*
Corner case not handled:
The exponent here is -1, but the mantissas are large enough that a right
shift
of 1 will be required at the end.
*/

// Keep track of right shift
reg [EXPONENT_SIZE-1:0] rtshift_s2;
always @(posedge clock) begin
    rtshift_s2 <= A_lt_B_s1 ? expB_m_expA_s1 : expA_m_expB_s1;
end



// Integer addsub takes 6 stages, so result will be available for use in
// stage 8.  (The last stage is s7.)
wire [31:0] int_addsub_result_s7;
wire int_addsub_C_s7, int_addsub_V_s7, int_addsub_Z_z7;

// First stage of integer addsub
integer_addsub ias(
    .clock(clock),
    .A(integerA_s1[31:0]),
    .B(integerB_s1[31:0]),
    .sub(operation_s1[INT_SUB],
    .carry_in(operation_s1[INT_CARRY_IN]),
    .out(int_addsub_result_s7),
    .C_out(int_addsub_C_s7),
    .V_out(int_addsub_V_s7),
    .Z_out(int_addsub_Z_s7));

// Integer right shift takes 5 stages, so result will be available for use
// in stage 7.  (The last stage is s6.)
wire [31:0] int_rsh_result_s6;
wire int_rsh_C_s6;

// First stage of integer right shift
int_right_shift irs(
    .clock(clock),
    .in(integerA_s1[31:0]),
    .shift_amt(integerB_s1[31:0]),
    .arith_shift(operation_s1[INT_SIGN_EXT]),
    .out(int_rsh_result_s6),
    .C_out(int_rsh_C_s6));


// Integer logical ops
wire [31:0] int_logical_result_s2;
logical_ops lops(
    .clock(clock),
    .predicate(operation_s1[PRED_VALUE]),
    .logical_op(operation_s1[INT_LOGICAL]),
    .A(integerA_s1[31:0]),
    .B(integerB_s1[31:0]),
    .C(int_logical_result_s2));


// Propagate other signals
reg [NUM_OP_FLAGS-1:0] operation_s2;
always @(posedge clock) begin
    operation_s2 <= operation_s1;
end


/**** STAGE 3:  First stage of mul, rsh for fp add ****/

// Finish sign extension
wire [34:0] mantA_s2_ext, mantB_s2_ext;
assign mantA_s2_ext[32:0] = mantA_s2;
assign mantA_s2_ext[34:33] = {2{mantA_s2[32]}};
assign mantB_s2_ext[32:0] = mantB_s2;
assign mantB_s2_ext[34:33] = {2{mantB_s2[32]}};

// Last stage of product is 6, and it will be valid as input to stage 7
wire [69:0] product_s6;

// First stage of multiply happens here
four_stage_signed_35x35_multiply fssmul(
    .clock(clock),
    .A(mantA_s2_ext),
    .B(mantB_s2_ext),
    .P(product_s6));

wire [GRS_MANTISSA_SIZE-1:0] mantB_s3;
wire signB_s3;

right_shift_mantissa rsm(
    .clock(clock),
    .inB({signB_s2, expB_s2, mantB_s2[DEN_MANTISSA_SIZE-1:0]}),
    .shift_amt(rtshift_s2),
    .expA(expA_s2),
    .mantB(mantB_s3),
    .signB(signB_s3));

// Propagate A, mux in product sign and exponent
reg [GRS_FP_SIZE-1:0] operandA_s3;
always @(posedge clock) begin
    if (operation_s2[FP_MUL]) begin
        operandA_s3[GRS_SIGN] <= prod_sign_s2;
        operandA_s3[GRS_EXPONENT] <= prod_exponent_s2;
    end else begin
        operandA_s3[GRS_SIGN] <= signA_s2;
        operandA_s3[GRS_EXPONENT] <= expA_s2;
    end
    operandA_s3[GRS_MANTISSA] <= {mantA_s3, 3'b0};
end

// Propagate other signals
reg prod_underflow_s3;
reg [NUM_OP_FLAGS-1:0] operation_s3;
reg [31:0] int_logical_result_s3;
always @(posedge clock) begin
    operation_s3 <= operation_s2;
    prod_underflow_s3 <= prod_underflow_s2;
    int_logical_result_s3 <= int_logical_result_s2;

    // At this point, we can throw away the mantissa and sign for B,
    // except that we need to use B's sign to know if we're adding
    // or subtracting.  Right here, we alter the operation appropriately,
    // depending on the original operation and the signs of A and B.
    operation_s3[FP_SUB] <= operation_s2[FP_SUB] ^ signA_s2 ^ signB_s2;
end



/**** STAGE 4:  add/sub mantissas ****/

reg signC_s4;
wire [GRS_MANTISSA_SIZE-1:0] mantC_s4;
reg [GRS_EXPONENT_SIZE-1:0] expC_s4;

addsub_mantissa asm(
    .clock(clock),
    .mantA(operandA_s3[GRS_MANTISSA]),
    .mantB(mantB_s3),
    .sub(operation_s3[FP_SUB]),
    .sum(mantC_s4));

always @(posedge clock) begin
    signC_s4 <= operandA_s3[GRS_SIGN];
    expC_s4 <= operandA_s3[GRS_EXPONENT];
end

// Propagate other signals
reg prod_underflow_s4;
reg [NUM_OP_FLAGS-1:0] operation_s4;
reg [31:0] int_logical_result_s4;
always @(posedge clock) begin
    operation_s4 <= operation_s3;
    prod_underflow_s4 <= prod_underflow_s3;
    int_logical_result_s4 <= int_logical_result_s3;
end



/**** STAGE 5:  Compute the shift for renormalizing the result  ****/

wire [4:0] left_shift_s5;
wire right_shift_s5;

compute_shift comsh(
    .clock(clock),
    .inA(mantC_s4),
    .max_shift(expC_s4),
    .left(left_shift_s5),
    .right(right_shift_s5));

reg [GRS_FP_SIZE-1:0] resultC_s5;
always @(posedge clock) begin
    resultC_s5[GRS_SIGN] <= signC_s4;
    resultC_s5[GRS_EXPONENT] <= expC_s4;
    resultC_s5[GRS_MANTISSA] <= mantC_s4;
end

// Propagate other signals
reg prod_underflow_s5;
reg [NUM_OP_FLAGS-1:0] operation_s5;
reg [31:0] int_logical_result_s5;
always @(posedge clock) begin
    operation_s5 <= operation_s4;
    prod_underflow_s5 <= prod_underflow_s4;
    int_logical_result_s5 <= int_logical_result_s4;
end



/**** STAGE 6:  perform normalization shift ****/

reg signC_s6;
wire [GRS_MANTISSA_SIZE-1:0] mantC_s6;
wire [GRS_EXPONENT_SIZE-1:0] expC_s6;

perform_shift persh(
    .clock(clock),
    .mantAin(resultC_s5[GRS_MANTISSA]),
    .expAin(resultC_s5[GRS_EXPONENT]),
    .left(left_shift_s5),
    .right(right_shift_s5),
    .mantAout(mantC_s6),
    .expAout(expC_s6));

always @(posedge clock) begin
    signC_s6 <= resultC_s5[GRS_SIGN];
end

/* last stage of multiply */
/* last stage of integer rsh */

// Propagate other signals
reg prod_underflow_s6;
reg [NUM_OP_FLAGS-1:0] operation_s6;
reg [31:0] int_logical_result_s6;
always @(posedge clock) begin
    operation_s6 <= operation_s5;
    prod_underflow_s6 <= prod_underflow_s5;
    int_logical_result_s6 <= int_logical_result_s5;
end



/**** STAGE 7:  mux in product and round result */

/*
How to interpret the product:
We fed in two 24-bit numbers into the multiplier, in 1.23 format.
This gives us a 48-bit result in 2.46 format.  We'll shift this right
by 20 to get the 2.26 format that the rounder wants.  Since the
largest product is 0xFFFFFE000001, it is not possible for rounding
to overflow by yet another bit.  The final stage will right-shift by
one if necessary.
*/

wire [GRS_MANTISSA_SIZE-1:0] mant_into_s7;
assign mant_into_s7 =
    operation_s6[FP_MUL] ?
        product_s6[47:20] :
        mantC_s6;

wire [GRS_MANTISSA_SIZE-1:0] mantC_s7;
reg signC_s7;
reg [GRS_EXPONENT_SIZE-1:0] expC_s7;

round rnd(
    .clock(clock),
    .inA(mant_into_s6),
    .outA(mantC_s7));

always @(posedge clock) begin
    signC_s7 <= signC_s6;
    expC_s7 <= signC_s6;
end


/* Last stage of integer addsub */

// Integer product overflow
wire int_mult_overflow;
assign int_mult_overflow = product_s6[63:32] != {32{product_s6[31]}};

// Mux integer results
reg [31:0] int_result_s7;
reg int_C_s7;
reg int_V_s7;
always @(posedge clock) begin
    int_result_s7 <=
        ( int_rsh_result_s6 & {32{operation_s6[INT_SHR]}} ) |
        ( product_s6[31:0] & {32{(operation_s6[INT_MUL] ||
operation_s6[INT_SHL])}} ) |
        ( int_logical_result_s6 & {32{(0 != operation_s6[INT_LOGICAL])}} );
    int_C_s7 <=
        ( int_rsh_C_s6 & operation_s6[INT_SHR] ) |
        ( product_s6[32] & (operation_s6[INT_MUL] || operation_s6[INT_SHL])
);
    int_V_s7 <= int_mult_overflow;
end


// Propagate other signals
reg prod_underflow_s7;
reg [NUM_OP_FLAGS-1:0] operation_s7;
always @(posedge clock) begin
    operation_s7 <= operation_s6;
    prod_underflow_s7 <= prod_underflow_s6;
end



/**** STAGE 8:  shift right if rounding caused an overflow ****/

wire [DEN_MANTISSA_SIZE-1:0] mantC_s8;
reg signC_s8;
wire [DEN_EXPONENT_SIZE-1:0] expC_s8;

round_overflow_shift rof(
    .clock(clock),
    .mantAin(mantC_s7),
    .expAin(expC_s7),
    .mantAout(mantC_s8),
    .expAout(expC_s8));

always @(posedge clock) begin
    signC_s8 <= signC_s7;
end


// Mux integer results
reg [31:0] int_result_s8;
reg int_C_s8;
reg int_V_s8;
reg int_Z_s8;
always @(posedge clock) begin
    if (operation_s7[INT_ADDSUB]) begin
        int_result_s8 <= int_addsub_result_s7;
        int_C_s8 <= int_addsub_C_s7;
        int_V_s8 <= int_addsub_V_s7;
        int_Z_s8 <= int_addsub_Z_s7;
    end else begin
        int_result_s8 <= in_result_s7;
        int_C_s8 <= int_C_s7;
        int_V_s8 <= int_V_s7;
        int_Z_s8 <= in_result_s7 == 0;
    end
end


// Propagate other signals
reg prod_underflow_s8;
reg [NUM_OP_FLAGS-1:0] operation_s8;
always @(posedge clock) begin
    operation_s8 <= operation_s7;
    prod_underflow_s8 <= prod_underflow_s7;
end



/**** STAGE 9:  renormalize ****/

wire [MANTISSA_SIZE-1:0] mantC_s9;
reg signC_s9;
wire [EXPONENT_SIZE-1:0] expC_s9;
wire overflow_s9, zero_s9;

renormalize ren(
    .clock(clock),
    .mantAin(mantC_s8),
    .expAin(expC_s8),
    .underflow(prod_underflow_s8),
    .mantAout(mantC_s9),
    .expAout(expC_s9),
    .overflow(overflow_s9),
    .zero(zero_s9));


reg [31:0] int_result_s9;
reg int_C_s9;
reg int_V_s9;
reg int_Z_s9;
always @(posedge clock) begin
    int_result_s9 <= int_result_s8;
    int_C_s9 <= int_C_s8;
    int_V_s9 <= int_V_s8;
    int_Z_s9 <= int_Z_s8;
end


// Propagate other signals
reg prod_underflow_s9;
reg [NUM_OP_FLAGS-1:0] operation_s9;
always @(posedge clock) begin
    operation_s9 <= operation_s8;
    prod_underflow_s9 <= prod_underflow_s8;
end



/**** STAGE 10:  mux integer and fp results ****/

wire fp_op  = operation_s9[FP_MUL] || operation_s9[FP_ADDSUB];
wire [31:0] fp_result = {signC_s9, expC_s9, mantC_s9};

always @(posedge clock) begin
    result <= fp_op ? fp_result : int_result_s9;

    if (fp_op) begin
        flags[FLAG_CARRY] <= 0;
        flags[FLAG_NEGATIVE] <= signC_s9;
        flags[FLAG_ZERO] <= zero_s9;
        flags[FLAG_OVERFLOW] <= overflow_s9;
        flags[FLAG_UNDERFLOW] <= prod_underflow_s9;
    end else begin
        flags[FLAG_CARRY] <= int_C_s9;
        flags[FLAG_NEGATIVE] <= int_C_s9[31];
        flags[FLAG_ZERO] <= int_Z_s9;
        flags[FLAG_OVERFLOW] <= int_V_s9;
        flags[FLAG_UNDERFLOW] <= 0;
    end

    operation_out <= operation_s9;
end


endmodule


-- 
Timothy Normand Miller, PhD
Assistant Professor of Computer Science, Binghamton University
http://www.cs.binghamton.edu/~millerti/
Open Graphics Project

_______________________________________________
Open-graphics mailing list
[email protected]
http://lists.duskglow.com/mailman/listinfo/open-graphics
List service provided by Duskglow Consulting, LLC (www.duskglow.com)

[Open-graphics] Combined integer/FP compute backend

Reply via email to