[Open-graphics] How to do floating point math

Timothy Normand Miller Fri, 01 Jun 2012 10:29:15 -0700

Below is a C program that implements FP multiply and add/sub.  If
we're optimizing for graphics, we can reduce some of the precision.
If we're optimizing for GPGPU, we want to do it properly.  Since this
is meant to be a configurable architecture, I recommend that we
actually do both.  Proper rounding is done for add/sub.  However, I
haven't quite finished the multiplier, so it's all sorts of broken.
(For instance, it isn't appropriate to just perform the multiply and
throw away the lower bits.  0xFF * 0xFF is 0xFE01, so we can do that.
But 0xFF * 0x1 is 0xFF, and we don't want to just throw the number
away.)




#include <stdio.h>


struct float32 {
    unsigned int mant : 23;
    unsigned int expt : 8;
    unsigned int sign : 1;
};


void float_mult(float32& a, float32& b, float32& out)
{
    // When multiplying, the new exponent is the sum of the operand exponents
    int expt; // 9 bits
    expt = a.expt + b.expt - 127;

    // Denormalize.  In hardware, denormalized numbers would be optimized away,
    // leaving only normalized numbers and zero as the only supported denorm.
    // Note that we may nevertheless not be able to completely
optimize this out.
    // This gives us two 24-bit numbers
    int am = a.mant;
    if (a.expt) am |= (1<<23);
    int bm = b.mant;
    if (b.expt) bm |= (1<<23);

    /**** Pipeline register here ***/

    // Perform the actual multiply of the mantissas.
    // 24 bits times 24 bits gives us 48 bits.  Then we throw away the lower 22.
    // In hardware, a shift requires no logic.
    // Note that this multiplier will definitely need to be pipelined
    // into at least two stages.
    unsigned long long mant; // 48 bits then 26
    mant = ((unsigned long long)am * (unsigned long long)bm) >> 22;

    int sign = a.sign ^ b.sign;

    // We're multiplying two numbers that are in the format of "1.23".
    // This will give us a result that is in the format "2.46".
    // We shift off the lower 22 bits, giving us "2.24", and then we may
    // have to shift right by one.

    /**** Pipeline register here ***/

    // Round here.  Although the result will always be "2.24", rounding
    // May increment the value to the left of the radix
    // This leaves us with a 2.23 format
    mant = (mant>>1) + (mant&1);

    /**** Pipeline register here ***/

    // See if we have to shift right by 1
    if (mant & (1<<24)) {
        printf("Right shift\n");
        mant = (mant>>1) + (mant&1);
        expt++;
    }

    // mant now in 1.23 format.

    /**** Pipeline register here ***/

    // Now, if the exponent is too small, or the mantissa doesn't have
    // 1 in bit position 23, we underflow to zero
    if (!(mant & (1<<23)) || expt < 1) {
        expt = 0;
        mant = 0;
    }

    // If the exponent is too large, overflow to infinity
    // No support for NaN
    if (expt >= 255) {
        expt = 255;
        mant = 0;
    }

    out.sign = sign;
    out.expt = expt;
    out.mant = mant;

    /**** Always register your outputs ***/
}


void float_addsub(float32 a, float32 b, float32& out, int sub)
{
    // Compute whether or not this is a subtraction.
    // Subtracting a negative is an add, etc.
    if (sub) b.sign ^= 1;
    sub = a.sign ^ b.sign;

    /*** Pipeline register here (when the differences are put above) ***/

    // We want B to be smaller than A, so we're going to perform
    // a swap if this isn't true.
    // Some facts impacting the hardware design:
    // To compute A < B, we need to subtract and detect negative.
    // To compare A == B, we need to subtract and detect zero.
    // Later, we need to know the difference between A and B to
    // know how much to shift.
    // Therefore, the best solution is to computer both A-B and B-A
    // in the previous pipeline stage.  Then A<B is when A-B is negative,
    // A==B is when neither is negative, and then we can pass whichever
    // difference is positive down the pipeline for the shift.
    if (a.expt < b.expt || (a.expt == b.expt && a.mant < b.mant)) {
        float32 x;
        x = a;
        a = b;
        b = x;
    }

    /**** Pipeline register here ***/

    // Denormalize.  We actually keep three extra bits of precision.
    // I added this just to make this match IEEE, but we'll optimize it out
    // for GPU hardware.
    int am, bm;
    am = a.mant<<3;
    if (a.expt) am |= 1<<26;
    bm = b.mant<<3;
    if (b.expt) bm |= 1<<26;
    printf("am=%x\n", am);
    printf("bm=%x\n", bm);

    /**** Pipeline register here ***/

    int ax, bx;
    ax = a.expt;
    bx = b.expt;

    // Shift dn_b to have same exp as dn_a
    // In hardware, we'd pre-compute the shift and do a barrel shift.
    while (bx < ax) {
        bx++;
        // Right shift with sticky bit
        bm = (bm>>1) | (bm&1);
        printf("Shift bm right\n");
    }
    printf("bm=%x\n", bm);

    /**** Pipeline register here ***/

    // Sums of positive numbers may yield a result with one bit more on
    // the left.
    // But subtractions of two very close numbers can leave LOTS of zeroes
    // on the left.  We could detect which is which and make these two
    // bits of logic parallel, but it would probably be better to make
    // them two consecutive pipeline stages.
    // So this may be pipelined to at least two stages.
    int mant;
    int expt = ax;
    if (sub) {
        mant = am - bm;
        printf("mant=%x\n", mant);

        // Can compute the actual shift in a prior pipeline stage.

        // For a difference, may require any number of left shifts
        while (mant && expt && !(mant & (1<<26))) {
            printf("Left shift\n");
            mant <<= 1;
            expt--;
        }
        if (!expt) {
            mant = 0;
        } else
        if (!mant) { // parallel
            expt = 0;
        }
    } else {
        mant = am + bm;
        printf("mant=%x\n", mant);

        // For a sum, may need a shift right by 1
        if (mant & (1<<27)) {
            if (expt >= 255) {
                expt = 255;
                mant = 0;
            } else {
                printf("Right shift\n");
                mant >>= 1;
                expt++;
            }
        }
    }

    /**** Pipeline register here ***/

    // Rounding:
    // Round up if grf bits are greater than half
    // Round down if grf bits are less than half
    // Half way case:
    //   Round up if LSB is set
    //   Round down if LSB is clear
    // In hardware, we may simplify this.
    int round = ((mant&7)>4) || (((mant>>2)&1) && ((mant>>3)&1));
    mant = (mant>>3) + round;
    printf("mant=%x\n", mant);

    /**** Pipeline register here ***/

    // The act of rounding may propagate an additional 1 bit on the left.
    // This correction can only occur if the mantissa was all 1's
    // from the round bit all the way up.  Therefore, it will only be a
    // shift.
    while (mant & (1<<24)) {
        printf("rounding shift right\n");
        mant >>= 1;
        expt++;
    }

    out.sign = a.sign;
    out.expt = expt;
    out.mant = mant;

    /**** Always register your outputs ***/
}

#define TO_FLOAT(x) (*(float*)&(x))
#define FROM_FLOAT(x) (*(float32*)&(x))


void print_float32(float32& x)
{
    printf("%d %d %d\n", x.sign, x.expt, x.mant);
}

void print_binary(int x)
{
    for (int i=0; i<32; i++) {
        printf("%d", x<0);
        x<<=1;
    }
    printf("\n");
}

int main()
{
    float a, b, d;
    float32 c;

    a = 1;
//    printf("%f: ", a); print_binary(*(int*)&a);
//    b = 0.99999994;
    b = 1;
//    printf("%f: ", b); print_binary(*(int*)&b);

    float32 a32, b32;
    a32 = FROM_FLOAT(a);
    a32.mant = -1;
//    a32.expt = 0;
    a = TO_FLOAT(a32);
    printf("%f: ", TO_FLOAT(a32)); print_binary(*(int*)&a32);
    b32 = FROM_FLOAT(b);
//    b32.mant = 1;
//    b32.expt = 0;
    b32.expt = 127-24;
    b = TO_FLOAT(b32);
    printf("%f: ", TO_FLOAT(b32)); print_binary(*(int*)&b32);


    float_addsub(FROM_FLOAT(a), FROM_FLOAT(b), c, 0);
    //float_mult(a32, b32, c);

    printf("\nComputed %f\n", TO_FLOAT(c));
    print_float32(c);
    print_binary(*(int*)&c);

    d = a+b;
    printf("\nExpecting %f\n", d);
    print_float32(FROM_FLOAT(d));
    print_binary(*(int*)&d);

    return 0;
}


-- 
Timothy Normand Miller
http://www.cse.ohio-state.edu/~millerti
Open Graphics Project
_______________________________________________
Open-graphics mailing list
[email protected]
http://lists.duskglow.com/mailman/listinfo/open-graphics
List service provided by Duskglow Consulting, LLC (www.duskglow.com)

[Open-graphics] How to do floating point math

Reply via email to