Below is a C program that implements FP multiply and add/sub. If
we're optimizing for graphics, we can reduce some of the precision.
If we're optimizing for GPGPU, we want to do it properly. Since this
is meant to be a configurable architecture, I recommend that we
actually do both. Proper rounding is done for add/sub. However, I
haven't quite finished the multiplier, so it's all sorts of broken.
(For instance, it isn't appropriate to just perform the multiply and
throw away the lower bits. 0xFF * 0xFF is 0xFE01, so we can do that.
But 0xFF * 0x1 is 0xFF, and we don't want to just throw the number
away.)
#include <stdio.h>
struct float32 {
unsigned int mant : 23;
unsigned int expt : 8;
unsigned int sign : 1;
};
void float_mult(float32& a, float32& b, float32& out)
{
// When multiplying, the new exponent is the sum of the operand exponents
int expt; // 9 bits
expt = a.expt + b.expt - 127;
// Denormalize. In hardware, denormalized numbers would be optimized away,
// leaving only normalized numbers and zero as the only supported denorm.
// Note that we may nevertheless not be able to completely
optimize this out.
// This gives us two 24-bit numbers
int am = a.mant;
if (a.expt) am |= (1<<23);
int bm = b.mant;
if (b.expt) bm |= (1<<23);
/**** Pipeline register here ***/
// Perform the actual multiply of the mantissas.
// 24 bits times 24 bits gives us 48 bits. Then we throw away the lower 22.
// In hardware, a shift requires no logic.
// Note that this multiplier will definitely need to be pipelined
// into at least two stages.
unsigned long long mant; // 48 bits then 26
mant = ((unsigned long long)am * (unsigned long long)bm) >> 22;
int sign = a.sign ^ b.sign;
// We're multiplying two numbers that are in the format of "1.23".
// This will give us a result that is in the format "2.46".
// We shift off the lower 22 bits, giving us "2.24", and then we may
// have to shift right by one.
/**** Pipeline register here ***/
// Round here. Although the result will always be "2.24", rounding
// May increment the value to the left of the radix
// This leaves us with a 2.23 format
mant = (mant>>1) + (mant&1);
/**** Pipeline register here ***/
// See if we have to shift right by 1
if (mant & (1<<24)) {
printf("Right shift\n");
mant = (mant>>1) + (mant&1);
expt++;
}
// mant now in 1.23 format.
/**** Pipeline register here ***/
// Now, if the exponent is too small, or the mantissa doesn't have
// 1 in bit position 23, we underflow to zero
if (!(mant & (1<<23)) || expt < 1) {
expt = 0;
mant = 0;
}
// If the exponent is too large, overflow to infinity
// No support for NaN
if (expt >= 255) {
expt = 255;
mant = 0;
}
out.sign = sign;
out.expt = expt;
out.mant = mant;
/**** Always register your outputs ***/
}
void float_addsub(float32 a, float32 b, float32& out, int sub)
{
// Compute whether or not this is a subtraction.
// Subtracting a negative is an add, etc.
if (sub) b.sign ^= 1;
sub = a.sign ^ b.sign;
/*** Pipeline register here (when the differences are put above) ***/
// We want B to be smaller than A, so we're going to perform
// a swap if this isn't true.
// Some facts impacting the hardware design:
// To compute A < B, we need to subtract and detect negative.
// To compare A == B, we need to subtract and detect zero.
// Later, we need to know the difference between A and B to
// know how much to shift.
// Therefore, the best solution is to computer both A-B and B-A
// in the previous pipeline stage. Then A<B is when A-B is negative,
// A==B is when neither is negative, and then we can pass whichever
// difference is positive down the pipeline for the shift.
if (a.expt < b.expt || (a.expt == b.expt && a.mant < b.mant)) {
float32 x;
x = a;
a = b;
b = x;
}
/**** Pipeline register here ***/
// Denormalize. We actually keep three extra bits of precision.
// I added this just to make this match IEEE, but we'll optimize it out
// for GPU hardware.
int am, bm;
am = a.mant<<3;
if (a.expt) am |= 1<<26;
bm = b.mant<<3;
if (b.expt) bm |= 1<<26;
printf("am=%x\n", am);
printf("bm=%x\n", bm);
/**** Pipeline register here ***/
int ax, bx;
ax = a.expt;
bx = b.expt;
// Shift dn_b to have same exp as dn_a
// In hardware, we'd pre-compute the shift and do a barrel shift.
while (bx < ax) {
bx++;
// Right shift with sticky bit
bm = (bm>>1) | (bm&1);
printf("Shift bm right\n");
}
printf("bm=%x\n", bm);
/**** Pipeline register here ***/
// Sums of positive numbers may yield a result with one bit more on
// the left.
// But subtractions of two very close numbers can leave LOTS of zeroes
// on the left. We could detect which is which and make these two
// bits of logic parallel, but it would probably be better to make
// them two consecutive pipeline stages.
// So this may be pipelined to at least two stages.
int mant;
int expt = ax;
if (sub) {
mant = am - bm;
printf("mant=%x\n", mant);
// Can compute the actual shift in a prior pipeline stage.
// For a difference, may require any number of left shifts
while (mant && expt && !(mant & (1<<26))) {
printf("Left shift\n");
mant <<= 1;
expt--;
}
if (!expt) {
mant = 0;
} else
if (!mant) { // parallel
expt = 0;
}
} else {
mant = am + bm;
printf("mant=%x\n", mant);
// For a sum, may need a shift right by 1
if (mant & (1<<27)) {
if (expt >= 255) {
expt = 255;
mant = 0;
} else {
printf("Right shift\n");
mant >>= 1;
expt++;
}
}
}
/**** Pipeline register here ***/
// Rounding:
// Round up if grf bits are greater than half
// Round down if grf bits are less than half
// Half way case:
// Round up if LSB is set
// Round down if LSB is clear
// In hardware, we may simplify this.
int round = ((mant&7)>4) || (((mant>>2)&1) && ((mant>>3)&1));
mant = (mant>>3) + round;
printf("mant=%x\n", mant);
/**** Pipeline register here ***/
// The act of rounding may propagate an additional 1 bit on the left.
// This correction can only occur if the mantissa was all 1's
// from the round bit all the way up. Therefore, it will only be a
// shift.
while (mant & (1<<24)) {
printf("rounding shift right\n");
mant >>= 1;
expt++;
}
out.sign = a.sign;
out.expt = expt;
out.mant = mant;
/**** Always register your outputs ***/
}
#define TO_FLOAT(x) (*(float*)&(x))
#define FROM_FLOAT(x) (*(float32*)&(x))
void print_float32(float32& x)
{
printf("%d %d %d\n", x.sign, x.expt, x.mant);
}
void print_binary(int x)
{
for (int i=0; i<32; i++) {
printf("%d", x<0);
x<<=1;
}
printf("\n");
}
int main()
{
float a, b, d;
float32 c;
a = 1;
// printf("%f: ", a); print_binary(*(int*)&a);
// b = 0.99999994;
b = 1;
// printf("%f: ", b); print_binary(*(int*)&b);
float32 a32, b32;
a32 = FROM_FLOAT(a);
a32.mant = -1;
// a32.expt = 0;
a = TO_FLOAT(a32);
printf("%f: ", TO_FLOAT(a32)); print_binary(*(int*)&a32);
b32 = FROM_FLOAT(b);
// b32.mant = 1;
// b32.expt = 0;
b32.expt = 127-24;
b = TO_FLOAT(b32);
printf("%f: ", TO_FLOAT(b32)); print_binary(*(int*)&b32);
float_addsub(FROM_FLOAT(a), FROM_FLOAT(b), c, 0);
//float_mult(a32, b32, c);
printf("\nComputed %f\n", TO_FLOAT(c));
print_float32(c);
print_binary(*(int*)&c);
d = a+b;
printf("\nExpecting %f\n", d);
print_float32(FROM_FLOAT(d));
print_binary(*(int*)&d);
return 0;
}
--
Timothy Normand Miller
http://www.cse.ohio-state.edu/~millerti
Open Graphics Project
_______________________________________________
Open-graphics mailing list
[email protected]
http://lists.duskglow.com/mailman/listinfo/open-graphics
List service provided by Duskglow Consulting, LLC (www.duskglow.com)