Well Sean, here is the code I used to test the both the new and old algorithms. Actually I did not write all of it from scratch. But adapted the code to suit the test i needed to perform. Added a make file to enable compiler code easily.
Cheers! Nyah On Thu, Jul 25, 2013 at 7:50 PM, Christopher Sean Morrison <[email protected]>wrote: > > > On Jul 25, 2013, at 01:25 PM, Check Nyah <[email protected]> wrote: > > Hey Sean, > Just ran the tests comparing the old and new routines for computing the > determinant and which gave me the following results: > > > Excellent Nyah. That definitely looks like a nice improvement. Can you > attach the test you used? > > Also, were the numbers stable? That is, your two results are > approximately 65s vs 55s which is about 15% reduction in time. If you ran > the comparison a couple more times with different matrices, is always about > 15%? > > Cheers! > Sean > > p.s. Please remember about bottom-posting and deleting irrelevant portions. > > > ------------------------------------------------------------------------------ > See everything from the browser to the database with AppDynamics > Get end-to-end visibility with application monitoring from AppDynamics > Isolate bottlenecks and diagnose root cause in seconds. > Start your free trial of AppDynamics Pro today! > http://pubads.g.doubleclick.net/gampad/clk?id=48808831&iu=/4140/ostg.clktrk > _______________________________________________ > BRL-CAD Developer mailing list > [email protected] > https://lists.sourceforge.net/lists/listinfo/brlcad-devel > >
/**
*Determinant.c
*This is the benchmark test for the determinant
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include "functions.h"
#define COUNT 1000000
/* CPU cycle counter -- available on x86 and x86-64 architectures.
*/
static inline uint64_t cpu_cycles(void)
{
register unsigned int hi, lo;
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)lo) | ((uint64_t)hi << 32U);
}
/* Xorshift pseudorandom number generator.
*/
static uint32_t xorshift_state[4] = { 123456789U, 362436069U, 521288629U, 88675123U };
static inline uint32_t xorshift(void){
uint32_t temp;
temp = xorshift_state[0] ^ (xorshift_state[0] << 11U);
xorshift_state[0] = xorshift_state[1];
xorshift_state[1] = xorshift_state[2];
xorshift_state[2] = xorshift_state[3];
return xorshift_state[3] ^= (temp >> 8U) ^ temp ^ (xorshift_state[3] >> 19U);
}
static inline double drand(void)
{
return ((double)xorshift() / 18446744073709551616.0)
+ ((double)xorshift() / 4294967296.0);
}
static int compare_ints(const void *a, const void *b)
{
return *(const int *)a - *(const int *)b;
}
static int compare_fastf(const void *a, const void *b)
{
const fastf_t d = *(const fastf_t *)a - *(const fastf_t *)b;
if (d < 0.0)
return -1;
else if (d > 0.0)
return +1;
else
return 0;
}
void sort_ints(int *const array, const size_t count)
{
qsort(array, count, sizeof (int), compare_ints);
}
void sort_fastf(fastf_t *const array, const size_t count)
{
qsort(array, count, sizeof (fastf_t), compare_fastf);
}
int main(void)
{
uint64_t cpu_started, cpu_stopped;
int *my_cycles, *bn_cycles;
fastf_t *rel_error;
mat_t my, bn;
fastf_t my_det, bn_det;
size_t i, k;
/* Allocate arrays to hold the cycle counts. */
my_cycles = malloc(COUNT * sizeof *my_cycles);
bn_cycles = malloc(COUNT * sizeof *bn_cycles);
rel_error = malloc(COUNT * sizeof *rel_error);
if (!my_cycles || !bn_cycles || !rel_error) {
fprintf(stderr, "%s.\n", strerror(ENOMEM));
return 1;
}
for (i = 0; i < COUNT; i++) {
/* Fill the matrix with random values. */
for (k = 0; k < 16; k++)
my[k] = bn[k] = drand() * 65536.0;
/* Measure bn result */
cpu_started = cpu_cycles();
bn_det = bn_mat_determinant(my);
cpu_stopped = cpu_cycles();
bn_cycles[i] = cpu_stopped - cpu_started;
/* Measure my result. */
cpu_started = cpu_cycles();
my_det = my_mat_determinant(my);
cpu_stopped = cpu_cycles();
my_cycles[i] = cpu_stopped - cpu_started;
/* Store the relative error in the results. */
if (bn_det != 0.0)
rel_error[i] = (my_det - bn_det) / bn_det;
else
rel_error[i] = my_det;
}
sort_ints(my_cycles, COUNT);
sort_ints(bn_cycles, COUNT);
sort_fastf(rel_error, COUNT);
printf("My implementation:\n");
printf("\t%9d cycles minimum\n", my_cycles[0]);
printf("\t%9d cycles median\n", my_cycles[COUNT/2]);
printf("libbn implementation:\n");
printf("\t%9d cycles minimum\n", bn_cycles[0]);
printf("\t%9d cycles median\n", bn_cycles[COUNT/2]);
printf("Results (relative differences of calculated determinants):\n");
printf("\t%.16g minimum\n", rel_error[0]);
printf("\t%.16g median\n", rel_error[COUNT/2]);
printf("\t%.16g maximum\n", rel_error[COUNT-1]);
return 0;
}
/**
*Functions.c
*Contains the function definitions used in test.
*/
#include "functions.h"
#if defined(vax)
/* DEC VAX "D" format, the most restrictive */
# define MAX_FASTF 1.0e37 /* Very close to the largest number */
# define SQRT_MAX_FASTF 1.0e18 /* This squared just avoids overflow */
# define SMALL_FASTF 1.0e-37 /* Anything smaller is zero */
# define SQRT_SMALL_FASTF 1.0e-18 /* This squared gives zero */
#else
/* IBM format, being the next most restrictive format */
# define MAX_FASTF 1.0e73 /* Very close to the largest number */
# define SQRT_MAX_FASTF 1.0e36 /* This squared just avoids overflow */
# define SMALL_FASTF 1.0e-77 /* Anything smaller is zero */
# if defined(aux)
# define SQRT_SMALL_FASTF 1.0e-40 /* _doprnt error in libc */
# else
# define SQRT_SMALL_FASTF 1.0e-39 /* This squared gives zero */
# endif
#endif
/**
* Return truthfully whether a value is within a specified epsilon
* distance from zero.
*/
#define NEAR_ZERO(val, epsilon) (((val) > -epsilon) && ((val) < epsilon))
/**
* Return truthfully whether a value is within a minimum
* representation tolerance from zero.
*
* Use not recommended due to compilation-variant tolerance.
*/
#define ZERO(_a) NEAR_ZERO((_a), SMALL_FASTF)
/** @brief Copy a matrix. */
#define MAT_COPY(d, s) do { \
(d)[0] = (s)[0]; \
(d)[1] = (s)[1]; \
(d)[2] = (s)[2]; \
(d)[3] = (s)[3]; \
(d)[4] = (s)[4]; \
(d)[5] = (s)[5]; \
(d)[6] = (s)[6]; \
(d)[7] = (s)[7]; \
(d)[8] = (s)[8]; \
(d)[9] = (s)[9]; \
(d)[10] = (s)[10]; \
(d)[11] = (s)[11]; \
(d)[12] = (s)[12]; \
(d)[13] = (s)[13]; \
(d)[14] = (s)[14]; \
(d)[15] = (s)[15]; \
} while (0)
/**
* MY _ M A T _ D E T E R M I N A N T
*
* Calculates the determinant of the 4X4 matrix
* (This implementation requires only 34 multiplications
* and 20 additions or substractions, saving 6 multiplications
* and 3 additions or substractions compared to the previous one.)
*/
fastf_t
my_mat_determinant(const mat_t input)
{
const fastf_t tmp1 = input[12] * input[ 9] - input[13] * input[ 8];
const fastf_t tmp2 = input[14] * input[ 8] - input[10] * input[12];
const fastf_t tmp3 = input[14] * input[ 9] - input[10] * input[13];
return ( tmp3 * input[ 0] - tmp2 * input[ 1] - tmp1 * input[ 2] ) * input[ 7]
- ( tmp3 * input[ 4] - tmp2 * input[ 5] - tmp1 * input[ 6] ) * input[ 3]
+ ( ( input[14] * input[ 1] - input[13] * input[ 2] ) * input[ 4]
+ ( input[12] * input[ 2] - input[ 0] * input[14] ) * input[ 5]
+ ( input[ 0] * input[13] - input[12] * input[ 1] ) * input[ 6] ) * input[11]
+ ( ( input[ 2] * input[ 9] - input[ 1] * input[10] ) * input[ 4]
+ ( input[ 0] * input[10] - input[ 8] * input[ 2] ) * input[ 5]
+ ( input[ 8] * input[ 1] - input[ 0] * input[ 9] ) * input[ 6] ) * input[15];
}
fastf_t
bn_mat_determinant(const mat_t m)
{
fastf_t det[4];
fastf_t sum;
det[0] = m[5] * (m[10]*m[15] - m[11]*m[14])
-m[6] * (m[ 9]*m[15] - m[11]*m[13])
+m[7] * (m[ 9]*m[14] - m[10]*m[13]);
det[1] = m[4] * (m[10]*m[15] - m[11]*m[14])
-m[6] * (m[ 8]*m[15] - m[11]*m[12])
+m[7] * (m[ 8]*m[14] - m[10]*m[12]);
det[2] = m[4] * (m[ 9]*m[15] - m[11]*m[13])
-m[5] * (m[ 8]*m[15] - m[11]*m[12])
+m[7] * (m[ 8]*m[13] - m[ 9]*m[12]);
det[3] = m[4] * (m[ 9]*m[14] - m[10]*m[13])
-m[5] * (m[ 8]*m[14] - m[10]*m[12])
+m[6] * (m[ 8]*m[13] - m[ 9]*m[12]);
sum = m[0]*det[0] - m[1]*det[1] + m[2]*det[2] - m[3]*det[3];
return sum;
}
int
bn_mat_inverse(register mat_t output, const mat_t input)
{
register int i, j; /* Indices */
int k; /* Indices */
int z[4]; /* Temporary */
fastf_t b[4]; /* Temporary */
fastf_t c[4]; /* Temporary */
MAT_COPY(output, input); /* Duplicate */
/* Initialization */
for (j = 0; j < 4; j++)
z[j] = j;
/* Main Loop */
for (i = 0; i < 4; i++) {
register fastf_t y; /* local temporary */
k = i;
y = output[i*4+i];
for (j = i+1; j < 4; j++) {
register fastf_t w; /* local temporary */
w = output[i*4+j];
if (fabs(w) > fabs(y)) {
k = j;
y = w;
}
}
if (ZERO(y)) {
/* SINGULAR */
return 0;
}
y = 1.0 / y;
for (j = 0; j < 4; j++) {
register fastf_t temp; /* Local */
c[j] = output[j*4+k];
output[j*4+k] = output[j*4+i];
output[j*4+i] = - c[j] * y;
temp = output[i*4+j] * y;
b[j] = temp;
output[i*4+j] = temp;
}
output[i*4+i] = y;
j = z[i];
z[i] = z[k];
z[k] = j;
for (k = 0; k < 4; k++) {
if (k == i)
continue;
for (j = 0; j < 4; j++) {
if (j == i)
continue;
output[k*4+j] = output[k*4+j] - b[j] * c[k];
}
}
}
/* Second Loop */
for (i = 0; i < 4; i++) {
while ((k = z[i]) != i) {
int p; /* Local temp */
for (j = 0; j < 4; j++) {
register fastf_t w; /* Local temp */
w = output[i*4+j];
output[i*4+j] = output[k*4+j];
output[k*4+j] = w;
}
p = z[i];
z[i] = z[k];
z[k] = p;
}
}
return 1;
}
/**
* MY _ M A T _ I N V E R S E
*
* The matrix pointed at by "input" is inverted and stored in the area
* pointed at by "output".
*
* Invert a 4-by-4 matrix using direct computation.
* Uses 100 multiplications and 61 additions or substractions, total.
*
* @return 1 if OK.
* @return 0 if matrix is singular.
*/
int
my_mat_inverse(register mat_t output, const mat_t input)
{
const fastf_t tmp01 = input[ 8] * input[ 2] - input[ 0] * input[10],
tmp02 = input[12] * input[ 1] - input[ 0] * input[13],
tmp03 = input[ 0] * input[15] - input[12] * input[ 3],
tmp04 = input[14] * input[ 1] - input[13] * input[ 2],
tmp05 = input[15] * input[ 1] - input[13] * input[ 3],
tmp06 = input[14] * input[ 3] - input[15] * input[ 2],
tmp07 = input[ 8] * input[ 1] - input[ 0] * input[ 9],
tmp08 = input[ 8] * input[ 3] - input[ 0] * input[11],
tmp09 = input[ 0] * input[14] - input[12] * input[ 2],
tmp10 = input[ 2] * input[ 9] - input[ 1] * input[10],
tmp11 = input[ 3] * input[ 9] - input[ 1] * input[11],
tmp12 = input[ 3] * input[10] - input[ 2] * input[11],
tmp13 = input[12] * input[ 9] - input[13] * input[ 8],
tmp14 = input[12] * input[10] - input[14] * input[ 8],
tmp15 = input[12] * input[11] - input[15] * input[ 8],
tmp16 = input[13] * input[10] - input[14] * input[ 9],
tmp17 = input[13] * input[11] - input[15] * input[ 9],
tmp18 = input[14] * input[11] - input[15] * input[10];
const fastf_t det = ( input[4] * ( tmp18 * input[1]
- tmp17 * input[2]
+ tmp16 * input[3] )
+ input[5] * ( tmp15 * input[2]
- tmp18 * input[0]
- tmp14 * input[3] )
+ input[6] * ( tmp17 * input[0]
- tmp15 * input[1]
+ tmp13 * input[3] )
+ input[7] * ( tmp14 * input[1]
- tmp16 * input[0]
- tmp13 * input[2] ) );
if (ZERO(det))
return 0;
output[ 0] = (tmp17 * input[6] - tmp18 * input[5] - tmp16 * input[7]) / det;
output[ 1] = (tmp18 * input[1] - tmp17 * input[2] + tmp16 * input[3]) / det;
output[ 2] = (tmp06 * input[5] - tmp04 * input[7] + tmp05 * input[6]) / det;
output[ 3] = (tmp11 * input[6] - tmp12 * input[5] - tmp10 * input[7]) / det;
output[ 4] = (tmp18 * input[4] - tmp15 * input[6] + tmp14 * input[7]) / det;
output[ 5] = (tmp15 * input[2] - tmp18 * input[0] - tmp14 * input[3]) / det;
output[ 6] = (tmp09 * input[7] - tmp06 * input[4] - tmp03 * input[6]) / det;
output[ 7] = (tmp12 * input[4] - tmp08 * input[6] + tmp01 * input[7]) / det;
output[ 8] = (tmp15 * input[5] - tmp17 * input[4] - tmp13 * input[7]) / det;
output[ 9] = (tmp17 * input[0] - tmp15 * input[1] + tmp13 * input[3]) / det;
output[10] = (tmp03 * input[5] - tmp05 * input[4] + tmp02 * input[7]) / det;
output[11] = (tmp08 * input[5] - tmp11 * input[4] - tmp07 * input[7]) / det;
output[12] = (tmp16 * input[4] - tmp14 * input[5] + tmp13 * input[6]) / det;
output[13] = (tmp14 * input[1] - tmp16 * input[0] - tmp13 * input[2]) / det;
output[14] = (tmp04 * input[4] - tmp09 * input[5] - tmp02 * input[6]) / det;
output[15] = (tmp10 * input[4] - tmp01 * input[5] + tmp07 * input[6]) / det;
return 1;
}
/** *Functions.h *The prototypes for the functions to be used in the test. Actual functions in functions.c */ #ifndef FUNCTIONS_H #define FUNCTIONS_H typedef double fastf_t; typedef fastf_t mat_t[16]; fastf_t my_mat_determinant(const mat_t m); fastf_t bn_mat_determinant(const mat_t m); int my_mat_inverse(mat_t output, const mat_t input); int bn_mat_inverse(register mat_t output, const mat_t input); #endif /* FUNCTIONS_H */
/**
*inverse.c
*The benchmark code for the matrix inversion is in inverse.c:
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <math.h>
#include "functions.h"
#define COUNT 100000
typedef struct {
fastf_t x;
fastf_t y;
fastf_t z;
} vec_t;
/* CPU cycle counter -- available on x86 and x86-64 architectures.
*/
static inline uint64_t cpu_cycles(void)
{
register unsigned int hi, lo;
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)lo) | ((uint64_t)hi << 32U);
}
/* Xorshift pseudorandom number generator.
*/
static uint32_t xorshift_state[4] = { 123456789U, 362436069U, 521288629U, 88675123U };
static inline uint32_t xorshift(void)
{
uint32_t temp;
temp = xorshift_state[0] ^ (xorshift_state[0] << 11U);
xorshift_state[0] = xorshift_state[1];
xorshift_state[1] = xorshift_state[2];
xorshift_state[2] = xorshift_state[3];
return xorshift_state[3] ^= (temp >> 8U) ^ temp ^ (xorshift_state[3] >> 19U);
}
static inline double drand(void)
{
return ((double)xorshift() / 18446744073709551616.0)
+ ((double)xorshift() / 4294967296.0);
}
static inline vec_t vrand(void)
{
return (vec_t){ drand(), drand(), drand() };
}
static inline vec_t vsub(const vec_t a, const vec_t b)
{
return (vec_t){ a.x - b.x, a.y - b.y, a.z - b.z };
}
static inline vec_t vnormalize(const vec_t v)
{
const fastf_t s = sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
if (s > 0.0)
return (vec_t){ v.x / s, v.y / s, v.z / s };
else
return (vec_t){ 0.0, 0.0, 0.0 };
}
static inline fastf_t vdot(const vec_t a, const vec_t b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
static inline vec_t vscale(const vec_t a, const fastf_t s)
{
return (vec_t){ a.x * s, a.y * s, a.z * s };
}
static inline vec_t vcross(const vec_t a, const vec_t b)
{
return (vec_t){ a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x };
}
static int compare_ints(const void *a, const void *b)
{
return *(const int *)a - *(const int *)b;
}
void sort_ints(int *const array, const size_t count)
{
qsort(array, count, sizeof (int), compare_ints);
}
int main(void)
{
uint64_t cpu_started, cpu_stopped;
int *my_cycles, *bn_cycles;
mat_t my, bn, my_out, bn_out;
vec_t x, y, z, t;
size_t i = 0;
/* Allocate arrays to hold the cycle counts. */
my_cycles = malloc(COUNT * sizeof *my_cycles);
bn_cycles = malloc(COUNT * sizeof *bn_cycles);
if (!my_cycles || !bn_cycles) {
fprintf(stderr, "%s.\n", strerror(ENOMEM));
return 1;
}
while (i < COUNT) {
/* Generate a random transformation matrix. */
x = vnormalize(vrand());
y = vrand();
y = vnormalize(vsub(y, vscale(y, vdot(x, y))));
z = vcross(x, y);
t = vrand();
my[ 0] = bn[ 0] = x.x;
my[ 1] = bn[ 1] = y.x;
my[ 2] = bn[ 2] = z.x;
my[ 3] = bn[ 3] = t.x;
my[ 4] = bn[ 4] = x.y;
my[ 5] = bn[ 5] = y.y;
my[ 6] = bn[ 6] = z.y;
my[ 7] = bn[ 7] = t.y;
my[ 8] = bn[ 8] = x.z;
my[ 9] = bn[ 9] = y.z;
my[10] = bn[10] = z.z;
my[11] = bn[11] = t.z;
my[12] = bn[12] = 0.0;
my[13] = bn[13] = 0.0;
my[14] = bn[14] = 0.0;
my[15] = bn[15] = 1.0;
/* Ignore matrices that are singular. */
/* Measure bn result */
cpu_started = cpu_cycles();
if (!bn_mat_inverse(bn_out, bn)) continue;
cpu_stopped = cpu_cycles();
bn_cycles[i] = cpu_stopped - cpu_started;
/* Measure my result. */
cpu_started = cpu_cycles();
if (!my_mat_inverse(my_out, my)) continue;
cpu_stopped = cpu_cycles();
my_cycles[i] = cpu_stopped - cpu_started;
/* Next. */
i++;
}
/* Print the last matrix. */
printf("Example:\n");
printf(" My implementation:\n");
printf("\t[ %9.6f %9.6f %9.6g %9.6f ]-1 [ %9.6f %9.6f %9.6g %9.6f ]\n", my[ 0], my[ 1], my[ 2], my[ 3], my_out[ 0], my_out[ 1], my_out[ 2], my_out[ 3]);
printf("\t[ %9.6f %9.6f %9.6g %9.6f ] [ %9.6f %9.6f %9.6g %9.6f ]\n", my[ 4], my[ 5], my[ 6], my[ 7], my_out[ 4], my_out[ 5], my_out[ 6], my_out[ 7]);
printf("\t[ %9.6f %9.6f %9.6g %9.6f ] = [ %9.6f %9.6f %9.6g %9.6f ]\n", my[ 8], my[ 9], my[10], my[11], my_out[ 8], my_out[ 9], my_out[10], my_out[11]);
printf("\t[ %9.6f %9.6f %9.6g %9.6f ] [ %9.6f %9.6f %9.6g %9.6f ]\n", my[12], my[13], my[14], my[15], my_out[12], my_out[13], my_out[14], my_out[15]);
printf(" libbn implementation:\n");
printf("\t[ %9.6f %9.6f %9.6g %9.6f ]-1 [ %9.6f %9.6f %9.6g %9.6f ]\n", bn[ 0], bn[ 1], bn[ 2], bn[ 3], bn_out[ 0], bn_out[ 1], bn_out[ 2], bn_out[ 3]);
printf("\t[ %9.6f %9.6f %9.6g %9.6f ] [ %9.6f %9.6f %9.6g %9.6f ]\n", bn[ 4], bn[ 5], bn[ 6], bn[ 7], bn_out[ 4], bn_out[ 5], bn_out[ 6], bn_out[ 7]);
printf("\t[ %9.6f %9.6f %9.6g %9.6f ] = [ %9.6f %9.6f %9.6g %9.6f ]\n", bn[ 8], bn[ 9], bn[10], bn[11], bn_out[ 8], bn_out[ 9], bn_out[10], bn_out[11]);
printf("\t[ %9.6f %9.6f %9.6g %9.6f ] [ %9.6f %9.6f %9.6g %9.6f ]\n", bn[12], bn[13], bn[14], bn[15], bn_out[12], bn_out[13], bn_out[14], bn_out[15]);
sort_ints(my_cycles, COUNT);
sort_ints(bn_cycles, COUNT);
printf("My implementation:\n");
printf("\t%9d cycles minimum\n", my_cycles[0]);
printf("\t%9d cycles median\n", my_cycles[COUNT/2]);
printf("libbn implementation:\n");
printf("\t%9d cycles minimum\n", bn_cycles[0]);
printf("\t%9d cycles median\n", bn_cycles[COUNT/2]);
return 0;
}
Makefile
Description: Binary data
------------------------------------------------------------------------------ See everything from the browser to the database with AppDynamics Get end-to-end visibility with application monitoring from AppDynamics Isolate bottlenecks and diagnose root cause in seconds. Start your free trial of AppDynamics Pro today! http://pubads.g.doubleclick.net/gampad/clk?id=48808831&iu=/4140/ostg.clktrk
_______________________________________________ BRL-CAD Developer mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/brlcad-devel
