#define USE_SSE_VECTORIZE 1
#include <stdio.h>
#include <time.h>
#include "sse.h"

float inpvec[16][4] = { 
/*  0 */				{ 0.0, 0.0, 0.0, 0.0 },
/*  1 */				{ 1.0, 0.0, 0.0, 0.0 },
/*  2 */				{ 0.0, 1.0, 0.0, 0.0 },
/*  3 */				{ 0.0, 0.0, 1.0, 0.0 },
/*  4 */				{ 0.0, 0.0, 1.0, 0.5 },
/*  5 */				{ 0.0, 0.0, 1.0, 1.0 },
/*  6 */				{ 0.0, 0.0, 1.0, 1.5 },
/*  7 */				{ 0.0, 0.0, 1.0, 2.0 },
/*  8 */				{ 0.0, 0.0, 1.0, 2.5 },
/*  9 */				{ 0.0, 0.0, 1.0, 3.0 },
/* 10 */				{ 0.0, 0.0, 1.0, 3.5 },
/* 11 */				{ 0.0, 0.0, 1.0, 4.0 },
/* 12 */				{ 0.0, 0.0, 1.0, 4.5 },
/* 13 */				{ 0.0, 0.0, 1.0, 5.0 },
/* 14 */				{ 0.0, 0.0, 1.0, 5.5 },
/* 15 */				{ 0.0, 0.0, 1.0, 6.0 } };

float inpvecb[16][4] = { 
/*  0 */				{ 0.0, 0.0, 0.0, 0.0 },
/*  1 */				{ 1.0, 0.0, 0.0, 0.0 },
/*  2 */				{ 1.0, 1.0, 0.0, 0.0 },
/*  3 */				{ 1.0, 1.0, 1.0, 0.0 },
/*  4 */				{ 1.0, 1.0, 1.0, 0.5 },
/*  5 */				{ 1.0, 1.0, 1.0, 1.0 },
/*  6 */				{ 1.0, 1.0, 1.0, 1.5 },
/*  7 */				{ 1.0, 1.0, 1.0, 2.0 },
/*  8 */				{ 1.0, 1.0, 1.0, 2.5 },
/*  9 */				{ 1.0, 1.0, 1.0, 3.0 },
/* 10 */				{ 1.0, 1.0, 1.0, 3.5 },
/* 11 */				{ 1.0, 1.0, 1.0, 4.0 },
/* 12 */				{ 1.0, 1.0, 1.0, 4.5 },
/* 13 */				{ 1.0, 1.0, 1.0, 5.0 },
/* 14 */				{ 1.0, 1.0, 1.0, 5.5 },
/* 15 */				{ 1.0, 1.0, 1.0, 6.0 } };

extern void
baseInputs(int anBoard[2][25], float arInput[])
{
  int i = 3;

	int *pB = &anBoard[0][0];
	float *pInput = &arInput[0];
	register __m128 vec0;
	register __m128 vec1;
	register __m128 vec2;
	register __m128 vec3;
	register __m128 vec4;
	register __m128 vec5;
	register __m128 vec6;
	register __m128 vec7;
	
	while ( i-- ){
					vec0 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec0 );
					pInput += 4;
					vec1 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec1 );
					pInput += 4;
					vec2 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec2 );
					pInput += 4;
					vec3 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec3 );
					pInput += 4;
					vec4 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec4 );
					pInput += 4;
					vec5 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec5 );
					pInput += 4;
					vec6 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec6 );
					pInput += 4;
					vec7 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec7 );
					pInput += 4;
	}

	/* bar */
	vec0 = _mm_load_ps(inpvecb[*pB++]);
	_mm_store_ps(pInput, vec0 );
	pInput += 4;

	i = 3;
	while ( i-- ){
					vec0 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec0 );
					pInput += 4;
					vec1 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec1 );
					pInput += 4;
					vec2 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec2 );
					pInput += 4;
					vec3 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec3 );
					pInput += 4;
					vec4 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec4 );
					pInput += 4;
					vec5 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec5 );
					pInput += 4;
					vec6 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec6 );
					pInput += 4;
					vec7 = _mm_load_ps(inpvec[*pB++]);
					_mm_store_ps(pInput , vec7 );
					pInput += 4;
	}
	
	/* bar */
	vec0 = _mm_load_ps(inpvecb[*pB]);
	_mm_store_ps(pInput, vec0 );
	
	return;
}

extern void
baseInputsOrg(int anBoard[2][25], float arInput[])
{
  int j, i;

  for(j = 0; j < 2; ++j ) {
    float* afInput = arInput + j * 25*4;
    int* board = anBoard[j];
    
    /* Points */
    for( i = 0; i < 24; i++ ) {
      int nc = board[ i ];
      
      afInput[ i * 4 + 0 ] = nc == 1;
      afInput[ i * 4 + 1 ] = nc == 2;
      afInput[ i * 4 + 2 ] = nc >= 3;
      afInput[ i * 4 + 3 ] = nc > 3 ? ( nc - 3 ) / 2.0 : 0.0;
    }

    /* Bar */
    {
      int nc = board[ 24 ];
      
      afInput[ 24 * 4 + 0 ] = nc >= 1;
      afInput[ 24 * 4 + 1 ] = nc >= 2; /**/
      afInput[ 24 * 4 + 2 ] = nc >= 3;
      afInput[ 24 * 4 + 3 ] = nc > 3 ? ( nc - 3 ) / 2.0 : 0.0;
    }
  }

}

int main(int argc, char *argv[])
{
				time_t  t0, t1;
				int nErrorCount = 0;
				
				int anBoard[2][25] = {
				{ 0, 0, 0, 0, 0, 4,   0, 4, 0, 0, 0, 0,   5, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 2, 0 },
				{ 0, 0, 0, 0, 0, 5,   0, 3, 0, 0, 0, 0,   5, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 2, 0 }};
			  SSE_ALIGN(float arInput[ 200 ]);
			  SSE_ALIGN(float arInputOrg[ 200 ]);

				long i;

				t0 = time(NULL);
				for ( i = 0l; i < 100000000l; i++)
					baseInputs(anBoard, arInput);
				t1 = time(NULL);
				printf("Elapsed time for new baseInputs: %ld\n", (long) t1 - t0);
				
				t0 = time(NULL);
				for ( i = 0l; i < 100000000l; i++)
					baseInputsOrg(anBoard, arInputOrg);
				t1 = time(NULL);
				printf("Elapsed time for org baseInputs: %ld\n", (long) t1 - t0);

				/* Check */
				for (i = 0; i < 200; i++){
// 								printf("%5.1f %5.1f\n", arInput[i], arInputOrg[i]);
								if ( arInput[i] != arInputOrg[i]) nErrorCount++;
				}
				
				if (nErrorCount)
								printf("Whoops, there's some errors in input the calculation: %d", nErrorCount);

				return 0;
}

				

