/*
(C) 2000 Arjan van de Ven and others  licensed under the terms of the GPL
Modified by VDA
*/

#include <stdio.h>
#include <stdlib.h>

/* This makes adding/removing test functions easier */
#include "clear_normal.c"
#include "clear_repstosl.c"
#include "clear_movq.c"
#include "clear_movntq.c"

#include "copy_normal.c"
#include "copy_repmovsl.c"
#include "copy_movq.c"
#include "copy_movqpf.c"
#include "copy_movqpf1.c"
#include "copy_movqpf2.c"
#include "copy_movqpf3.c"
#include "copy_movntq.c"
#include "copy_movntq1.c"
#include "copy_movntq11.c"
#include "copy_movntq12.c"
#include "copy_movntq2.c"
#include "copy_movntq3.c"
#include "copy_movntq4.c"


// Should be significantly bigger than any RAM cache but very big buffer
// means slow tests and increased chances of interrupt hitting
// test time window.
// I found that 1/4meg is affected by cache, 1/2 is not,
// considered 1 meg is a safe bet for my box. YMMV.
const int BUFSIZE = 1 * 1024*1024;

// More - better but slower. Pick so that it runs reasonably fast.
const int TRY_TIMES = 64;

// PAGESIZE is not tunable!
const int PAGESIZE = 4096;

static inline long long rdtsc()
{
	unsigned int low,high;
	__asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high));
	return low + (((long long)high)<<32);
}

typedef void clear_func(void *);
typedef void copy_func(void *,void *);

void test_one_clearpage(clear_func *func, char *name, char *buffer)
{
	char *temp;
	int i;
	unsigned long long before,after,min,max;

	// dummy run to ensure coherent mem/cache state
	temp = buffer;
	while(temp < buffer+BUFSIZE) {
		func(temp);
		temp += PAGESIZE;
	}
	// pick fastest run
	min = ~0ULL;
	max = 0;
	for (i=0;i<TRY_TIMES;i++) {
		temp = buffer;
		before = rdtsc();
    		while(temp < buffer+BUFSIZE) {
			func(temp);
			temp += PAGESIZE;
		}
		after = rdtsc();
		if (before>after) {
			printf("timer overflow\n");
		} else {
			after-=before;
			if(min>after) min=after;
			if(max<after) max=after;
		}
	}
	printf("%32s - took %5lli max,%5lli min cycles per page\n",
		name,
		max/(BUFSIZE/PAGESIZE),
		min/(BUFSIZE/PAGESIZE)
	);
}

void test_one_copypage(copy_func *func, char *name, char *buffer)
{
	char *temp;
	int i;
	unsigned long long before,after,min,max;

	// dummy run to ensure coherent mem/cache state
        temp = buffer;
	while(temp < buffer+BUFSIZE/2) {
		func(temp, temp+BUFSIZE/2);
		temp += PAGESIZE;
	}
	// pick fastest run
	min = ~0ULL;
	max = 0;
	for (i=0;i<TRY_TIMES;i++) {;
	        temp = buffer;
		before = rdtsc();
		while(temp < buffer+BUFSIZE/2) {
			func(temp, temp+BUFSIZE/2);
			temp += PAGESIZE;
		}
		after = rdtsc();
		if (before>after) {
			printf("timer overflow\n");
		} else {
			after-=before;
			if(min>after) min=after;
			if(max<after) max=after;
		}
	}
	printf("%32s - took %5lli max,%5lli min cycles per page\n",
		name,
		max/(BUFSIZE/2 / PAGESIZE),
		min/(BUFSIZE/2 / PAGESIZE)
	);
}

void test_one_copypage_r(copy_func *func, char *name, char *buffer)
{
	char *temp;
	int i;
	unsigned long long before,after,min,max;

	// dummy run to ensure coherent mem/cache state
        temp = buffer+BUFSIZE/2-PAGESIZE;
	while(temp >= buffer) {
		func(temp, temp+BUFSIZE/2);
		temp -= PAGESIZE;
	}
	// pick fastest run
	min = ~0ULL;
	max = 0;
	for (i=0;i<TRY_TIMES;i++) {;
	        temp = buffer+BUFSIZE/2-PAGESIZE;
		before = rdtsc();
		while(temp >= buffer) {
			func(temp, temp+BUFSIZE/2);
			temp -= PAGESIZE;
		}
		after = rdtsc();
		if (before>after) {
			printf("timer overflow\n");
		} else {
			after-=before;
			if(min>after) min=after;
			if(max<after) max=after;
		}
	}
	printf("%32s - took %5lli max,%5lli min cycles per page\n",
		name,
		max/(BUFSIZE/2 / PAGESIZE),
		min/(BUFSIZE/2 / PAGESIZE)
	);
}


void test_clearpage(char *buffer)
{
	printf("clear_page() tests:\n");

#define	TEST_CLEAR(a) test_one_clearpage(a,#a,buffer)

	TEST_CLEAR(normal_clear_page	);
	TEST_CLEAR(normal_clear_page	);
	TEST_CLEAR(repstosl_clear_page	);
	TEST_CLEAR(movq_clear_page	);
	TEST_CLEAR(movntq_clear_page	);
//	TEST_CLEAR(test_clear_page	);

#undef TEST_CLEAR

}

void test_copypage(char *buffer)
{
	printf("copy_page() tests:\n");

#define	TEST_COPY(a) test_one_copypage(a,#a,buffer)

	TEST_COPY(normal_copy_page	);
	TEST_COPY(normal_copy_page	);
	TEST_COPY(repmovsl_copy_page	);
	TEST_COPY(movq_copy_page	);
	TEST_COPY(movqpf_copy_page	);
	TEST_COPY(movqpf1_copy_page	);
	TEST_COPY(movqpf2_copy_page	);
	TEST_COPY(movqpf3_copy_page	);
	TEST_COPY(movntq_copy_page	);
	TEST_COPY(movntq1_copy_page	);
	TEST_COPY(movntq11_copy_page	);
	TEST_COPY(movntq12_copy_page	);
	TEST_COPY(movntq2_copy_page	);
	TEST_COPY(movntq3_copy_page	);
	TEST_COPY(movntq4_copy_page	);
//	TEST_COPY(test_copy_page	);

#undef TEST_COPY

}

void test_copypage_r(char *buffer)
{
	printf("copy_page() tests (reversed walk order):\n");

#define	TEST_COPY_R(a) test_one_copypage_r(a,#a,buffer)

	TEST_COPY_R(normal_copy_page	);
	TEST_COPY_R(normal_copy_page	);
	TEST_COPY_R(repmovsl_copy_page	);
	TEST_COPY_R(movq_copy_page	);
	TEST_COPY_R(movqpf_copy_page	);
	TEST_COPY_R(movqpf1_copy_page	);
	TEST_COPY_R(movqpf2_copy_page	);
	TEST_COPY_R(movqpf3_copy_page	);
	TEST_COPY_R(movntq_copy_page	);
	TEST_COPY_R(movntq1_copy_page	);
	TEST_COPY_R(movntq11_copy_page	);
	TEST_COPY_R(movntq12_copy_page	);
	TEST_COPY_R(movntq2_copy_page	);
	TEST_COPY_R(movntq3_copy_page	);
	TEST_COPY_R(movntq4_copy_page	);
//	TEST_COPY_R(test_copy_page	);

#undef TEST_COPY

}

int main()
{
	char *buffer;
	printf("Page clear/copy benchmark program.\n"
		"buffer size: %i Mb\n"
		"Each test tried %i times, max and min CPU cycles per page are reported.\n"
		"Please disregard max values. They are due to system interference only.\n",
		BUFSIZE/1024/1024,
		TRY_TIMES
	);
	buffer = malloc(BUFSIZE);
	if(!buffer) {
		printf("Malloc failed.\n");
		exit(1);
	}

	test_clearpage(buffer);
	printf("\n");
	test_copypage(buffer);
	printf("\n");
	test_copypage_r(buffer);

	printf("Done.\n");
	free(buffer);
	return 0;
}
