/* Athlon improved version */

// Note: this version prefetches data past end of page -
// this is not good.
// Results for this routine are skewed by the fact we are
// doing tests on successive pages, extra prefetch past end
// of page helps in this case. See 'reverse walk order' tests!
// Real world usage is most likely random pages

// Note: 6 iterations prefetch is too deep for my CPU
// (your mileage may vary)
static void movntq_copy_page(void *to, void *from)
{
	int i;
	char fpu_save[108];

	// Prefetch data for 5 first iterations
	__asm__ __volatile__ (
		"1: prefetchnta (%0)\n"
		"   prefetchnta 64(%0)\n"
		"   prefetchnta 128(%0)\n"
		"   prefetchnta 192(%0)\n"
		"   prefetchnta 256(%0)\n"
		: : "r" (from) );

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	// Each iteration copies 64 bytes
	// Prefetch is for 6th iteration
	for(i=0; i<4096/64; i++)
	{
		__asm__ __volatile__ (
		"1: prefetchnta 320(%0)\n"
		"2: movq (%0), %%mm0\n"
		"   movq 8(%0), %%mm1\n"
		"   movq 16(%0), %%mm2\n"
		"   movq 24(%0), %%mm3\n"
		"   movq 32(%0), %%mm4\n"
		"   movq 40(%0), %%mm5\n"
		"   movq 48(%0), %%mm6\n"
		"   movq 56(%0), %%mm7\n"
		"   movntq %%mm0, (%1)\n"
		"   movntq %%mm1, 8(%1)\n"
		"   movntq %%mm2, 16(%1)\n"
		"   movntq %%mm3, 24(%1)\n"
		"   movntq %%mm4, 32(%1)\n"
		"   movntq %%mm5, 40(%1)\n"
		"   movntq %%mm6, 48(%1)\n"
		"   movntq %%mm7, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	__asm__ __volatile__ (
		//" femms \n"
		" sfence\n"
		: :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
}
