/* Athlon improved version */

// Load/store reordered
// This is most strange:
//   reordering in second for() loop don't do any good!
static void movntq3_copy_page(void *to, void *from)
{
	int i;
	char fpu_save[108];

#define PF "prefetchnta "

	// Prefetch data for 4 first iterations
	__asm__ __volatile__ (
		PF " (%0)\n"
		PF " 64(%0)\n"
		PF " 128(%0)\n"
		PF " 192(%0)\n"
		: : "r" (from) );

	__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

	// Each iteration copies 64 bytes
	// Prefetch is for 5th iteration
	for(i=0; i<(4096-256)/64; i++)
	{
		__asm__ __volatile__ (
		PF " 256(%0)\n"
		"   movq (%0), %%mm0\n"
		"   movq 8(%0), %%mm1\n"
		"   movq 16(%0), %%mm2\n"
		"   movq 24(%0), %%mm3\n"
		"   movntq %%mm0, (%1)\n"
		"   movntq %%mm1, 8(%1)\n"
		"   movntq %%mm2, 16(%1)\n"
		"   movntq %%mm3, 24(%1)\n"
		"   movq 32(%0), %%mm4\n"
		"   movq 40(%0), %%mm5\n"
		"   movq 48(%0), %%mm6\n"
		"   movq 56(%0), %%mm7\n"
		"   movntq %%mm4, 32(%1)\n"
		"   movntq %%mm5, 40(%1)\n"
		"   movntq %%mm6, 48(%1)\n"
		"   movntq %%mm7, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	for(; i<4096/64; i++)
	{
		__asm__ __volatile__ (
		"2: movq (%0), %%mm0\n"
		"   movq 8(%0), %%mm1\n"
		"   movq 16(%0), %%mm2\n"
		"   movq 24(%0), %%mm3\n"
		"   movq 32(%0), %%mm4\n"
		"   movq 40(%0), %%mm5\n"
		"   movq 48(%0), %%mm6\n"
		"   movq 56(%0), %%mm7\n"
		"   movntq %%mm0, (%1)\n"
		"   movntq %%mm1, 8(%1)\n"
		"   movntq %%mm2, 16(%1)\n"
		"   movntq %%mm3, 24(%1)\n"
		"   movntq %%mm4, 32(%1)\n"
		"   movntq %%mm5, 40(%1)\n"
		"   movntq %%mm6, 48(%1)\n"
		"   movntq %%mm7, 56(%1)\n"
		: : "r" (from), "r" (to) : "memory");
		from+=64;
		to+=64;
	}
	__asm__ __volatile__ (
		//" femms\n"
		" sfence\n"
		: :
	);
	__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
}
