Rewrite the memset/memcpy as inlined asmembly code. It make the code
run much faster if memcpy runs in ROM.

Signed-off-by: Zheng Bao <zheng.bao@amd.com>

Index: src/lib/memcpy.c
===================================================================
--- src/lib/memcpy.c	(revision 5133)
+++ src/lib/memcpy.c	(working copy)
@@ -3,10 +3,14 @@
 {
 	const char *src = vsrc;
 	char *dest = vdest;
-	int i;
 
-	for (i = 0; i < (int)bytes; i++)
-		dest[i] = src[i];
+	__asm__ __volatile__ (				\
+		"cld \n\t"				\
+		"rep \n\t"				\
+		"movsb"					\
+		:		/* No output */		\
+		: "S"(src), "D"(dest), "c"(bytes)	\
+		);
 
 	return vdest;
 }
Index: src/lib/memset.c
===================================================================
--- src/lib/memset.c	(revision 5133)
+++ src/lib/memset.c	(working copy)
@@ -2,11 +2,15 @@
 
 void *memset(void *s, int c, size_t n)
 {
-	int i;
 	char *ss = (char *) s;
 
-	for (i = 0; i < (int)n; i++)
-		ss[i] = c;
+	__asm__ __volatile__ (			\
+		"cld\n\t"			\
+		"rep\n\t"			\
+		"stosb"				\
+		:				\
+		: "a"(c), "D"(ss), "c"(n)	\
+		);
 
 	return s;
 }
