Carmelo AMOROSO wrote: > Paul Mundt wrote: >> On Sun, Mar 25, 2007 at 09:18:33AM -0400, Mike Frysinger wrote: >> >>> On Wednesday 21 March 2007, Carmelo AMOROSO wrote: >>> >>>> I'm currently using on uClibc-nptl for sh4 an optimized version >>>> of the memcpy function (from Stuart Menefy @STMicroelectronics). >>>> This implementation is based on 'backward copying' >>>> and brakes the current implementation of 'memmove' >>>> (libc/string/generic/memmove.c) >>>> that, as clearly stated, assumes memcpy does a forward copying. >>>> >>>> The attached patch provides a solution for this adding a config option >>>> to specify what kind of memcpy implementation the architecture >>>> provides. >>>> In this way the memmove works with both implementation. >>>> >>> if anything, this option should not be exported for the user to try >>> and figure out ... either the architecture provides it or it doesnt >>> which means it'd be a hardcoded selection in the arch-specific >>> config.in files ... >>> >>> wouldnt it be simpler to provide a superh optimized memmove/memcpy ? >>> then it wouldnt matter what the generic implementations assume ... >>> >> >> It has to be split out separately for sh4, given the movca.l usage. >> > Hi All, > I've updated the previous patch to keep into account both suggestions > made by Mike and Paul. > A brief explanation of the changes follows: > > extra/Configs/Config.in -> set the TARGET_SUBARCH for the sh4 > architecture > extra/Configs/Config.in.sh -> set on the ARCH_HAS_BWD_MEMCPY for > sh4 architecture only > > libc/string/sh/sh4 -> new file memcpy.S (sh4 specific) > libc/string/generic/memmove.c -> use the new macro > __ARCH_HAS_BWD_MEMCPY__ instead of #if 1 > libc/string/generic/memcpy.c -> move static function from C source > to common header file with some reorder > libc/string/generic/memcopy.h -> "" > libc/string/Makefile.in -> add code the manage subarch > specific code in addition to the arch specific one. > > Any comments are welcome. > > Cheers, > Carmelo > > Hi Mike, Paul, did you have time to look at this ?
If accepted, may reduce a bit the diff from sh4 port and trunk. This ode is currently used on nptl/sh4 port. Carmelo > ------------------------------------------------------------------------ > > diff -Naupr uClibc-trunk/extra/Configs/Config.in > uClibc-trunk-st/extra/Configs/Config.in > --- uClibc-trunk/extra/Configs/Config.in 2007-04-24 15:19:31.000000000 > +0200 > +++ uClibc-trunk-st/extra/Configs/Config.in 2007-05-07 10:24:22.045984000 > +0200 > @@ -180,6 +180,7 @@ config TARGET_SUBARCH > string > default "e500" if CONFIG_E500 > default "classic" if CONFIG_CLASSIC > + default "sh4" if CONFIG_SH4 > default "" > > source "extra/Configs/Config.in.arch" > diff -Naupr uClibc-trunk/extra/Configs/Config.sh > uClibc-trunk-st/extra/Configs/Config.sh > --- uClibc-trunk/extra/Configs/Config.sh 2007-03-16 21:38:22.000000000 > +0100 > +++ uClibc-trunk-st/extra/Configs/Config.sh 2007-05-07 14:02:04.426778000 > +0200 > @@ -48,3 +48,8 @@ config CONFIG_SH4 > bool "SH4" > > endchoice > + > +config ARCH_HAS_BWD_MEMCPY > + bool > + default y > + depends CONFIG_SH4 > diff -Naupr uClibc-trunk/libc/string/Makefile.in > uClibc-trunk-st/libc/string/Makefile.in > --- uClibc-trunk/libc/string/Makefile.in 2006-09-19 09:43:04.000000000 > +0200 > +++ uClibc-trunk-st/libc/string/Makefile.in 2007-05-07 10:27:07.516749000 > +0200 > @@ -8,6 +8,18 @@ > # > # Arch specific fun > # > +# Collect the subarch specific implementation (asm files) > +ifneq ($(strip $(TARGET_SUBARCH)),) > +STRING_SUBARCH_DIR := > $(top_srcdir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH) > +STRING_SUBARCH_OUT := > $(top_builddir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH) > + > +STRING_SUBARCH_SSRC := $(wildcard $(STRING_SUBARCH_OUT)/*.S) > +STRING_SUBARCH_SOBJ := $(patsubst > $(STRING_SUBARCH_DIR)/%.S,$(STRING_SUBARCH_OUT)/%.o,$(STRING_SUBARCH_SSRC)) > + > +STRING_SUBARCH_OBJS := $(STRING_SUBARCH_SOBJ) > +endif > + > +# Collect the arch specific implementation (asm, c files) > STRING_ARCH_DIR := $(top_srcdir)libc/string/$(TARGET_ARCH) > STRING_ARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH) > > @@ -15,13 +27,18 @@ STRING_ARCH_SRC := $(wildcard $(STRING_A > STRING_ARCH_OBJ := $(patsubst > $(STRING_ARCH_DIR)/%.c,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SRC)) > > STRING_ARCH_SSRC := $(wildcard $(STRING_ARCH_DIR)/*.S) > -STRING_ARCH_SOBJ := $(patsubst > $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC)) > > +# Exclude the subarch implementation from the arch ones > +ifneq ($(strip $(STRING_SUBARCH_OBJS)),) > +STRING_ARCH_SSRC := $(filter-out $(patsubst > %.o,$(STRING_ARCH_DIR)/%.S,$(notdir > $(STRING_SUBARCH_OBJS))),$(STRING_ARCH_SSRC)) > +endif > + > +STRING_ARCH_SOBJ := $(patsubst > $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC)) > STRING_ARCH_OBJS := $(STRING_ARCH_OBJ) $(STRING_ARCH_SOBJ) > > -libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS) > +libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS) > $(STRING_SUBARCH_OBJS) > > -libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ) > +libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ) > $(STRING_SUBARCH_OBJS) > > # > # Generic stuff > @@ -35,6 +52,9 @@ ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y) > ifneq ($(strip $(STRING_ARCH_OBJS)),) > STRING_GENERIC_SRC := $(filter-out $(patsubst > %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir > $(STRING_ARCH_OBJS))),$(STRING_GENERIC_SRC)) > endif > +ifneq ($(strip $(STRING_SUBARCH_OBJS)),) > +STRING_GENERIC_SRC := $(filter-out $(patsubst > %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir > $(STRING_SUBARCH_OBJS))),$(STRING_GENERIC_SRC)) > +endif > endif > > STRING_GENERIC_OBJS := $(patsubst > $(STRING_GENERIC_DIR)/%.c,$(STRING_GENERIC_OUT)/%.o,$(STRING_GENERIC_SRC)) > @@ -93,6 +113,9 @@ ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y) > ifneq ($(strip $(STRING_ARCH_OBJS)),) > STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir > $(STRING_ARCH_OBJS))),$(STRING_CSRC)) > endif > +ifneq ($(strip $(STRING_SUBARCH_OBJS)),) > +STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir > $(STRING_SUBARCH_OBJS))),$(STRING_CSRC)) > +endif > endif > > ifeq ($(UCLIBC_HAS_STRING_GENERIC_OPT),y) > diff -Naupr uClibc-trunk/libc/string/generic/memcopy.h > uClibc-trunk-st/libc/string/generic/memcopy.h > --- uClibc-trunk/libc/string/generic/memcopy.h 2006-09-19 > 09:43:00.000000000 +0200 > +++ uClibc-trunk-st/libc/string/generic/memcopy.h 2007-05-07 > 10:27:55.056971000 +0200 > @@ -107,24 +107,6 @@ typedef unsigned char byte; > } \ > } while (0) > > -/* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with > - the assumption that DST_BP is aligned on an OPSIZ multiple. If > - not all bytes could be easily copied, store remaining number of bytes > - in NBYTES_LEFT, otherwise store 0. */ > -/* extern void _wordcopy_fwd_aligned __P ((long int, long int, size_t)); */ > -/* extern void _wordcopy_fwd_dest_aligned __P ((long int, long int, > size_t)); */ > -#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \ > - do \ > - { > \ > - if (src_bp % OPSIZ == 0) > \ > - _wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \ > - else \ > - _wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \ > - src_bp += (nbytes) & -OPSIZ; \ > - dst_bp += (nbytes) & -OPSIZ; \ > - (nbytes_left) = (nbytes) % OPSIZ; > \ > - } while (0) > - > /* Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR, > beginning at the words (of type op_t) right before the pointers and > continuing towards smaller addresses. May take advantage of that > @@ -148,3 +130,213 @@ typedef unsigned char byte; > > /* Threshold value for when to enter the unrolled loops. */ > #define OP_T_THRES 16 > + > +#ifdef __ARCH_HAS_BWD_MEMCPY__ > + > +/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to > + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). > + Both SRCP and DSTP should be aligned for memory operations on `op_t's. */ > + > +static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len) > +{ > + op_t a0, a1; > + a0 = a1 = 0L; > + switch (len % 8) > + { > + case 2: > + a0 = ((op_t *) srcp)[0]; > + srcp -= 6 * OPSIZ; > + dstp -= 7 * OPSIZ; > + len += 6; > + goto do1; > + case 3: > + a1 = ((op_t *) srcp)[0]; > + srcp -= 5 * OPSIZ; > + dstp -= 6 * OPSIZ; > + len += 5; > + goto do2; > + case 4: > + a0 = ((op_t *) srcp)[0]; > + srcp -= 4 * OPSIZ; > + dstp -= 5 * OPSIZ; > + len += 4; > + goto do3; > + case 5: > + a1 = ((op_t *) srcp)[0]; > + srcp -= 3 * OPSIZ; > + dstp -= 4 * OPSIZ; > + len += 3; > + goto do4; > + case 6: > + a0 = ((op_t *) srcp)[0]; > + srcp -= 2 * OPSIZ; > + dstp -= 3 * OPSIZ; > + len += 2; > + goto do5; > + case 7: > + a1 = ((op_t *) srcp)[0]; > + srcp -= 1 * OPSIZ; > + dstp -= 2 * OPSIZ; > + len += 1; > + goto do6; > + > + case 0: > + if (OP_T_THRES <= 3 * OPSIZ && len == 0) > + return; > + a0 = ((op_t *) srcp)[0]; > + srcp -= 0 * OPSIZ; > + dstp -= 1 * OPSIZ; > + goto do7; > + case 1: > + a1 = ((op_t *) srcp)[0]; > + srcp -=-1 * OPSIZ; > + dstp -= 0 * OPSIZ; > + len -= 1; > + if (OP_T_THRES <= 3 * OPSIZ && len == 0) > + goto do0; > + goto do8; /* No-op. */ > + } > + > + do > + { > + do8: > + a0 = ((op_t *) srcp)[0]; > + ((op_t *) dstp)[0] = a1; > + do7: > + a1 = ((op_t *) srcp)[1]; > + ((op_t *) dstp)[1] = a0; > + do6: > + a0 = ((op_t *) srcp)[2]; > + ((op_t *) dstp)[2] = a1; > + do5: > + a1 = ((op_t *) srcp)[3]; > + ((op_t *) dstp)[3] = a0; > + do4: > + a0 = ((op_t *) srcp)[4]; > + ((op_t *) dstp)[4] = a1; > + do3: > + a1 = ((op_t *) srcp)[5]; > + ((op_t *) dstp)[5] = a0; > + do2: > + a0 = ((op_t *) srcp)[6]; > + ((op_t *) dstp)[6] = a1; > + do1: > + a1 = ((op_t *) srcp)[7]; > + ((op_t *) dstp)[7] = a0; > + > + srcp += 8 * OPSIZ; > + dstp += 8 * OPSIZ; > + len -= 8; > + } > + while (len != 0); > + > + /* This is the right position for do0. Please don't move > + it into the loop. */ > + do0: > + ((op_t *) dstp)[0] = a1; > +} > + > +/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to > + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). > + DSTP should be aligned for memory operations on `op_t's, but SRCP must > + *not* be aligned. */ > + > +static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t > len) > +{ > + op_t a0, a1, a2, a3; > + int sh_1, sh_2; > + > + /* Calculate how to shift a word read at the memory operation > + aligned srcp to make it aligned for copy. */ > + a0 = a1 = a2 = a3 = 0L; > + sh_1 = 8 * (srcp % OPSIZ); > + sh_2 = 8 * OPSIZ - sh_1; > + > + /* Make SRCP aligned by rounding it down to the beginning of the `op_t' > + it points in the middle of. */ > + srcp &= -OPSIZ; > + > + switch (len % 4) > + { > + case 2: > + a1 = ((op_t *) srcp)[0]; > + a2 = ((op_t *) srcp)[1]; > + srcp -= 1 * OPSIZ; > + dstp -= 3 * OPSIZ; > + len += 2; > + goto do1; > + case 3: > + a0 = ((op_t *) srcp)[0]; > + a1 = ((op_t *) srcp)[1]; > + srcp -= 0 * OPSIZ; > + dstp -= 2 * OPSIZ; > + len += 1; > + goto do2; > + case 0: > + if (OP_T_THRES <= 3 * OPSIZ && len == 0) > + return; > + a3 = ((op_t *) srcp)[0]; > + a0 = ((op_t *) srcp)[1]; > + srcp -=-1 * OPSIZ; > + dstp -= 1 * OPSIZ; > + len += 0; > + goto do3; > + case 1: > + a2 = ((op_t *) srcp)[0]; > + a3 = ((op_t *) srcp)[1]; > + srcp -=-2 * OPSIZ; > + dstp -= 0 * OPSIZ; > + len -= 1; > + if (OP_T_THRES <= 3 * OPSIZ && len == 0) > + goto do0; > + goto do4; /* No-op. */ > + } > + > + do > + { > + do4: > + a0 = ((op_t *) srcp)[0]; > + ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); > + do3: > + a1 = ((op_t *) srcp)[1]; > + ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2); > + do2: > + a2 = ((op_t *) srcp)[2]; > + ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2); > + do1: > + a3 = ((op_t *) srcp)[3]; > + ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2); > + > + srcp += 4 * OPSIZ; > + dstp += 4 * OPSIZ; > + len -= 4; > + } > + while (len != 0); > + > + /* This is the right position for do0. Please don't move > + it into the loop. */ > + do0: > + ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); > +} > + > + > +/* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with > + the assumption that DST_BP is aligned on an OPSIZ multiple. If > + not all bytes could be easily copied, store remaining number of bytes > + in NBYTES_LEFT, otherwise store 0. */ > +/* extern void _wordcopy_fwd_aligned __P ((long int, long int, size_t)); */ > +/* extern void _wordcopy_fwd_dest_aligned __P ((long int, long int, > size_t)); */ > +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \ > + do \ > + { > \ > + if (src_bp % OPSIZ == 0) > \ > + _wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \ > + else \ > + _wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \ > + src_bp += (nbytes) & -OPSIZ; \ > + dst_bp += (nbytes) & -OPSIZ; \ > + (nbytes_left) = (nbytes) % OPSIZ; > \ > + } while (0) > + > +#endif /* __ARCH_HAS_BWD_MEMCPY__ */ > + > diff -Naupr uClibc-trunk/libc/string/generic/memcpy.c > uClibc-trunk-st/libc/string/generic/memcpy.c > --- uClibc-trunk/libc/string/generic/memcpy.c 2006-09-19 09:43:00.000000000 > +0200 > +++ uClibc-trunk-st/libc/string/generic/memcpy.c 2007-05-07 > 10:28:20.217087000 +0200 > @@ -25,192 +25,6 @@ > > libc_hidden_proto(memcpy) > > -/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to > - block beginning at DSTP with LEN `op_t' words (not LEN bytes!). > - Both SRCP and DSTP should be aligned for memory operations on `op_t's. */ > - > -static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len) > -{ > - op_t a0, a1; > - > - switch (len % 8) > - { > - case 2: > - a0 = ((op_t *) srcp)[0]; > - srcp -= 6 * OPSIZ; > - dstp -= 7 * OPSIZ; > - len += 6; > - goto do1; > - case 3: > - a1 = ((op_t *) srcp)[0]; > - srcp -= 5 * OPSIZ; > - dstp -= 6 * OPSIZ; > - len += 5; > - goto do2; > - case 4: > - a0 = ((op_t *) srcp)[0]; > - srcp -= 4 * OPSIZ; > - dstp -= 5 * OPSIZ; > - len += 4; > - goto do3; > - case 5: > - a1 = ((op_t *) srcp)[0]; > - srcp -= 3 * OPSIZ; > - dstp -= 4 * OPSIZ; > - len += 3; > - goto do4; > - case 6: > - a0 = ((op_t *) srcp)[0]; > - srcp -= 2 * OPSIZ; > - dstp -= 3 * OPSIZ; > - len += 2; > - goto do5; > - case 7: > - a1 = ((op_t *) srcp)[0]; > - srcp -= 1 * OPSIZ; > - dstp -= 2 * OPSIZ; > - len += 1; > - goto do6; > - > - case 0: > - if (OP_T_THRES <= 3 * OPSIZ && len == 0) > - return; > - a0 = ((op_t *) srcp)[0]; > - srcp -= 0 * OPSIZ; > - dstp -= 1 * OPSIZ; > - goto do7; > - case 1: > - a1 = ((op_t *) srcp)[0]; > - srcp -=-1 * OPSIZ; > - dstp -= 0 * OPSIZ; > - len -= 1; > - if (OP_T_THRES <= 3 * OPSIZ && len == 0) > - goto do0; > - goto do8; /* No-op. */ > - } > - > - do > - { > - do8: > - a0 = ((op_t *) srcp)[0]; > - ((op_t *) dstp)[0] = a1; > - do7: > - a1 = ((op_t *) srcp)[1]; > - ((op_t *) dstp)[1] = a0; > - do6: > - a0 = ((op_t *) srcp)[2]; > - ((op_t *) dstp)[2] = a1; > - do5: > - a1 = ((op_t *) srcp)[3]; > - ((op_t *) dstp)[3] = a0; > - do4: > - a0 = ((op_t *) srcp)[4]; > - ((op_t *) dstp)[4] = a1; > - do3: > - a1 = ((op_t *) srcp)[5]; > - ((op_t *) dstp)[5] = a0; > - do2: > - a0 = ((op_t *) srcp)[6]; > - ((op_t *) dstp)[6] = a1; > - do1: > - a1 = ((op_t *) srcp)[7]; > - ((op_t *) dstp)[7] = a0; > - > - srcp += 8 * OPSIZ; > - dstp += 8 * OPSIZ; > - len -= 8; > - } > - while (len != 0); > - > - /* This is the right position for do0. Please don't move > - it into the loop. */ > - do0: > - ((op_t *) dstp)[0] = a1; > -} > - > -/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to > - block beginning at DSTP with LEN `op_t' words (not LEN bytes!). > - DSTP should be aligned for memory operations on `op_t's, but SRCP must > - *not* be aligned. */ > - > -static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t > len) > -{ > - op_t a0, a1, a2, a3; > - int sh_1, sh_2; > - > - /* Calculate how to shift a word read at the memory operation > - aligned srcp to make it aligned for copy. */ > - > - sh_1 = 8 * (srcp % OPSIZ); > - sh_2 = 8 * OPSIZ - sh_1; > - > - /* Make SRCP aligned by rounding it down to the beginning of the `op_t' > - it points in the middle of. */ > - srcp &= -OPSIZ; > - > - switch (len % 4) > - { > - case 2: > - a1 = ((op_t *) srcp)[0]; > - a2 = ((op_t *) srcp)[1]; > - srcp -= 1 * OPSIZ; > - dstp -= 3 * OPSIZ; > - len += 2; > - goto do1; > - case 3: > - a0 = ((op_t *) srcp)[0]; > - a1 = ((op_t *) srcp)[1]; > - srcp -= 0 * OPSIZ; > - dstp -= 2 * OPSIZ; > - len += 1; > - goto do2; > - case 0: > - if (OP_T_THRES <= 3 * OPSIZ && len == 0) > - return; > - a3 = ((op_t *) srcp)[0]; > - a0 = ((op_t *) srcp)[1]; > - srcp -=-1 * OPSIZ; > - dstp -= 1 * OPSIZ; > - len += 0; > - goto do3; > - case 1: > - a2 = ((op_t *) srcp)[0]; > - a3 = ((op_t *) srcp)[1]; > - srcp -=-2 * OPSIZ; > - dstp -= 0 * OPSIZ; > - len -= 1; > - if (OP_T_THRES <= 3 * OPSIZ && len == 0) > - goto do0; > - goto do4; /* No-op. */ > - } > - > - do > - { > - do4: > - a0 = ((op_t *) srcp)[0]; > - ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); > - do3: > - a1 = ((op_t *) srcp)[1]; > - ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2); > - do2: > - a2 = ((op_t *) srcp)[2]; > - ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2); > - do1: > - a3 = ((op_t *) srcp)[3]; > - ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2); > - > - srcp += 4 * OPSIZ; > - dstp += 4 * OPSIZ; > - len -= 4; > - } > - while (len != 0); > - > - /* This is the right position for do0. Please don't move > - it into the loop. */ > - do0: > - ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); > -} > - > void *memcpy (void *dstpp, const void *srcpp, size_t len) > { > unsigned long int dstp = (long int) dstpp; > diff -Naupr uClibc-trunk/libc/string/generic/memmove.c > uClibc-trunk-st/libc/string/generic/memmove.c > --- uClibc-trunk/libc/string/generic/memmove.c 2006-09-19 > 09:43:00.000000000 +0200 > +++ uClibc-trunk-st/libc/string/generic/memmove.c 2007-05-07 > 10:29:26.717396000 +0200 > @@ -29,7 +29,8 @@ libc_hidden_proto(memcpy) > > static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len) > { > - op_t a0, a1; > + op_t a0 = 0; > + op_t a1 = 0; > > switch (len % 8) > { > @@ -133,7 +134,10 @@ static void _wordcopy_bwd_aligned (long > > static void _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t > len) > { > - op_t a0, a1, a2, a3; > + op_t a0 = 0; > + op_t a1 = 0; > + op_t a2 = 0; > + op_t a3 = 0; > int sh_1, sh_2; > > /* Calculate how to shift a word read at the memory operation > @@ -218,8 +222,8 @@ void *memmove (void *dest, const void *s > Reduces the working set. */ > if (dstp - srcp >= len) /* *Unsigned* compare! */ > { > -#if 1 > -#warning REMINDER: generic-opt memmove assumes memcpy does forward copying! > +#ifndef __ARCH_HAS_BWD_MEMCPY__ > + /* generic-opt memmove assumes memcpy does forward copying! */ > memcpy(dest, src, len); > #else > /* Copy from the beginning to the end. */ > diff -Naupr uClibc-trunk/libc/string/sh/sh4/memcpy.S > uClibc-trunk-st/libc/string/sh/sh4/memcpy.S > --- uClibc-trunk/libc/string/sh/sh4/memcpy.S 1970-01-01 01:00:00.000000000 > +0100 > +++ uClibc-trunk-st/libc/string/sh/sh4/memcpy.S 2007-05-07 > 13:43:16.291529000 +0200 > @@ -0,0 +1,807 @@ > +/* > + * "memcpy" implementation of SuperH > + * > + * Copyright (C) 1999 Niibe Yutaka > + * Copyright (c) 2002 STMicroelectronics Ltd > + * Modified from memcpy.S and micro-optimised for SH4 > + * Stuart Menefy ([EMAIL PROTECTED]) > + * > + */ > + > +/* > + * void *memcpy(void *dst, const void *src, size_t n); > + * > + * It is assumed that there is no overlap between src and dst. > + * If there is an overlap, then the results are undefined. > + */ > + > +#include <endian.h> > + > + ! > + ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. > + ! > + > + ! Size is 16 or greater, and may have trailing bytes > + > + .balign 32 > +.Lcase1: > + ! Read a long word and write a long word at once > + ! At the start of each iteration, r7 contains last long load > + add #-1,r5 ! 79 EX > + mov r4,r2 ! 5 MT (0 cycles latency) > + > + mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) > + add #-4,r5 ! 50 EX > + > + add #7,r2 ! 79 EX > + ! > +#ifdef __LITTLE_ENDIAN__ > + ! 6 cycles, 4 bytes per iteration > +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK > + mov r7, r3 ! 5 MT (latency=0) ! RQPO > + > + cmp/hi r2,r0 ! 57 MT > + shll16 r3 ! 103 EX > + > + mov r1,r6 ! 5 MT (latency=0) > + shll8 r3 ! 102 EX ! Oxxx > + > + shlr8 r6 ! 106 EX ! xNML > + mov r1, r7 ! 5 MT (latency=0) > + > + or r6,r3 ! 82 EX ! ONML > + bt/s 3b ! 109 BR > + > + mov.l r3,@-r0 ! 30 LS > +#else > +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN > + mov r7,r3 ! 5 MT (latency=0) ! OPQR > + > + cmp/hi r2,r0 ! 57 MT > + shlr16 r3 ! 107 EX > + > + shlr8 r3 ! 106 EX ! xxxO > + mov r1,r6 ! 5 MT (latency=0) > + > + shll8 r6 ! 102 EX ! LMNx > + mov r1,r7 ! 5 MT (latency=0) > + > + or r6,r3 ! 82 EX ! LMNO > + bt/s 3b ! 109 BR > + > + mov.l r3,@-r0 ! 30 LS > +#endif > + ! Finally, copy a byte at once, if necessary > + > + add #4,r5 ! 50 EX > + cmp/eq r4,r0 ! 54 MT > + > + add #-6,r2 ! 50 EX > + bt 9f ! 109 BR > + > +8: cmp/hi r2,r0 ! 57 MT > + mov.b @(r0,r5),r1 ! 20 LS (latency=2) > + > + bt/s 8b ! 109 BR > + > + mov.b r1,@-r0 ! 29 LS > + > +9: rts > + nop > + > + > + ! > + ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... > + ! > + > + ! Size is 16 or greater, and may have trailing bytes > + > + .balign 32 > +.Lcase3: > + ! Read a long word and write a long word at once > + ! At the start of each iteration, r7 contains last long load > + add #-3,r5 ! 79 EX > + mov r4,r2 ! 5 MT (0 cycles latency) > + > + mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) > + add #-4,r5 ! 50 EX > + > + add #7,r2 ! 79 EX > + ! > +#ifdef __LITTLE_ENDIAN__ > + ! 6 cycles, 4 bytes per iteration > +3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK > + mov r7, r3 ! 5 MT (latency=0) ! RQPO > + > + cmp/hi r2,r0 ! 57 MT > + shll8 r3 ! 102 EX ! QPOx > + > + mov r1,r6 ! 5 MT (latency=0) > + shlr16 r6 ! 107 EX > + > + shlr8 r6 ! 106 EX ! xxxN > + mov r1, r7 ! 5 MT (latency=0) > + > + or r6,r3 ! 82 EX ! QPON > + bt/s 3b ! 109 BR > + > + mov.l r3,@-r0 ! 30 LS > +#else > +3: mov r1,r3 ! OPQR > + shlr8 r3 ! xOPQ > + mov.l @(r0,r5),r1 ! KLMN > + mov r1,r6 > + shll16 r6 > + shll8 r6 ! Nxxx > + or r6,r3 ! NOPQ > + cmp/hi r2,r0 > + bt/s 3b > + mov.l r3,@-r0 > +#endif > + > + ! Finally, copy a byte at once, if necessary > + > + add #6,r5 ! 50 EX > + cmp/eq r4,r0 ! 54 MT > + > + add #-6,r2 ! 50 EX > + bt 9f ! 109 BR > + > +8: cmp/hi r2,r0 ! 57 MT > + mov.b @(r0,r5),r1 ! 20 LS (latency=2) > + > + bt/s 8b ! 109 BR > + > + mov.b r1,@-r0 ! 29 LS > + > +9: rts > + nop > + > +/* void *memcpy(void *dst, const void *src, size_t len) */ > +.text > +.align 5 > +.type memcpy,@function > +.globl memcpy; > + > +memcpy: > + ! Calculate the invariants which will be used in the remainder > + ! of the code: > + ! > + ! r4 --> [ ... ] DST [ ... ] SRC > + ! [ ... ] [ ... ] > + ! : : > + ! r0 --> [ ... ] r0+r5 --> [ ... ] > + ! > + ! > + > + ! Short circuit the common case of src, dst and len being 32 bit aligned > + ! and test for zero length move > + > + mov r6, r0 ! 5 MT (0 cycle latency) > + or r4, r0 ! 82 EX > + > + or r5, r0 ! 82 EX > + tst r6, r6 ! 86 MT > + > + bt/s 99f ! 111 BR (zero len) > + tst #3, r0 ! 87 MT > + > + mov r4, r0 ! 5 MT (0 cycle latency) > + add r6, r0 ! 49 EX > + > + mov #16, r1 ! 6 EX > + bt/s .Lcase00 ! 111 BR (aligned) > + > + sub r4, r5 ! 75 EX > + > + ! Arguments are not nicely long word aligned or zero len. > + ! Check for small copies, and if so do a simple byte at a time copy. > + ! > + ! Deciding on an exact value of 'small' is not easy, as the point at > which > + ! using the optimised routines become worthwhile varies (these are the > + ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): > + ! size byte-at-time long word byte > + ! 16 42 39-40 46-50 50-55 > + ! 24 58 43-44 54-58 62-67 > + ! 36 82 49-50 66-70 80-85 > + ! However the penalty for getting it 'wrong' is much higher for long > word > + ! aligned data (and this is more common), so use a value of 16. > + > + cmp/gt r6,r1 ! 56 MT > + > + add #-1,r5 ! 50 EX > + bf/s 6f ! 108 BR (not small) > + > + mov r5, r3 ! 5 MT (latency=0) > + shlr r6 ! 104 EX > + > + mov.b @(r0,r5),r1 ! 20 LS (latency=2) > + bf/s 4f ! 111 BR > + > + add #-1,r3 ! 50 EX > + tst r6, r6 ! 86 MT > + > + bt/s 98f ! 110 BR > + mov.b r1,@-r0 ! 29 LS > + > + ! 4 cycles, 2 bytes per iteration > +3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) > + > +4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) > + dt r6 ! 67 EX > + > + mov.b r1,@-r0 ! 29 LS > + bf/s 3b ! 111 BR > + > + mov.b r2,@-r0 ! 29 LS > +98: > + rts > + nop > + > +99: rts > + mov r4, r0 > + > + ! Size is not small, so its worthwhile looking for optimisations. > + ! First align destination to a long word boundary. > + ! > + ! r5 = normal value -1 > + > +6: tst #3, r0 ! 87 MT > + mov #3, r3 ! 6 EX > + > + bt/s 2f ! 111 BR > + and r0,r3 ! 78 EX > + > + ! 3 cycles, 1 byte per iteration > +1: dt r3 ! 67 EX > + mov.b @(r0,r5),r1 ! 19 LS (latency=2) > + > + add #-1, r6 ! 79 EX > + bf/s 1b ! 109 BR > + > + mov.b r1,@-r0 ! 28 LS > + > +2: add #1, r5 ! 79 EX > + > + ! Now select the appropriate bulk transfer code based on relative > + ! alignment of src and dst. > + > + mov r0, r3 ! 5 MT (latency=0) > + > + mov r5, r0 ! 5 MT (latency=0) > + tst #1, r0 ! 87 MT > + > + bf/s 1f ! 111 BR > + mov #64, r7 ! 6 EX > + > + ! bit 0 clear > + > + cmp/ge r7, r6 ! 55 MT > + > + bt/s 2f ! 111 BR > + tst #2, r0 ! 87 MT > + > + ! small > + bt/s .Lcase0 > + mov r3, r0 > + > + bra .Lcase2 > + nop > + > + ! big > +2: bt/s .Lcase0b > + mov r3, r0 > + > + bra .Lcase2b > + nop > + > + ! bit 0 set > +1: tst #2, r0 ! 87 MT > + > + bt/s .Lcase1 > + mov r3, r0 > + > + bra .Lcase3 > + nop > + > + > + ! > + ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR > + ! > + > + ! src, dst and size are all long word aligned > + ! size is non-zero > + > + .balign 32 > +.Lcase00: > + mov #64, r1 ! 6 EX > + mov r5, r3 ! 5 MT (latency=0) > + > + cmp/gt r6, r1 ! 56 MT > + add #-4, r5 ! 50 EX > + > + bf .Lcase00b ! 108 BR (big loop) > + shlr2 r6 ! 105 EX > + > + shlr r6 ! 104 EX > + mov.l @(r0, r5), r1 ! 21 LS (latency=2) > + > + bf/s 4f ! 111 BR > + add #-8, r3 ! 50 EX > + > + tst r6, r6 ! 86 MT > + bt/s 5f ! 110 BR > + > + mov.l r1,@-r0 ! 30 LS > + > + ! 4 cycles, 2 long words per iteration > +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) > + > +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) > + dt r6 ! 67 EX > + > + mov.l r1, @-r0 ! 30 LS > + bf/s 3b ! 109 BR > + > + mov.l r2, @-r0 ! 30 LS > + > +5: rts > + nop > + > + > + ! Size is 16 or greater and less than 64, but may have trailing bytes > + > + .balign 32 > +.Lcase0: > + add #-4, r5 ! 50 EX > + mov r4, r7 ! 5 MT (latency=0) > + > + mov.l @(r0, r5), r1 ! 21 LS (latency=2) > + mov #4, r2 ! 6 EX > + > + add #11, r7 ! 50 EX > + tst r2, r6 ! 86 MT > + > + mov r5, r3 ! 5 MT (latency=0) > + bt/s 4f ! 111 BR > + > + add #-4, r3 ! 50 EX > + mov.l r1,@-r0 ! 30 LS > + > + ! 4 cycles, 2 long words per iteration > +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) > + > +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) > + cmp/hi r7, r0 > + > + mov.l r1, @-r0 ! 30 LS > + bt/s 3b ! 109 BR > + > + mov.l r2, @-r0 ! 30 LS > + > + ! Copy the final 0-3 bytes > + > + add #3,r5 ! 50 EX > + > + cmp/eq r0, r4 ! 54 MT > + add #-10, r7 ! 50 EX > + > + bt 9f ! 110 BR > + > + ! 3 cycles, 1 byte per iteration > +1: mov.b @(r0,r5),r1 ! 19 LS > + cmp/hi r7,r0 ! 57 MT > + > + bt/s 1b ! 111 BR > + mov.b r1,@-r0 ! 28 LS > + > +9: rts > + nop > + > + ! Size is at least 64 bytes, so will be going round the big loop at > least once. > + ! > + ! r2 = rounded up r4 > + ! r3 = rounded down r0 > + > + .balign 32 > +.Lcase0b: > + add #-4, r5 ! 50 EX > + > +.Lcase00b: > + mov r0, r3 ! 5 MT (latency=0) > + mov #(~0x1f), r1 ! 6 EX > + > + and r1, r3 ! 78 EX > + mov r4, r2 ! 5 MT (latency=0) > + > + cmp/eq r3, r0 ! 54 MT > + add #0x1f, r2 ! 50 EX > + > + bt/s 1f ! 110 BR > + and r1, r2 ! 78 EX > + > + ! copy initial words until cache line aligned > + > + mov.l @(r0, r5), r1 ! 21 LS (latency=2) > + tst #4, r0 ! 87 MT > + > + mov r5, r6 ! 5 MT (latency=0) > + add #-4, r6 ! 50 EX > + > + bt/s 4f ! 111 BR > + add #8, r3 ! 50 EX > + > + tst #0x18, r0 ! 87 MT > + > + bt/s 1f ! 109 BR > + mov.l r1,@-r0 ! 30 LS > + > + ! 4 cycles, 2 long words per iteration > +3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) > + > +4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) > + cmp/eq r3, r0 ! 54 MT > + > + mov.l r1, @-r0 ! 30 LS > + bf/s 3b ! 109 BR > + > + mov.l r7, @-r0 ! 30 LS > + > + ! Copy the cache line aligned blocks > + ! > + ! In use: r0, r2, r4, r5 > + ! Scratch: r1, r3, r6, r7 > + ! > + ! We could do this with the four scratch registers, but if src > + ! and dest hit the same cache line, this will thrash, so make > + ! use of additional registers. > + ! > + ! We also need r0 as a temporary (for movca), so 'undo' the invariant: > + ! r5: src (was r0+r5) > + ! r1: dest (was r0) > + ! this can be reversed at the end, so we don't need to save any extra > + ! state. > + ! > +1: mov.l r8, @-r15 ! 30 LS > + add r0, r5 ! 49 EX > + > + mov.l r9, @-r15 ! 30 LS > + mov r0, r1 ! 5 MT (latency=0) > + > + mov.l r10, @-r15 ! 30 LS > + add #-0x1c, r5 ! 50 EX > + > + mov.l r11, @-r15 ! 30 LS > + > + ! 16 cycles, 32 bytes per iteration > +2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) > + add #-0x20, r1 ! 50 EX > + mov.l @(0x04,r5),r3 ! 18 LS (latency=2) > + mov.l @(0x08,r5),r6 ! 18 LS (latency=2) > + mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) > + mov.l @(0x10,r5),r8 ! 18 LS (latency=2) > + mov.l @(0x14,r5),r9 ! 18 LS (latency=2) > + mov.l @(0x18,r5),r10 ! 18 LS (latency=2) > + mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) > + movca.l r0,@r1 ! 40 LS (latency=3-7) > + mov.l r3,@(0x04,r1) ! 33 LS > + mov.l r6,@(0x08,r1) ! 33 LS > + mov.l r7,@(0x0c,r1) ! 33 LS > + > + mov.l r8,@(0x10,r1) ! 33 LS > + add #-0x20, r5 ! 50 EX > + > + mov.l r9,@(0x14,r1) ! 33 LS > + cmp/eq r2,r1 ! 54 MT > + > + mov.l r10,@(0x18,r1) ! 33 LS > + bf/s 2b ! 109 BR > + > + mov.l r11,@(0x1c,r1) ! 33 LS > + > + mov r1, r0 ! 5 MT (latency=0) > + > + mov.l @r15+, r11 ! 15 LS > + sub r1, r5 ! 75 EX > + > + mov.l @r15+, r10 ! 15 LS > + cmp/eq r4, r0 ! 54 MT > + > + bf/s 1f ! 109 BR > + mov.l @r15+, r9 ! 15 LS > + > + rts > +1: mov.l @r15+, r8 ! 15 LS > + sub r4, r1 ! 75 EX (len remaining) > + > + ! number of trailing bytes is non-zero > + ! > + ! invariants restored (r5 already decremented by 4) > + ! also r1=num bytes remaining > + > + mov #4, r2 ! 6 EX > + mov r4, r7 ! 5 MT (latency=0) > + > + add #0x1c, r5 ! 50 EX (back to -4) > + cmp/hs r2, r1 ! 58 MT > + > + bf/s 5f ! 108 BR > + add #11, r7 ! 50 EX > + > + mov.l @(r0, r5), r6 ! 21 LS (latency=2) > + tst r2, r1 ! 86 MT > + > + mov r5, r3 ! 5 MT (latency=0) > + bt/s 4f ! 111 BR > + > + add #-4, r3 ! 50 EX > + cmp/hs r2, r1 ! 58 MT > + > + bt/s 5f ! 111 BR > + mov.l r6,@-r0 ! 30 LS > + > + ! 4 cycles, 2 long words per iteration > +3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) > + > +4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) > + cmp/hi r7, r0 > + > + mov.l r6, @-r0 ! 30 LS > + bt/s 3b ! 109 BR > + > + mov.l r2, @-r0 ! 30 LS > + > + ! Copy the final 0-3 bytes > + > +5: cmp/eq r0, r4 ! 54 MT > + add #-10, r7 ! 50 EX > + > + bt 9f ! 110 BR > + add #3,r5 ! 50 EX > + > + ! 3 cycles, 1 byte per iteration > +1: mov.b @(r0,r5),r1 ! 19 LS > + cmp/hi r7,r0 ! 57 MT > + > + bt/s 1b ! 111 BR > + mov.b r1,@-r0 ! 28 LS > + > +9: rts > + nop > + > + ! > + ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. > + ! > + > + .balign 32 > +.Lcase2: > + ! Size is 16 or greater and less then 64, but may have trailing bytes > + > +2: mov r5, r6 ! 5 MT (latency=0) > + add #-2,r5 ! 50 EX > + > + mov r4,r2 ! 5 MT (latency=0) > + add #-4,r6 ! 50 EX > + > + add #7,r2 ! 50 EX > +3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) > + > + mov.w @(r0,r6),r3 ! 20 LS (latency=2) > + cmp/hi r2,r0 ! 57 MT > + > + mov.w r1,@-r0 ! 29 LS > + bt/s 3b ! 111 BR > + > + mov.w r3,@-r0 ! 29 LS > + > + bra 10f > + nop > + > + > + .balign 32 > +.Lcase2b: > + ! Size is at least 64 bytes, so will be going round the big loop at > least once. > + ! > + ! r2 = rounded up r4 > + ! r3 = rounded down r0 > + > + mov r0, r3 ! 5 MT (latency=0) > + mov #(~0x1f), r1 ! 6 EX > + > + and r1, r3 ! 78 EX > + mov r4, r2 ! 5 MT (latency=0) > + > + cmp/eq r3, r0 ! 54 MT > + add #0x1f, r2 ! 50 EX > + > + add #-2, r5 ! 50 EX > + bt/s 1f ! 110 BR > + and r1, r2 ! 78 EX > + > + ! Copy a short word one at a time until we are cache line aligned > + ! Normal values: r0, r2, r3, r4 > + ! Unused: r1, r6, r7 > + ! Mod: r5 (=r5-2) > + ! > + add #2, r3 ! 50 EX > + > +2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) > + cmp/eq r3,r0 ! 54 MT > + > + bf/s 2b ! 111 BR > + > + mov.w r1,@-r0 ! 29 LS > + > + ! Copy the cache line aligned blocks > + ! > + ! In use: r0, r2, r4, r5 (=r5-2) > + ! Scratch: r1, r3, r6, r7 > + ! > + ! We could do this with the four scratch registers, but if src > + ! and dest hit the same cache line, this will thrash, so make > + ! use of additional registers. > + ! > + ! We also need r0 as a temporary (for movca), so 'undo' the invariant: > + ! r5: src (was r0+r5) > + ! r1: dest (was r0) > + ! this can be reversed at the end, so we don't need to save any extra > + ! state. > + ! > +1: mov.l r8, @-r15 ! 30 LS > + add r0, r5 ! 49 EX > + > + mov.l r9, @-r15 ! 30 LS > + mov r0, r1 ! 5 MT (latency=0) > + > + mov.l r10, @-r15 ! 30 LS > + add #-0x1e, r5 ! 50 EX > + > + mov.l r11, @-r15 ! 30 LS > + > + mov.l r12, @-r15 ! 30 LS > + > + ! 17 cycles, 32 bytes per iteration > +#ifdef __LITTLE_ENDIAN__ > +2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI > + add #-0x20, r1 ! 50 EX > + > + mov.l @r5+, r3 ! 15 LS (latency=2) NMLK > + > + mov.l @r5+, r6 ! 15 LS (latency=2) RQPO > + shll16 r0 ! 103 EX JI.. > + > + mov.l @r5+, r7 ! 15 LS (latency=2) > + xtrct r3, r0 ! 48 EX LKJI > + > + mov.l @r5+, r8 ! 15 LS (latency=2) > + xtrct r6, r3 ! 48 EX PONM > + > + mov.l @r5+, r9 ! 15 LS (latency=2) > + xtrct r7, r6 ! 48 EX > + > + mov.l @r5+, r10 ! 15 LS (latency=2) > + xtrct r8, r7 ! 48 EX > + > + mov.l @r5+, r11 ! 15 LS (latency=2) > + xtrct r9, r8 ! 48 EX > + > + mov.w @r5+, r12 ! 15 LS (latency=2) > + xtrct r10, r9 ! 48 EX > + > + movca.l r0,@r1 ! 40 LS (latency=3-7) > + xtrct r11, r10 ! 48 EX > + > + mov.l r3, @(0x04,r1) ! 33 LS > + xtrct r12, r11 ! 48 EX > + > + mov.l r6, @(0x08,r1) ! 33 LS > + > + mov.l r7, @(0x0c,r1) ! 33 LS > + > + mov.l r8, @(0x10,r1) ! 33 LS > + add #-0x40, r5 ! 50 EX > + > + mov.l r9, @(0x14,r1) ! 33 LS > + cmp/eq r2,r1 ! 54 MT > + > + mov.l r10, @(0x18,r1) ! 33 LS > + bf/s 2b ! 109 BR > + > + mov.l r11, @(0x1c,r1) ! 33 LS > +#else > +2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) > + add #-2, r5 ! 50 EX > + > + mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) > + add #-4, r1 ! 50 EX > + > + mov.l @(0x18,r5), r6 ! 18 LS (latency=2) > + shll16 r0 ! 103 EX > + > + mov.l @(0x14,r5), r7 ! 18 LS (latency=2) > + xtrct r3, r0 ! 48 EX > + > + mov.l @(0x10,r5), r8 ! 18 LS (latency=2) > + xtrct r6, r3 ! 48 EX > + > + mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) > + xtrct r7, r6 ! 48 EX > + > + mov.l @(0x08,r5), r10 ! 18 LS (latency=2) > + xtrct r8, r7 ! 48 EX > + > + mov.l @(0x04,r5), r11 ! 18 LS (latency=2) > + xtrct r9, r8 ! 48 EX > + > + mov.w @(0x02,r5), r12 ! 18 LS (latency=2) > + xtrct r10, r9 ! 48 EX > + > + movca.l r0,@r1 ! 40 LS (latency=3-7) > + add #-0x1c, r1 ! 50 EX > + > + mov.l r3, @(0x1c,r1) ! 33 LS > + xtrct r11, r10 ! 48 EX > + > + mov.l r6, @(0x18,r1) ! 33 LS > + xtrct r12, r11 ! 48 EX > + > + mov.l r7, @(0x14,r1) ! 33 LS > + > + mov.l r8, @(0x10,r1) ! 33 LS > + add #-0x3e, r5 ! 50 EX > + > + mov.l r9, @(0x0c,r1) ! 33 LS > + cmp/eq r2,r1 ! 54 MT > + > + mov.l r10, @(0x08,r1) ! 33 LS > + bf/s 2b ! 109 BR > + > + mov.l r11, @(0x04,r1) ! 33 LS > +#endif > + > + mov.l @r15+, r12 > + mov r1, r0 ! 5 MT (latency=0) > + > + mov.l @r15+, r11 ! 15 LS > + sub r1, r5 ! 75 EX > + > + mov.l @r15+, r10 ! 15 LS > + cmp/eq r4, r0 ! 54 MT > + > + bf/s 1f ! 109 BR > + mov.l @r15+, r9 ! 15 LS > + > + rts > +1: mov.l @r15+, r8 ! 15 LS > + > + add #0x1e, r5 ! 50 EX > + > + ! Finish off a short word at a time > + ! r5 must be invariant - 2 > +10: mov r4,r2 ! 5 MT (latency=0) > + add #1,r2 ! 50 EX > + > + cmp/hi r2, r0 ! 57 MT > + bf/s 1f ! 109 BR > + > + add #2, r2 ! 50 EX > + > +3: mov.w @(r0,r5),r1 ! 20 LS > + cmp/hi r2,r0 ! 57 MT > + > + bt/s 3b ! 109 BR > + > + mov.w r1,@-r0 ! 29 LS > +1: > + > + ! > + ! Finally, copy the last byte if necessary > + cmp/eq r4,r0 ! 54 MT > + bt/s 9b > + add #1,r5 > + mov.b @(r0,r5),r1 > + rts > + mov.b r1,@-r0 > + > +.size memcpy,.-memcpy; > +libc_hidden_def (memcpy) > > > ------------------------------------------------------------------------ > > _______________________________________________ > uClibc mailing list > uClibc@uclibc.org > http://busybox.net/cgi-bin/mailman/listinfo/uclibc _______________________________________________ uClibc mailing list uClibc@uclibc.org http://busybox.net/cgi-bin/mailman/listinfo/uclibc