Re: [PATCH] sh4: use optimized asm version of memcpy - add config option to support backward copying

Carmelo Amoroso Mon, 28 May 2007 05:47:41 -0700

Carmelo AMOROSO wrote:
> Paul Mundt wrote:
>> On Sun, Mar 25, 2007 at 09:18:33AM -0400, Mike Frysinger wrote:
>>  
>>> On Wednesday 21 March 2007, Carmelo AMOROSO wrote:
>>>    
>>>> I'm currently using on uClibc-nptl for sh4 an optimized version
>>>> of the memcpy function (from Stuart Menefy @STMicroelectronics).
>>>> This implementation is based on 'backward copying'
>>>> and brakes the current implementation of 'memmove'
>>>> (libc/string/generic/memmove.c)
>>>> that, as clearly stated, assumes memcpy does a forward copying.
>>>>
>>>> The attached patch provides a solution for this adding a config option
>>>> to specify what kind of memcpy implementation the architecture 
>>>> provides.
>>>> In this way the memmove works with both implementation.
>>>>       
>>> if anything, this option should not be exported for the user to try 
>>> and figure out ... either the architecture provides it or it doesnt 
>>> which means it'd be a hardcoded selection in the arch-specific 
>>> config.in files ...
>>>
>>> wouldnt it be simpler to provide a superh optimized memmove/memcpy ?  
>>> then it wouldnt matter what the generic implementations assume ...
>>>     
>>
>> It has to be split out separately for sh4, given the movca.l usage.
>>   
> Hi All,
> I've updated the previous patch to keep into account both suggestions 
> made by Mike and Paul.
> A brief explanation of the changes follows:
> 
> extra/Configs/Config.in         -> set the TARGET_SUBARCH for the sh4 
> architecture
> extra/Configs/Config.in.sh      ->  set on the ARCH_HAS_BWD_MEMCPY for 
> sh4 architecture only
> 
> libc/string/sh/sh4                  -> new file memcpy.S (sh4 specific)
> libc/string/generic/memmove.c   -> use the new macro 
> __ARCH_HAS_BWD_MEMCPY__ instead of #if 1
> libc/string/generic/memcpy.c    ->  move static function from C source 
> to common header file with some reorder
> libc/string/generic/memcopy.h   ->  ""
> libc/string/Makefile.in               -> add code the manage subarch 
> specific code in addition to the arch specific one.
> 
> Any comments are welcome.
> 
> Cheers,
> Carmelo
> 
> 
Hi Mike, Paul,
did you have time to look at this ?


If accepted, may reduce a bit the diff from sh4 port and trunk.
This ode is currently used on nptl/sh4 port.

Carmelo

> ------------------------------------------------------------------------
> 
> diff -Naupr uClibc-trunk/extra/Configs/Config.in 
> uClibc-trunk-st/extra/Configs/Config.in
> --- uClibc-trunk/extra/Configs/Config.in      2007-04-24 15:19:31.000000000 
> +0200
> +++ uClibc-trunk-st/extra/Configs/Config.in   2007-05-07 10:24:22.045984000 
> +0200
> @@ -180,6 +180,7 @@ config TARGET_SUBARCH
>       string
>       default "e500" if CONFIG_E500
>       default "classic" if CONFIG_CLASSIC
> +     default "sh4" if CONFIG_SH4
>       default ""
>  
>  source "extra/Configs/Config.in.arch"
> diff -Naupr uClibc-trunk/extra/Configs/Config.sh 
> uClibc-trunk-st/extra/Configs/Config.sh
> --- uClibc-trunk/extra/Configs/Config.sh      2007-03-16 21:38:22.000000000 
> +0100
> +++ uClibc-trunk-st/extra/Configs/Config.sh   2007-05-07 14:02:04.426778000 
> +0200
> @@ -48,3 +48,8 @@ config CONFIG_SH4
>       bool "SH4"
>  
>  endchoice
> +
> +config ARCH_HAS_BWD_MEMCPY
> +       bool
> +       default y
> +       depends CONFIG_SH4
> diff -Naupr uClibc-trunk/libc/string/Makefile.in 
> uClibc-trunk-st/libc/string/Makefile.in
> --- uClibc-trunk/libc/string/Makefile.in      2006-09-19 09:43:04.000000000 
> +0200
> +++ uClibc-trunk-st/libc/string/Makefile.in   2007-05-07 10:27:07.516749000 
> +0200
> @@ -8,6 +8,18 @@
>  #
>  # Arch specific fun
>  #
> +# Collect the subarch specific implementation (asm files)
> +ifneq ($(strip $(TARGET_SUBARCH)),)
> +STRING_SUBARCH_DIR := 
> $(top_srcdir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH)
> +STRING_SUBARCH_OUT := 
> $(top_builddir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH)
> +
> +STRING_SUBARCH_SSRC := $(wildcard $(STRING_SUBARCH_OUT)/*.S)
> +STRING_SUBARCH_SOBJ := $(patsubst 
> $(STRING_SUBARCH_DIR)/%.S,$(STRING_SUBARCH_OUT)/%.o,$(STRING_SUBARCH_SSRC))
> +
> +STRING_SUBARCH_OBJS := $(STRING_SUBARCH_SOBJ)
> +endif
> +
> +# Collect the arch specific implementation (asm, c files)
>  STRING_ARCH_DIR := $(top_srcdir)libc/string/$(TARGET_ARCH)
>  STRING_ARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)
>  
> @@ -15,13 +27,18 @@ STRING_ARCH_SRC := $(wildcard $(STRING_A
>  STRING_ARCH_OBJ := $(patsubst 
> $(STRING_ARCH_DIR)/%.c,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SRC))
>  
>  STRING_ARCH_SSRC := $(wildcard $(STRING_ARCH_DIR)/*.S)
> -STRING_ARCH_SOBJ := $(patsubst 
> $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC))
>  
> +# Exclude the subarch implementation from the arch ones
> +ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
> +STRING_ARCH_SSRC := $(filter-out $(patsubst 
> %.o,$(STRING_ARCH_DIR)/%.S,$(notdir 
> $(STRING_SUBARCH_OBJS))),$(STRING_ARCH_SSRC))
> +endif
> +
> +STRING_ARCH_SOBJ := $(patsubst 
> $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC))
>  STRING_ARCH_OBJS := $(STRING_ARCH_OBJ) $(STRING_ARCH_SOBJ)
>  
> -libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS)
> +libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS) 
> $(STRING_SUBARCH_OBJS)
>  
> -libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ)
> +libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ) 
> $(STRING_SUBARCH_OBJS)
>  
>  #
>  # Generic stuff
> @@ -35,6 +52,9 @@ ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y)
>  ifneq ($(strip $(STRING_ARCH_OBJS)),)
>  STRING_GENERIC_SRC := $(filter-out $(patsubst 
> %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir 
> $(STRING_ARCH_OBJS))),$(STRING_GENERIC_SRC))
>  endif
> +ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
> +STRING_GENERIC_SRC := $(filter-out $(patsubst 
> %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir 
> $(STRING_SUBARCH_OBJS))),$(STRING_GENERIC_SRC))
> +endif
>  endif
>  
>  STRING_GENERIC_OBJS := $(patsubst 
> $(STRING_GENERIC_DIR)/%.c,$(STRING_GENERIC_OUT)/%.o,$(STRING_GENERIC_SRC))
> @@ -93,6 +113,9 @@ ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y)
>  ifneq ($(strip $(STRING_ARCH_OBJS)),)
>  STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir 
> $(STRING_ARCH_OBJS))),$(STRING_CSRC))
>  endif
> +ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
> +STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir 
> $(STRING_SUBARCH_OBJS))),$(STRING_CSRC))
> +endif
>  endif
>  
>  ifeq ($(UCLIBC_HAS_STRING_GENERIC_OPT),y)
> diff -Naupr uClibc-trunk/libc/string/generic/memcopy.h 
> uClibc-trunk-st/libc/string/generic/memcopy.h
> --- uClibc-trunk/libc/string/generic/memcopy.h        2006-09-19 
> 09:43:00.000000000 +0200
> +++ uClibc-trunk-st/libc/string/generic/memcopy.h     2007-05-07 
> 10:27:55.056971000 +0200
> @@ -107,24 +107,6 @@ typedef unsigned char byte;
>       }                                                                     \
>      } while (0)
>  
> -/* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
> -   the assumption that DST_BP is aligned on an OPSIZ multiple.  If
> -   not all bytes could be easily copied, store remaining number of bytes
> -   in NBYTES_LEFT, otherwise store 0.  */
> -/* extern void _wordcopy_fwd_aligned __P ((long int, long int, size_t)); */
> -/* extern void _wordcopy_fwd_dest_aligned __P ((long int, long int, 
> size_t)); */
> -#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)                 \
> -  do                                                                       \
> -    {                                                                        
>       \
> -      if (src_bp % OPSIZ == 0)                                               
>       \
> -     _wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);             \
> -      else                                                                 \
> -     _wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);        \
> -      src_bp += (nbytes) & -OPSIZ;                                         \
> -      dst_bp += (nbytes) & -OPSIZ;                                         \
> -      (nbytes_left) = (nbytes) % OPSIZ;                                      
>       \
> -    } while (0)
> -
>  /* Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR,
>     beginning at the words (of type op_t) right before the pointers and
>     continuing towards smaller addresses.  May take advantage of that
> @@ -148,3 +130,213 @@ typedef unsigned char byte;
>  
>  /* Threshold value for when to enter the unrolled loops.  */
>  #define      OP_T_THRES      16
> +
> +#ifdef __ARCH_HAS_BWD_MEMCPY__
> +
> +/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
> +   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> +   Both SRCP and DSTP should be aligned for memory operations on `op_t's.  */
> +
> +static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
> +{
> +  op_t a0, a1;
> +  a0 = a1 = 0L;
> +  switch (len % 8)
> +    {
> +    case 2:
> +      a0 = ((op_t *) srcp)[0];
> +      srcp -= 6 * OPSIZ;
> +      dstp -= 7 * OPSIZ;
> +      len += 6;
> +      goto do1;
> +    case 3:
> +      a1 = ((op_t *) srcp)[0];
> +      srcp -= 5 * OPSIZ;
> +      dstp -= 6 * OPSIZ;
> +      len += 5;
> +      goto do2;
> +    case 4:
> +      a0 = ((op_t *) srcp)[0];
> +      srcp -= 4 * OPSIZ;
> +      dstp -= 5 * OPSIZ;
> +      len += 4;
> +      goto do3;
> +    case 5:
> +      a1 = ((op_t *) srcp)[0];
> +      srcp -= 3 * OPSIZ;
> +      dstp -= 4 * OPSIZ;
> +      len += 3;
> +      goto do4;
> +    case 6:
> +      a0 = ((op_t *) srcp)[0];
> +      srcp -= 2 * OPSIZ;
> +      dstp -= 3 * OPSIZ;
> +      len += 2;
> +      goto do5;
> +    case 7:
> +      a1 = ((op_t *) srcp)[0];
> +      srcp -= 1 * OPSIZ;
> +      dstp -= 2 * OPSIZ;
> +      len += 1;
> +      goto do6;
> +
> +    case 0:
> +      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> +     return;
> +      a0 = ((op_t *) srcp)[0];
> +      srcp -= 0 * OPSIZ;
> +      dstp -= 1 * OPSIZ;
> +      goto do7;
> +    case 1:
> +      a1 = ((op_t *) srcp)[0];
> +      srcp -=-1 * OPSIZ;
> +      dstp -= 0 * OPSIZ;
> +      len -= 1;
> +      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> +     goto do0;
> +      goto do8;                      /* No-op.  */
> +    }
> +
> +  do
> +    {
> +    do8:
> +      a0 = ((op_t *) srcp)[0];
> +      ((op_t *) dstp)[0] = a1;
> +    do7:
> +      a1 = ((op_t *) srcp)[1];
> +      ((op_t *) dstp)[1] = a0;
> +    do6:
> +      a0 = ((op_t *) srcp)[2];
> +      ((op_t *) dstp)[2] = a1;
> +    do5:
> +      a1 = ((op_t *) srcp)[3];
> +      ((op_t *) dstp)[3] = a0;
> +    do4:
> +      a0 = ((op_t *) srcp)[4];
> +      ((op_t *) dstp)[4] = a1;
> +    do3:
> +      a1 = ((op_t *) srcp)[5];
> +      ((op_t *) dstp)[5] = a0;
> +    do2:
> +      a0 = ((op_t *) srcp)[6];
> +      ((op_t *) dstp)[6] = a1;
> +    do1:
> +      a1 = ((op_t *) srcp)[7];
> +      ((op_t *) dstp)[7] = a0;
> +
> +      srcp += 8 * OPSIZ;
> +      dstp += 8 * OPSIZ;
> +      len -= 8;
> +    }
> +  while (len != 0);
> +
> +  /* This is the right position for do0.  Please don't move
> +     it into the loop.  */
> + do0:
> +  ((op_t *) dstp)[0] = a1;
> +}
> +
> +/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
> +   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> +   DSTP should be aligned for memory operations on `op_t's, but SRCP must
> +   *not* be aligned.  */
> +
> +static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t 
> len)
> +{
> +  op_t a0, a1, a2, a3;
> +  int sh_1, sh_2;
> +
> +  /* Calculate how to shift a word read at the memory operation
> +     aligned srcp to make it aligned for copy.  */
> +  a0 = a1 = a2 = a3 = 0L;
> +  sh_1 = 8 * (srcp % OPSIZ);
> +  sh_2 = 8 * OPSIZ - sh_1;
> + 
> +  /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
> +     it points in the middle of.  */
> +  srcp &= -OPSIZ;
> +
> +  switch (len % 4)
> +    {
> +    case 2:
> +      a1 = ((op_t *) srcp)[0];
> +      a2 = ((op_t *) srcp)[1];
> +      srcp -= 1 * OPSIZ;
> +      dstp -= 3 * OPSIZ;
> +      len += 2;
> +      goto do1;
> +    case 3:
> +      a0 = ((op_t *) srcp)[0];
> +      a1 = ((op_t *) srcp)[1];
> +      srcp -= 0 * OPSIZ;
> +      dstp -= 2 * OPSIZ;
> +      len += 1;
> +      goto do2;
> +    case 0:
> +      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> +     return;
> +      a3 = ((op_t *) srcp)[0];
> +      a0 = ((op_t *) srcp)[1];
> +      srcp -=-1 * OPSIZ;
> +      dstp -= 1 * OPSIZ;
> +      len += 0;
> +      goto do3;
> +    case 1:
> +      a2 = ((op_t *) srcp)[0];
> +      a3 = ((op_t *) srcp)[1];
> +      srcp -=-2 * OPSIZ;
> +      dstp -= 0 * OPSIZ;
> +      len -= 1;
> +      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> +     goto do0;
> +      goto do4;                      /* No-op.  */
> +    }
> +
> +  do
> +    {
> +    do4:
> +      a0 = ((op_t *) srcp)[0];
> +      ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> +    do3:
> +      a1 = ((op_t *) srcp)[1];
> +      ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2);
> +    do2:
> +      a2 = ((op_t *) srcp)[2];
> +      ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2);
> +    do1:
> +      a3 = ((op_t *) srcp)[3];
> +      ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2);
> +
> +      srcp += 4 * OPSIZ;
> +      dstp += 4 * OPSIZ;
> +      len -= 4;
> +    }
> +  while (len != 0);
> +
> +  /* This is the right position for do0.  Please don't move
> +     it into the loop.  */
> + do0:
> +  ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> +}
> +
> +
> +/* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
> +   the assumption that DST_BP is aligned on an OPSIZ multiple.  If
> +   not all bytes could be easily copied, store remaining number of bytes
> +   in NBYTES_LEFT, otherwise store 0.  */
> +/* extern void _wordcopy_fwd_aligned __P ((long int, long int, size_t)); */
> +/* extern void _wordcopy_fwd_dest_aligned __P ((long int, long int, 
> size_t)); */
> +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)                 \
> +  do                                                                       \
> +    {                                                                        
>       \
> +      if (src_bp % OPSIZ == 0)                                               
>       \
> +     _wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);             \
> +      else                                                                 \
> +     _wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ);        \
> +      src_bp += (nbytes) & -OPSIZ;                                         \
> +      dst_bp += (nbytes) & -OPSIZ;                                         \
> +      (nbytes_left) = (nbytes) % OPSIZ;                                      
>       \
> +    } while (0)
> +
> +#endif /* __ARCH_HAS_BWD_MEMCPY__ */
> +
> diff -Naupr uClibc-trunk/libc/string/generic/memcpy.c 
> uClibc-trunk-st/libc/string/generic/memcpy.c
> --- uClibc-trunk/libc/string/generic/memcpy.c 2006-09-19 09:43:00.000000000 
> +0200
> +++ uClibc-trunk-st/libc/string/generic/memcpy.c      2007-05-07 
> 10:28:20.217087000 +0200
> @@ -25,192 +25,6 @@
>  
>  libc_hidden_proto(memcpy)
>  
> -/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
> -   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> -   Both SRCP and DSTP should be aligned for memory operations on `op_t's.  */
> -
> -static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
> -{
> -  op_t a0, a1;
> -
> -  switch (len % 8)
> -    {
> -    case 2:
> -      a0 = ((op_t *) srcp)[0];
> -      srcp -= 6 * OPSIZ;
> -      dstp -= 7 * OPSIZ;
> -      len += 6;
> -      goto do1;
> -    case 3:
> -      a1 = ((op_t *) srcp)[0];
> -      srcp -= 5 * OPSIZ;
> -      dstp -= 6 * OPSIZ;
> -      len += 5;
> -      goto do2;
> -    case 4:
> -      a0 = ((op_t *) srcp)[0];
> -      srcp -= 4 * OPSIZ;
> -      dstp -= 5 * OPSIZ;
> -      len += 4;
> -      goto do3;
> -    case 5:
> -      a1 = ((op_t *) srcp)[0];
> -      srcp -= 3 * OPSIZ;
> -      dstp -= 4 * OPSIZ;
> -      len += 3;
> -      goto do4;
> -    case 6:
> -      a0 = ((op_t *) srcp)[0];
> -      srcp -= 2 * OPSIZ;
> -      dstp -= 3 * OPSIZ;
> -      len += 2;
> -      goto do5;
> -    case 7:
> -      a1 = ((op_t *) srcp)[0];
> -      srcp -= 1 * OPSIZ;
> -      dstp -= 2 * OPSIZ;
> -      len += 1;
> -      goto do6;
> -
> -    case 0:
> -      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> -     return;
> -      a0 = ((op_t *) srcp)[0];
> -      srcp -= 0 * OPSIZ;
> -      dstp -= 1 * OPSIZ;
> -      goto do7;
> -    case 1:
> -      a1 = ((op_t *) srcp)[0];
> -      srcp -=-1 * OPSIZ;
> -      dstp -= 0 * OPSIZ;
> -      len -= 1;
> -      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> -     goto do0;
> -      goto do8;                      /* No-op.  */
> -    }
> -
> -  do
> -    {
> -    do8:
> -      a0 = ((op_t *) srcp)[0];
> -      ((op_t *) dstp)[0] = a1;
> -    do7:
> -      a1 = ((op_t *) srcp)[1];
> -      ((op_t *) dstp)[1] = a0;
> -    do6:
> -      a0 = ((op_t *) srcp)[2];
> -      ((op_t *) dstp)[2] = a1;
> -    do5:
> -      a1 = ((op_t *) srcp)[3];
> -      ((op_t *) dstp)[3] = a0;
> -    do4:
> -      a0 = ((op_t *) srcp)[4];
> -      ((op_t *) dstp)[4] = a1;
> -    do3:
> -      a1 = ((op_t *) srcp)[5];
> -      ((op_t *) dstp)[5] = a0;
> -    do2:
> -      a0 = ((op_t *) srcp)[6];
> -      ((op_t *) dstp)[6] = a1;
> -    do1:
> -      a1 = ((op_t *) srcp)[7];
> -      ((op_t *) dstp)[7] = a0;
> -
> -      srcp += 8 * OPSIZ;
> -      dstp += 8 * OPSIZ;
> -      len -= 8;
> -    }
> -  while (len != 0);
> -
> -  /* This is the right position for do0.  Please don't move
> -     it into the loop.  */
> - do0:
> -  ((op_t *) dstp)[0] = a1;
> -}
> -
> -/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
> -   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
> -   DSTP should be aligned for memory operations on `op_t's, but SRCP must
> -   *not* be aligned.  */
> -
> -static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t 
> len)
> -{
> -  op_t a0, a1, a2, a3;
> -  int sh_1, sh_2;
> -
> -  /* Calculate how to shift a word read at the memory operation
> -     aligned srcp to make it aligned for copy.  */
> -
> -  sh_1 = 8 * (srcp % OPSIZ);
> -  sh_2 = 8 * OPSIZ - sh_1;
> -
> -  /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
> -     it points in the middle of.  */
> -  srcp &= -OPSIZ;
> -
> -  switch (len % 4)
> -    {
> -    case 2:
> -      a1 = ((op_t *) srcp)[0];
> -      a2 = ((op_t *) srcp)[1];
> -      srcp -= 1 * OPSIZ;
> -      dstp -= 3 * OPSIZ;
> -      len += 2;
> -      goto do1;
> -    case 3:
> -      a0 = ((op_t *) srcp)[0];
> -      a1 = ((op_t *) srcp)[1];
> -      srcp -= 0 * OPSIZ;
> -      dstp -= 2 * OPSIZ;
> -      len += 1;
> -      goto do2;
> -    case 0:
> -      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> -     return;
> -      a3 = ((op_t *) srcp)[0];
> -      a0 = ((op_t *) srcp)[1];
> -      srcp -=-1 * OPSIZ;
> -      dstp -= 1 * OPSIZ;
> -      len += 0;
> -      goto do3;
> -    case 1:
> -      a2 = ((op_t *) srcp)[0];
> -      a3 = ((op_t *) srcp)[1];
> -      srcp -=-2 * OPSIZ;
> -      dstp -= 0 * OPSIZ;
> -      len -= 1;
> -      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
> -     goto do0;
> -      goto do4;                      /* No-op.  */
> -    }
> -
> -  do
> -    {
> -    do4:
> -      a0 = ((op_t *) srcp)[0];
> -      ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> -    do3:
> -      a1 = ((op_t *) srcp)[1];
> -      ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2);
> -    do2:
> -      a2 = ((op_t *) srcp)[2];
> -      ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2);
> -    do1:
> -      a3 = ((op_t *) srcp)[3];
> -      ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2);
> -
> -      srcp += 4 * OPSIZ;
> -      dstp += 4 * OPSIZ;
> -      len -= 4;
> -    }
> -  while (len != 0);
> -
> -  /* This is the right position for do0.  Please don't move
> -     it into the loop.  */
> - do0:
> -  ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
> -}
> -
>  void *memcpy (void *dstpp, const void *srcpp, size_t len)
>  {
>    unsigned long int dstp = (long int) dstpp;
> diff -Naupr uClibc-trunk/libc/string/generic/memmove.c 
> uClibc-trunk-st/libc/string/generic/memmove.c
> --- uClibc-trunk/libc/string/generic/memmove.c        2006-09-19 
> 09:43:00.000000000 +0200
> +++ uClibc-trunk-st/libc/string/generic/memmove.c     2007-05-07 
> 10:29:26.717396000 +0200
> @@ -29,7 +29,8 @@ libc_hidden_proto(memcpy)
>  
>  static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
>  {
> -  op_t a0, a1;
> +  op_t a0 = 0;
> +  op_t a1 = 0;
>  
>    switch (len % 8)
>      {
> @@ -133,7 +134,10 @@ static void _wordcopy_bwd_aligned (long 
>  
>  static void _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t 
> len)
>  {
> -  op_t a0, a1, a2, a3;
> +  op_t a0 = 0;
> +  op_t a1 = 0;
> +  op_t a2 = 0;
> +  op_t a3 = 0;
>    int sh_1, sh_2;
>  
>    /* Calculate how to shift a word read at the memory operation
> @@ -218,8 +222,8 @@ void *memmove (void *dest, const void *s
>       Reduces the working set.  */
>    if (dstp - srcp >= len)    /* *Unsigned* compare!  */
>      {
> -#if 1
> -#warning REMINDER: generic-opt memmove assumes memcpy does forward copying!
> +#ifndef __ARCH_HAS_BWD_MEMCPY__
> +      /* generic-opt memmove assumes memcpy does forward copying! */
>        memcpy(dest, src, len);
>  #else
>        /* Copy from the beginning to the end.  */
> diff -Naupr uClibc-trunk/libc/string/sh/sh4/memcpy.S 
> uClibc-trunk-st/libc/string/sh/sh4/memcpy.S
> --- uClibc-trunk/libc/string/sh/sh4/memcpy.S  1970-01-01 01:00:00.000000000 
> +0100
> +++ uClibc-trunk-st/libc/string/sh/sh4/memcpy.S       2007-05-07 
> 13:43:16.291529000 +0200
> @@ -0,0 +1,807 @@
> +/*
> + * "memcpy" implementation of SuperH
> + *
> + * Copyright (C) 1999  Niibe Yutaka
> + * Copyright (c) 2002  STMicroelectronics Ltd
> + *   Modified from memcpy.S and micro-optimised for SH4
> + *   Stuart Menefy ([EMAIL PROTECTED])
> + *
> + */
> +
> +/*
> + * void *memcpy(void *dst, const void *src, size_t n);
> + *
> + * It is assumed that there is no overlap between src and dst.
> + * If there is an overlap, then the results are undefined.
> + */
> +
> +#include <endian.h>
> +
> +     !
> +     !       GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
> +     !
> +     
> +     ! Size is 16 or greater, and may have trailing bytes
> +
> +     .balign 32
> +.Lcase1:
> +     ! Read a long word and write a long word at once
> +     ! At the start of each iteration, r7 contains last long load
> +     add     #-1,r5          !  79 EX
> +     mov     r4,r2           !   5 MT (0 cycles latency)
> +
> +     mov.l   @(r0,r5),r7     !  21 LS (2 cycles latency)
> +     add     #-4,r5          !  50 EX
> +
> +     add     #7,r2           !  79 EX
> +     !
> +#ifdef __LITTLE_ENDIAN__
> +     ! 6 cycles, 4 bytes per iteration
> +3:   mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
> +     mov     r7, r3          !   5 MT (latency=0)    ! RQPO
> +     
> +     cmp/hi  r2,r0           !  57 MT
> +     shll16  r3              ! 103 EX
> +
> +     mov     r1,r6           !   5 MT (latency=0)
> +     shll8   r3              ! 102 EX                ! Oxxx
> +
> +     shlr8   r6              ! 106 EX                ! xNML
> +     mov     r1, r7          !   5 MT (latency=0)
> +     
> +     or      r6,r3           !  82 EX                ! ONML
> +     bt/s    3b              ! 109 BR
> +
> +      mov.l  r3,@-r0         !  30 LS
> +#else
> +3:   mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! KLMN
> +     mov     r7,r3           !   5 MT (latency=0)    ! OPQR
> +
> +     cmp/hi  r2,r0           !  57 MT
> +     shlr16  r3              ! 107 EX
> +
> +     shlr8   r3              ! 106 EX                ! xxxO
> +     mov     r1,r6           !   5 MT (latency=0)
> +
> +     shll8   r6              ! 102 EX                ! LMNx
> +     mov     r1,r7           !   5 MT (latency=0)
> +
> +     or      r6,r3           !  82 EX                ! LMNO
> +     bt/s    3b              ! 109 BR
> +
> +      mov.l  r3,@-r0         !  30 LS
> +#endif
> +     ! Finally, copy a byte at once, if necessary
> +
> +     add     #4,r5           !  50 EX
> +     cmp/eq  r4,r0           !  54 MT
> +
> +     add     #-6,r2          !  50 EX
> +     bt      9f              ! 109 BR
> +
> +8:   cmp/hi  r2,r0           !  57 MT
> +     mov.b   @(r0,r5),r1     !  20 LS (latency=2)
> +     
> +     bt/s    8b              ! 109 BR
> +
> +      mov.b  r1,@-r0         !  29 LS
> +
> +9:   rts
> +      nop
> +
> +     
> +     !
> +     !       GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
> +     !
> +     
> +     ! Size is 16 or greater, and may have trailing bytes
> +
> +     .balign 32
> +.Lcase3:
> +     ! Read a long word and write a long word at once
> +     ! At the start of each iteration, r7 contains last long load
> +     add     #-3,r5          ! 79 EX
> +     mov     r4,r2           !  5 MT (0 cycles latency)
> +
> +     mov.l   @(r0,r5),r7     ! 21 LS (2 cycles latency)
> +     add     #-4,r5          ! 50 EX
> +
> +     add     #7,r2           !  79 EX
> +     !
> +#ifdef __LITTLE_ENDIAN__
> +     ! 6 cycles, 4 bytes per iteration
> +3:   mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
> +     mov     r7, r3          !   5 MT (latency=0)    ! RQPO
> +     
> +     cmp/hi  r2,r0           !  57 MT
> +     shll8   r3              ! 102 EX                ! QPOx
> +
> +     mov     r1,r6           !   5 MT (latency=0)
> +     shlr16  r6              ! 107 EX
> +
> +     shlr8   r6              ! 106 EX                ! xxxN
> +     mov     r1, r7          !   5 MT (latency=0)
> +     
> +     or      r6,r3           !  82 EX                ! QPON
> +     bt/s    3b              ! 109 BR
> +
> +      mov.l  r3,@-r0         !  30 LS
> +#else
> +3:   mov     r1,r3           ! OPQR
> +     shlr8   r3              ! xOPQ
> +     mov.l   @(r0,r5),r1     ! KLMN
> +     mov     r1,r6
> +     shll16  r6
> +     shll8   r6              ! Nxxx
> +     or      r6,r3           ! NOPQ
> +     cmp/hi  r2,r0
> +     bt/s    3b
> +      mov.l  r3,@-r0
> +#endif
> +
> +     ! Finally, copy a byte at once, if necessary
> +
> +     add     #6,r5           !  50 EX
> +     cmp/eq  r4,r0           !  54 MT
> +
> +     add     #-6,r2          !  50 EX
> +     bt      9f              ! 109 BR
> +
> +8:   cmp/hi  r2,r0           !  57 MT
> +     mov.b   @(r0,r5),r1     !  20 LS (latency=2)
> +     
> +     bt/s    8b              ! 109 BR
> +
> +      mov.b  r1,@-r0         !  29 LS
> +
> +9:   rts
> +      nop
> +     
> +/* void *memcpy(void *dst, const void *src, size_t len) */
> +.text
> +.align 5
> +.type memcpy,@function
> +.globl memcpy;
> +
> +memcpy:
> +     ! Calculate the invariants which will be used in the remainder
> +     ! of the code:
> +     !
> +     !      r4   -->  [ ...  ] DST             [ ...  ] SRC
> +     !                [ ...  ]                 [ ...  ]
> +     !                  :                        :
> +     !      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
> +     !
> +     !
> +     
> +     ! Short circuit the common case of src, dst and len being 32 bit aligned
> +     ! and test for zero length move
> +
> +     mov     r6, r0          !   5 MT (0 cycle latency)
> +     or      r4, r0          !  82 EX
> +
> +     or      r5, r0          !  82 EX
> +     tst     r6, r6          !  86 MT
> +
> +     bt/s    99f             ! 111 BR                (zero len)
> +      tst    #3, r0          !  87 MT
> +
> +     mov     r4, r0          !   5 MT (0 cycle latency)
> +     add     r6, r0          !  49 EX
> +
> +     mov     #16, r1         !   6 EX
> +     bt/s    .Lcase00        ! 111 BR                (aligned)
> +
> +      sub    r4, r5          !  75 EX
> +
> +     ! Arguments are not nicely long word aligned or zero len.
> +     ! Check for small copies, and if so do a simple byte at a time copy.
> +     !
> +     ! Deciding on an exact value of 'small' is not easy, as the point at 
> which
> +     ! using the optimised routines become worthwhile varies (these are the
> +     ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
> +     !       size    byte-at-time    long    word    byte
> +     !       16      42              39-40   46-50   50-55
> +     !       24      58              43-44   54-58   62-67
> +     !       36      82              49-50   66-70   80-85
> +     ! However the penalty for getting it 'wrong' is much higher for long 
> word
> +     ! aligned data (and this is more common), so use a value of 16.
> +     
> +     cmp/gt  r6,r1           !  56 MT
> +
> +     add     #-1,r5          !  50 EX
> +     bf/s    6f              ! 108 BR                (not small)
> +
> +      mov    r5, r3          !   5 MT (latency=0)
> +     shlr    r6              ! 104 EX
> +
> +     mov.b   @(r0,r5),r1     !  20 LS (latency=2)
> +     bf/s    4f              ! 111 BR
> +     
> +      add    #-1,r3          !  50 EX
> +     tst     r6, r6          !  86 MT
> +
> +     bt/s    98f             ! 110 BR
> +      mov.b  r1,@-r0         !  29 LS
> +
> +     ! 4 cycles, 2 bytes per iteration
> +3:   mov.b   @(r0,r5),r1     !  20 LS (latency=2)
> +
> +4:   mov.b   @(r0,r3),r2     !  20 LS (latency=2)
> +     dt      r6              !  67 EX
> +
> +     mov.b   r1,@-r0         !  29 LS
> +     bf/s    3b              ! 111 BR
> +
> +      mov.b  r2,@-r0         !  29 LS
> +98:
> +     rts
> +      nop
> +
> +99:  rts
> +      mov    r4, r0
> +
> +     ! Size is not small, so its worthwhile looking for optimisations.
> +     ! First align destination to a long word boundary.
> +     !
> +     ! r5 = normal value -1
> +
> +6:   tst     #3, r0          !  87 MT
> +        mov  #3, r3          !   6 EX
> +
> +     bt/s    2f              ! 111 BR
> +      and    r0,r3           !  78 EX
> +
> +     ! 3 cycles, 1 byte per iteration        
> +1:   dt      r3              !  67 EX
> +     mov.b   @(r0,r5),r1     !  19 LS (latency=2)
> +
> +     add     #-1, r6         !  79 EX
> +     bf/s    1b              ! 109 BR
> +
> +      mov.b  r1,@-r0         !  28 LS
> +
> +2:   add     #1, r5          !  79 EX
> +     
> +     ! Now select the appropriate bulk transfer code based on relative
> +     ! alignment of src and dst.
> +     
> +     mov     r0, r3          !   5 MT (latency=0)
> +
> +     mov     r5, r0          !   5 MT (latency=0)
> +     tst     #1, r0          !  87 MT
> +
> +     bf/s    1f              ! 111 BR
> +      mov    #64, r7         !   6 EX
> +
> +     ! bit 0 clear
> +             
> +     cmp/ge  r7, r6          !  55 MT
> +
> +     bt/s    2f              ! 111 BR
> +      tst    #2, r0          !  87 MT
> +
> +     ! small
> +     bt/s    .Lcase0
> +      mov    r3, r0
> +
> +     bra     .Lcase2
> +      nop
> +
> +     ! big
> +2:   bt/s    .Lcase0b
> +      mov    r3, r0
> +
> +     bra     .Lcase2b
> +      nop
> +     
> +     ! bit 0 set
> +1:   tst     #2, r0          ! 87 MT
> +
> +     bt/s    .Lcase1
> +      mov    r3, r0
> +
> +     bra     .Lcase3
> +      nop
> +     
> +
> +     !
> +     !       GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
> +     !
> +
> +     ! src, dst and size are all long word aligned
> +     ! size is non-zero
> +     
> +     .balign 32
> +.Lcase00:
> +     mov     #64, r1         !   6 EX
> +     mov     r5, r3          !   5 MT (latency=0)
> +
> +     cmp/gt  r6, r1          !  56 MT
> +     add     #-4, r5         !  50 EX
> +
> +     bf      .Lcase00b       ! 108 BR                (big loop)
> +     shlr2   r6              ! 105 EX
> +
> +     shlr    r6              ! 104 EX
> +     mov.l   @(r0, r5), r1   !  21 LS (latency=2)    
> +
> +     bf/s    4f              ! 111 BR
> +      add    #-8, r3         !  50 EX
> +
> +     tst     r6, r6          !  86 MT
> +     bt/s    5f              ! 110 BR
> +
> +      mov.l  r1,@-r0         !  30 LS
> +
> +     ! 4 cycles, 2 long words per iteration
> +3:   mov.l   @(r0, r5), r1   !  21 LS (latency=2)
> +
> +4:   mov.l   @(r0, r3), r2   !  21 LS (latency=2)
> +     dt      r6              !  67 EX
> +
> +     mov.l   r1, @-r0        !  30 LS
> +     bf/s    3b              ! 109 BR
> +
> +      mov.l  r2, @-r0        !  30 LS
> +
> +5:   rts
> +      nop
> +
> +     
> +     ! Size is 16 or greater and less than 64, but may have trailing bytes
> +
> +     .balign 32
> +.Lcase0:
> +     add     #-4, r5         !  50 EX
> +     mov     r4, r7          !   5 MT (latency=0)
> +
> +     mov.l   @(r0, r5), r1   !  21 LS (latency=2)    
> +     mov     #4, r2          !   6 EX
> +
> +     add     #11, r7         !  50 EX
> +     tst     r2, r6          !  86 MT
> +
> +     mov     r5, r3          !   5 MT (latency=0)
> +     bt/s    4f              ! 111 BR
> +
> +      add    #-4, r3         !  50 EX
> +     mov.l   r1,@-r0         !  30 LS
> +
> +     ! 4 cycles, 2 long words per iteration
> +3:   mov.l   @(r0, r5), r1   !  21 LS (latency=2)
> +
> +4:   mov.l   @(r0, r3), r2   !  21 LS (latency=2)
> +     cmp/hi  r7, r0
> +
> +     mov.l   r1, @-r0        !  30 LS
> +     bt/s    3b              ! 109 BR
> +
> +      mov.l  r2, @-r0        !  30 LS
> +
> +     ! Copy the final 0-3 bytes
> +
> +     add     #3,r5           !  50 EX
> +     
> +     cmp/eq  r0, r4          !  54 MT
> +     add     #-10, r7        !  50 EX
> +
> +     bt      9f              ! 110 BR
> +
> +     ! 3 cycles, 1 byte per iteration
> +1:   mov.b   @(r0,r5),r1     !  19 LS
> +     cmp/hi  r7,r0           !  57 MT
> +     
> +     bt/s    1b              ! 111 BR
> +      mov.b  r1,@-r0         !  28 LS
> +
> +9:   rts
> +      nop
> +
> +     ! Size is at least 64 bytes, so will be going round the big loop at 
> least once.
> +     !
> +     !   r2 = rounded up r4
> +     !   r3 = rounded down r0
> +
> +     .balign 32
> +.Lcase0b:
> +     add     #-4, r5         !  50 EX
> +
> +.Lcase00b:
> +     mov     r0, r3          !   5 MT (latency=0)
> +     mov     #(~0x1f), r1    !   6 EX
> +
> +     and     r1, r3          !  78 EX
> +     mov     r4, r2          !   5 MT (latency=0)
> +
> +     cmp/eq  r3, r0          !  54 MT
> +     add     #0x1f, r2       !  50 EX
> +
> +     bt/s    1f              ! 110 BR
> +      and    r1, r2          !  78 EX
> +
> +     ! copy initial words until cache line aligned
> +
> +     mov.l   @(r0, r5), r1   !  21 LS (latency=2)
> +     tst     #4, r0          !  87 MT
> +
> +     mov     r5, r6          !   5 MT (latency=0)
> +     add     #-4, r6         !  50 EX
> +
> +     bt/s    4f              ! 111 BR
> +      add    #8, r3          !  50 EX
> +
> +     tst     #0x18, r0       !  87 MT
> +     
> +     bt/s    1f              ! 109 BR
> +      mov.l  r1,@-r0         !  30 LS
> +     
> +     ! 4 cycles, 2 long words per iteration
> +3:   mov.l   @(r0, r5), r1   !  21 LS (latency=2)
> +
> +4:   mov.l   @(r0, r6), r7   !  21 LS (latency=2)
> +     cmp/eq  r3, r0          !  54 MT
> +
> +     mov.l   r1, @-r0        !  30 LS
> +     bf/s    3b              ! 109 BR
> +
> +      mov.l  r7, @-r0        !  30 LS
> +
> +     ! Copy the cache line aligned blocks
> +     !
> +     ! In use: r0, r2, r4, r5
> +     ! Scratch: r1, r3, r6, r7
> +     !
> +     ! We could do this with the four scratch registers, but if src
> +     ! and dest hit the same cache line, this will thrash, so make
> +     ! use of additional registers.
> +     ! 
> +     ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
> +     !   r5:  src (was r0+r5)
> +     !   r1:  dest (was r0)
> +     ! this can be reversed at the end, so we don't need to save any extra
> +     ! state.
> +     !
> +1:   mov.l   r8, @-r15       !  30 LS
> +     add     r0, r5          !  49 EX
> +     
> +     mov.l   r9, @-r15       !  30 LS
> +     mov     r0, r1          !   5 MT (latency=0)
> +     
> +     mov.l   r10, @-r15      !  30 LS
> +     add     #-0x1c, r5      !  50 EX
> +     
> +     mov.l   r11, @-r15      !  30 LS                        
> +
> +     ! 16 cycles, 32 bytes per iteration
> +2:   mov.l   @(0x00,r5),r0   ! 18 LS (latency=2)
> +     add     #-0x20, r1      ! 50 EX
> +     mov.l   @(0x04,r5),r3   ! 18 LS (latency=2)
> +     mov.l   @(0x08,r5),r6   ! 18 LS (latency=2)
> +     mov.l   @(0x0c,r5),r7   ! 18 LS (latency=2)
> +     mov.l   @(0x10,r5),r8   ! 18 LS (latency=2)
> +     mov.l   @(0x14,r5),r9   ! 18 LS (latency=2)
> +     mov.l   @(0x18,r5),r10  ! 18 LS (latency=2)
> +     mov.l   @(0x1c,r5),r11  ! 18 LS (latency=2)
> +     movca.l r0,@r1          ! 40 LS (latency=3-7)
> +     mov.l   r3,@(0x04,r1)   ! 33 LS
> +     mov.l   r6,@(0x08,r1)   ! 33 LS
> +     mov.l   r7,@(0x0c,r1)   ! 33 LS
> +
> +     mov.l   r8,@(0x10,r1)   ! 33 LS
> +     add     #-0x20, r5      ! 50 EX
> +
> +     mov.l   r9,@(0x14,r1)   ! 33 LS
> +     cmp/eq  r2,r1           ! 54 MT
> +
> +     mov.l   r10,@(0x18,r1)  !  33 LS
> +     bf/s    2b              ! 109 BR
> +
> +      mov.l  r11,@(0x1c,r1)  !  33 LS
> +
> +     mov     r1, r0          !   5 MT (latency=0)
> +
> +     mov.l   @r15+, r11      !  15 LS
> +     sub     r1, r5          !  75 EX
> +
> +     mov.l   @r15+, r10      !  15 LS
> +     cmp/eq  r4, r0          !  54 MT
> +
> +     bf/s    1f              ! 109 BR
> +      mov.l   @r15+, r9      !  15 LS
> +
> +     rts
> +1:    mov.l  @r15+, r8       !  15 LS
> +     sub     r4, r1          !  75 EX                (len remaining)
> +
> +     ! number of trailing bytes is non-zero
> +     !       
> +     ! invariants restored (r5 already decremented by 4)
> +     ! also r1=num bytes remaining
> +     
> +     mov     #4, r2          !   6 EX
> +     mov     r4, r7          !   5 MT (latency=0)
> +
> +     add     #0x1c, r5       !  50 EX                (back to -4)
> +     cmp/hs  r2, r1          !  58 MT
> +
> +     bf/s    5f              ! 108 BR
> +      add     #11, r7        !  50 EX
> +
> +     mov.l   @(r0, r5), r6   !  21 LS (latency=2)    
> +     tst     r2, r1          !  86 MT
> +
> +     mov     r5, r3          !   5 MT (latency=0)
> +     bt/s    4f              ! 111 BR
> +
> +      add    #-4, r3         !  50 EX
> +     cmp/hs  r2, r1          !  58 MT
> +
> +     bt/s    5f              ! 111 BR
> +      mov.l  r6,@-r0         !  30 LS
> +
> +     ! 4 cycles, 2 long words per iteration
> +3:   mov.l   @(r0, r5), r6   !  21 LS (latency=2)
> +
> +4:   mov.l   @(r0, r3), r2   !  21 LS (latency=2)
> +     cmp/hi  r7, r0
> +
> +     mov.l   r6, @-r0        !  30 LS
> +     bt/s    3b              ! 109 BR
> +
> +      mov.l  r2, @-r0        !  30 LS
> +
> +     ! Copy the final 0-3 bytes
> +
> +5:   cmp/eq  r0, r4          !  54 MT
> +     add     #-10, r7        !  50 EX
> +
> +     bt      9f              ! 110 BR
> +     add     #3,r5           !  50 EX
> +     
> +     ! 3 cycles, 1 byte per iteration
> +1:   mov.b   @(r0,r5),r1     !  19 LS
> +     cmp/hi  r7,r0           !  57 MT
> +     
> +     bt/s    1b              ! 111 BR
> +      mov.b  r1,@-r0         !  28 LS
> +
> +9:   rts
> +      nop
> +
> +     !
> +     !       GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
> +     !
> +             
> +     .balign 32
> +.Lcase2:
> +     ! Size is 16 or greater and less then 64, but may have trailing bytes
> +
> +2:   mov     r5, r6          !   5 MT (latency=0)
> +     add     #-2,r5          !  50 EX
> +
> +     mov     r4,r2           !   5 MT (latency=0)
> +     add     #-4,r6          !  50 EX
> +
> +     add     #7,r2           !  50 EX
> +3:   mov.w   @(r0,r5),r1     !  20 LS (latency=2)
> +
> +     mov.w   @(r0,r6),r3     !  20 LS (latency=2)
> +     cmp/hi  r2,r0           !  57 MT
> +
> +     mov.w   r1,@-r0         !  29 LS
> +     bt/s    3b              ! 111 BR
> +
> +      mov.w  r3,@-r0         !  29 LS
> +
> +     bra     10f
> +      nop
> +
> +
> +     .balign 32
> +.Lcase2b:
> +     ! Size is at least 64 bytes, so will be going round the big loop at 
> least once.
> +     !
> +     !   r2 = rounded up r4
> +     !   r3 = rounded down r0
> +
> +     mov     r0, r3          !   5 MT (latency=0)
> +     mov     #(~0x1f), r1    !   6 EX
> +
> +     and     r1, r3          !  78 EX
> +     mov     r4, r2          !   5 MT (latency=0)
> +
> +     cmp/eq  r3, r0          !  54 MT
> +     add     #0x1f, r2       !  50 EX
> +     
> +     add     #-2, r5         !  50 EX
> +     bt/s    1f              ! 110 BR
> +      and    r1, r2          !  78 EX
> +     
> +     ! Copy a short word one at a time until we are cache line aligned
> +     !   Normal values: r0, r2, r3, r4
> +     !   Unused: r1, r6, r7
> +     !   Mod: r5 (=r5-2)
> +     !
> +     add     #2, r3          !  50 EX
> +     
> +2:   mov.w   @(r0,r5),r1     !  20 LS (latency=2)
> +     cmp/eq  r3,r0           !  54 MT
> +             
> +     bf/s    2b              ! 111 BR
> +
> +      mov.w  r1,@-r0         !  29 LS
> +
> +     ! Copy the cache line aligned blocks
> +     !
> +     ! In use: r0, r2, r4, r5 (=r5-2)
> +     ! Scratch: r1, r3, r6, r7
> +     !
> +     ! We could do this with the four scratch registers, but if src
> +     ! and dest hit the same cache line, this will thrash, so make
> +     ! use of additional registers.
> +     ! 
> +     ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
> +     !   r5:  src (was r0+r5)
> +     !   r1:  dest (was r0)
> +     ! this can be reversed at the end, so we don't need to save any extra
> +     ! state.
> +     !
> +1:   mov.l   r8, @-r15       !  30 LS
> +     add     r0, r5          !  49 EX
> +     
> +     mov.l   r9, @-r15       !  30 LS
> +     mov     r0, r1          !   5 MT (latency=0)
> +     
> +     mov.l   r10, @-r15      !  30 LS
> +     add     #-0x1e, r5      !  50 EX
> +     
> +     mov.l   r11, @-r15      !  30 LS                        
> +     
> +     mov.l   r12, @-r15      !  30 LS                        
> +
> +     ! 17 cycles, 32 bytes per iteration
> +#ifdef __LITTLE_ENDIAN__
> +2:   mov.w   @r5+, r0        !  14 LS (latency=2)            ..JI
> +     add     #-0x20, r1      !  50 EX
> +
> +     mov.l   @r5+, r3        !  15 LS (latency=2)            NMLK
> +
> +     mov.l   @r5+, r6        !  15 LS (latency=2)            RQPO
> +     shll16  r0              ! 103 EX                        JI..
> +
> +     mov.l   @r5+, r7        !  15 LS (latency=2)
> +     xtrct   r3, r0          !  48 EX                        LKJI
> +
> +     mov.l   @r5+, r8        !  15 LS (latency=2)
> +     xtrct   r6, r3          !  48 EX                        PONM
> +
> +     mov.l   @r5+, r9        !  15 LS (latency=2)
> +     xtrct   r7, r6          !  48 EX
> +
> +     mov.l   @r5+, r10       !  15 LS (latency=2)
> +     xtrct   r8, r7          !  48 EX
> +
> +     mov.l   @r5+, r11       !  15 LS (latency=2)
> +     xtrct   r9, r8          !  48 EX
> +
> +     mov.w   @r5+, r12       !  15 LS (latency=2)
> +     xtrct   r10, r9         !  48 EX
> +
> +     movca.l r0,@r1          !  40 LS (latency=3-7)
> +     xtrct   r11, r10        !  48 EX
> +
> +     mov.l   r3, @(0x04,r1)  !  33 LS
> +     xtrct   r12, r11        !  48 EX
> +
> +     mov.l   r6, @(0x08,r1)  !  33 LS
> +     
> +     mov.l   r7, @(0x0c,r1)  !  33 LS
> +
> +     mov.l   r8, @(0x10,r1)  !  33 LS
> +     add     #-0x40, r5      !  50 EX
> +
> +     mov.l   r9, @(0x14,r1)  !  33 LS
> +     cmp/eq  r2,r1           !  54 MT
> +
> +     mov.l   r10, @(0x18,r1) !  33 LS
> +     bf/s    2b              ! 109 BR
> +
> +      mov.l  r11, @(0x1c,r1) !  33 LS
> +#else
> +2:   mov.w   @(0x1e,r5), r0  !  17 LS (latency=2)
> +     add     #-2, r5         !  50 EX
> +
> +     mov.l   @(0x1c,r5), r3  !  18 LS (latency=2)
> +     add     #-4, r1         !  50 EX
> +
> +     mov.l   @(0x18,r5), r6  !  18 LS (latency=2)
> +     shll16  r0              ! 103 EX
> +
> +     mov.l   @(0x14,r5), r7  !  18 LS (latency=2)
> +     xtrct   r3, r0          !  48 EX
> +
> +     mov.l   @(0x10,r5), r8  !  18 LS (latency=2)
> +     xtrct   r6, r3          !  48 EX
> +
> +     mov.l   @(0x0c,r5), r9  !  18 LS (latency=2)
> +     xtrct   r7, r6          !  48 EX
> +
> +     mov.l   @(0x08,r5), r10 !  18 LS (latency=2)
> +     xtrct   r8, r7          !  48 EX
> +
> +     mov.l   @(0x04,r5), r11 !  18 LS (latency=2)
> +     xtrct   r9, r8          !  48 EX
> +
> +     mov.w   @(0x02,r5), r12 !  18 LS (latency=2)
> +     xtrct   r10, r9         !  48 EX
> +
> +     movca.l r0,@r1          !  40 LS (latency=3-7)
> +     add     #-0x1c, r1      !  50 EX
> +
> +     mov.l   r3, @(0x1c,r1)  !  33 LS
> +     xtrct   r11, r10        !  48 EX
> +
> +     mov.l   r6, @(0x18,r1)  !  33 LS
> +     xtrct   r12, r11        !  48 EX
> +     
> +     mov.l   r7, @(0x14,r1)  !  33 LS
> +
> +     mov.l   r8, @(0x10,r1)  !  33 LS
> +     add     #-0x3e, r5      !  50 EX
> +
> +     mov.l   r9, @(0x0c,r1)  !  33 LS
> +     cmp/eq  r2,r1           !  54 MT
> +
> +     mov.l   r10, @(0x08,r1) !  33 LS
> +     bf/s    2b              ! 109 BR
> +
> +      mov.l  r11, @(0x04,r1) !  33 LS
> +#endif
> +
> +     mov.l   @r15+, r12
> +     mov     r1, r0          !   5 MT (latency=0)
> +
> +     mov.l   @r15+, r11      !  15 LS
> +     sub     r1, r5          !  75 EX
> +
> +     mov.l   @r15+, r10      !  15 LS
> +     cmp/eq  r4, r0          !  54 MT
> +
> +     bf/s    1f              ! 109 BR
> +      mov.l   @r15+, r9      !  15 LS
> +
> +     rts
> +1:    mov.l  @r15+, r8       !  15 LS
> +
> +     add     #0x1e, r5       !  50 EX
> +     
> +     ! Finish off a short word at a time
> +     ! r5 must be invariant - 2
> +10:  mov     r4,r2           !   5 MT (latency=0)
> +     add     #1,r2           !  50 EX
> +
> +     cmp/hi  r2, r0          !  57 MT
> +     bf/s    1f              ! 109 BR
> +
> +      add    #2, r2          !  50 EX
> +     
> +3:   mov.w   @(r0,r5),r1     !  20 LS
> +     cmp/hi  r2,r0           !  57 MT
> +
> +     bt/s    3b              ! 109 BR
> +
> +      mov.w  r1,@-r0         !  29 LS
> +1:
> +             
> +     !
> +     ! Finally, copy the last byte if necessary
> +     cmp/eq  r4,r0           !  54 MT
> +     bt/s    9b
> +      add    #1,r5
> +     mov.b   @(r0,r5),r1
> +     rts
> +      mov.b  r1,@-r0
> +
> +.size memcpy,.-memcpy;
> +libc_hidden_def (memcpy)
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> uClibc mailing list
> uClibc@uclibc.org
> http://busybox.net/cgi-bin/mailman/listinfo/uclibc

_______________________________________________
uClibc mailing list
uClibc@uclibc.org
http://busybox.net/cgi-bin/mailman/listinfo/uclibc

Re: [PATCH] sh4: use optimized asm version of memcpy - add config option to support backward copying

Reply via email to