Author: mjg
Date: Fri Nov 16 00:44:22 2018
New Revision: 340472
URL: https://svnweb.freebsd.org/changeset/base/340472

Log:
  amd64: handle small memset buffers with overlapping stores
  
  Instead of jumping to locations which store the exact number of bytes,
  use displacement to move the destination.
  
  In particular the following clears an area between 8-16 (inclusive)
  branch-free:
  
  movq    %r10,(%rdi)
  movq    %r10,-8(%rdi,%rcx)
  
  For instance for rcx of 10 the second line is rdi + 10 - 8 = rdi + 2.
  Writing 8 bytes starting at that offset overlaps with 6 bytes written
  previously and writes 2 new, giving 10 in total.
  
  Provides a nice win for smaller stores. Other ones are erratic depending
  on the microarchitecture.
  
  General idea taken from NetBSD (restricted use of the trick) and bionic
  string functions (use for various ranges like in this patch).
  
  Reviewed by:  kib (previous version)
  Sponsored by: The FreeBSD Foundation
  Differential Revision:        https://reviews.freebsd.org/D17660

Modified:
  head/lib/libc/amd64/string/memset.S
  head/sys/amd64/amd64/support.S

Modified: head/lib/libc/amd64/string/memset.S
==============================================================================
--- head/lib/libc/amd64/string/memset.S Fri Nov 16 00:03:31 2018        
(r340471)
+++ head/lib/libc/amd64/string/memset.S Fri Nov 16 00:44:22 2018        
(r340472)
@@ -41,12 +41,12 @@ __FBSDID("$FreeBSD$");
        imulq   %r8,%r10
 
        cmpq    $32,%rcx
-       jb      1016f
+       jbe     101632f
 
        cmpq    $256,%rcx
        ja      1256f
 
-1032:
+103200:
        movq    %r10,(%rdi)
        movq    %r10,8(%rdi)
        movq    %r10,16(%rdi)
@@ -54,43 +54,49 @@ __FBSDID("$FreeBSD$");
        leaq    32(%rdi),%rdi
        subq    $32,%rcx
        cmpq    $32,%rcx
-       jae     1032b
-       cmpb    $0,%cl
-       je      1000f
-1016:
+       ja      103200b
        cmpb    $16,%cl
-       jl      1008f
+       ja      201632f
+       movq    %r10,-16(%rdi,%rcx)
+       movq    %r10,-8(%rdi,%rcx)
+       ret
+       ALIGN_TEXT
+101632:
+       cmpb    $16,%cl
+       jl      100816f
+201632:
        movq    %r10,(%rdi)
        movq    %r10,8(%rdi)
-       subb    $16,%cl
-       jz      1000f
-       leaq    16(%rdi),%rdi
-1008:
+       movq    %r10,-16(%rdi,%rcx)
+       movq    %r10,-8(%rdi,%rcx)
+       ret
+       ALIGN_TEXT
+100816:
        cmpb    $8,%cl
-       jl      1004f
+       jl      100408f
        movq    %r10,(%rdi)
-       subb    $8,%cl
-       jz      1000f
-       leaq    8(%rdi),%rdi
-1004:
+       movq    %r10,-8(%rdi,%rcx)
+       ret
+       ALIGN_TEXT
+100408:
        cmpb    $4,%cl
-       jl      1002f
+       jl      100204f
        movl    %r10d,(%rdi)
-       subb    $4,%cl
-       jz      1000f
-       leaq    4(%rdi),%rdi
-1002:
+       movl    %r10d,-4(%rdi,%rcx)
+       ret
+       ALIGN_TEXT
+100204:
        cmpb    $2,%cl
-       jl      1001f
+       jl      100001f
        movw    %r10w,(%rdi)
-       subb    $2,%cl
-       jz      1000f
-       leaq    2(%rdi),%rdi
-1001:
-       cmpb    $1,%cl
-       jl      1000f
+       movw    %r10w,-2(%rdi,%rcx)
+       ret
+       ALIGN_TEXT
+100001:
+       cmpb    $0,%cl
+       je      100000f
        movb    %r10b,(%rdi)
-1000:
+100000:
        ret
        ALIGN_TEXT
 1256:
@@ -127,6 +133,7 @@ __FBSDID("$FreeBSD$");
        leaq    16(%rdi,%r8),%rdi
        jmp     1b
 .endm
+
 
 ENTRY(memset)
        MEMSET erms=0

Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S      Fri Nov 16 00:03:31 2018        
(r340471)
+++ head/sys/amd64/amd64/support.S      Fri Nov 16 00:44:22 2018        
(r340472)
@@ -459,12 +459,12 @@ END(memcpy_erms)
        imulq   %r8,%r10
 
        cmpq    $32,%rcx
-       jb      1016f
+       jbe     101632f
 
        cmpq    $256,%rcx
        ja      1256f
 
-1032:
+103200:
        movq    %r10,(%rdi)
        movq    %r10,8(%rdi)
        movq    %r10,16(%rdi)
@@ -472,43 +472,54 @@ END(memcpy_erms)
        leaq    32(%rdi),%rdi
        subq    $32,%rcx
        cmpq    $32,%rcx
-       jae     1032b
-       cmpb    $0,%cl
-       je      1000f
-1016:
+       ja      103200b
        cmpb    $16,%cl
-       jl      1008f
+       ja      201632f
+       movq    %r10,-16(%rdi,%rcx)
+       movq    %r10,-8(%rdi,%rcx)
+       POP_FRAME_POINTER
+       ret
+       ALIGN_TEXT
+101632:
+       cmpb    $16,%cl
+       jl      100816f
+201632:
        movq    %r10,(%rdi)
        movq    %r10,8(%rdi)
-       subb    $16,%cl
-       jz      1000f
-       leaq    16(%rdi),%rdi
-1008:
+       movq    %r10,-16(%rdi,%rcx)
+       movq    %r10,-8(%rdi,%rcx)
+       POP_FRAME_POINTER
+       ret
+       ALIGN_TEXT
+100816:
        cmpb    $8,%cl
-       jl      1004f
+       jl      100408f
        movq    %r10,(%rdi)
-       subb    $8,%cl
-       jz      1000f
-       leaq    8(%rdi),%rdi
-1004:
+       movq    %r10,-8(%rdi,%rcx)
+       POP_FRAME_POINTER
+       ret
+       ALIGN_TEXT
+100408:
        cmpb    $4,%cl
-       jl      1002f
+       jl      100204f
        movl    %r10d,(%rdi)
-       subb    $4,%cl
-       jz      1000f
-       leaq    4(%rdi),%rdi
-1002:
+       movl    %r10d,-4(%rdi,%rcx)
+       POP_FRAME_POINTER
+       ret
+       ALIGN_TEXT
+100204:
        cmpb    $2,%cl
-       jl      1001f
+       jl      100001f
        movw    %r10w,(%rdi)
-       subb    $2,%cl
-       jz      1000f
-       leaq    2(%rdi),%rdi
-1001:
-       cmpb    $1,%cl
-       jl      1000f
+       movw    %r10w,-2(%rdi,%rcx)
+       POP_FRAME_POINTER
+       ret
+       ALIGN_TEXT
+100001:
+       cmpb    $0,%cl
+       je      100000f
        movb    %r10b,(%rdi)
-1000:
+100000:
        POP_FRAME_POINTER
        ret
        ALIGN_TEXT
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "[email protected]"

Reply via email to