On Tue, Jan 13, 2015 at 11:34 AM, Nikos Mavrogiannopoulos
<[email protected]> wrote:
> On Tue, Jan 13, 2015 at 11:20 AM, Niels Möller <[email protected]> wrote:
>> [email protected] (Niels Möller) writes:
>>> Clearly, this will be more useful after adding support for
>>> fat binaries, detecting presence of these instructions at runtime.
>>
>> I've now had a first go at fat-library support. Checked in on the branch
>> fat-library. See
>> https://git.lysator.liu.se/nettle/nettle/blob/fat-library/x86_64/fat/fat.c

A quick and dirty patch to enable SSE2 instructions for memxor() on
Intel CPUs is attached.
I tried to follow the logic in the fat.c file, but I may have missed
something. I've not added memxor3() because it is actually slower with
SSE2.

SSE2:
            memxor     aligned 26081.83
            memxor   unaligned 25893.69

No-SSE2:
            memxor     aligned 17806.94
            memxor   unaligned 16581.48

regards,
Nikos
From f3e529a40fa376a9ce73a229fa223655504a1ac5 Mon Sep 17 00:00:00 2001
From: Nikos Mavrogiannopoulos <[email protected]>
Date: Fri, 16 Jan 2015 16:43:23 +0100
Subject: [PATCH] Select SSE2 XOR when on Intel x86-64

---
 x86_64/fat/fat.c      |  33 +++++++
 x86_64/fat/memxor.asm | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 283 insertions(+)
 create mode 100644 x86_64/fat/memxor.asm

diff --git a/x86_64/fat/fat.c b/x86_64/fat/fat.c
index 3585cf5..e892537 100644
--- a/x86_64/fat/fat.c
+++ b/x86_64/fat/fat.c
@@ -36,6 +36,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "nettle-types.h"
 
@@ -96,6 +97,12 @@ aes_crypt_internal_func _aes_decrypt IFUNC ("_aes_decrypt_resolve");
 aes_crypt_internal_func _nettle_aes_decrypt_x86_64;
 aes_crypt_internal_func _nettle_aes_decrypt_aesni;
 
+typedef void *(memxor_func)(void *dst_in, const void *src_in, size_t n);
+
+memxor_func nettle_memxor IFUNC ("_memxor_resolve");
+memxor_func _nettle_memxor_x86_64;
+memxor_func _nettle_memxor_sse2;
+
 #if HAVE_LINK_IFUNC
 #define _aes_encrypt_init NULL
 #define _aes_decrypt_init NULL
@@ -106,6 +113,7 @@ static aes_crypt_internal_func _aes_decrypt_init;
 
 static aes_crypt_internal_func *_aes_encrypt_vec = _aes_encrypt_init;
 static aes_crypt_internal_func *_aes_decrypt_vec = _aes_decrypt_init;
+static memxor_func *_memxor_vec = _nettle_memxor_x86_64;
 
 /* This function should usually be called only once, at startup. But
    it is idempotent, and on x86, pointer updates are atomic, so
@@ -144,6 +152,16 @@ fat_init (void)
       _aes_encrypt_vec = _nettle_aes_encrypt_x86_64;
       _aes_decrypt_vec = _nettle_aes_decrypt_x86_64;
     }
+
+  _nettle_cpuid (0, cpuid_data);
+  if (memcmp(&cpuid_data[1], "Genu", 4) == 0 &&
+      memcmp(&cpuid_data[3], "ineI", 4) == 0 &&
+      memcmp(&cpuid_data[2], "ntel", 4) == 0) {
+      if (verbose)
+	fprintf (stderr, "libnettle: intel SSE2 will be used for XOR.\n");
+      _memxor_vec = _nettle_memxor_sse2;
+  }
+
   /* FIXME: We ought to use some thread-aware memory barrier before
      setting the initialized flag. For now, just do another cpuinfo
      call to get some synchronization. */
@@ -179,6 +197,15 @@ _aes_decrypt_resolve (void)
   return (void_func *) _aes_decrypt_vec;
 }
 
+static void_func *
+_memxor_resolve (void)
+{
+  if (getenv ("NETTLE_FAT_VERBOSE"))
+    fprintf (stderr, "libnettle: _memxor_resolve\n");
+  fat_init ();
+  return (void_func *) _memxor_vec;
+}
+
 #else /* !HAVE_LINK_IFUNC */
 
 /* We need wrapper functions jumping via the function pointer. */
@@ -226,4 +253,10 @@ _aes_decrypt_init (unsigned rounds, const uint32_t *keys,
   _aes_decrypt (rounds, keys, T, length, dst, src);
 }
 
+void *
+memxor(void *dst_in, const void *src_in, size_t n)
+{
+  return _memxor_vec (dst_in, src_in, n);
+}
+
 #endif /* !HAVE_LINK_IFUNC */
diff --git a/x86_64/fat/memxor.asm b/x86_64/fat/memxor.asm
new file mode 100644
index 0000000..118447a
--- /dev/null
+++ b/x86_64/fat/memxor.asm
@@ -0,0 +1,250 @@
+C x86_64/memxor.asm
+
+ifelse(<
+   Copyright (C) 2010, 2014, Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+define(<DST>, <%rax>) C Originally in %rdi
+define(<SRC>, <%rsi>)
+define(<N>, <%rdx>)
+define(<TMP>, <%r8>)
+define(<TMP2>, <%r9>)
+define(<CNT>, <%rdi>)
+define(<S0>, <%r11>)
+define(<S1>, <%rdi>) C Overlaps with CNT 
+
+	.file "memxor.asm"
+
+	.text
+
+	C memxor(void *dst, const void *src, size_t n)
+	C 	          %rdi               %rsi      %rdx
+	ALIGN(16)
+
+PROLOGUE(_nettle_memxor_x86_64)
+	W64_ENTRY(3, 0)
+
+	test	N, N
+	C Get number of unaligned bytes at the end
+	C %rdi is used as CNT, %rax as DST and as return value
+	mov	%rdi, %rax
+	jz	.Ldone
+	add 	N, CNT
+	and	$7, CNT
+	
+	jz	.Laligned
+
+	cmp	$8, N
+	jc	.Lfinal_next
+
+	C FIXME: Instead of this loop, could try cmov with memory
+	C destination, as a sequence of one 8-bit, one 16-bit and one
+	C 32-bit operations. (Except that cmov can't do 8-bit ops, so
+	C that step has to use a conditional).
+.Lalign_loop:
+	
+	sub	$1, N
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
+	sub	$1, CNT
+	jnz	.Lalign_loop
+
+.Laligned:
+
+	C Next destination word is -8(DST, N)
+	C Setup for unrolling
+	test	$8, N
+	jz	.Lword_next
+
+	sub	$8, N
+	jz	.Lone_word
+
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
+	
+	jmp	.Lword_next
+
+	ALIGN(16)
+
+.Lword_loop:
+	mov	8(SRC, N), TMP
+	mov	(SRC, N), TMP2
+	xor	TMP, 8(DST, N)
+	xor	TMP2, (DST, N)
+
+.Lword_next:
+	sub	$16, N
+	ja	.Lword_loop	C Not zero and no carry
+	jnz	.Lfinal
+
+	C Final operation is word aligned
+	mov	8(SRC, N), TMP
+	xor	TMP, 8(DST, N)
+	
+.Lone_word:
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
+
+	W64_EXIT(3, 0)
+	ret
+
+.Lfinal:
+	add	$15, N
+
+.Lfinal_loop:
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
+.Lfinal_next:
+	sub	$1, N
+	jnc	.Lfinal_loop
+
+.Ldone:
+	W64_EXIT(3, 0)
+	ret
+
+EPILOGUE(_nettle_memxor_x86_64)
+
+PROLOGUE(_nettle_memxor_sse2)
+	W64_ENTRY(3, 0)
+
+	test	N, N
+	C Get number of unaligned bytes at the end
+	C %rdi is used as CNT, %rax as DST and as return value
+	mov	%rdi, %rax
+	jz	.SLdone
+	add 	N, CNT
+	and	$7, CNT
+	
+	jz	.SLaligned
+
+	cmp	$8, N
+	jc	.SLfinal_next
+
+	C FIXME: Instead of this loop, could try cmov with memory
+	C destination, as a sequence of one 8-bit, one 16-bit and one
+	C 32-bit operations. (Except that cmov can't do 8-bit ops, so
+	C that step has to use a conditional).
+.SLalign_loop:
+	
+	sub	$1, N
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
+	sub	$1, CNT
+	jnz	.SLalign_loop
+
+.SLaligned:
+	cmp	$16, N
+	jnc	.SLsse2_case
+
+	C Next destination word is -8(DST, N)
+	C Setup for unrolling
+	test	$8, N
+	jz	.SLword_next
+
+	sub	$8, N
+	jz	.SLone_word
+
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
+	
+	jmp	.SLword_next
+
+	ALIGN(16)
+
+.SLword_loop:
+	mov	8(SRC, N), TMP
+	mov	(SRC, N), TMP2
+	xor	TMP, 8(DST, N)
+	xor	TMP2, (DST, N)
+
+.SLword_next:
+	sub	$16, N
+	ja	.SLword_loop	C Not zero and no carry
+	jnz	.SLfinal
+
+	C Final operation is word aligned
+	mov	8(SRC, N), TMP
+	xor	TMP, 8(DST, N)
+	
+.SLone_word:
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
+
+	W64_EXIT(3, 0)
+	ret
+
+.SLfinal:
+	add	$15, N
+
+.SLfinal_loop:
+	movb	(SRC, N), LREG(TMP)
+	xorb	LREG(TMP), (DST, N)
+.SLfinal_next:
+	sub	$1, N
+	jnc	.SLfinal_loop
+
+.SLdone:
+	W64_EXIT(3, 0)
+	ret
+
+.SLsse2_case:
+	lea	(DST, N), TMP
+	test	$8, TMP
+	jz	.SLsse2_next
+	sub	$8, N
+	mov	(SRC, N), TMP
+	xor	TMP, (DST, N)
+	jmp	.SLsse2_next
+
+	ALIGN(16)
+.SLsse2_loop:
+	movdqu	(SRC, N), %xmm0
+	movdqa	(DST, N), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST, N)
+.SLsse2_next:
+	sub	$16, N
+	ja	.SLsse2_loop
+	
+	C FIXME: See if we can do a full word first, before the
+	C byte-wise final loop.
+	jnz	.SLfinal		
+
+	C Final operation is aligned
+	movdqu	(SRC), %xmm0
+	movdqa	(DST), %xmm1
+	pxor	%xmm0, %xmm1
+	movdqa	%xmm1, (DST)
+
+	W64_EXIT(3, 0)
+	ret
+
+EPILOGUE(_nettle_memxor_sse2)
+
-- 
2.1.0

_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

Reply via email to