On Tue, Jan 13, 2015 at 11:34 AM, Nikos Mavrogiannopoulos
<[email protected]> wrote:
> On Tue, Jan 13, 2015 at 11:20 AM, Niels Möller <[email protected]> wrote:
>> [email protected] (Niels Möller) writes:
>>> Clearly, this will be more useful after adding support for
>>> fat binaries, detecting presence of these instructions at runtime.
>>
>> I've now had a first go at fat-library support. Checked in on the branch
>> fat-library. See
>> https://git.lysator.liu.se/nettle/nettle/blob/fat-library/x86_64/fat/fat.c
A quick and dirty patch to enable SSE2 instructions for memxor() on
Intel CPUs is attached.
I tried to follow the logic in the fat.c file, but I may have missed
something. I've not added memxor3() because it is actually slower with
SSE2.
SSE2:
memxor aligned 26081.83
memxor unaligned 25893.69
No-SSE2:
memxor aligned 17806.94
memxor unaligned 16581.48
regards,
Nikos
From f3e529a40fa376a9ce73a229fa223655504a1ac5 Mon Sep 17 00:00:00 2001
From: Nikos Mavrogiannopoulos <[email protected]>
Date: Fri, 16 Jan 2015 16:43:23 +0100
Subject: [PATCH] Select SSE2 XOR when on Intel x86-64
---
x86_64/fat/fat.c | 33 +++++++
x86_64/fat/memxor.asm | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 283 insertions(+)
create mode 100644 x86_64/fat/memxor.asm
diff --git a/x86_64/fat/fat.c b/x86_64/fat/fat.c
index 3585cf5..e892537 100644
--- a/x86_64/fat/fat.c
+++ b/x86_64/fat/fat.c
@@ -36,6 +36,7 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include "nettle-types.h"
@@ -96,6 +97,12 @@ aes_crypt_internal_func _aes_decrypt IFUNC ("_aes_decrypt_resolve");
aes_crypt_internal_func _nettle_aes_decrypt_x86_64;
aes_crypt_internal_func _nettle_aes_decrypt_aesni;
+typedef void *(memxor_func)(void *dst_in, const void *src_in, size_t n);
+
+memxor_func nettle_memxor IFUNC ("_memxor_resolve");
+memxor_func _nettle_memxor_x86_64;
+memxor_func _nettle_memxor_sse2;
+
#if HAVE_LINK_IFUNC
#define _aes_encrypt_init NULL
#define _aes_decrypt_init NULL
@@ -106,6 +113,7 @@ static aes_crypt_internal_func _aes_decrypt_init;
static aes_crypt_internal_func *_aes_encrypt_vec = _aes_encrypt_init;
static aes_crypt_internal_func *_aes_decrypt_vec = _aes_decrypt_init;
+static memxor_func *_memxor_vec = _nettle_memxor_x86_64;
/* This function should usually be called only once, at startup. But
it is idempotent, and on x86, pointer updates are atomic, so
@@ -144,6 +152,16 @@ fat_init (void)
_aes_encrypt_vec = _nettle_aes_encrypt_x86_64;
_aes_decrypt_vec = _nettle_aes_decrypt_x86_64;
}
+
+ _nettle_cpuid (0, cpuid_data);
+ if (memcmp(&cpuid_data[1], "Genu", 4) == 0 &&
+ memcmp(&cpuid_data[3], "ineI", 4) == 0 &&
+ memcmp(&cpuid_data[2], "ntel", 4) == 0) {
+ if (verbose)
+ fprintf (stderr, "libnettle: intel SSE2 will be used for XOR.\n");
+ _memxor_vec = _nettle_memxor_sse2;
+ }
+
/* FIXME: We ought to use some thread-aware memory barrier before
setting the initialized flag. For now, just do another cpuinfo
call to get some synchronization. */
@@ -179,6 +197,15 @@ _aes_decrypt_resolve (void)
return (void_func *) _aes_decrypt_vec;
}
+static void_func *
+_memxor_resolve (void)
+{
+ if (getenv ("NETTLE_FAT_VERBOSE"))
+ fprintf (stderr, "libnettle: _memxor_resolve\n");
+ fat_init ();
+ return (void_func *) _memxor_vec;
+}
+
#else /* !HAVE_LINK_IFUNC */
/* We need wrapper functions jumping via the function pointer. */
@@ -226,4 +253,10 @@ _aes_decrypt_init (unsigned rounds, const uint32_t *keys,
_aes_decrypt (rounds, keys, T, length, dst, src);
}
+void *
+memxor(void *dst_in, const void *src_in, size_t n)
+{
+ return _memxor_vec (dst_in, src_in, n);
+}
+
#endif /* !HAVE_LINK_IFUNC */
diff --git a/x86_64/fat/memxor.asm b/x86_64/fat/memxor.asm
new file mode 100644
index 0000000..118447a
--- /dev/null
+++ b/x86_64/fat/memxor.asm
@@ -0,0 +1,250 @@
+C x86_64/memxor.asm
+
+ifelse(<
+ Copyright (C) 2010, 2014, Niels Möller
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+>)
+
+C Register usage:
+define(<DST>, <%rax>) C Originally in %rdi
+define(<SRC>, <%rsi>)
+define(<N>, <%rdx>)
+define(<TMP>, <%r8>)
+define(<TMP2>, <%r9>)
+define(<CNT>, <%rdi>)
+define(<S0>, <%r11>)
+define(<S1>, <%rdi>) C Overlaps with CNT
+
+ .file "memxor.asm"
+
+ .text
+
+ C memxor(void *dst, const void *src, size_t n)
+ C %rdi %rsi %rdx
+ ALIGN(16)
+
+PROLOGUE(_nettle_memxor_x86_64)
+ W64_ENTRY(3, 0)
+
+ test N, N
+ C Get number of unaligned bytes at the end
+ C %rdi is used as CNT, %rax as DST and as return value
+ mov %rdi, %rax
+ jz .Ldone
+ add N, CNT
+ and $7, CNT
+
+ jz .Laligned
+
+ cmp $8, N
+ jc .Lfinal_next
+
+ C FIXME: Instead of this loop, could try cmov with memory
+ C destination, as a sequence of one 8-bit, one 16-bit and one
+ C 32-bit operations. (Except that cmov can't do 8-bit ops, so
+ C that step has to use a conditional).
+.Lalign_loop:
+
+ sub $1, N
+ movb (SRC, N), LREG(TMP)
+ xorb LREG(TMP), (DST, N)
+ sub $1, CNT
+ jnz .Lalign_loop
+
+.Laligned:
+
+ C Next destination word is -8(DST, N)
+ C Setup for unrolling
+ test $8, N
+ jz .Lword_next
+
+ sub $8, N
+ jz .Lone_word
+
+ mov (SRC, N), TMP
+ xor TMP, (DST, N)
+
+ jmp .Lword_next
+
+ ALIGN(16)
+
+.Lword_loop:
+ mov 8(SRC, N), TMP
+ mov (SRC, N), TMP2
+ xor TMP, 8(DST, N)
+ xor TMP2, (DST, N)
+
+.Lword_next:
+ sub $16, N
+ ja .Lword_loop C Not zero and no carry
+ jnz .Lfinal
+
+ C Final operation is word aligned
+ mov 8(SRC, N), TMP
+ xor TMP, 8(DST, N)
+
+.Lone_word:
+ mov (SRC, N), TMP
+ xor TMP, (DST, N)
+
+ W64_EXIT(3, 0)
+ ret
+
+.Lfinal:
+ add $15, N
+
+.Lfinal_loop:
+ movb (SRC, N), LREG(TMP)
+ xorb LREG(TMP), (DST, N)
+.Lfinal_next:
+ sub $1, N
+ jnc .Lfinal_loop
+
+.Ldone:
+ W64_EXIT(3, 0)
+ ret
+
+EPILOGUE(_nettle_memxor_x86_64)
+
+PROLOGUE(_nettle_memxor_sse2)
+ W64_ENTRY(3, 0)
+
+ test N, N
+ C Get number of unaligned bytes at the end
+ C %rdi is used as CNT, %rax as DST and as return value
+ mov %rdi, %rax
+ jz .SLdone
+ add N, CNT
+ and $7, CNT
+
+ jz .SLaligned
+
+ cmp $8, N
+ jc .SLfinal_next
+
+ C FIXME: Instead of this loop, could try cmov with memory
+ C destination, as a sequence of one 8-bit, one 16-bit and one
+ C 32-bit operations. (Except that cmov can't do 8-bit ops, so
+ C that step has to use a conditional).
+.SLalign_loop:
+
+ sub $1, N
+ movb (SRC, N), LREG(TMP)
+ xorb LREG(TMP), (DST, N)
+ sub $1, CNT
+ jnz .SLalign_loop
+
+.SLaligned:
+ cmp $16, N
+ jnc .SLsse2_case
+
+ C Next destination word is -8(DST, N)
+ C Setup for unrolling
+ test $8, N
+ jz .SLword_next
+
+ sub $8, N
+ jz .SLone_word
+
+ mov (SRC, N), TMP
+ xor TMP, (DST, N)
+
+ jmp .SLword_next
+
+ ALIGN(16)
+
+.SLword_loop:
+ mov 8(SRC, N), TMP
+ mov (SRC, N), TMP2
+ xor TMP, 8(DST, N)
+ xor TMP2, (DST, N)
+
+.SLword_next:
+ sub $16, N
+ ja .SLword_loop C Not zero and no carry
+ jnz .SLfinal
+
+ C Final operation is word aligned
+ mov 8(SRC, N), TMP
+ xor TMP, 8(DST, N)
+
+.SLone_word:
+ mov (SRC, N), TMP
+ xor TMP, (DST, N)
+
+ W64_EXIT(3, 0)
+ ret
+
+.SLfinal:
+ add $15, N
+
+.SLfinal_loop:
+ movb (SRC, N), LREG(TMP)
+ xorb LREG(TMP), (DST, N)
+.SLfinal_next:
+ sub $1, N
+ jnc .SLfinal_loop
+
+.SLdone:
+ W64_EXIT(3, 0)
+ ret
+
+.SLsse2_case:
+ lea (DST, N), TMP
+ test $8, TMP
+ jz .SLsse2_next
+ sub $8, N
+ mov (SRC, N), TMP
+ xor TMP, (DST, N)
+ jmp .SLsse2_next
+
+ ALIGN(16)
+.SLsse2_loop:
+ movdqu (SRC, N), %xmm0
+ movdqa (DST, N), %xmm1
+ pxor %xmm0, %xmm1
+ movdqa %xmm1, (DST, N)
+.SLsse2_next:
+ sub $16, N
+ ja .SLsse2_loop
+
+ C FIXME: See if we can do a full word first, before the
+ C byte-wise final loop.
+ jnz .SLfinal
+
+ C Final operation is aligned
+ movdqu (SRC), %xmm0
+ movdqa (DST), %xmm1
+ pxor %xmm0, %xmm1
+ movdqa %xmm1, (DST)
+
+ W64_EXIT(3, 0)
+ ret
+
+EPILOGUE(_nettle_memxor_sse2)
+
--
2.1.0
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs