On Fri, 2015-01-16 at 22:18 +0100, Niels Möller wrote:
> Nikos Mavrogiannopoulos <[email protected]> writes:
>
> > A quick and dirty patch to enable SSE2 instructions for memxor() on
> > Intel CPUs is attached.
> > I tried to follow the logic in the fat.c file, but I may have missed
> > something. I've not added memxor3() because it is actually slower with
> > SSE2.
>
> Cool!
>
> > SSE2:
> > memxor aligned 26081.83
> > memxor unaligned 25893.69
> >
> > No-SSE2:
> > memxor aligned 17806.94
> > memxor unaligned 16581.48
>
> How confident are you that the intel vs amd check is the right way
> to enable sse2? I guess we could add check on the particular cpu model
> later, if needed. Which model(s) did you benchmark on?
The benchmarks (if it is same as the older code I've sent you few years
ago), have been done on intel i7, i5 and a xeon. In all of them there
was an improvement. The benchmark above is on i7.
About that not improving on AMD I have no more data than what I've wrote
you last time (which was few years ago). No idea if newer AMD processors
behave better.
> It would be nice in a way if we could share code with x86_64/memxor.asm.
> E.g., by defining x86_64/fat/memxor-1.asm and x86_64/fat/memxor-2.asm
> which each include the same file with a different setting of USE_SSE2.
> But I haven't looked at that carefully, it might be better to have a
> unified x86_64/fat/memxor.asm with two entry points, like you do.
> I've also been considering m4 hacks to let a single fat .asm file
> include multiple other .asm files, or including the same file twice,
> without labels or m4 definitions colliding, but I'm not sure that's
> worth the effort. The foo-1.asm, foo-2.asm, ... scheme is a bit
> inelegant, but it is easy to understand.
I didn't like the duplication of code either. I'm not very skilled in
m4, but I though that x86_64/ could include the fat variant and use the
non-sse2 variant.
The code in fat.c is quite elaborate on the cases it handles. The more
functions added the more unmanageable the code will become. Would it
make sense to restrict that support to the systems where ifunc is
available? Then the addition of new optimized functions becomes very
simple.
regards,
Nikos
diff --git a/x86_64/fat/fat.c b/x86_64/fat/fat.c
index 3585cf5..e6fb76e 100644
--- a/x86_64/fat/fat.c
+++ b/x86_64/fat/fat.c
@@ -74,12 +74,10 @@
call to any fat function.
*/
-#if HAVE_LINK_IFUNC
-# define IFUNC(resolve) __attribute__ ((ifunc (resolve)))
-#else
-# define IFUNC(resolve)
-#endif
+#define IFUNC(resolve) __attribute__ ((ifunc (resolve)))
+
+static void fat_init (void);
void _nettle_cpuid (uint32_t input, uint32_t regs[4]);
typedef void void_func (void);
@@ -88,24 +86,34 @@ typedef void aes_crypt_internal_func (unsigned rounds,
const uint32_t *keys,
const struct aes_table *T,
size_t length, uint8_t *dst,
const uint8_t *src);
-aes_crypt_internal_func _aes_encrypt IFUNC ("_aes_encrypt_resolve");
-aes_crypt_internal_func _nettle_aes_encrypt_x86_64;
-aes_crypt_internal_func _nettle_aes_encrypt_aesni;
-
-aes_crypt_internal_func _aes_decrypt IFUNC ("_aes_decrypt_resolve");
-aes_crypt_internal_func _nettle_aes_decrypt_x86_64;
-aes_crypt_internal_func _nettle_aes_decrypt_aesni;
-
-#if HAVE_LINK_IFUNC
-#define _aes_encrypt_init NULL
-#define _aes_decrypt_init NULL
-#else
-static aes_crypt_internal_func _aes_encrypt_init;
-static aes_crypt_internal_func _aes_decrypt_init;
-#endif
-static aes_crypt_internal_func *_aes_encrypt_vec = _aes_encrypt_init;
-static aes_crypt_internal_func *_aes_decrypt_vec = _aes_decrypt_init;
+/* name: the name of the function to be overriden
+ * type: the type of the function (e.g. aes_crypt_internal_func)
+ * var: specify an implementation of it; the symbol _nettle_name_var should
exist
+ */
+#define DEFINE_FAT_FUNC(name, type) \
+ type name IFUNC(#name"_resolve"); \
+ static type * name##_vec = NULL; \
+ static void_func * name##_resolve(void) { \
+ if (getenv ("NETTLE_FAT_VERBOSE")) \
+ fprintf (stderr, "libnettle: "#name"_resolve\n"); \
+ fat_init(); \
+ return (void_func *) name##_vec; \
+ }
+
+#define DEFINE_FAT_FUNC_VAR(name, type, var) \
+ type _nettle##name##_##var;
+
+#define ENABLE_FAT_FUNC_VAR(name, var) \
+ name##_vec = _nettle##name##_##var;
+
+DEFINE_FAT_FUNC(_aes_encrypt, aes_crypt_internal_func);
+DEFINE_FAT_FUNC_VAR(_aes_encrypt, aes_crypt_internal_func, x86_64);
+DEFINE_FAT_FUNC_VAR(_aes_encrypt, aes_crypt_internal_func, aesni);
+
+DEFINE_FAT_FUNC(_aes_decrypt, aes_crypt_internal_func);
+DEFINE_FAT_FUNC_VAR(_aes_decrypt, aes_crypt_internal_func, x86_64);
+DEFINE_FAT_FUNC_VAR(_aes_decrypt, aes_crypt_internal_func, aesni);
/* This function should usually be called only once, at startup. But
it is idempotent, and on x86, pointer updates are atomic, so
@@ -134,15 +142,15 @@ fat_init (void)
{
if (verbose)
fprintf (stderr, "libnettle: aes instructions available.\n");
- _aes_encrypt_vec = _nettle_aes_encrypt_aesni;
- _aes_decrypt_vec = _nettle_aes_decrypt_aesni;
+ ENABLE_FAT_FUNC_VAR(_aes_encrypt, aesni);
+ ENABLE_FAT_FUNC_VAR(_aes_decrypt, aesni);
}
else
{
if (verbose)
fprintf (stderr, "libnettle: aes instructions not available.\n");
- _aes_encrypt_vec = _nettle_aes_encrypt_x86_64;
- _aes_decrypt_vec = _nettle_aes_decrypt_x86_64;
+ ENABLE_FAT_FUNC_VAR(_aes_encrypt, x86_64);
+ ENABLE_FAT_FUNC_VAR(_aes_decrypt, x86_64);
}
/* FIXME: We ought to use some thread-aware memory barrier before
setting the initialized flag. For now, just do another cpuinfo
@@ -159,71 +167,3 @@ fat_constructor (void)
}
#endif
-#if HAVE_LINK_IFUNC
-
-static void_func *
-_aes_encrypt_resolve (void)
-{
- if (getenv ("NETTLE_FAT_VERBOSE"))
- fprintf (stderr, "libnettle: _aes_encrypt_resolve\n");
- fat_init ();
- return (void_func *) _aes_encrypt_vec;
-}
-
-static void_func *
-_aes_decrypt_resolve (void)
-{
- if (getenv ("NETTLE_FAT_VERBOSE"))
- fprintf (stderr, "libnettle: _aes_decrypt_resolve\n");
- fat_init ();
- return (void_func *) _aes_decrypt_vec;
-}
-
-#else /* !HAVE_LINK_IFUNC */
-
-/* We need wrapper functions jumping via the function pointer. */
-void
-_aes_encrypt (unsigned rounds, const uint32_t *keys,
- const struct aes_table *T,
- size_t length, uint8_t *dst,
- const uint8_t *src)
-{
- _aes_encrypt_vec (rounds, keys, T, length, dst, src);
-}
-
-static void
-_aes_encrypt_init (unsigned rounds, const uint32_t *keys,
- const struct aes_table *T,
- size_t length, uint8_t *dst,
- const uint8_t *src)
-{
- if (getenv ("NETTLE_FAT_VERBOSE"))
- fprintf (stderr, "libnettle: _aes_encrypt_init\n");
- fat_init ();
- assert (_aes_encrypt_vec != _aes_encrypt_init);
- _aes_encrypt (rounds, keys, T, length, dst, src);
-}
-
-void
-_aes_decrypt (unsigned rounds, const uint32_t *keys,
- const struct aes_table *T,
- size_t length, uint8_t *dst,
- const uint8_t *src)
-{
- _aes_decrypt_vec (rounds, keys, T, length, dst, src);
-}
-
-static void
-_aes_decrypt_init (unsigned rounds, const uint32_t *keys,
- const struct aes_table *T,
- size_t length, uint8_t *dst,
- const uint8_t *src)
-{
- if (getenv ("NETTLE_FAT_VERBOSE"))
- fprintf (stderr, "libnettle: _aes_decrypt_init\n");
- fat_init ();
- assert (_aes_decrypt_vec != _aes_decrypt_init);
- _aes_decrypt (rounds, keys, T, length, dst, src);
-}
-
-#endif /* !HAVE_LINK_IFUNC */
_______________________________________________
nettle-bugs mailing list
[email protected]
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs