Re: [FFmpeg-devel] [PATCH 06/10] lavu/aes: add x86 AESNI optimizations

2015-10-13 Thread Henrik Gramner
I changed the asm a bit and made it about 1 cycle faster on Haswell
and slightly smaller (-48 bytes overall incl. alignment on 64-bit
Linux).

%macro AES_CRYPT 1
cglobal aes_%1rypt, 6,6,2
shl  r3d, 4
add  r5d, r5d
add   r0, 0x60
add   r2, r3
add   r1, r3
neg   r3
pxor  m1, m1
test  r4, r4
je .block
movu  m1, [r4] ; iv
.block:
movu  m0, [r2+r3] ; state
%ifidn %1, enc
pxor  m0, m1
%endif
pxor  m0, [r0+8*r5-0x60]
cmp  r5d, 24
je .rounds12
jl .rounds10
aes%1 m0, [r0+0x70]
aes%1 m0, [r0+0x60]
.rounds12:
aes%1 m0, [r0+0x50]
aes%1 m0, [r0+0x40]
.rounds10:
aes%1 m0, [r0+0x30]
aes%1 m0, [r0+0x20]
aes%1 m0, [r0+0x10]
aes%1 m0, [r0+0x00]
aes%1 m0, [r0-0x10]
aes%1 m0, [r0-0x20]
aes%1 m0, [r0-0x30]
aes%1 m0, [r0-0x40]
aes%1 m0, [r0-0x50]
aes%1last m0, [r0-0x60]
test  r4, r4
je .noiv
%ifidn %1, enc
mova  m1, m0
%else
pxor  m0, m1
movu  m1, [r2+r3]
%endif
.noiv
movu [r1+r3], m0
add   r3, 16
jl .block
%ifidn %1, enc
test  r4, r4
je .ret
movu[r4], m0
.ret:
%endif
REP_RET
%endmacro

%if HAVE_AESNI_EXTERNAL
INIT_XMM aesni
AES_CRYPT enc
AES_CRYPT dec
%endif
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 06/10] lavu/aes: add x86 AESNI optimizations

2015-10-13 Thread Henrik Gramner
On Tue, Oct 13, 2015 at 2:33 AM, Rodger Combs  wrote:
> +%macro AES_CRYPT 1
> +%if %1 == 1
> +%define CRYPT aesdec
> +%define LAST  aesdeclast
> +cglobal aes_decrypt, 6,6,2
> +%else
> +%define CRYPT aesenc
> +%define LAST  aesenclast
> +cglobal aes_encrypt, 6,6,2
> +%endif
> +pxor xm1, xm1
> +shl r5d, 4
> +sub r5, 0x60
> +test r4, r4
> +je .block
> +movdqu xm1, [r4] ; iv
> +.block:
> +movdqu xm0, [r2] ; state
> +%if %1 == 0
> +pxor xm0, xm1
> +%endif
> +pxor  xm0, [r0 + r5 + 0x60]
> +CRYPT xm0, [r0 + r5 + 0x50]
> +CRYPT xm0, [r0 + r5 + 0x40]
> +CRYPT xm0, [r0 + r5 + 0x30]
> +CRYPT xm0, [r0 + r5 + 0x20]
> +CRYPT xm0, [r0 + r5 + 0x10]
> +CRYPT xm0, [r0 + r5 + 0x00]
> +CRYPT xm0, [r0 + r5 - 0x10]
> +CRYPT xm0, [r0 + r5 - 0x20]
> +CRYPT xm0, [r0 + r5 - 0x30]
> +cmp r5, 0x60
> +jl .last
> +CRYPT xm0, [r0 + r5 - 0x40]
> +CRYPT xm0, [r0 + r5 - 0x50]
> +cmp r5, 0x80
> +jl .last
> +CRYPT xm0, [r0 + 0x20]
> +CRYPT xm0, [r0 + 0x10]
> +.last:
> +LAST xm0, [r0]
> +test r4, r4
> +je .noiv
> +%if %1 == 1
> +pxor xm0, xm1
> +movdqu xm1, [r2]
> +%else
> +movdqa xm1, xm0
> +%endif
> +.noiv
> +movdqu [r1], xm0
> +dec r3d
> +add r2, 16
> +add r1, 16
> +test r3d, r3d
> +jne .block
> +%if %1 == 0
> +test r4, r4
> +je .ret
> +movdqu [r4], xm0
> +.ret:
> +%endif
> +REP_RET
> +%endmacro

If you use enc and dec as macro arguments instead of 0 and 1 you could
get rid of some of the if:s. E.g. using cglobal aes_%1rypt and the
instructions aes%1 and aes%1last instead of the CRYPT/LAST macros.
"%if %1 == 0" can be replaced by "%ifidn %1, enc" as well (which is
also more clear).

Use m# instead of xm# since you're not dealing with mixing xmm and ymm
registers.

Use mova instead of movdqa and movu instead of movdqu.

Vertically align the lines on the first comma.

You can also adjust the r5 offset to make the "cmp r5, 0x80" fit in a
signed byte as well, but that's pretty minor.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 06/10] lavu/aes: add x86 AESNI optimizations

2015-10-12 Thread Rodger Combs
---
 libavutil/aes.c  |  4 ++
 libavutil/aes_internal.h |  2 +
 libavutil/x86/Makefile   |  4 +-
 libavutil/x86/aes.asm| 98 
 libavutil/x86/aes_init.c | 37 ++
 5 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 libavutil/x86/aes.asm
 create mode 100644 libavutil/x86/aes_init.c

diff --git a/libavutil/aes.c b/libavutil/aes.c
index 1fac4e8..672d0eb 100644
--- a/libavutil/aes.c
+++ b/libavutil/aes.c
@@ -161,6 +161,8 @@ static void aes_decrypt(AVAES *a, uint8_t *dst, const 
uint8_t *src,
 void av_aes_crypt(AVAES *a, uint8_t *dst, const uint8_t *src,
   int count, uint8_t *iv, int decrypt)
 {
+if (count <= 0)
+return;
 a->crypt(a, dst, src, count, iv, a->rounds);
 }
 
@@ -200,6 +202,8 @@ int av_aes_init(AVAES *a, const uint8_t *key, int key_bits, 
int decrypt)
 uint8_t alog8[512];
 
 a->crypt = decrypt ? aes_decrypt : aes_encrypt;
+if (ARCH_X86)
+ff_init_aes_x86(a, decrypt);
 
 if 
(!enc_multbl[FF_ARRAY_ELEMS(enc_multbl)-1][FF_ARRAY_ELEMS(enc_multbl[0])-1]) {
 j = 1;
diff --git a/libavutil/aes_internal.h b/libavutil/aes_internal.h
index 4944258..dfa2039 100644
--- a/libavutil/aes_internal.h
+++ b/libavutil/aes_internal.h
@@ -40,4 +40,6 @@ typedef struct AVAES {
 void (*crypt)(struct AVAES *a, uint8_t *dst, const uint8_t *src, int 
count, uint8_t *iv, int rounds);
 } AVAES;
 
+void ff_init_aes_x86(AVAES *a, int decrypt);
+
 #endif /* AVUTIL_AES_INTERNAL_H */
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index eb70a62..4ac6219 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -1,4 +1,5 @@
-OBJS += x86/cpu.o   \
+OBJS += x86/aes_init.o  \
+x86/cpu.o   \
 x86/float_dsp_init.o\
 x86/lls_init.o  \
 
@@ -10,5 +11,6 @@ YASM-OBJS += x86/cpuid.o  
  \
  $(EMMS_OBJS__yes_)  \
  x86/float_dsp.o\
  x86/lls.o  \
+ x86/aes.o  \
 
 YASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o  \
diff --git a/libavutil/x86/aes.asm b/libavutil/x86/aes.asm
new file mode 100644
index 000..f56068a
--- /dev/null
+++ b/libavutil/x86/aes.asm
@@ -0,0 +1,98 @@
+;*
+;* Copyright (c) 2015 Rodger Combs 
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;**
+
+%include "x86util.asm"
+
+SECTION .text
+
+;-
+; void ff_aes_decrypt(AVAES *a, uint8_t *dst, const uint8_t *src,
+; int count, uint8_t *iv, int rounds)
+;-
+%macro AES_CRYPT 1
+%if %1 == 1
+%define CRYPT aesdec
+%define LAST  aesdeclast
+cglobal aes_decrypt, 6,6,2
+%else
+%define CRYPT aesenc
+%define LAST  aesenclast
+cglobal aes_encrypt, 6,6,2
+%endif
+pxor xm1, xm1
+shl r5d, 4
+sub r5, 0x60
+test r4, r4
+je .block
+movdqu xm1, [r4] ; iv
+.block:
+movdqu xm0, [r2] ; state
+%if %1 == 0
+pxor xm0, xm1
+%endif
+pxor  xm0, [r0 + r5 + 0x60]
+CRYPT xm0, [r0 + r5 + 0x50]
+CRYPT xm0, [r0 + r5 + 0x40]
+CRYPT xm0, [r0 + r5 + 0x30]
+CRYPT xm0, [r0 + r5 + 0x20]
+CRYPT xm0, [r0 + r5 + 0x10]
+CRYPT xm0, [r0 + r5 + 0x00]
+CRYPT xm0, [r0 + r5 - 0x10]
+CRYPT xm0, [r0 + r5 - 0x20]
+CRYPT xm0, [r0 + r5 - 0x30]
+cmp r5, 0x60
+jl .last
+CRYPT xm0, [r0 + r5 - 0x40]
+CRYPT xm0, [r0 + r5 - 0x50]
+cmp r5, 0x80
+jl .last
+CRYPT xm0, [r0 + 0x20]
+CRYPT xm0, [r0 + 0x10]
+.last:
+LAST xm0, [r0]
+test r4, r4
+je .noiv
+%if %1 == 1
+px