Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8

2014-08-21 Thread Hendrik Leppkes
On Thu, Aug 21, 2014 at 12:42 AM, James Almer jamr...@gmail.com wrote:
 * Reduced xmm register count to 7 (As such they are now enabled for x86_32).
 * Removed four movdqa (affects the sse2 version only).
 * pxor is now used to clear m0 only once.

 ~5% faster.

 Signed-off-by: James Almer jamr...@gmail.com
 ---

Good job, faster and 32-bit compat!

  libavcodec/x86/hevc_res_add.asm | 122 
 
  libavcodec/x86/hevcdsp_init.c   |  10 ++--
  2 files changed, 51 insertions(+), 81 deletions(-)

 diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
 index feea50c..7238fb3 100644
 --- a/libavcodec/x86/hevc_res_add.asm
 +++ b/libavcodec/x86/hevc_res_add.asm
 @@ -88,71 +88,41 @@ cglobal hevc_transform_add4_8, 3, 4, 6
  movhps   [r0+r3  ], m1
  %endmacro

 -%macro TR_ADD_INIT_SSE_8 0
 -pxor  m0, m0
 -
 -mova  m4, [r1]
 -mova  m1, [r1+16]
 -psubw m2, m0, m1
 -psubw m5, m0, m4
 -packuswb  m4, m1
 -packuswb  m5, m2
 -
 -mova  m6, [r1+32]
 -mova  m1, [r1+48]
 -psubw m2, m0, m1
 -psubw m7, m0, m6
 -packuswb  m6, m1
 -packuswb  m7, m2
 -
 -mova  m8, [r1+64]
 -mova  m1, [r1+80]
 -psubw m2, m0, m1
 -psubw m9, m0, m8
 -packuswb  m8, m1
 -packuswb  m9, m2
 -
 -mova m10, [r1+96]
 -mova  m1, [r1+112]
 -psubw m2, m0, m1
 -psubwm11, m0, m10
 -packuswb m10, m1
 -packuswb m11, m2
 -%endmacro
 -
 -
 -%macro TR_ADD_SSE_16_8 0
 -TR_ADD_INIT_SSE_8
 -
 -paddusb   m0, m4, [r0 ]
 -paddusb   m1, m6, [r0+r2  ]
 -paddusb   m2, m8, [r0+r2*2]
 -paddusb   m3, m10,[r0+r3  ]
 -psubusb   m0, m5
 -psubusb   m1, m7
 -psubusb   m2, m9
 -psubusb   m3, m11
 -mova   [r0 ], m0
 -mova   [r0+r2  ], m1
 -mova   [r0+2*r2], m2
 -mova   [r0+r3  ], m3
 -%endmacro
 -
 -%macro TR_ADD_SSE_32_8 0
 -TR_ADD_INIT_SSE_8
 -
 -paddusb   m0, m4, [r0  ]
 -paddusb   m1, m6, [r0+16   ]
 -paddusb   m2, m8, [r0+r2   ]
 -paddusb   m3, m10,[r0+r2+16]
 -psubusb   m0, m5
 -psubusb   m1, m7
 -psubusb   m2, m9
 -psubusb   m3, m11
 -mova  [r0  ], m0
 -mova  [r0+16   ], m1
 -mova  [r0+r2   ], m2
 -mova  [r0+r2+16], m3
 +%macro TR_ADD_SSE_16_32_8 3
 +mova  m2, [r1+%1   ]
 +mova  m6, [r1+%1+16]
 +%if cpuflag(avx)
 +psubw m1, m0, m2
 +psubw m5, m0, m6
 +%else
 +mova  m1, m0
 +mova  m5, m0
 +psubw m1, m2
 +psubw m5, m6
 +%endif

I was wondering about these blocks - doesn't the x264asm layer
automatically add the mova's when you just use the 3-arg form on sse2?
Or is there a speed benefit grouping the mov's?

- Hendrik
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8

2014-08-21 Thread James Almer
On 21/08/14 10:03 AM, Hendrik Leppkes wrote:
 On Thu, Aug 21, 2014 at 12:42 AM, James Almer jamr...@gmail.com wrote:
 * Reduced xmm register count to 7 (As such they are now enabled for x86_32).
 * Removed four movdqa (affects the sse2 version only).
 * pxor is now used to clear m0 only once.

 ~5% faster.

 Signed-off-by: James Almer jamr...@gmail.com
 ---
 
 Good job, faster and 32-bit compat!
 
  libavcodec/x86/hevc_res_add.asm | 122 
 
  libavcodec/x86/hevcdsp_init.c   |  10 ++--
  2 files changed, 51 insertions(+), 81 deletions(-)

 diff --git a/libavcodec/x86/hevc_res_add.asm 
 b/libavcodec/x86/hevc_res_add.asm
 index feea50c..7238fb3 100644
 --- a/libavcodec/x86/hevc_res_add.asm
 +++ b/libavcodec/x86/hevc_res_add.asm
 @@ -88,71 +88,41 @@ cglobal hevc_transform_add4_8, 3, 4, 6
  movhps   [r0+r3  ], m1
  %endmacro

 -%macro TR_ADD_INIT_SSE_8 0
 -pxor  m0, m0
 -
 -mova  m4, [r1]
 -mova  m1, [r1+16]
 -psubw m2, m0, m1
 -psubw m5, m0, m4
 -packuswb  m4, m1
 -packuswb  m5, m2
 -
 -mova  m6, [r1+32]
 -mova  m1, [r1+48]
 -psubw m2, m0, m1
 -psubw m7, m0, m6
 -packuswb  m6, m1
 -packuswb  m7, m2
 -
 -mova  m8, [r1+64]
 -mova  m1, [r1+80]
 -psubw m2, m0, m1
 -psubw m9, m0, m8
 -packuswb  m8, m1
 -packuswb  m9, m2
 -
 -mova m10, [r1+96]
 -mova  m1, [r1+112]
 -psubw m2, m0, m1
 -psubwm11, m0, m10
 -packuswb m10, m1
 -packuswb m11, m2
 -%endmacro
 -
 -
 -%macro TR_ADD_SSE_16_8 0
 -TR_ADD_INIT_SSE_8
 -
 -paddusb   m0, m4, [r0 ]
 -paddusb   m1, m6, [r0+r2  ]
 -paddusb   m2, m8, [r0+r2*2]
 -paddusb   m3, m10,[r0+r3  ]
 -psubusb   m0, m5
 -psubusb   m1, m7
 -psubusb   m2, m9
 -psubusb   m3, m11
 -mova   [r0 ], m0
 -mova   [r0+r2  ], m1
 -mova   [r0+2*r2], m2
 -mova   [r0+r3  ], m3
 -%endmacro
 -
 -%macro TR_ADD_SSE_32_8 0
 -TR_ADD_INIT_SSE_8
 -
 -paddusb   m0, m4, [r0  ]
 -paddusb   m1, m6, [r0+16   ]
 -paddusb   m2, m8, [r0+r2   ]
 -paddusb   m3, m10,[r0+r2+16]
 -psubusb   m0, m5
 -psubusb   m1, m7
 -psubusb   m2, m9
 -psubusb   m3, m11
 -mova  [r0  ], m0
 -mova  [r0+16   ], m1
 -mova  [r0+r2   ], m2
 -mova  [r0+r2+16], m3
 +%macro TR_ADD_SSE_16_32_8 3
 +mova  m2, [r1+%1   ]
 +mova  m6, [r1+%1+16]
 +%if cpuflag(avx)
 +psubw m1, m0, m2
 +psubw m5, m0, m6
 +%else
 +mova  m1, m0
 +mova  m5, m0
 +psubw m1, m2
 +psubw m5, m6
 +%endif
 
 I was wondering about these blocks - doesn't the x264asm layer
 automatically add the mova's when you just use the 3-arg form on sse2?
 Or is there a speed benefit grouping the mov's?
 
 - Hendrik

It does that, but on older SSE2 cpus with not-so-good OOO execution grouping 
instructions like this might help reduce dependencies a bit.
This trick is used all over the tree (including some macros from x86util), 
so it's certainly useful for some cpus.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8

2014-08-21 Thread Christophe Gisquet
Hi,

2014-08-21 0:42 GMT+02:00 James Almer jamr...@gmail.com:
 * Reduced xmm register count to 7 (As such they are now enabled for x86_32).
 * Removed four movdqa (affects the sse2 version only).
 * pxor is now used to clear m0 only once.

OK.

-- 
Christophe
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8

2014-08-21 Thread James Almer
On 21/08/14 2:15 PM, Christophe Gisquet wrote:
 Hi,
 
 2014-08-21 0:42 GMT+02:00 James Almer jamr...@gmail.com:
 * Reduced xmm register count to 7 (As such they are now enabled for x86_32).
 * Removed four movdqa (affects the sse2 version only).
 * pxor is now used to clear m0 only once.
 
 OK.

Pushed. Thanks.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8

2014-08-21 Thread Kieran Kunhya
 It does that, but on older SSE2 cpus with not-so-good OOO execution grouping
 instructions like this might help reduce dependencies a bit.

Are any older SSE2 CPUs actually capable of decoding reasonable HEVC?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8

2014-08-21 Thread James Almer
On 21/08/14 5:40 PM, Kieran Kunhya wrote:
 It does that, but on older SSE2 cpus with not-so-good OOO execution grouping
 instructions like this might help reduce dependencies a bit.
 
 Are any older SSE2 CPUs actually capable of decoding reasonable HEVC?

Probably not (at least nothing above dvd resolution), but aside from uglyfying 
the asm a bit it doesn't hurt.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8

2014-08-21 Thread Reimar Döffinger
On 21.08.2014, at 22:40, Kieran Kunhya kier...@obe.tv wrote:
 It does that, but on older SSE2 cpus with not-so-good OOO execution grouping
 instructions like this might help reduce dependencies a bit.
 
 Are any older SSE2 CPUs actually capable of decoding reasonable HEVC?

Of course they are. Not in real time, but people would use the same method that 
was used for H.264 back in the day: re-encode to a simpler format (and/or skip 
processing steps).
So any speedup will still be welcome on these even if less critical.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] x86/hevc_res_add: refactor ff_hevc_transform_add{16, 32}_8

2014-08-20 Thread James Almer
* Reduced xmm register count to 7 (As such they are now enabled for x86_32).
* Removed four movdqa (affects the sse2 version only).
* pxor is now used to clear m0 only once.

~5% faster.

Signed-off-by: James Almer jamr...@gmail.com
---
 libavcodec/x86/hevc_res_add.asm | 122 
 libavcodec/x86/hevcdsp_init.c   |  10 ++--
 2 files changed, 51 insertions(+), 81 deletions(-)

diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
index feea50c..7238fb3 100644
--- a/libavcodec/x86/hevc_res_add.asm
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -88,71 +88,41 @@ cglobal hevc_transform_add4_8, 3, 4, 6
 movhps   [r0+r3  ], m1
 %endmacro
 
-%macro TR_ADD_INIT_SSE_8 0
-pxor  m0, m0
-
-mova  m4, [r1]
-mova  m1, [r1+16]
-psubw m2, m0, m1
-psubw m5, m0, m4
-packuswb  m4, m1
-packuswb  m5, m2
-
-mova  m6, [r1+32]
-mova  m1, [r1+48]
-psubw m2, m0, m1
-psubw m7, m0, m6
-packuswb  m6, m1
-packuswb  m7, m2
-
-mova  m8, [r1+64]
-mova  m1, [r1+80]
-psubw m2, m0, m1
-psubw m9, m0, m8
-packuswb  m8, m1
-packuswb  m9, m2
-
-mova m10, [r1+96]
-mova  m1, [r1+112]
-psubw m2, m0, m1
-psubwm11, m0, m10
-packuswb m10, m1
-packuswb m11, m2
-%endmacro
-
-
-%macro TR_ADD_SSE_16_8 0
-TR_ADD_INIT_SSE_8
-
-paddusb   m0, m4, [r0 ]
-paddusb   m1, m6, [r0+r2  ]
-paddusb   m2, m8, [r0+r2*2]
-paddusb   m3, m10,[r0+r3  ]
-psubusb   m0, m5
-psubusb   m1, m7
-psubusb   m2, m9
-psubusb   m3, m11
-mova   [r0 ], m0
-mova   [r0+r2  ], m1
-mova   [r0+2*r2], m2
-mova   [r0+r3  ], m3
-%endmacro
-
-%macro TR_ADD_SSE_32_8 0
-TR_ADD_INIT_SSE_8
-
-paddusb   m0, m4, [r0  ]
-paddusb   m1, m6, [r0+16   ]
-paddusb   m2, m8, [r0+r2   ]
-paddusb   m3, m10,[r0+r2+16]
-psubusb   m0, m5
-psubusb   m1, m7
-psubusb   m2, m9
-psubusb   m3, m11
-mova  [r0  ], m0
-mova  [r0+16   ], m1
-mova  [r0+r2   ], m2
-mova  [r0+r2+16], m3
+%macro TR_ADD_SSE_16_32_8 3
+mova  m2, [r1+%1   ]
+mova  m6, [r1+%1+16]
+%if cpuflag(avx)
+psubw m1, m0, m2
+psubw m5, m0, m6
+%else
+mova  m1, m0
+mova  m5, m0
+psubw m1, m2
+psubw m5, m6
+%endif
+packuswb  m2, m6
+packuswb  m1, m5
+
+mova  m4, [r1+%1+32]
+mova  m6, [r1+%1+48]
+%if cpuflag(avx)
+psubw m3, m0, m4
+psubw m5, m0, m6
+%else
+mova  m3, m0
+mova  m5, m0
+psubw m3, m4
+psubw m5, m6
+%endif
+packuswb  m4, m6
+packuswb  m3, m5
+
+paddusb   m2, [%2]
+paddusb   m4, [%3]
+psubusb   m2, m1
+psubusb   m4, m3
+mova[%2], m2
+mova[%3], m4
 %endmacro
 
 
@@ -166,30 +136,32 @@ cglobal hevc_transform_add8_8, 3, 4, 8
 TR_ADD_SSE_8_8
 RET
 
-%if ARCH_X86_64
 ; void ff_hevc_transform_add16_8_opt(uint8_t *dst, int16_t *coeffs, 
ptrdiff_t stride)
-cglobal hevc_transform_add16_8, 3, 4, 12
+cglobal hevc_transform_add16_8, 3, 4, 7
+pxor  m0, m0
 lea   r3, [r2*3]
-TR_ADD_SSE_16_8
+TR_ADD_SSE_16_32_8  0, r0,  r0+r2
+TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
 %rep 3
 addr1, 128
 lear0, [r0+r2*4]
-TR_ADD_SSE_16_8
+TR_ADD_SSE_16_32_8  0, r0,  r0+r2
+TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
 %endrep
 RET
 
 ; void ff_hevc_transform_add32_8_opt(uint8_t *dst, int16_t *coeffs, 
ptrdiff_t stride)
-cglobal hevc_transform_add32_8, 3, 4, 12
-
-TR_ADD_SSE_32_8
+cglobal hevc_transform_add32_8, 3, 4, 7
+pxor   m0, m0
+TR_ADD_SSE_16_32_8  0, r0,r0+16
+TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
 %rep 15
 addr1, 128
 lear0, [r0+r2*2]
-TR_ADD_SSE_32_8
+TR_ADD_SSE_16_32_8  0, r0,r0+16
+TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
 %endrep
 RET
-
-%endif ;ARCH_X86_64
 %endmacro
 
 INIT_XMM sse2
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index f6f0a4b..0709158 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -477,15 +477,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
 if (ARCH_X86_64) {