Re: [FFmpeg-devel] [PATCH] x86/hevc_sao: move 10/12bit functions into a separate file

2015-09-29 Thread James Almer
On 9/29/2015 11:39 PM, Michael Niedermayer wrote:
> On Tue, Sep 29, 2015 at 01:40:51PM -0300, James Almer wrote:
>> Sorry for the attachment. git send-email is giving me an unusual error when
>> i try to send this.
> 
>>  Makefile   |3 
>>  hevc_sao.asm   |  394 ++--
>>  hevc_sao_10bit.asm |  433 
>> +
>>  3 files changed, 490 insertions(+), 340 deletions(-)
>> f12242898081e1241e29947a3f2e83cc7fe86013  
>> 0001-x86-hevc_sao-move-10-12bit-functions-into-a-separate.patch
>> From 9e8ec4d51566cdda677b15e50240e8842ec6cd34 Mon Sep 17 00:00:00 2001
>> From: James Almer 
>> Date: Mon, 28 Sep 2015 00:58:01 -0300
>> Subject: [PATCH] x86/hevc_sao: move 10/12bit functions into a separate file
>>
>> Signed-off-by: James Almer 
>> ---
>> There's a bit of code duplication now (init functions), but it's cleaner
>> and should hopefully be easier to read.
> 
> tested, seems working

Pushed then, thanks.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/hevc_sao: move 10/12bit functions into a separate file

2015-09-29 Thread Michael Niedermayer
On Tue, Sep 29, 2015 at 01:40:51PM -0300, James Almer wrote:
> Sorry for the attachment. git send-email is giving me an unusual error when
> i try to send this.

>  Makefile   |3 
>  hevc_sao.asm   |  394 ++--
>  hevc_sao_10bit.asm |  433 
> +
>  3 files changed, 490 insertions(+), 340 deletions(-)
> f12242898081e1241e29947a3f2e83cc7fe86013  
> 0001-x86-hevc_sao-move-10-12bit-functions-into-a-separate.patch
> From 9e8ec4d51566cdda677b15e50240e8842ec6cd34 Mon Sep 17 00:00:00 2001
> From: James Almer 
> Date: Mon, 28 Sep 2015 00:58:01 -0300
> Subject: [PATCH] x86/hevc_sao: move 10/12bit functions into a separate file
> 
> Signed-off-by: James Almer 
> ---
> There's a bit of code duplication now (init functions), but it's cleaner
> and should hopefully be easier to read.

tested, seems working


[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Awnsering whenever a program halts or runs forever is
On a turing machine, in general impossible (turings halting problem).
On any real computer, always possible as a real computer has a finite number
of states N, and will either halt in less than N cycles or never halt.


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] x86/hevc_sao: move 10/12bit functions into a separate file

2015-09-29 Thread James Almer
Sorry for the attachment. git send-email is giving me an unusual error when
i try to send this.
>From 9e8ec4d51566cdda677b15e50240e8842ec6cd34 Mon Sep 17 00:00:00 2001
From: James Almer 
Date: Mon, 28 Sep 2015 00:58:01 -0300
Subject: [PATCH] x86/hevc_sao: move 10/12bit functions into a separate file

Signed-off-by: James Almer 
---
There's a bit of code duplication now (init functions), but it's cleaner
and should hopefully be easier to read.

 libavcodec/x86/Makefile   |   3 +-
 libavcodec/x86/hevc_sao.asm   | 394 +-
 libavcodec/x86/hevc_sao_10bit.asm | 433 ++
 3 files changed, 490 insertions(+), 340 deletions(-)
 create mode 100644 libavcodec/x86/hevc_sao_10bit.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index b3cfb0b..febaccd 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -140,7 +140,8 @@ YASM-OBJS-$(CONFIG_HEVC_DECODER)   += x86/hevc_mc.o \
   x86/hevc_deblock.o\
   x86/hevc_idct.o   \
   x86/hevc_res_add.o\
-  x86/hevc_sao.o
+  x86/hevc_sao.o\
+  x86/hevc_sao_10bit.o
 YASM-OBJS-$(CONFIG_JPEG2000_DECODER)   += x86/jpeg2000dsp.o
 YASM-OBJS-$(CONFIG_MLP_DECODER)+= x86/mlpdsp.o
 YASM-OBJS-$(CONFIG_MPEG4_DECODER)  += x86/xvididct.o
diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm
index fa45a24..888a28a 100644
--- a/libavcodec/x86/hevc_sao.asm
+++ b/libavcodec/x86/hevc_sao.asm
@@ -1,5 +1,5 @@
 ;**
-;* SIMD optimized SAO functions for HEVC decoding
+;* SIMD optimized SAO functions for HEVC 8bit decoding
 ;*
 ;* Copyright (c) 2013 Pierre-Edouard LEPERE
 ;* Copyright (c) 2014 James Almer
@@ -25,27 +25,18 @@
 
 SECTION_RODATA 32
 
-pw_mask10: times 16 dw 0x03FF
-pw_mask12: times 16 dw 0x0FFF
-pw_m2: times 16 dw -2
 pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 pb_eo:   db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
-cextern pw_m1
-cextern pw_1
-cextern pw_2
 cextern pb_1
 cextern pb_2
 
 SECTION .text
 
-%define MAX_PB_SIZE  64
-%define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
-
 ;**
 ;SAO Band Filter
 ;**
 
-%macro HEVC_SAO_BAND_FILTER_INIT 1
+%macro HEVC_SAO_BAND_FILTER_INIT 0
 andleftq, 31
 movd xm0, leftd
 addleftq, 1
@@ -76,9 +67,6 @@ SECTION .text
 %endif
 
 %if ARCH_X86_64
-%if %1 > 8
-mova m13, [pw_mask %+ %1]
-%endif
 pxor m14, m14
 
 %else ; ARCH_X86_32
@@ -90,9 +78,6 @@ SECTION .text
 mova  [rsp+mmsize*5], m5
 mova  [rsp+mmsize*6], m6
 pxor  m0, m0
-%if %1 > 8
-mova  m1, [pw_mask %+ %1]
-%endif
 %assign MMSIZE mmsize
 %define m14 m0
 %define m13 m1
@@ -103,49 +88,49 @@ DEFINE_ARGS dst, src, dststride, srcstride, offset, height
 mov  heightd, r7m
 %endmacro
 
-%macro HEVC_SAO_BAND_FILTER_COMPUTE 3
-psraw %2, %3, %1-5
+%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
+psraw %1, %2, 3
 %if ARCH_X86_64
-pcmpeqw  m10, %2, m0
-pcmpeqw  m11, %2, m1
-pcmpeqw  m12, %2, m2
-pcmpeqw   %2, m3
+pcmpeqw  m10, %1, m0
+pcmpeqw  m11, %1, m1
+pcmpeqw  m12, %1, m2
+pcmpeqw   %1, m3
 pand m10, m4
 pand m11, m5
 pand m12, m6
-pand  %2, m7
+pand  %1, m7
 por  m10, m11
-por  m12, %2
+por  m12, %1
 por  m10, m12
-paddw %3, m10
+paddw %2, m10
 %else ; ARCH_X86_32
-pcmpeqw   m4, %2, [rsp+MMSIZE*0]
-pcmpeqw   m5, %2, [rsp+MMSIZE*1]
-pcmpeqw   m6, %2, [rsp+MMSIZE*2]
-pcmpeqw   %2, [rsp+MMSIZE*3]
+pcmpeqw   m4, %1, [rsp+MMSIZE*0]
+pcmpeqw   m5, %1, [rsp+MMSIZE*1]
+pcmpeqw   m6, %1, [rsp+MMSIZE*2]
+pcmpeqw   %1, [rsp+MMSIZE*3]
 pand  m4, [rsp+MMSIZE*4]
 pand  m5, [rsp+MMSIZE*5]
 pand  m6, [rsp+MMSIZE*6]
-pand  %2, m7
+pand  %1, m7
 por   m4, m5
-por   m6, %2
+por   m6, %1
 por   m4, m6
-paddw %3, m4
+paddw %2, m4
 %endif ; ARCH
 %endmacro
 
 ;void ff_hevc_sao_band_filter__