Re: [FFmpeg-devel] [PATCH 4/9] x86: simple_idct_put: 12bits versions

2015-10-09 Thread Michael Niedermayer
On Thu, Oct 08, 2015 at 08:22:51AM +0200, Christophe Gisquet wrote:
> On 12 frames of a 444p 12 bits DNxHR sequence:
> C: 78902 decicycles in idct,  262071 runs, 73 skips
> avx:   32478 decicycles in idct,  262045 runs, 99 skips
> 
> Difference between the 2:
> stddev:0.39 PSNR:104.47 MAXDIFF:2
> 
> This is unavoidable and due to the scale factors used in the x86
> version, which cannot match the C ones, as this would cause
> overflows (there's one less 1bit of precision).
> ---
>  libavcodec/x86/idctdsp_init.c| 19 +--
>  libavcodec/x86/simple_idct.h |  3 +++
>  libavcodec/x86/simple_idct10.asm | 18 +++---
>  3 files changed, 35 insertions(+), 5 deletions(-)

this causes overflows somewhere with mjpeg
see https://trac.ffmpeg.org/attachment/ticket/4683/12bpp.jpg

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

I know you won't believe me, but the highest form of Human Excellence is
to question oneself and others. -- Socrates


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 4/9] x86: simple_idct_put: 12bits versions

2015-10-07 Thread Christophe Gisquet
On 12 frames of a 444p 12 bits DNxHR sequence:
C: 78902 decicycles in idct,  262071 runs, 73 skips
avx:   32478 decicycles in idct,  262045 runs, 99 skips

Difference between the 2:
stddev:0.39 PSNR:104.47 MAXDIFF:2

This is unavoidable and due to the scale factors used in the x86
version, which cannot match the C ones, as this would cause
overflows (there's one less 1bit of precision).
---
 libavcodec/x86/idctdsp_init.c| 19 +--
 libavcodec/x86/simple_idct.h |  3 +++
 libavcodec/x86/simple_idct10.asm | 18 +++---
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 17ddc9e..0051461 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -86,8 +86,8 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 c->add_pixels_clamped= ff_add_pixels_clamped_sse2;
 }
 
-if (ARCH_X86_64 &&
-avctx->bits_per_raw_sample == 10 && avctx->lowres == 0 &&
+if (ARCH_X86_64 && avctx->lowres == 0) {
+if (avctx->bits_per_raw_sample == 10 &&
 (avctx->idct_algo == FF_IDCT_AUTO ||
  avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
  avctx->idct_algo == FF_IDCT_SIMPLE)) {
@@ -100,5 +100,20 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 c->idct_put  = ff_simple_idct10_put_avx;
 c->perm_type = FF_IDCT_PERM_TRANSPOSE;
 }
+}
+
+if (avctx->bits_per_raw_sample == 12 &&
+(avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+if (EXTERNAL_SSE2(cpu_flags)) {
+c->idct_put  = ff_simple_idct12_put_sse2;
+c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+
+}
+if (EXTERNAL_AVX(cpu_flags)) {
+c->idct_put  = ff_simple_idct12_put_avx;
+c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+}
+}
 }
 }
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index d886434..0a90c36 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -28,4 +28,7 @@ void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, 
int16_t *block);
 void ff_simple_idct10_put_sse2(uint8_t *dest, int line_size, int16_t *block);
 void ff_simple_idct10_put_avx(uint8_t *dest, int line_size, int16_t *block);
 
+void ff_simple_idct12_put_sse2(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct12_put_avx(uint8_t *dest, int line_size, int16_t *block);
+
 #endif /* AVCODEC_X86_SIMPLE_IDCT_H */
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index 77db0a7..06290ae 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -29,25 +29,37 @@
 
 SECTION_RODATA
 
+cextern pw_1
+cextern pw_2
 cextern pw_8
 cextern pw_1023
+cextern pw_4095
 pd_round: times 4 dd 1<<(13-1)
 
 %include "libavcodec/x86/simple_idct10_template.asm"
 
 section .text align=16
 
-%macro idct_put_fn 1
+%macro idct_put_fn10 1
 cglobal simple_idct10_put, 3, 3, %1
 IDCT_PUT_FN"", 13, pw_8, 18, 0, pw_1023
 RET
 %endmacro
 
+%macro idct_put_fn12 1
+cglobal simple_idct12_put, 3, 3, %1
+; range isn't known, so the C simple_idct range is used
+IDCT_PUT_FNpw_1, 15, pw_2, 16, 0, pw_4095
+RET
+%endmacro
+
 INIT_XMM sse2
-idct_put_fn 16
+idct_put_fn10 16
+idct_put_fn12 16
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
-idct_put_fn 16
+idct_put_fn10 16
+idct_put_fn12 16
 %endif
 
 %endif
-- 
2.6.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel