Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

2016-06-29 Thread Rostislav Pehlivanov
On 27 June 2016 at 22:38, James Almer  wrote:

> On 6/27/2016 8:53 AM, Rostislav Pehlivanov wrote:
> > I've attached another patch which should work fine now.
> > I did this after the put_signed_rect so it does require the first patch,
> > but if this patch is okay I'll amend and tidy things before I push.
> > For some reason changing dstq to be stored at r4 or r3 broke it and I've
> no
> > idea why. Neither is used after loading m2 and m3. Should work on x86_32
> > now, but I'm wondering why I can't save that register.
>
> [...]
>
> > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > index c5cc530..4bc8b2d 100644
> > --- a/libavcodec/x86/diracdsp.asm
> > +++ b/libavcodec/x86/diracdsp.asm
> > @@ -266,9 +266,45 @@ HPEL_FILTER sse2
> >  ADD_OBMC 32, sse2
> >  ADD_OBMC 16, sse2
> >
> > -%if ARCH_X86_64 == 1
> >  INIT_XMM sse4
> >
> > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
> const int qf, const int qs, int tot_v, int tot_h)
> > +cglobal dequant_subband_32, 7, 8, 4, src, dst, stride, qf, qs, tot_v,
> tot_h
>
> x86_32 has 8 gprs but you can only use 7 as the last one is reserved
> to keep the stack pointer.
>
> > +
> > +movd   m2, qfd
> > +movd   m3, qsd
> > +SPLATD m2
> > +SPLATD m3
> > +movr4, tot_hq
> > +movr7, dstq
> > +
> > +.loop_v:
> > +movtot_hq, r4
> > +movdstq,   r7
> > +
> > +.loop_h:
> > +movu   m0, [srcq]
> > +
> > +pabsd  m1, m0
> > +pmulld m1, m2
> > +paddd  m1, m3
> > +psrld  m1,  2
> > +psignd m1, m0
> > +
> > +movu   [dstq], m1
> > +
> > +addsrcq, mmsize
> > +adddstq, mmsize
> > +subtot_hd, 4
> > +jg .loop_h
> > +
> > +addr7, strideq
> > +dectot_vd
> > +jg .loop_v
> > +
> > +RET
>
> I'm not sure why you say using r3 instead of r7 here didn't work for
> you. I just tried it (after applying all patches up to 6/10) and fate
> at least still passes, on both x86_64 and x86_32.
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

Odd, works fine now. I guess it just needed a clean build.
Attached a working patch.

I'd like to get some feedback on the other patches before I push though,
particularly the Golomb reader.
From 4ed0be9216175d5f394a1176596e6cbd5eee7b9a Mon Sep 17 00:00:00 2001
From: Rostislav Pehlivanov 
Date: Thu, 23 Jun 2016 18:06:56 +0100
Subject: [PATCH] diracdsp: add dequantization SIMD

Currently unused, to be used in the following commits.

Signed-off-by: Rostislav Pehlivanov 
---
 libavcodec/diracdsp.c  | 24 
 libavcodec/diracdsp.h  |  4 
 libavcodec/x86/diracdsp.asm| 38 +-
 libavcodec/x86/diracdsp_init.c |  7 +--
 4 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
index ab8d149..cd1209e 100644
--- a/libavcodec/diracdsp.c
+++ b/libavcodec/diracdsp.c
@@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride,
 }
 }
 
+#define DEQUANT_SUBBAND(PX)\
+static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t stride, \
+ const int qf, const int qs, int tot_v, int tot_h) \
+{  \
+int i, y;  \
+for (y = 0; y < tot_v; y++) {  \
+PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;\
+for (i = 0; i < tot_h; i++) {  \
+c = *src_r++;  \
+sign = FFSIGN(c)*(!!c);\
+c = (FFABS(c)*qf + qs) >> 2;   \
+*dst_r++ = c*sign; \
+}  \
+src += tot_h << (sizeof(PX) >> 1); \
+dst += stride; \
+}  \
+}
+
+DEQUANT_SUBBAND(int16_t)
+DEQUANT_SUBBAND(int32_t)
+
 #define PIXFUNC(PFX, WIDTH) \
 c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \
 c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## W

Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

2016-06-27 Thread James Almer
On 6/27/2016 8:53 AM, Rostislav Pehlivanov wrote:
> I've attached another patch which should work fine now.
> I did this after the put_signed_rect so it does require the first patch,
> but if this patch is okay I'll amend and tidy things before I push.
> For some reason changing dstq to be stored at r4 or r3 broke it and I've no
> idea why. Neither is used after loading m2 and m3. Should work on x86_32
> now, but I'm wondering why I can't save that register.

[...]

> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index c5cc530..4bc8b2d 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -266,9 +266,45 @@ HPEL_FILTER sse2
>  ADD_OBMC 32, sse2
>  ADD_OBMC 16, sse2
>  
> -%if ARCH_X86_64 == 1
>  INIT_XMM sse4
>  
> +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 
> const int qf, const int qs, int tot_v, int tot_h)
> +cglobal dequant_subband_32, 7, 8, 4, src, dst, stride, qf, qs, tot_v, tot_h

x86_32 has 8 gprs but you can only use 7 as the last one is reserved
to keep the stack pointer.

> +
> +movd   m2, qfd
> +movd   m3, qsd
> +SPLATD m2
> +SPLATD m3
> +movr4, tot_hq
> +movr7, dstq
> +
> +.loop_v:
> +movtot_hq, r4
> +movdstq,   r7
> +
> +.loop_h:
> +movu   m0, [srcq]
> +
> +pabsd  m1, m0
> +pmulld m1, m2
> +paddd  m1, m3
> +psrld  m1,  2
> +psignd m1, m0
> +
> +movu   [dstq], m1
> +
> +addsrcq, mmsize
> +adddstq, mmsize
> +subtot_hd, 4
> +jg .loop_h
> +
> +addr7, strideq
> +dectot_vd
> +jg .loop_v
> +
> +RET

I'm not sure why you say using r3 instead of r7 here didn't work for
you. I just tried it (after applying all patches up to 6/10) and fate
at least still passes, on both x86_64 and x86_32.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

2016-06-27 Thread Michael Niedermayer
On Mon, Jun 27, 2016 at 12:53:47PM +0100, Rostislav Pehlivanov wrote:
> On 24 June 2016 at 16:38, James Almer  wrote:
> 
> > On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote:
> > > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001
> > > From: Rostislav Pehlivanov 
> > > Date: Thu, 23 Jun 2016 18:06:56 +0100
> > > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD
> > >
> > > Currently unused, to be used in the following commits.
> > >
> > > Signed-off-by: Rostislav Pehlivanov 
> > > ---
> > >  libavcodec/diracdsp.c  | 24 
> > >  libavcodec/diracdsp.h  |  4 
> > >  libavcodec/x86/diracdsp.asm| 36 
> > >  libavcodec/x86/diracdsp_init.c |  2 ++
> > >  4 files changed, 66 insertions(+)
> > >
> > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> > > index ab8d149..cd1209e 100644
> > > --- a/libavcodec/diracdsp.c
> > > +++ b/libavcodec/diracdsp.c
> > > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const
> > uint16_t *src, int stride,
> > >  }
> > >  }
> > >
> > > +#define DEQUANT_SUBBAND(PX)
> > \
> > > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst,
> > ptrdiff_t stride, \
> > > + const int qf, const int qs,
> > int tot_v, int tot_h) \
> > > +{
> > \
> > > +int i, y;
> > \
> > > +for (y = 0; y < tot_v; y++) {
> > \
> > > +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;
> > \
> > > +for (i = 0; i < tot_h; i++) {
> > \
> > > +c = *src_r++;
> > \
> > > +sign = FFSIGN(c)*(!!c);
> > \
> > > +c = (FFABS(c)*qf + qs) >> 2;
> >\
> > > +*dst_r++ = c*sign;
> >\
> > > +}
> > \
> > > +src += tot_h << (sizeof(PX) >> 1);
> >\
> > > +dst += stride;
> >\
> > > +}
> > \
> > > +}
> > > +
> > > +DEQUANT_SUBBAND(int16_t)
> > > +DEQUANT_SUBBAND(int32_t)
> > > +
> > >  #define PIXFUNC(PFX, WIDTH)
> >  \
> > >  c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ##
> > _dirac_pixels ## WIDTH ## _c; \
> > >  c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ##
> > _dirac_pixels ## WIDTH ## _l2_c; \
> > > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
> > >  c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
> > >  c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
> > >
> > > +c->dequant_subband[0] = c->dequant_subband[2] =
> > dequant_subband_int16_t_c;
> > > +c->dequant_subband[1] = c->dequant_subband[3] =
> > dequant_subband_int32_t_c;
> > > +
> > >  PIXFUNC(put, 8);
> > >  PIXFUNC(put, 16);
> > >  PIXFUNC(put, 32);
> > > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> > > index 25a872d..224828d 100644
> > > --- a/libavcodec/diracdsp.h
> > > +++ b/libavcodec/diracdsp.h
> > > @@ -22,6 +22,7 @@
> > >  #define AVCODEC_DIRACDSP_H
> > >
> > >  #include 
> > > +#include 
> > >
> > >  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int
> > log2_denom, int weight, int h);
> > >  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src,
> > int stride, int log2_denom, int weightd, int weights, int h);
> > > @@ -46,6 +47,9 @@ typedef struct {
> > >  void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t
> > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int
> > idwt_stride, int width, int height/*mod 2*/);
> > >  void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int
> > stride, const uint8_t *obmc_weight, int yblen);
> > >
> > > +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> > > +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t
> > stride, const int qf, const int qs, int tot_v, int tot_h);
> > > +
> > >  dirac_weight_func weight_dirac_pixels_tab[3];
> > >  dirac_biweight_func biweight_dirac_pixels_tab[3];
> > >  } DiracDSPContext;
> > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > > index a0d6788..a764706 100644
> > > --- a/libavcodec/x86/diracdsp.asm
> > > +++ b/libavcodec/x86/diracdsp.asm
> > > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst,
> > dst_stride, src, src_stride, w
> > >
> > >  RET
> > >
> > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
> > const int qf, const int qs, int tot_v, int tot_h)
> > > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v,
> > tot_h
> > > +
> > > +movd   m2, qfd
> > > +movd   m3, qsd
> > > +SPLATD m2
> > > +SPLATD m3
> > > +movr7, dstq
> > > +movr8, tot_hq
> >
> > 

Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

2016-06-27 Thread Rostislav Pehlivanov
On 24 June 2016 at 16:38, James Almer  wrote:

> On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote:
> > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001
> > From: Rostislav Pehlivanov 
> > Date: Thu, 23 Jun 2016 18:06:56 +0100
> > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD
> >
> > Currently unused, to be used in the following commits.
> >
> > Signed-off-by: Rostislav Pehlivanov 
> > ---
> >  libavcodec/diracdsp.c  | 24 
> >  libavcodec/diracdsp.h  |  4 
> >  libavcodec/x86/diracdsp.asm| 36 
> >  libavcodec/x86/diracdsp_init.c |  2 ++
> >  4 files changed, 66 insertions(+)
> >
> > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> > index ab8d149..cd1209e 100644
> > --- a/libavcodec/diracdsp.c
> > +++ b/libavcodec/diracdsp.c
> > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const
> uint16_t *src, int stride,
> >  }
> >  }
> >
> > +#define DEQUANT_SUBBAND(PX)
> \
> > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst,
> ptrdiff_t stride, \
> > + const int qf, const int qs,
> int tot_v, int tot_h) \
> > +{
> \
> > +int i, y;
> \
> > +for (y = 0; y < tot_v; y++) {
> \
> > +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;
> \
> > +for (i = 0; i < tot_h; i++) {
> \
> > +c = *src_r++;
> \
> > +sign = FFSIGN(c)*(!!c);
> \
> > +c = (FFABS(c)*qf + qs) >> 2;
>\
> > +*dst_r++ = c*sign;
>\
> > +}
> \
> > +src += tot_h << (sizeof(PX) >> 1);
>\
> > +dst += stride;
>\
> > +}
> \
> > +}
> > +
> > +DEQUANT_SUBBAND(int16_t)
> > +DEQUANT_SUBBAND(int32_t)
> > +
> >  #define PIXFUNC(PFX, WIDTH)
>  \
> >  c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _c; \
> >  c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _l2_c; \
> > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
> >  c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
> >  c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
> >
> > +c->dequant_subband[0] = c->dequant_subband[2] =
> dequant_subband_int16_t_c;
> > +c->dequant_subband[1] = c->dequant_subband[3] =
> dequant_subband_int32_t_c;
> > +
> >  PIXFUNC(put, 8);
> >  PIXFUNC(put, 16);
> >  PIXFUNC(put, 32);
> > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> > index 25a872d..224828d 100644
> > --- a/libavcodec/diracdsp.h
> > +++ b/libavcodec/diracdsp.h
> > @@ -22,6 +22,7 @@
> >  #define AVCODEC_DIRACDSP_H
> >
> >  #include 
> > +#include 
> >
> >  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int
> log2_denom, int weight, int h);
> >  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src,
> int stride, int log2_denom, int weightd, int weights, int h);
> > @@ -46,6 +47,9 @@ typedef struct {
> >  void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t
> *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int
> idwt_stride, int width, int height/*mod 2*/);
> >  void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int
> stride, const uint8_t *obmc_weight, int yblen);
> >
> > +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> > +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int tot_v, int tot_h);
> > +
> >  dirac_weight_func weight_dirac_pixels_tab[3];
> >  dirac_biweight_func biweight_dirac_pixels_tab[3];
> >  } DiracDSPContext;
> > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > index a0d6788..a764706 100644
> > --- a/libavcodec/x86/diracdsp.asm
> > +++ b/libavcodec/x86/diracdsp.asm
> > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst,
> dst_stride, src, src_stride, w
> >
> >  RET
> >
> > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
> const int qf, const int qs, int tot_v, int tot_h)
> > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v,
> tot_h
> > +
> > +movd   m2, qfd
> > +movd   m3, qsd
> > +SPLATD m2
> > +SPLATD m3
> > +movr7, dstq
> > +movr8, tot_hq
>
> Replace every r7 and r8 with r3 and r4, make the cglobal line 7, 7, 4
> and the function will work on x86_32.
>
> > +
> > +.loop_v:
> > +movdstq,   r7
> > +movtot_hq, r8
> > +
> > +.loop_h:
> > +movu   m0, [srcq]
> > +
> > +pabsd  m1, m0
> > +pmulld m1, m2
> > +paddd  m1, m3
> > +psrld  m1

Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

2016-06-24 Thread James Almer
On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote:
> From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001
> From: Rostislav Pehlivanov 
> Date: Thu, 23 Jun 2016 18:06:56 +0100
> Subject: [PATCH 2/2] diracdsp: add dequantization SIMD
> 
> Currently unused, to be used in the following commits.
> 
> Signed-off-by: Rostislav Pehlivanov 
> ---
>  libavcodec/diracdsp.c  | 24 
>  libavcodec/diracdsp.h  |  4 
>  libavcodec/x86/diracdsp.asm| 36 
>  libavcodec/x86/diracdsp_init.c |  2 ++
>  4 files changed, 66 insertions(+)
> 
> diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> index ab8d149..cd1209e 100644
> --- a/libavcodec/diracdsp.c
> +++ b/libavcodec/diracdsp.c
> @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const 
> uint16_t *src, int stride,
>  }
>  }
>  
> +#define DEQUANT_SUBBAND(PX)  
>   \
> +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, 
> ptrdiff_t stride, \
> + const int qf, const int qs, int 
> tot_v, int tot_h) \
> +{
>   \
> +int i, y;
>   \
> +for (y = 0; y < tot_v; y++) {
>   \
> +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;  
>   \
> +for (i = 0; i < tot_h; i++) {
>   \
> +c = *src_r++;
>   \
> +sign = FFSIGN(c)*(!!c);  
>   \
> +c = (FFABS(c)*qf + qs) >> 2; 
>   \
> +*dst_r++ = c*sign;   
>   \
> +}
>   \
> +src += tot_h << (sizeof(PX) >> 1);   
>   \
> +dst += stride;   
>   \
> +}
>   \
> +}
> +
> +DEQUANT_SUBBAND(int16_t)
> +DEQUANT_SUBBAND(int32_t)
> +
>  #define PIXFUNC(PFX, WIDTH) \
>  c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels 
> ## WIDTH ## _c; \
>  c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels 
> ## WIDTH ## _l2_c; \
> @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
>  c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
>  c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
>  
> +c->dequant_subband[0] = c->dequant_subband[2] = 
> dequant_subband_int16_t_c;
> +c->dequant_subband[1] = c->dequant_subband[3] = 
> dequant_subband_int32_t_c;
> +
>  PIXFUNC(put, 8);
>  PIXFUNC(put, 16);
>  PIXFUNC(put, 32);
> diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> index 25a872d..224828d 100644
> --- a/libavcodec/diracdsp.h
> +++ b/libavcodec/diracdsp.h
> @@ -22,6 +22,7 @@
>  #define AVCODEC_DIRACDSP_H
>  
>  #include 
> +#include 
>  
>  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int 
> log2_denom, int weight, int h);
>  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int 
> stride, int log2_denom, int weightd, int weights, int h);
> @@ -46,6 +47,9 @@ typedef struct {
>  void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t 
> *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int 
> idwt_stride, int width, int height/*mod 2*/);
>  void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, 
> const uint8_t *obmc_weight, int yblen);
>  
> +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 
> const int qf, const int qs, int tot_v, int tot_h);
> +
>  dirac_weight_func weight_dirac_pixels_tab[3];
>  dirac_biweight_func biweight_dirac_pixels_tab[3];
>  } DiracDSPContext;
> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index a0d6788..a764706 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, 
> dst_stride, src, src_stride, w
>  
>  RET
>  
> +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 
> const int qf, const int qs, int tot_v, int tot_h)
> +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, tot_h
> +
> +movd   

Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

2016-06-24 Thread Rostislav Pehlivanov
On 23 June 2016 at 21:01, James Almer  wrote:

> On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote:
> > Currently unused, to be used in the following commits.
> >
> > Signed-off-by: Rostislav Pehlivanov 
> > ---
> >  libavcodec/diracdsp.c  | 24 
> >  libavcodec/diracdsp.h  |  4 
> >  libavcodec/x86/diracdsp.asm| 41
> +
> >  libavcodec/x86/diracdsp_init.c |  4 +++-
> >  4 files changed, 72 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> > index ab8d149..d0cfd00 100644
> > --- a/libavcodec/diracdsp.c
> > +++ b/libavcodec/diracdsp.c
> > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const
> uint16_t *src, int stride,
> >  }
> >  }
> >
> > +#define DEQUANT_SUBBAND(PX)
> \
> > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst,
> ptrdiff_t stride, \
> > + const int qf, const int qs,
> int64_t tot_v, int64_t tot_h) \
>
> Shouldn't this be int (or ptrdiff_t)? Seeing they are int in the
> SliceCoeffs struct introduced by patch 6, i don't see why they
> should be int64_t here. Unless I'm missing something.
>
> > +{
> \
> > +int i, y;
> \
> > +for (y = 0; y < tot_v; y++) {
> \
> > +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;
> \
> > +for (i = 0; i < tot_h; i++) {
> \
> > +c = *src_r++;
> \
> > +sign = FFSIGN(c)*(!!c);
> \
> > +c = (FFABS(c)*qf + qs) >> 2;
>\
> > +*dst_r++ = c*sign;
>\
> > +}
> \
> > +src += tot_h << (sizeof(PX) >> 1);
>\
> > +dst += stride;
>\
> > +}
> \
> > +}
> > +
> > +DEQUANT_SUBBAND(int16_t)
> > +DEQUANT_SUBBAND(int32_t)
> > +
> >  #define PIXFUNC(PFX, WIDTH)
>  \
> >  c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _c; \
> >  c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _l2_c; \
> > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
> >  c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
> >  c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
> >
> > +c->dequant_subband[0] = c->dequant_subband[2] =
> dequant_subband_int16_t_c;
> > +c->dequant_subband[1] = c->dequant_subband[3] =
> dequant_subband_int32_t_c;
> > +
> >  PIXFUNC(put, 8);
> >  PIXFUNC(put, 16);
> >  PIXFUNC(put, 32);
> > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> > index 25a872d..c0ac56b 100644
> > --- a/libavcodec/diracdsp.h
> > +++ b/libavcodec/diracdsp.h
> > @@ -22,6 +22,7 @@
> >  #define AVCODEC_DIRACDSP_H
> >
> >  #include 
> > +#include 
> >
> >  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int
> log2_denom, int weight, int h);
> >  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src,
> int stride, int log2_denom, int weightd, int weights, int h);
> > @@ -46,6 +47,9 @@ typedef struct {
> >  void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t
> *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int
> idwt_stride, int width, int height/*mod 2*/);
> >  void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int
> stride, const uint8_t *obmc_weight, int yblen);
> >
> > +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> > +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h);
> > +
> >  dirac_weight_func weight_dirac_pixels_tab[3];
> >  dirac_biweight_func biweight_dirac_pixels_tab[3];
> >  } DiracDSPContext;
> > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > index 9db7b67..f743363 100644
> > --- a/libavcodec/x86/diracdsp.asm
> > +++ b/libavcodec/x86/diracdsp.asm
> > @@ -289,6 +289,46 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src,
> stride, obmc, yblen
> >  RET
> >  %endm
> >
> > +%macro DEQUANT_SUBBAND_32 0
> > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
> const int qf, const int qs, int64_t tot_v, int64_t tot_h)
> > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v,
> tot_h
>
> Again, x86_64 only as is.
>
> > +
> > +movd   m2, qfd
> > +movd   m3, qsd
> > +SPLATD m2
> > +SPLATD m3
> > +negtot_vq
> > +negtot_hq
>
> Same as with put_signed_rect_clamped_10, no reason to neg these.
>
> > +movr7, dstq
> > +movr8, tot_hq
>
> You have qf 

Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

2016-06-23 Thread James Almer
On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote:
> Currently unused, to be used in the following commits.
> 
> Signed-off-by: Rostislav Pehlivanov 
> ---
>  libavcodec/diracdsp.c  | 24 
>  libavcodec/diracdsp.h  |  4 
>  libavcodec/x86/diracdsp.asm| 41 +
>  libavcodec/x86/diracdsp_init.c |  4 +++-
>  4 files changed, 72 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> index ab8d149..d0cfd00 100644
> --- a/libavcodec/diracdsp.c
> +++ b/libavcodec/diracdsp.c
> @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const 
> uint16_t *src, int stride,
>  }
>  }
>  
> +#define DEQUANT_SUBBAND(PX)  
>   \
> +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, 
> ptrdiff_t stride, \
> + const int qf, const int qs, int64_t 
> tot_v, int64_t tot_h) \

Shouldn't this be int (or ptrdiff_t)? Seeing they are int in the
SliceCoeffs struct introduced by patch 6, i don't see why they
should be int64_t here. Unless I'm missing something.

> +{
>   \
> +int i, y;
>   \
> +for (y = 0; y < tot_v; y++) {
>   \
> +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;  
>   \
> +for (i = 0; i < tot_h; i++) {
>   \
> +c = *src_r++;
>   \
> +sign = FFSIGN(c)*(!!c);  
>   \
> +c = (FFABS(c)*qf + qs) >> 2; 
>   \
> +*dst_r++ = c*sign;   
>   \
> +}
>   \
> +src += tot_h << (sizeof(PX) >> 1);   
>   \
> +dst += stride;   
>   \
> +}
>   \
> +}
> +
> +DEQUANT_SUBBAND(int16_t)
> +DEQUANT_SUBBAND(int32_t)
> +
>  #define PIXFUNC(PFX, WIDTH) \
>  c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels 
> ## WIDTH ## _c; \
>  c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels 
> ## WIDTH ## _l2_c; \
> @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
>  c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
>  c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
>  
> +c->dequant_subband[0] = c->dequant_subband[2] = 
> dequant_subband_int16_t_c;
> +c->dequant_subband[1] = c->dequant_subband[3] = 
> dequant_subband_int32_t_c;
> +
>  PIXFUNC(put, 8);
>  PIXFUNC(put, 16);
>  PIXFUNC(put, 32);
> diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> index 25a872d..c0ac56b 100644
> --- a/libavcodec/diracdsp.h
> +++ b/libavcodec/diracdsp.h
> @@ -22,6 +22,7 @@
>  #define AVCODEC_DIRACDSP_H
>  
>  #include 
> +#include 
>  
>  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int 
> log2_denom, int weight, int h);
>  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int 
> stride, int log2_denom, int weightd, int weights, int h);
> @@ -46,6 +47,9 @@ typedef struct {
>  void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t 
> *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int 
> idwt_stride, int width, int height/*mod 2*/);
>  void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, 
> const uint8_t *obmc_weight, int yblen);
>  
> +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 
> const int qf, const int qs, int64_t tot_v, int64_t tot_h);
> +
>  dirac_weight_func weight_dirac_pixels_tab[3];
>  dirac_biweight_func biweight_dirac_pixels_tab[3];
>  } DiracDSPContext;
> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index 9db7b67..f743363 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -289,6 +289,46 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, 
> obmc, yblen
>  RET
>  %endm
>  
> +%macro DEQUANT_SUBBAND_32 0
> +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_

[FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

2016-06-23 Thread Rostislav Pehlivanov
Currently unused, to be used in the following commits.

Signed-off-by: Rostislav Pehlivanov 
---
 libavcodec/diracdsp.c  | 24 
 libavcodec/diracdsp.h  |  4 
 libavcodec/x86/diracdsp.asm| 41 +
 libavcodec/x86/diracdsp_init.c |  4 +++-
 4 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
index ab8d149..d0cfd00 100644
--- a/libavcodec/diracdsp.c
+++ b/libavcodec/diracdsp.c
@@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const 
uint16_t *src, int stride,
 }
 }
 
+#define DEQUANT_SUBBAND(PX)
\
+static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t 
stride, \
+ const int qf, const int qs, int64_t 
tot_v, int64_t tot_h) \
+{  
\
+int i, y;  
\
+for (y = 0; y < tot_v; y++) {  
\
+PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;
\
+for (i = 0; i < tot_h; i++) {  
\
+c = *src_r++;  
\
+sign = FFSIGN(c)*(!!c);
\
+c = (FFABS(c)*qf + qs) >> 2;   
\
+*dst_r++ = c*sign; 
\
+}  
\
+src += tot_h << (sizeof(PX) >> 1); 
\
+dst += stride; 
\
+}  
\
+}
+
+DEQUANT_SUBBAND(int16_t)
+DEQUANT_SUBBAND(int32_t)
+
 #define PIXFUNC(PFX, WIDTH) \
 c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## 
WIDTH ## _c; \
 c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## 
WIDTH ## _l2_c; \
@@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
 c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
 c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
 
+c->dequant_subband[0] = c->dequant_subband[2] = dequant_subband_int16_t_c;
+c->dequant_subband[1] = c->dequant_subband[3] = dequant_subband_int32_t_c;
+
 PIXFUNC(put, 8);
 PIXFUNC(put, 16);
 PIXFUNC(put, 32);
diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
index 25a872d..c0ac56b 100644
--- a/libavcodec/diracdsp.h
+++ b/libavcodec/diracdsp.h
@@ -22,6 +22,7 @@
 #define AVCODEC_DIRACDSP_H
 
 #include 
+#include 
 
 typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, 
int weight, int h);
 typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int 
stride, int log2_denom, int weightd, int weights, int h);
@@ -46,6 +47,9 @@ typedef struct {
 void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t 
*src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, 
int width, int height/*mod 2*/);
 void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, 
const uint8_t *obmc_weight, int yblen);
 
+/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
+void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, 
const int qf, const int qs, int64_t tot_v, int64_t tot_h);
+
 dirac_weight_func weight_dirac_pixels_tab[3];
 dirac_biweight_func biweight_dirac_pixels_tab[3];
 } DiracDSPContext;
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index 9db7b67..f743363 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -289,6 +289,46 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, 
obmc, yblen
 RET
 %endm
 
+%macro DEQUANT_SUBBAND_32 0
+; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const 
int qf, const int qs, int64_t tot_v, int64_t tot_h)
+cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, tot_h
+
+movd   m2, qfd
+movd   m3, qsd
+SPLATD m2
+SPLATD m3
+negtot_vq
+negtot_hq
+movr7, dstq
+movr8, tot_hq
+
+.loop_v:
+movdstq,   r7
+movtot_hq, r8
+
+.loop_h:
+movu   m0, [srcq]
+
+pabsd  m1, m0
+pmulld m1, m2
+paddd  m1, m3