Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD
On 27 June 2016 at 22:38, James Almer wrote: > On 6/27/2016 8:53 AM, Rostislav Pehlivanov wrote: > > I've attached another patch which should work fine now. > > I did this after the put_signed_rect so it does require the first patch, > > but if this patch is okay I'll amend and tidy things before I push. > > For some reason changing dstq to be stored at r4 or r3 broke it and I've > no > > idea why. Neither is used after loading m2 and m3. Should work on x86_32 > > now, but I'm wondering why I can't save that register. > > [...] > > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > > index c5cc530..4bc8b2d 100644 > > --- a/libavcodec/x86/diracdsp.asm > > +++ b/libavcodec/x86/diracdsp.asm > > @@ -266,9 +266,45 @@ HPEL_FILTER sse2 > > ADD_OBMC 32, sse2 > > ADD_OBMC 16, sse2 > > > > -%if ARCH_X86_64 == 1 > > INIT_XMM sse4 > > > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > const int qf, const int qs, int tot_v, int tot_h) > > +cglobal dequant_subband_32, 7, 8, 4, src, dst, stride, qf, qs, tot_v, > tot_h > > x86_32 has 8 gprs but you can only use 7 as the last one is reserved > to keep the stack pointer. > > > + > > +movd m2, qfd > > +movd m3, qsd > > +SPLATD m2 > > +SPLATD m3 > > +movr4, tot_hq > > +movr7, dstq > > + > > +.loop_v: > > +movtot_hq, r4 > > +movdstq, r7 > > + > > +.loop_h: > > +movu m0, [srcq] > > + > > +pabsd m1, m0 > > +pmulld m1, m2 > > +paddd m1, m3 > > +psrld m1, 2 > > +psignd m1, m0 > > + > > +movu [dstq], m1 > > + > > +addsrcq, mmsize > > +adddstq, mmsize > > +subtot_hd, 4 > > +jg .loop_h > > + > > +addr7, strideq > > +dectot_vd > > +jg .loop_v > > + > > +RET > > I'm not sure why you say using r3 instead of r7 here didn't work for > you. I just tried it (after applying all patches up to 6/10) and fate > at least still passes, on both x86_64 and x86_32. > > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > Odd, works fine now. I guess it just needed a clean build. Attached a working patch. I'd like to get some feedback on the other patches before I push though, particularly the Golomb reader. From 4ed0be9216175d5f394a1176596e6cbd5eee7b9a Mon Sep 17 00:00:00 2001 From: Rostislav Pehlivanov Date: Thu, 23 Jun 2016 18:06:56 +0100 Subject: [PATCH] diracdsp: add dequantization SIMD Currently unused, to be used in the following commits. Signed-off-by: Rostislav Pehlivanov --- libavcodec/diracdsp.c | 24 libavcodec/diracdsp.h | 4 libavcodec/x86/diracdsp.asm| 38 +- libavcodec/x86/diracdsp_init.c | 7 +-- 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c index ab8d149..cd1209e 100644 --- a/libavcodec/diracdsp.c +++ b/libavcodec/diracdsp.c @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride, } } +#define DEQUANT_SUBBAND(PX)\ +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t stride, \ + const int qf, const int qs, int tot_v, int tot_h) \ +{ \ +int i, y; \ +for (y = 0; y < tot_v; y++) { \ +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;\ +for (i = 0; i < tot_h; i++) { \ +c = *src_r++; \ +sign = FFSIGN(c)*(!!c);\ +c = (FFABS(c)*qf + qs) >> 2; \ +*dst_r++ = c*sign; \ +} \ +src += tot_h << (sizeof(PX) >> 1); \ +dst += stride; \ +} \ +} + +DEQUANT_SUBBAND(int16_t) +DEQUANT_SUBBAND(int32_t) + #define PIXFUNC(PFX, WIDTH) \ c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \ c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## W
Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD
On 6/27/2016 8:53 AM, Rostislav Pehlivanov wrote: > I've attached another patch which should work fine now. > I did this after the put_signed_rect so it does require the first patch, > but if this patch is okay I'll amend and tidy things before I push. > For some reason changing dstq to be stored at r4 or r3 broke it and I've no > idea why. Neither is used after loading m2 and m3. Should work on x86_32 > now, but I'm wondering why I can't save that register. [...] > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > index c5cc530..4bc8b2d 100644 > --- a/libavcodec/x86/diracdsp.asm > +++ b/libavcodec/x86/diracdsp.asm > @@ -266,9 +266,45 @@ HPEL_FILTER sse2 > ADD_OBMC 32, sse2 > ADD_OBMC 16, sse2 > > -%if ARCH_X86_64 == 1 > INIT_XMM sse4 > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > const int qf, const int qs, int tot_v, int tot_h) > +cglobal dequant_subband_32, 7, 8, 4, src, dst, stride, qf, qs, tot_v, tot_h x86_32 has 8 gprs but you can only use 7 as the last one is reserved to keep the stack pointer. > + > +movd m2, qfd > +movd m3, qsd > +SPLATD m2 > +SPLATD m3 > +movr4, tot_hq > +movr7, dstq > + > +.loop_v: > +movtot_hq, r4 > +movdstq, r7 > + > +.loop_h: > +movu m0, [srcq] > + > +pabsd m1, m0 > +pmulld m1, m2 > +paddd m1, m3 > +psrld m1, 2 > +psignd m1, m0 > + > +movu [dstq], m1 > + > +addsrcq, mmsize > +adddstq, mmsize > +subtot_hd, 4 > +jg .loop_h > + > +addr7, strideq > +dectot_vd > +jg .loop_v > + > +RET I'm not sure why you say using r3 instead of r7 here didn't work for you. I just tried it (after applying all patches up to 6/10) and fate at least still passes, on both x86_64 and x86_32. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD
On Mon, Jun 27, 2016 at 12:53:47PM +0100, Rostislav Pehlivanov wrote: > On 24 June 2016 at 16:38, James Almer wrote: > > > On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote: > > > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001 > > > From: Rostislav Pehlivanov > > > Date: Thu, 23 Jun 2016 18:06:56 +0100 > > > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD > > > > > > Currently unused, to be used in the following commits. > > > > > > Signed-off-by: Rostislav Pehlivanov > > > --- > > > libavcodec/diracdsp.c | 24 > > > libavcodec/diracdsp.h | 4 > > > libavcodec/x86/diracdsp.asm| 36 > > > libavcodec/x86/diracdsp_init.c | 2 ++ > > > 4 files changed, 66 insertions(+) > > > > > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c > > > index ab8d149..cd1209e 100644 > > > --- a/libavcodec/diracdsp.c > > > +++ b/libavcodec/diracdsp.c > > > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const > > uint16_t *src, int stride, > > > } > > > } > > > > > > +#define DEQUANT_SUBBAND(PX) > > \ > > > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, > > ptrdiff_t stride, \ > > > + const int qf, const int qs, > > int tot_v, int tot_h) \ > > > +{ > > \ > > > +int i, y; > > \ > > > +for (y = 0; y < tot_v; y++) { > > \ > > > +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; > > \ > > > +for (i = 0; i < tot_h; i++) { > > \ > > > +c = *src_r++; > > \ > > > +sign = FFSIGN(c)*(!!c); > > \ > > > +c = (FFABS(c)*qf + qs) >> 2; > >\ > > > +*dst_r++ = c*sign; > >\ > > > +} > > \ > > > +src += tot_h << (sizeof(PX) >> 1); > >\ > > > +dst += stride; > >\ > > > +} > > \ > > > +} > > > + > > > +DEQUANT_SUBBAND(int16_t) > > > +DEQUANT_SUBBAND(int32_t) > > > + > > > #define PIXFUNC(PFX, WIDTH) > > \ > > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## > > _dirac_pixels ## WIDTH ## _c; \ > > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## > > _dirac_pixels ## WIDTH ## _l2_c; \ > > > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) > > > c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; > > > c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; > > > > > > +c->dequant_subband[0] = c->dequant_subband[2] = > > dequant_subband_int16_t_c; > > > +c->dequant_subband[1] = c->dequant_subband[3] = > > dequant_subband_int32_t_c; > > > + > > > PIXFUNC(put, 8); > > > PIXFUNC(put, 16); > > > PIXFUNC(put, 32); > > > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h > > > index 25a872d..224828d 100644 > > > --- a/libavcodec/diracdsp.h > > > +++ b/libavcodec/diracdsp.h > > > @@ -22,6 +22,7 @@ > > > #define AVCODEC_DIRACDSP_H > > > > > > #include > > > +#include > > > > > > typedef void (*dirac_weight_func)(uint8_t *block, int stride, int > > log2_denom, int weight, int h); > > > typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, > > int stride, int log2_denom, int weightd, int weights, int h); > > > @@ -46,6 +47,9 @@ typedef struct { > > > void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t > > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int > > idwt_stride, int width, int height/*mod 2*/); > > > void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int > > stride, const uint8_t *obmc_weight, int yblen); > > > > > > +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ > > > +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t > > stride, const int qf, const int qs, int tot_v, int tot_h); > > > + > > > dirac_weight_func weight_dirac_pixels_tab[3]; > > > dirac_biweight_func biweight_dirac_pixels_tab[3]; > > > } DiracDSPContext; > > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > > > index a0d6788..a764706 100644 > > > --- a/libavcodec/x86/diracdsp.asm > > > +++ b/libavcodec/x86/diracdsp.asm > > > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, > > dst_stride, src, src_stride, w > > > > > > RET > > > > > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > > const int qf, const int qs, int tot_v, int tot_h) > > > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, > > tot_h > > > + > > > +movd m2, qfd > > > +movd m3, qsd > > > +SPLATD m2 > > > +SPLATD m3 > > > +movr7, dstq > > > +movr8, tot_hq > > > >
Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD
On 24 June 2016 at 16:38, James Almer wrote: > On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote: > > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001 > > From: Rostislav Pehlivanov > > Date: Thu, 23 Jun 2016 18:06:56 +0100 > > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD > > > > Currently unused, to be used in the following commits. > > > > Signed-off-by: Rostislav Pehlivanov > > --- > > libavcodec/diracdsp.c | 24 > > libavcodec/diracdsp.h | 4 > > libavcodec/x86/diracdsp.asm| 36 > > libavcodec/x86/diracdsp_init.c | 2 ++ > > 4 files changed, 66 insertions(+) > > > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c > > index ab8d149..cd1209e 100644 > > --- a/libavcodec/diracdsp.c > > +++ b/libavcodec/diracdsp.c > > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const > uint16_t *src, int stride, > > } > > } > > > > +#define DEQUANT_SUBBAND(PX) > \ > > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, > ptrdiff_t stride, \ > > + const int qf, const int qs, > int tot_v, int tot_h) \ > > +{ > \ > > +int i, y; > \ > > +for (y = 0; y < tot_v; y++) { > \ > > +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; > \ > > +for (i = 0; i < tot_h; i++) { > \ > > +c = *src_r++; > \ > > +sign = FFSIGN(c)*(!!c); > \ > > +c = (FFABS(c)*qf + qs) >> 2; >\ > > +*dst_r++ = c*sign; >\ > > +} > \ > > +src += tot_h << (sizeof(PX) >> 1); >\ > > +dst += stride; >\ > > +} > \ > > +} > > + > > +DEQUANT_SUBBAND(int16_t) > > +DEQUANT_SUBBAND(int32_t) > > + > > #define PIXFUNC(PFX, WIDTH) > \ > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## > _dirac_pixels ## WIDTH ## _c; \ > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## > _dirac_pixels ## WIDTH ## _l2_c; \ > > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) > > c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; > > c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; > > > > +c->dequant_subband[0] = c->dequant_subband[2] = > dequant_subband_int16_t_c; > > +c->dequant_subband[1] = c->dequant_subband[3] = > dequant_subband_int32_t_c; > > + > > PIXFUNC(put, 8); > > PIXFUNC(put, 16); > > PIXFUNC(put, 32); > > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h > > index 25a872d..224828d 100644 > > --- a/libavcodec/diracdsp.h > > +++ b/libavcodec/diracdsp.h > > @@ -22,6 +22,7 @@ > > #define AVCODEC_DIRACDSP_H > > > > #include > > +#include > > > > typedef void (*dirac_weight_func)(uint8_t *block, int stride, int > log2_denom, int weight, int h); > > typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, > int stride, int log2_denom, int weightd, int weights, int h); > > @@ -46,6 +47,9 @@ typedef struct { > > void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int > idwt_stride, int width, int height/*mod 2*/); > > void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int > stride, const uint8_t *obmc_weight, int yblen); > > > > +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ > > +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t > stride, const int qf, const int qs, int tot_v, int tot_h); > > + > > dirac_weight_func weight_dirac_pixels_tab[3]; > > dirac_biweight_func biweight_dirac_pixels_tab[3]; > > } DiracDSPContext; > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > > index a0d6788..a764706 100644 > > --- a/libavcodec/x86/diracdsp.asm > > +++ b/libavcodec/x86/diracdsp.asm > > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, > dst_stride, src, src_stride, w > > > > RET > > > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > const int qf, const int qs, int tot_v, int tot_h) > > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, > tot_h > > + > > +movd m2, qfd > > +movd m3, qsd > > +SPLATD m2 > > +SPLATD m3 > > +movr7, dstq > > +movr8, tot_hq > > Replace every r7 and r8 with r3 and r4, make the cglobal line 7, 7, 4 > and the function will work on x86_32. > > > + > > +.loop_v: > > +movdstq, r7 > > +movtot_hq, r8 > > + > > +.loop_h: > > +movu m0, [srcq] > > + > > +pabsd m1, m0 > > +pmulld m1, m2 > > +paddd m1, m3 > > +psrld m1
Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD
On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote: > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001 > From: Rostislav Pehlivanov > Date: Thu, 23 Jun 2016 18:06:56 +0100 > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD > > Currently unused, to be used in the following commits. > > Signed-off-by: Rostislav Pehlivanov > --- > libavcodec/diracdsp.c | 24 > libavcodec/diracdsp.h | 4 > libavcodec/x86/diracdsp.asm| 36 > libavcodec/x86/diracdsp_init.c | 2 ++ > 4 files changed, 66 insertions(+) > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c > index ab8d149..cd1209e 100644 > --- a/libavcodec/diracdsp.c > +++ b/libavcodec/diracdsp.c > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const > uint16_t *src, int stride, > } > } > > +#define DEQUANT_SUBBAND(PX) > \ > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, > ptrdiff_t stride, \ > + const int qf, const int qs, int > tot_v, int tot_h) \ > +{ > \ > +int i, y; > \ > +for (y = 0; y < tot_v; y++) { > \ > +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; > \ > +for (i = 0; i < tot_h; i++) { > \ > +c = *src_r++; > \ > +sign = FFSIGN(c)*(!!c); > \ > +c = (FFABS(c)*qf + qs) >> 2; > \ > +*dst_r++ = c*sign; > \ > +} > \ > +src += tot_h << (sizeof(PX) >> 1); > \ > +dst += stride; > \ > +} > \ > +} > + > +DEQUANT_SUBBAND(int16_t) > +DEQUANT_SUBBAND(int32_t) > + > #define PIXFUNC(PFX, WIDTH) \ > c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels > ## WIDTH ## _c; \ > c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels > ## WIDTH ## _l2_c; \ > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) > c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; > c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; > > +c->dequant_subband[0] = c->dequant_subband[2] = > dequant_subband_int16_t_c; > +c->dequant_subband[1] = c->dequant_subband[3] = > dequant_subband_int32_t_c; > + > PIXFUNC(put, 8); > PIXFUNC(put, 16); > PIXFUNC(put, 32); > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h > index 25a872d..224828d 100644 > --- a/libavcodec/diracdsp.h > +++ b/libavcodec/diracdsp.h > @@ -22,6 +22,7 @@ > #define AVCODEC_DIRACDSP_H > > #include > +#include > > typedef void (*dirac_weight_func)(uint8_t *block, int stride, int > log2_denom, int weight, int h); > typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int > stride, int log2_denom, int weightd, int weights, int h); > @@ -46,6 +47,9 @@ typedef struct { > void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int > idwt_stride, int width, int height/*mod 2*/); > void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, > const uint8_t *obmc_weight, int yblen); > > +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ > +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > const int qf, const int qs, int tot_v, int tot_h); > + > dirac_weight_func weight_dirac_pixels_tab[3]; > dirac_biweight_func biweight_dirac_pixels_tab[3]; > } DiracDSPContext; > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > index a0d6788..a764706 100644 > --- a/libavcodec/x86/diracdsp.asm > +++ b/libavcodec/x86/diracdsp.asm > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, > dst_stride, src, src_stride, w > > RET > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > const int qf, const int qs, int tot_v, int tot_h) > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, tot_h > + > +movd
Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD
On 23 June 2016 at 21:01, James Almer wrote: > On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote: > > Currently unused, to be used in the following commits. > > > > Signed-off-by: Rostislav Pehlivanov > > --- > > libavcodec/diracdsp.c | 24 > > libavcodec/diracdsp.h | 4 > > libavcodec/x86/diracdsp.asm| 41 > + > > libavcodec/x86/diracdsp_init.c | 4 +++- > > 4 files changed, 72 insertions(+), 1 deletion(-) > > > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c > > index ab8d149..d0cfd00 100644 > > --- a/libavcodec/diracdsp.c > > +++ b/libavcodec/diracdsp.c > > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const > uint16_t *src, int stride, > > } > > } > > > > +#define DEQUANT_SUBBAND(PX) > \ > > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, > ptrdiff_t stride, \ > > + const int qf, const int qs, > int64_t tot_v, int64_t tot_h) \ > > Shouldn't this be int (or ptrdiff_t)? Seeing they are int in the > SliceCoeffs struct introduced by patch 6, i don't see why they > should be int64_t here. Unless I'm missing something. > > > +{ > \ > > +int i, y; > \ > > +for (y = 0; y < tot_v; y++) { > \ > > +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; > \ > > +for (i = 0; i < tot_h; i++) { > \ > > +c = *src_r++; > \ > > +sign = FFSIGN(c)*(!!c); > \ > > +c = (FFABS(c)*qf + qs) >> 2; >\ > > +*dst_r++ = c*sign; >\ > > +} > \ > > +src += tot_h << (sizeof(PX) >> 1); >\ > > +dst += stride; >\ > > +} > \ > > +} > > + > > +DEQUANT_SUBBAND(int16_t) > > +DEQUANT_SUBBAND(int32_t) > > + > > #define PIXFUNC(PFX, WIDTH) > \ > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## > _dirac_pixels ## WIDTH ## _c; \ > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## > _dirac_pixels ## WIDTH ## _l2_c; \ > > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) > > c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; > > c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; > > > > +c->dequant_subband[0] = c->dequant_subband[2] = > dequant_subband_int16_t_c; > > +c->dequant_subband[1] = c->dequant_subband[3] = > dequant_subband_int32_t_c; > > + > > PIXFUNC(put, 8); > > PIXFUNC(put, 16); > > PIXFUNC(put, 32); > > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h > > index 25a872d..c0ac56b 100644 > > --- a/libavcodec/diracdsp.h > > +++ b/libavcodec/diracdsp.h > > @@ -22,6 +22,7 @@ > > #define AVCODEC_DIRACDSP_H > > > > #include > > +#include > > > > typedef void (*dirac_weight_func)(uint8_t *block, int stride, int > log2_denom, int weight, int h); > > typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, > int stride, int log2_denom, int weightd, int weights, int h); > > @@ -46,6 +47,9 @@ typedef struct { > > void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int > idwt_stride, int width, int height/*mod 2*/); > > void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int > stride, const uint8_t *obmc_weight, int yblen); > > > > +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ > > +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t > stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h); > > + > > dirac_weight_func weight_dirac_pixels_tab[3]; > > dirac_biweight_func biweight_dirac_pixels_tab[3]; > > } DiracDSPContext; > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > > index 9db7b67..f743363 100644 > > --- a/libavcodec/x86/diracdsp.asm > > +++ b/libavcodec/x86/diracdsp.asm > > @@ -289,6 +289,46 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, > stride, obmc, yblen > > RET > > %endm > > > > +%macro DEQUANT_SUBBAND_32 0 > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > const int qf, const int qs, int64_t tot_v, int64_t tot_h) > > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, > tot_h > > Again, x86_64 only as is. > > > + > > +movd m2, qfd > > +movd m3, qsd > > +SPLATD m2 > > +SPLATD m3 > > +negtot_vq > > +negtot_hq > > Same as with put_signed_rect_clamped_10, no reason to neg these. > > > +movr7, dstq > > +movr8, tot_hq > > You have qf
Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD
On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote: > Currently unused, to be used in the following commits. > > Signed-off-by: Rostislav Pehlivanov > --- > libavcodec/diracdsp.c | 24 > libavcodec/diracdsp.h | 4 > libavcodec/x86/diracdsp.asm| 41 + > libavcodec/x86/diracdsp_init.c | 4 +++- > 4 files changed, 72 insertions(+), 1 deletion(-) > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c > index ab8d149..d0cfd00 100644 > --- a/libavcodec/diracdsp.c > +++ b/libavcodec/diracdsp.c > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const > uint16_t *src, int stride, > } > } > > +#define DEQUANT_SUBBAND(PX) > \ > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, > ptrdiff_t stride, \ > + const int qf, const int qs, int64_t > tot_v, int64_t tot_h) \ Shouldn't this be int (or ptrdiff_t)? Seeing they are int in the SliceCoeffs struct introduced by patch 6, i don't see why they should be int64_t here. Unless I'm missing something. > +{ > \ > +int i, y; > \ > +for (y = 0; y < tot_v; y++) { > \ > +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; > \ > +for (i = 0; i < tot_h; i++) { > \ > +c = *src_r++; > \ > +sign = FFSIGN(c)*(!!c); > \ > +c = (FFABS(c)*qf + qs) >> 2; > \ > +*dst_r++ = c*sign; > \ > +} > \ > +src += tot_h << (sizeof(PX) >> 1); > \ > +dst += stride; > \ > +} > \ > +} > + > +DEQUANT_SUBBAND(int16_t) > +DEQUANT_SUBBAND(int32_t) > + > #define PIXFUNC(PFX, WIDTH) \ > c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels > ## WIDTH ## _c; \ > c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels > ## WIDTH ## _l2_c; \ > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) > c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; > c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; > > +c->dequant_subband[0] = c->dequant_subband[2] = > dequant_subband_int16_t_c; > +c->dequant_subband[1] = c->dequant_subband[3] = > dequant_subband_int32_t_c; > + > PIXFUNC(put, 8); > PIXFUNC(put, 16); > PIXFUNC(put, 32); > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h > index 25a872d..c0ac56b 100644 > --- a/libavcodec/diracdsp.h > +++ b/libavcodec/diracdsp.h > @@ -22,6 +22,7 @@ > #define AVCODEC_DIRACDSP_H > > #include > +#include > > typedef void (*dirac_weight_func)(uint8_t *block, int stride, int > log2_denom, int weight, int h); > typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int > stride, int log2_denom, int weightd, int weights, int h); > @@ -46,6 +47,9 @@ typedef struct { > void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int > idwt_stride, int width, int height/*mod 2*/); > void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, > const uint8_t *obmc_weight, int yblen); > > +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ > +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > const int qf, const int qs, int64_t tot_v, int64_t tot_h); > + > dirac_weight_func weight_dirac_pixels_tab[3]; > dirac_biweight_func biweight_dirac_pixels_tab[3]; > } DiracDSPContext; > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > index 9db7b67..f743363 100644 > --- a/libavcodec/x86/diracdsp.asm > +++ b/libavcodec/x86/diracdsp.asm > @@ -289,6 +289,46 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, > obmc, yblen > RET > %endm > > +%macro DEQUANT_SUBBAND_32 0 > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_
[FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD
Currently unused, to be used in the following commits. Signed-off-by: Rostislav Pehlivanov --- libavcodec/diracdsp.c | 24 libavcodec/diracdsp.h | 4 libavcodec/x86/diracdsp.asm| 41 + libavcodec/x86/diracdsp_init.c | 4 +++- 4 files changed, 72 insertions(+), 1 deletion(-) diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c index ab8d149..d0cfd00 100644 --- a/libavcodec/diracdsp.c +++ b/libavcodec/diracdsp.c @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride, } } +#define DEQUANT_SUBBAND(PX) \ +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t stride, \ + const int qf, const int qs, int64_t tot_v, int64_t tot_h) \ +{ \ +int i, y; \ +for (y = 0; y < tot_v; y++) { \ +PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; \ +for (i = 0; i < tot_h; i++) { \ +c = *src_r++; \ +sign = FFSIGN(c)*(!!c); \ +c = (FFABS(c)*qf + qs) >> 2; \ +*dst_r++ = c*sign; \ +} \ +src += tot_h << (sizeof(PX) >> 1); \ +dst += stride; \ +} \ +} + +DEQUANT_SUBBAND(int16_t) +DEQUANT_SUBBAND(int32_t) + #define PIXFUNC(PFX, WIDTH) \ c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \ c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \ @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; +c->dequant_subband[0] = c->dequant_subband[2] = dequant_subband_int16_t_c; +c->dequant_subband[1] = c->dequant_subband[3] = dequant_subband_int32_t_c; + PIXFUNC(put, 8); PIXFUNC(put, 16); PIXFUNC(put, 32); diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h index 25a872d..c0ac56b 100644 --- a/libavcodec/diracdsp.h +++ b/libavcodec/diracdsp.h @@ -22,6 +22,7 @@ #define AVCODEC_DIRACDSP_H #include +#include typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h); typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h); @@ -46,6 +47,9 @@ typedef struct { void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/); void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); +/* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ +void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h); + dirac_weight_func weight_dirac_pixels_tab[3]; dirac_biweight_func biweight_dirac_pixels_tab[3]; } DiracDSPContext; diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm index 9db7b67..f743363 100644 --- a/libavcodec/x86/diracdsp.asm +++ b/libavcodec/x86/diracdsp.asm @@ -289,6 +289,46 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen RET %endm +%macro DEQUANT_SUBBAND_32 0 +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h) +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, tot_h + +movd m2, qfd +movd m3, qsd +SPLATD m2 +SPLATD m3 +negtot_vq +negtot_hq +movr7, dstq +movr8, tot_hq + +.loop_v: +movdstq, r7 +movtot_hq, r8 + +.loop_h: +movu m0, [srcq] + +pabsd m1, m0 +pmulld m1, m2 +paddd m1, m3