Hello I've been working on assembly for the vc2 encoder and have reached an impasse. My code results in very visible errors, very obvious vertical streaks in the bottom-right half of the image and some low-frequency effect (I think).
I cannot see the problem in my code so I need some fresh eyes to look at it. You can test the code with these two commands: > ./ffmpeg -f lavfi -i testsrc=s=1024x576,format=yuv420p -vcodec vc2 -strict -1 > -vframes 1 -cpuflags 0 -y temp.vc2 > ./ffmpeg -f lavfi -i testsrc=s=1024x576,format=yuv420p -vcodec vc2 -strict -1 > -vframes 1 -y temp.vc2 Decode each to a png image to see the problem. Attached is a diff from master (my branch is a two dozen small commits). Thanks.
diff --git a/libavcodec/vc2enc_dwt.c b/libavcodec/vc2enc_dwt.c index c60b003..bac262d 100644 --- a/libavcodec/vc2enc_dwt.c +++ b/libavcodec/vc2enc_dwt.c @@ -23,6 +23,7 @@ #include "libavutil/attributes.h" #include "libavutil/mem.h" #include "vc2enc_dwt.h" +#include "libavcodec/x86/vc2dsp.h" /* Since the transforms spit out interleaved coefficients, this function * rearranges the coefficients into the more traditional subdivision, @@ -76,16 +77,21 @@ static void vc2_subband_dwt_97(VC2TransformContext *t, dwtcoef *data, for (y = 0; y < synth_height; y++) { /* Lifting stage 2. */ synthl[1] -= (8*synthl[0] + 9*synthl[2] - synthl[4] + 8) >> 4; + for (x = 1; x < width - 2; x++) - synthl[2*x + 1] -= (9*synthl[2*x] + 9*synthl[2*x + 2] - synthl[2*x + 4] - - synthl[2 * x - 2] + 8) >> 4; + synthl[2*x + 1] -= (9*synthl[2*x] + + 9*synthl[2*x + 2] - + synthl[2*x + 4] - + synthl[2*x - 2] + 8) >> 4; + synthl[synth_width - 1] -= (17*synthl[synth_width - 2] - - synthl[synth_width - 4] + 8) >> 4; + synthl[synth_width - 4] + 8) >> 4; synthl[synth_width - 3] -= (8*synthl[synth_width - 2] + 9*synthl[synth_width - 4] - - synthl[synth_width - 6] + 8) >> 4; + synthl[synth_width - 6] + 8) >> 4; /* Lifting stage 1. */ synthl[0] += (synthl[1] + synthl[1] + 2) >> 2; + for (x = 1; x < width - 1; x++) synthl[2*x] += (synthl[2*x - 1] + synthl[2*x + 1] + 2) >> 2; @@ -97,25 +103,28 @@ static void vc2_subband_dwt_97(VC2TransformContext *t, dwtcoef *data, /* Vertical synthesis: Lifting stage 2. */ synthl = synth + synth_width; for (x = 0; x < synth_width; x++) - synthl[x] -= (8*synthl[x - synth_width] + 9*synthl[x + synth_width] - - synthl[x + 3 * synth_width] + 8) >> 4; + synthl[x] -= (8*synthl[x - synth_width] + + 9*synthl[x + synth_width] - + synthl[x + 3*synth_width] + 8) >> 4; synthl = synth + (synth_width << 1); for (y = 1; y < height - 2; y++) { for (x = 0; x < synth_width; x++) synthl[x + synth_width] -= (9*synthl[x] + - 9*synthl[x + 2 * synth_width] - - synthl[x - 2 * synth_width] - - synthl[x + 4 * synth_width] + 8) >> 4; + 9*synthl[x + 2*synth_width] - + synthl[x - 2*synth_width] - + synthl[x + 4*synth_width] + 8) >> 4; synthl += synth_width << 1; } synthl = synth + (synth_height - 1) * synth_width; for (x = 0; x < synth_width; x++) { synthl[x] -= (17*synthl[x - synth_width] - - synthl[x - 3*synth_width] + 8) >> 4; - synthl[x - 2*synth_width] -= (9*synthl[x - 3*synth_width] + - 8*synthl[x - 1*synth_width] - synthl[x - 5*synth_width] + 8) >> 4; + synthl[x - 3*synth_width] + 8) >> 4; + + synthl[x - 2*synth_width] -= (9*synthl[x - 3*synth_width] + + 8*synthl[x - 1*synth_width] - + synthl[x - 5*synth_width] + 8) >> 4; } /* Vertical synthesis: Lifting stage 1. */ @@ -262,6 +271,10 @@ av_cold int ff_vc2enc_init_transforms(VC2TransformContext *s, int p_width, int p s->vc2_subband_dwt[VC2_TRANSFORM_HAAR] = vc2_subband_dwt_haar; s->vc2_subband_dwt[VC2_TRANSFORM_HAAR_S] = vc2_subband_dwt_haar_shift; +#if ARCH_X86 + ff_vc2enc_init_x86(s, p_width, p_height); +#endif + s->buffer = av_malloc(2*p_width*p_height*sizeof(dwtcoef)); if (!s->buffer) return 1; diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 839b5bc..24aedda 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -34,6 +34,7 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_init.o +OBJS-$(CONFIG_VC2_ENCODER) += x86/vc2dsp_init.o OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o @@ -122,6 +123,7 @@ YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ YASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o YASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \ x86/vc1dsp_mc.o +YASM-OBJS-$(CONFIG_VC2_ENCODER) += x86/vc2enc_dwt.o YASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o diff --git a/libavcodec/x86/vc2dsp.h b/libavcodec/x86/vc2dsp.h new file mode 100644 index 0000000..825862e --- /dev/null +++ b/libavcodec/x86/vc2dsp.h @@ -0,0 +1,28 @@ +/* + * VC2 encoder + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VC2DSP_H +#define AVCODEC_X86_VC2DSP_H + +#include "libavcodec/vc2enc_dwt.h" + +void ff_vc2enc_init_x86(VC2TransformContext *s, int p_width, int p_height); + +#endif /* AVCODEC_X86_VC2DSP_H */ diff --git a/libavcodec/x86/vc2dsp_init.c b/libavcodec/x86/vc2dsp_init.c new file mode 100644 index 0000000..b3bdb53 --- /dev/null +++ b/libavcodec/x86/vc2dsp_init.c @@ -0,0 +1,67 @@ +/* + * VC2 encoder + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/x86/vc2dsp.h" + +#if HAVE_SSE2_EXTERNAL +void ff_vc2_subband_dwt_97_sse2(VC2TransformContext*, dwtcoef*, ptrdiff_t, int , int); +#endif + +static av_always_inline void deinterleave(dwtcoef *linell, ptrdiff_t stride, + int width, int height, dwtcoef *synthl) +{ + int x, y; + ptrdiff_t synthw = width << 1; + dwtcoef *linehl = linell + width; + dwtcoef *linelh = linell + height*stride; + dwtcoef *linehh = linelh + width; + + /* Deinterleave the coefficients. */ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + linell[x] = synthl[(x << 1)]; + linehl[x] = synthl[(x << 1) + 1]; + linelh[x] = synthl[(x << 1) + synthw]; + linehh[x] = synthl[(x << 1) + synthw + 1]; + } + synthl += synthw << 1; + linell += stride; + linelh += stride; + linehl += stride; + linehh += stride; + } +} + +void static wrapper_97(VC2TransformContext *t, dwtcoef *data, ptrdiff_t stride, int width, int height) +{ + ff_vc2_subband_dwt_97_sse2(t, data, stride, width, height); + deinterleave(data, stride, width, height, t->buffer); +} + +av_cold void ff_vc2enc_init_x86(VC2TransformContext *s, int p_width, int p_height) +{ + int cpuflags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpuflags)) { + s->vc2_subband_dwt[VC2_TRANSFORM_9_7] = wrapper_97; + } +} diff --git a/libavcodec/x86/vc2enc_dwt.asm b/libavcodec/x86/vc2enc_dwt.asm new file mode 100644 index 0000000..c6fdf36 --- /dev/null +++ b/libavcodec/x86/vc2enc_dwt.asm @@ -0,0 +1,312 @@ +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +cextern pw_2 +cextern pw_8 +cextern pw_9 +cextern pw_17 + +section .text + +INIT_XMM sse2 +cglobal vc2_subband_dwt_97, 5, 12, 5, synth, data, stride, width, height + sub rsp, 5 * gprsize + mov synthq, [synthq] + movsxdifnidn widthq, widthd + movsxdifnidn heightq, heightd + mov [rsp + gprsize*0], synthq + mov [rsp + gprsize*1], dataq + mov [rsp + gprsize*2], strideq + mov [rsp + gprsize*3], widthq + mov [rsp + gprsize*4], heightq + + DECLARE_REG_TMP 9, 10, 11 + %define synth_w r5 + %define synth_h r6 + mov synth_w, widthq + mov synth_h, heightq + add synth_w, synth_w + add synth_h, synth_h + + %define x r7 + %define y r8 + ; shifting loop (for precision and copy) + lea synthq, [synthq + synth_w*2] ; add 1 synth_w to save comparing x against synth_w + lea dataq, [dataq + synth_w*2] + mov y, synth_h + .loop1_y: + mov x, synth_w + neg x + .loop1_x: + movu m0, [dataq + x*2] + paddw m0, m0 + movu [synthq + x*2], m0 + add x, mmsize/2 + jl .loop1_x + + lea dataq, [dataq + strideq*2] + lea synthq, [synthq + synth_w*2] + sub y, 1 + jg .loop1_y + + ; Horizontal synthesis + mov synthq, [rsp] + mov y, synth_h + .loop2_y: + ; Lifting stage 2 + mov t0w, word [synthq+0] + mov t1w, word [synthq+4] + imul t0w, 8 + imul t1w, 9 + add t0w, 8 + sub t1w, word [synthq+8] + add t0w, t1w + sar t0w, 4 + sub word [synthq+2], t0w + + mov x, 1 + sub widthq, 2 + .loop2_x_1: +%if 0 ; needs horizontal masking + movu m0, [synthq + x*4 + 0] + movu m1, [synthq + x*4 + 4] + movu m2, [synthq + x*4 + 8] + movu m3, [synthq + x*4 - 4] + pmullw m0, [pw_9] + pmullw m1, [pw_9] + psubw m0, m2 + psubw m1, m3 + paddw m0, m1 + paddw m0, [pw_8] + psraw m0, 4 + movu m4, [synthq + x*4 + 2] + psubw m4, m0 + movu [synthq + x*4 + 2], m4 + add x, mmsize / 2 + cmp x, widthq +%else + mov t0w, word [synthq + x*4 + 0] + mov t1w, word [synthq + x*4 + 4] + imul t0w, 9 + imul t1w, 9 + sub t0w, word [synthq + x*4 + 8] + sub t1w, word [synthq + x*4 - 4] + add t0w, 8 + add t0w, t1w + sar t0w, 4 + sub word [synthq + x*4 + 2], t0w + add x, 1 + cmp x, widthq +%endif + jl .loop2_x_1 + + mov t0w, word [synthq + synth_w*2 - 4] + mov t1w, word [synthq + synth_w*2 - 8] + imul t0w, 17 + sub t1w, 8 ; -8 so that the sign is corrected below + sub t0w, t1w + sar t0w, 4 + sub word [synthq + synth_w*2 - 2], t0w + + mov t0w, word [synthq + synth_w*2 - 4] + mov t1w, word [synthq + synth_w*2 - 8] + imul t0w, 8 + imul t1w, 9 + sub t0w, word [synthq + synth_w*2 - 12] + add t1w, 8 + add t0w, t1w + sar t0w, 4 + sub word [synthq + synth_w*2 - 6], t0w + + ; Lifting stage 1 + mov t0w, word [synthq + 2] + add t0w, t0w + add t0w, 2 + sar t0w, 2 + add word [synthq], t0w + + mov x, 1 + add widthq, 1 + .loop2_x_2: +%if 0 ; needs horizontal masking + movu m0, [synthq + x*4 - 2] + movu m1, [synthq + x*4 + 2] + paddw m0, m1 + paddw m0, [pw_2] + psraw m0, 2 + movu m2, [synthq + x*4 + 0] + paddw m2, m0 + movu [synthq + x*4 + 0], m2 + add x, mmsize / 2 + cmp x, widthq +%else + mov t0w, word [synthq + x*4 - 2] + mov t1w, word [synthq + x*4 + 2] + add t0w, 2 + add t0w, t1w + sar t0w, 2 + add word [synthq + x*4 + 0], t0w + add x, 1 + cmp x, widthq +%endif + jl .loop2_x_2 + + mov t0w, word [synthq + synth_w*2 - 6] + mov t1w, word [synthq + synth_w*2 - 2] + add t0w, t1w + add t0w, 2 + sar t0w, 2 + add word [synthq + synth_w*2 - 4], t0w + + lea synthq, [synthq + synth_w*2] + sub y, 1 + jg .loop2_y + + ; Vertical synthesis: Lifting stage 2 + mov synthq, [rsp] ; No addition of synth_w so indicies in the loop have an extra synth_w added. + mov x, synth_w + .loop3_x: + movu m0, [synthq] + movu m1, [synthq + synth_w*4] + movu m2, [synthq + synth_w*8] + pmullw m0, [pw_8] + pmullw m1, [pw_9] + psubw m0, m2 + paddw m1, [pw_8] + paddw m0, m1 + psraw m0, 4 + movu m3, [synthq + synth_w*2] + psubw m3, m0 + movu [synthq + synth_w*2], m3 + add synthq, mmsize + sub x, mmsize/2 + jg .loop3_x + + mov synthq, [rsp] + mov t0, synth_w + neg t0 + mov y, heightq + sub y, 3 + .loop4_y: + lea synthq, [synthq + synth_w*4] + mov t1, synthq + mov x, synth_w + .loop4_x: + movu m0, [t1] + movu m1, [t1 + synth_w*4] + movu m2, [t1 + t0*4] + movu m3, [t1 + synth_w*8] + pmullw m0, [pw_9] + pmullw m1, [pw_9] + paddw m2, m3 + paddw m0, [pw_8] + psubw m1, m2 + paddw m0, m1 + psraw m0, 4 + movu m4, [t1 + synth_w*2] + psubw m4, m0 + movu [t1 + synth_w*2], m4 + add t1, mmsize + sub x, mmsize/2 + jg .loop4_x + sub y, 1 + jg .loop4_y + + mov synthq, [rsp] + mov t0, synth_h + sub t0, 1 + imul t0, synth_w + lea synthq, [synthq + t0*2] + neg synth_w + lea t1, [synth_w*3] + lea t0, [synth_w*5] + mov x, synth_w + .loop5_x: + movu m0, [synthq + synth_w*2] + movu m1, [synthq + t1*2] + pmullw m0, [pw_17] + psubw m1, [pw_8] ; -8 so that the sign is corrected below + psubw m0, m1 + psraw m0, 4 + movu m2, [synthq] + psubw m2, m0 + movu [synthq], m2 + + movu m0, [synthq + t1*2] + movu m1, [synthq + synth_w*2] + movu m2, [synthq + t0*2] + pmullw m0, [pw_9] + pmullw m1, [pw_8] + psubw m2, [pw_8] ; -8 so that the sign is corrected below + paddw m0, m1 + psubw m0, m2 + psraw m0, 4 + movu m3, [synthq + synth_w*4] + psubw m3, m0 + movu [synthq + synth_w*4], m3 + + add synthq, mmsize + add x, mmsize/2 + jl .loop5_x + + ; Vertical synthesis: Lifting stage 1 + mov synthq, [rsp] + mov x, synth_w + neg synth_w + .loop6_x: + movu m0, [synthq + synth_w] + paddw m0, m0 + paddw m0, [pw_2] + psraw m0, 2 + movu m1, [synthq] + paddw m1, m0 + movu [synthq], m1 + add synthq, mmsize + add x, mmsize/2 + jl .loop6_x + + mov synthq, [rsp] + lea synthq, [synthq + synth_w*2] ; 1 synth_w left out so indicies in the loop have an extra synth_w added. + mov y, heightq + sub y, 2 + .loop7_y: + mov t0, synthq + mov x, synth_w + .loop7_x: + movu m0, [t0] + movu m1, [t0 + synth_w*4] + paddw m0, m1 + paddw m0, [pw_2] + psraw m0, 2 + movu m2, [t0 + synth_w*2] + paddw m2, m0 + movu [t0 + synth_w*2], m2 + add t0, mmsize + sub x, mmsize/2 + jg .loop7_x + lea synthq, [synthq + synth_w*4] + sub y, 1 + jg .loop7_y + + mov synthq, [rsp] + mov t0, synth_h + sub t0, 3 + imul t0, synth_w ; 1 synth_w left out so indicies in the loop have an extra synth_w added. + lea synthq, [synthq + t0*2] + mov x, synth_w + .loop8_x: + movu m0, [synthq] + movu m1, [synthq + synth_w*4] + paddw m0, m1 + paddw m0, [pw_2] + psraw m0, 2 + movu m2, [synthq + synth_w*2] + paddw m2, m0 + movu [synthq + synth_w*2], m2 + add synthq, mmsize + sub x, mmsize/2 + jg .loop8_x + + add rsp, 5 * gprsize +RET +
signature.asc
Description: OpenPGP digital signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel