[FFmpeg-devel] [WIP] vc2 encoder simd assembly

James Darnley Tue, 01 Mar 2016 08:16:55 -0800

Hello

I've been working on assembly for the vc2 encoder and have reached an
impasse.  My code results in very visible errors, very obvious vertical
streaks in the bottom-right half of the image and some low-frequency
effect (I think).


I cannot see the problem in my code so I need some fresh eyes to look at it.

You can test the code with these two commands:
> ./ffmpeg -f lavfi -i testsrc=s=1024x576,format=yuv420p -vcodec vc2 -strict -1 
> -vframes 1 -cpuflags 0 -y temp.vc2
> ./ffmpeg -f lavfi -i testsrc=s=1024x576,format=yuv420p -vcodec vc2 -strict -1 
> -vframes 1 -y temp.vc2

Decode each to a png image to see the problem.

Attached is a diff from master (my branch is a two dozen small commits).

Thanks.

diff --git a/libavcodec/vc2enc_dwt.c b/libavcodec/vc2enc_dwt.c
index c60b003..bac262d 100644
--- a/libavcodec/vc2enc_dwt.c
+++ b/libavcodec/vc2enc_dwt.c
@@ -23,6 +23,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/mem.h"
 #include "vc2enc_dwt.h"
+#include "libavcodec/x86/vc2dsp.h"
 
 /* Since the transforms spit out interleaved coefficients, this function
  * rearranges the coefficients into the more traditional subdivision,
@@ -76,16 +77,21 @@ static void vc2_subband_dwt_97(VC2TransformContext *t, 
dwtcoef *data,
     for (y = 0; y < synth_height; y++) {
         /* Lifting stage 2. */
         synthl[1] -= (8*synthl[0] + 9*synthl[2] - synthl[4] + 8) >> 4;
+
         for (x = 1; x < width - 2; x++)
-            synthl[2*x + 1] -= (9*synthl[2*x] + 9*synthl[2*x + 2] - synthl[2*x 
+ 4] -
-                                synthl[2 * x - 2] + 8) >> 4;
+            synthl[2*x + 1] -= (9*synthl[2*x] +
+                                9*synthl[2*x + 2] -
+                                  synthl[2*x + 4] -
+                                  synthl[2*x - 2] + 8) >> 4;
+
         synthl[synth_width - 1] -= (17*synthl[synth_width - 2] -
-                                    synthl[synth_width - 4] + 8) >> 4;
+                                       synthl[synth_width - 4] + 8) >> 4;
         synthl[synth_width - 3] -= (8*synthl[synth_width - 2] +
                                     9*synthl[synth_width - 4] -
-                                    synthl[synth_width - 6] + 8) >> 4;
+                                      synthl[synth_width - 6] + 8) >> 4;
         /* Lifting stage 1. */
         synthl[0] += (synthl[1] + synthl[1] + 2) >> 2;
+
         for (x = 1; x < width - 1; x++)
             synthl[2*x] += (synthl[2*x - 1] + synthl[2*x + 1] + 2) >> 2;
 
@@ -97,25 +103,28 @@ static void vc2_subband_dwt_97(VC2TransformContext *t, 
dwtcoef *data,
     /* Vertical synthesis: Lifting stage 2. */
     synthl = synth + synth_width;
     for (x = 0; x < synth_width; x++)
-        synthl[x] -= (8*synthl[x - synth_width] + 9*synthl[x + synth_width] -
-                      synthl[x + 3 * synth_width] + 8) >> 4;
+        synthl[x] -= (8*synthl[x - synth_width] +
+                      9*synthl[x + synth_width] -
+                        synthl[x + 3*synth_width] + 8) >> 4;
 
     synthl = synth + (synth_width << 1);
     for (y = 1; y < height - 2; y++) {
         for (x = 0; x < synth_width; x++)
             synthl[x + synth_width] -= (9*synthl[x] +
-                                        9*synthl[x + 2 * synth_width] -
-                                        synthl[x - 2 * synth_width] -
-                                        synthl[x + 4 * synth_width] + 8) >> 4;
+                                        9*synthl[x + 2*synth_width] -
+                                          synthl[x - 2*synth_width] -
+                                          synthl[x + 4*synth_width] + 8) >> 4;
         synthl += synth_width << 1;
     }
 
     synthl = synth + (synth_height - 1) * synth_width;
     for (x = 0; x < synth_width; x++) {
         synthl[x] -= (17*synthl[x - synth_width] -
-                      synthl[x - 3*synth_width] + 8) >> 4;
-                      synthl[x - 2*synth_width] -= (9*synthl[x - 
3*synth_width] +
-                      8*synthl[x - 1*synth_width] - synthl[x - 5*synth_width] 
+ 8) >> 4;
+                         synthl[x - 3*synth_width] + 8) >> 4;
+
+        synthl[x - 2*synth_width] -= (9*synthl[x - 3*synth_width] +
+                                      8*synthl[x - 1*synth_width] -
+                                        synthl[x - 5*synth_width] + 8) >> 4;
     }
 
     /* Vertical synthesis: Lifting stage 1. */
@@ -262,6 +271,10 @@ av_cold int ff_vc2enc_init_transforms(VC2TransformContext 
*s, int p_width, int p
     s->vc2_subband_dwt[VC2_TRANSFORM_HAAR]   = vc2_subband_dwt_haar;
     s->vc2_subband_dwt[VC2_TRANSFORM_HAAR_S] = vc2_subband_dwt_haar_shift;
 
+#if ARCH_X86
+    ff_vc2enc_init_x86(s, p_width, p_height);
+#endif
+
     s->buffer = av_malloc(2*p_width*p_height*sizeof(dwtcoef));
     if (!s->buffer)
         return 1;
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 839b5bc..24aedda 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -34,6 +34,7 @@ OBJS-$(CONFIG_PIXBLOCKDSP)             += 
x86/pixblockdsp_init.o
 OBJS-$(CONFIG_QPELDSP)                 += x86/qpeldsp_init.o
 OBJS-$(CONFIG_RV34DSP)                 += x86/rv34dsp_init.o
 OBJS-$(CONFIG_VC1DSP)                  += x86/vc1dsp_init.o
+OBJS-$(CONFIG_VC2_ENCODER)             += x86/vc2dsp_init.o
 OBJS-$(CONFIG_VIDEODSP)                += x86/videodsp_init.o
 OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
 OBJS-$(CONFIG_VP8DSP)                  += x86/vp8dsp_init.o
@@ -122,6 +123,7 @@ YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o     
            \
 YASM-OBJS-$(CONFIG_RV34DSP)            += x86/rv34dsp.o
 YASM-OBJS-$(CONFIG_VC1DSP)             += x86/vc1dsp_loopfilter.o       \
                                           x86/vc1dsp_mc.o
+YASM-OBJS-$(CONFIG_VC2_ENCODER)        += x86/vc2enc_dwt.o
 YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/simple_idct10.o
 YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
diff --git a/libavcodec/x86/vc2dsp.h b/libavcodec/x86/vc2dsp.h
new file mode 100644
index 0000000..825862e
--- /dev/null
+++ b/libavcodec/x86/vc2dsp.h
@@ -0,0 +1,28 @@
+/*
+ * VC2 encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VC2DSP_H
+#define AVCODEC_X86_VC2DSP_H
+
+#include "libavcodec/vc2enc_dwt.h"
+
+void ff_vc2enc_init_x86(VC2TransformContext *s, int p_width, int p_height);
+
+#endif /* AVCODEC_X86_VC2DSP_H */
diff --git a/libavcodec/x86/vc2dsp_init.c b/libavcodec/x86/vc2dsp_init.c
new file mode 100644
index 0000000..b3bdb53
--- /dev/null
+++ b/libavcodec/x86/vc2dsp_init.c
@@ -0,0 +1,67 @@
+/*
+ * VC2 encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/x86/vc2dsp.h"
+
+#if HAVE_SSE2_EXTERNAL
+void ff_vc2_subband_dwt_97_sse2(VC2TransformContext*, dwtcoef*, ptrdiff_t, int 
, int);
+#endif
+
+static av_always_inline void deinterleave(dwtcoef *linell, ptrdiff_t stride,
+                                          int width, int height, dwtcoef 
*synthl)
+{
+    int x, y;
+    ptrdiff_t synthw = width << 1;
+    dwtcoef *linehl = linell + width;
+    dwtcoef *linelh = linell + height*stride;
+    dwtcoef *linehh = linelh + width;
+
+    /* Deinterleave the coefficients. */
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            linell[x] = synthl[(x << 1)];
+            linehl[x] = synthl[(x << 1) + 1];
+            linelh[x] = synthl[(x << 1) + synthw];
+            linehh[x] = synthl[(x << 1) + synthw + 1];
+        }
+        synthl += synthw << 1;
+        linell += stride;
+        linelh += stride;
+        linehl += stride;
+        linehh += stride;
+    }
+}
+
+void static wrapper_97(VC2TransformContext *t, dwtcoef *data, ptrdiff_t 
stride, int width, int height)
+{
+    ff_vc2_subband_dwt_97_sse2(t, data, stride, width, height);
+    deinterleave(data, stride, width, height, t->buffer);
+}
+
+av_cold void ff_vc2enc_init_x86(VC2TransformContext *s, int p_width, int 
p_height)
+{
+    int cpuflags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpuflags)) {
+        s->vc2_subband_dwt[VC2_TRANSFORM_9_7] = wrapper_97;
+    }
+}
diff --git a/libavcodec/x86/vc2enc_dwt.asm b/libavcodec/x86/vc2enc_dwt.asm
new file mode 100644
index 0000000..c6fdf36
--- /dev/null
+++ b/libavcodec/x86/vc2enc_dwt.asm
@@ -0,0 +1,312 @@
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pw_2
+cextern pw_8
+cextern pw_9
+cextern pw_17
+
+section .text
+
+INIT_XMM sse2
+cglobal vc2_subband_dwt_97, 5, 12, 5, synth, data, stride, width, height
+    sub rsp, 5 * gprsize
+    mov synthq, [synthq]
+    movsxdifnidn widthq, widthd
+    movsxdifnidn heightq, heightd
+    mov [rsp + gprsize*0], synthq
+    mov [rsp + gprsize*1], dataq
+    mov [rsp + gprsize*2], strideq
+    mov [rsp + gprsize*3], widthq
+    mov [rsp + gprsize*4], heightq
+
+    DECLARE_REG_TMP 9, 10, 11
+    %define synth_w r5
+    %define synth_h r6
+    mov synth_w, widthq
+    mov synth_h, heightq
+    add synth_w, synth_w
+    add synth_h, synth_h
+
+    %define x r7
+    %define y r8
+    ; shifting loop (for precision and copy)
+    lea synthq, [synthq + synth_w*2] ; add 1 synth_w to save comparing x 
against synth_w
+    lea dataq,  [dataq + synth_w*2]
+    mov y, synth_h
+    .loop1_y:
+        mov x, synth_w
+        neg x
+        .loop1_x:
+            movu   m0, [dataq + x*2]
+            paddw  m0, m0
+            movu  [synthq + x*2],  m0
+            add    x, mmsize/2
+        jl .loop1_x
+
+        lea dataq, [dataq + strideq*2]
+        lea synthq, [synthq + synth_w*2]
+        sub y, 1
+    jg .loop1_y
+
+    ; Horizontal synthesis
+    mov synthq, [rsp]
+    mov y, synth_h
+    .loop2_y:
+        ; Lifting stage 2
+        mov t0w, word [synthq+0]
+        mov t1w, word [synthq+4]
+        imul t0w, 8
+        imul t1w, 9
+        add t0w, 8
+        sub t1w, word [synthq+8]
+        add t0w, t1w
+        sar t0w, 4
+        sub word [synthq+2], t0w
+
+        mov x, 1
+        sub widthq, 2
+        .loop2_x_1:
+%if 0 ; needs horizontal masking
+            movu m0, [synthq + x*4 + 0]
+            movu m1, [synthq + x*4 + 4]
+            movu m2, [synthq + x*4 + 8]
+            movu m3, [synthq + x*4 - 4]
+            pmullw m0, [pw_9]
+            pmullw m1, [pw_9]
+            psubw m0, m2
+            psubw m1, m3
+            paddw m0, m1
+            paddw m0, [pw_8]
+            psraw m0, 4
+            movu m4, [synthq + x*4 + 2]
+            psubw m4, m0
+            movu [synthq + x*4 + 2], m4
+            add x, mmsize / 2
+            cmp x, widthq
+%else
+            mov t0w, word [synthq + x*4 + 0]
+            mov t1w, word [synthq + x*4 + 4]
+            imul t0w, 9
+            imul t1w, 9
+            sub t0w, word [synthq + x*4 + 8]
+            sub t1w, word [synthq + x*4 - 4]
+            add t0w, 8
+            add t0w, t1w
+            sar t0w, 4
+            sub word [synthq + x*4 + 2], t0w
+            add x, 1
+            cmp x, widthq
+%endif
+        jl .loop2_x_1
+
+        mov t0w, word [synthq + synth_w*2 - 4]
+        mov t1w, word [synthq + synth_w*2 - 8]
+        imul t0w, 17
+        sub t1w, 8 ; -8 so that the sign is corrected below
+        sub t0w, t1w
+        sar t0w, 4
+        sub word [synthq + synth_w*2 - 2], t0w
+
+        mov t0w, word [synthq + synth_w*2 - 4]
+        mov t1w, word [synthq + synth_w*2 - 8]
+        imul t0w, 8
+        imul t1w, 9
+        sub t0w, word [synthq + synth_w*2 - 12]
+        add t1w, 8
+        add t0w, t1w
+        sar t0w, 4
+        sub word [synthq + synth_w*2 - 6], t0w
+
+        ; Lifting stage 1
+        mov t0w, word [synthq + 2]
+        add t0w, t0w
+        add t0w, 2
+        sar t0w, 2
+        add word [synthq], t0w
+
+        mov x, 1
+        add widthq, 1
+        .loop2_x_2:
+%if 0 ; needs horizontal masking
+            movu m0, [synthq + x*4 - 2]
+            movu m1, [synthq + x*4 + 2]
+            paddw m0, m1
+            paddw m0, [pw_2]
+            psraw m0, 2
+            movu m2, [synthq + x*4 + 0]
+            paddw m2, m0
+            movu [synthq + x*4 + 0], m2
+            add x, mmsize / 2
+            cmp x, widthq
+%else
+            mov t0w, word [synthq + x*4 - 2]
+            mov t1w, word [synthq + x*4 + 2]
+            add t0w, 2
+            add t0w, t1w
+            sar t0w, 2
+            add word [synthq + x*4 + 0], t0w
+            add x, 1
+            cmp x, widthq
+%endif
+        jl .loop2_x_2
+
+        mov t0w, word [synthq + synth_w*2 - 6]
+        mov t1w, word [synthq + synth_w*2 - 2]
+        add t0w, t1w
+        add t0w, 2
+        sar t0w, 2
+        add word [synthq + synth_w*2 - 4], t0w
+
+        lea synthq, [synthq + synth_w*2]
+        sub y, 1
+    jg .loop2_y
+
+    ; Vertical synthesis: Lifting stage 2
+    mov synthq, [rsp] ; No addition of synth_w so indicies in the loop have an 
extra synth_w added.
+    mov x, synth_w
+    .loop3_x:
+        movu m0, [synthq]
+        movu m1, [synthq + synth_w*4]
+        movu m2, [synthq + synth_w*8]
+        pmullw m0, [pw_8]
+        pmullw m1, [pw_9]
+        psubw m0, m2
+        paddw m1, [pw_8]
+        paddw m0, m1
+        psraw m0, 4
+        movu m3, [synthq + synth_w*2]
+        psubw m3, m0
+        movu [synthq + synth_w*2], m3
+        add synthq, mmsize
+        sub x, mmsize/2
+    jg .loop3_x
+
+    mov synthq, [rsp]
+    mov t0, synth_w
+    neg t0
+    mov y, heightq
+    sub y, 3
+    .loop4_y:
+        lea synthq, [synthq + synth_w*4]
+        mov t1, synthq
+        mov x, synth_w
+        .loop4_x:
+            movu m0, [t1]
+            movu m1, [t1 + synth_w*4]
+            movu m2, [t1 + t0*4]
+            movu m3, [t1 + synth_w*8]
+            pmullw m0, [pw_9]
+            pmullw m1, [pw_9]
+            paddw m2, m3
+            paddw m0, [pw_8]
+            psubw m1, m2
+            paddw m0, m1
+            psraw m0, 4
+            movu m4, [t1 + synth_w*2]
+            psubw m4, m0
+            movu [t1 + synth_w*2], m4
+            add t1, mmsize
+            sub x, mmsize/2
+        jg .loop4_x
+        sub y, 1
+    jg .loop4_y
+
+    mov synthq, [rsp]
+    mov t0, synth_h
+    sub t0, 1
+    imul t0, synth_w
+    lea synthq, [synthq + t0*2]
+    neg synth_w
+    lea t1, [synth_w*3]
+    lea t0, [synth_w*5]
+    mov x, synth_w
+    .loop5_x:
+        movu m0, [synthq + synth_w*2]
+        movu m1, [synthq + t1*2]
+        pmullw m0, [pw_17]
+        psubw m1, [pw_8] ; -8 so that the sign is corrected below
+        psubw m0, m1
+        psraw m0, 4
+        movu m2, [synthq]
+        psubw m2, m0
+        movu [synthq], m2
+
+        movu m0, [synthq + t1*2]
+        movu m1, [synthq + synth_w*2]
+        movu m2, [synthq + t0*2]
+        pmullw m0, [pw_9]
+        pmullw m1, [pw_8]
+        psubw m2, [pw_8] ; -8 so that the sign is corrected below
+        paddw m0, m1
+        psubw m0, m2
+        psraw m0, 4
+        movu m3, [synthq + synth_w*4]
+        psubw m3, m0
+        movu [synthq + synth_w*4], m3
+
+        add synthq, mmsize
+        add x, mmsize/2
+    jl .loop5_x
+
+    ; Vertical synthesis: Lifting stage 1
+    mov synthq, [rsp]
+    mov x, synth_w
+    neg synth_w
+    .loop6_x:
+        movu m0, [synthq + synth_w]
+        paddw m0, m0
+        paddw m0, [pw_2]
+        psraw m0, 2
+        movu m1, [synthq]
+        paddw m1, m0
+        movu [synthq], m1
+        add synthq, mmsize
+        add x, mmsize/2
+    jl .loop6_x
+
+    mov synthq, [rsp]
+    lea synthq, [synthq + synth_w*2] ; 1 synth_w left out so indicies in the 
loop have an extra synth_w added.
+    mov y, heightq
+    sub y, 2
+    .loop7_y:
+        mov t0, synthq
+        mov x, synth_w
+        .loop7_x:
+            movu m0, [t0]
+            movu m1, [t0 + synth_w*4]
+            paddw m0, m1
+            paddw m0, [pw_2]
+            psraw m0, 2
+            movu m2, [t0 + synth_w*2]
+            paddw m2, m0
+            movu [t0 + synth_w*2], m2
+            add t0, mmsize
+            sub x, mmsize/2
+        jg .loop7_x
+        lea synthq, [synthq + synth_w*4]
+        sub y, 1
+    jg .loop7_y
+
+    mov synthq, [rsp]
+    mov t0, synth_h
+    sub t0, 3
+    imul t0, synth_w ; 1 synth_w left out so indicies in the loop have an 
extra synth_w added.
+    lea synthq, [synthq + t0*2]
+    mov x, synth_w
+    .loop8_x:
+        movu m0, [synthq]
+        movu m1, [synthq + synth_w*4]
+        paddw m0, m1
+        paddw m0, [pw_2]
+        psraw m0, 2
+        movu m2, [synthq + synth_w*2]
+        paddw m2, m0
+        movu [synthq + synth_w*2], m2
+        add synthq, mmsize
+        sub x, mmsize/2
+    jg .loop8_x
+
+    add rsp, 5 * gprsize
+RET
+

signature.asc
Description: OpenPGP digital signature

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [WIP] vc2 encoder simd assembly

Reply via email to