On Wed, 15 Jun 2016, Janne Grunau wrote:

On 2016-06-11 23:32:22 +0300, Martin Storsjö wrote:
The forward dcts are based partially on x264. The idct tests
themselves are inspired by similar tests for vp9 by Ronald
Bultje.
---
Should we mark this as GPL, or are x264 people ok with picking
this part (which is more or less straight from the spec)?

Marking tests for LGPL code as GPL feels strange, we should try to avoid
that.

Indeed. I'll make that code look a little less x264-copypasted and I guess it should be fine then.

I'm a little unsure about the scaling for the 8x8 forward dct;
the output after idct+add might be off by one occasionally.

I'm not sure if it is even supposed to be exactly reversible.

Ok, that probably explains it then.

---
 tests/checkasm/Makefile   |   1 +
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/h264dsp.c  | 268 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 273 insertions(+)
 create mode 100644 tests/checkasm/h264dsp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 8a0cee5..3c23853 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -2,6 +2,7 @@
 # subsystems
 AVCODECOBJS-$(CONFIG_BSWAPDSP)          += bswapdsp.o
 AVCODECOBJS-$(CONFIG_FMTCONVERT)        += fmtconvert.o
+AVCODECOBJS-$(CONFIG_H264DSP)           += h264dsp.o
 AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
 AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o

diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index c75e431..15f9f68 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -74,6 +74,9 @@ static const struct {
 #if CONFIG_FMTCONVERT
     { "fmtconvert", checkasm_check_fmtconvert },
 #endif
+#if CONFIG_H264DSP
+    { "h264dsp", checkasm_check_h264dsp },
+#endif
 #if CONFIG_H264PRED
     { "h264pred", checkasm_check_h264pred },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index c7aa19c..619ebc7 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -33,6 +33,7 @@
 void checkasm_check_bswapdsp(void);
 void checkasm_check_dcadsp(void);
 void checkasm_check_fmtconvert(void);
+void checkasm_check_h264dsp(void);
 void checkasm_check_h264pred(void);
 void checkasm_check_h264qpel(void);
 void checkasm_check_hevc_mc(void);
diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c
new file mode 100644
index 0000000..bbdf74b
--- /dev/null
+++ b/tests/checkasm/h264dsp.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2016 Martin Storsjo
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Libav; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+#include "checkasm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264dsp.h"
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+
+static const uint32_t pixel_mask[3] = { 0xffffffff, 0x01ff01ff, 0x03ff03ff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define SIZEOF_COEF  (2 * ((bit_depth + 7) / 8))
+
+#define randomize_buffers()                                               \
+    do {                                                                  \
+        uint32_t mask = pixel_mask[bit_depth - 8];                        \
+        for (y = 0; y < sz; y++) {                                        \
+            for (x = 0; x < sz * SIZEOF_PIXEL; x += 4) {                  \
+                AV_WN32A(src + y * sz * SIZEOF_PIXEL + x, rnd() & mask);  \
+                AV_WN32A(dst + y * sz * SIZEOF_PIXEL + x, rnd() & mask);  \
+            }                                                             \
+            for (x = 0; x < sz; x++) {                                    \
+                if (bit_depth == 8) {                                     \
+                    coef[y * sz + x] = src[y * sz + x] - dst[y * sz + x]; \
+                } else {                                                  \
+                    ((int32_t *)coef)[y * sz + x] =                       \
+                        ((uint16_t *)src)[y * sz + x] -                   \
+                        ((uint16_t *)dst)[y * sz + x];                    \
+                }                                                         \
+            }                                                             \
+        }                                                                 \
+    } while (0)
+
+#define dct4x4_impl(size, dctcoef) \
+static void dct4x4_##size(dctcoef *coef)                                       
           \
+{                                                                              
           \
+    int i, y, x;                                                               
           \
+    dctcoef tmp[16];                                                           
           \
+    for (i = 0; i < 4; i++) {                                                  
           \
+        int z0 = coef[i*4 + 0] + coef[i*4 + 3];                                
           \
+        int z1 = coef[i*4 + 1] + coef[i*4 + 2];                                
           \
+        int z2 = coef[i*4 + 0] - coef[i*4 + 3];                                
           \
+        int z3 = coef[i*4 + 1] - coef[i*4 + 2];                                
           \
+        tmp[i + 4*0] =   z0 +   z1;                                            
           \
+        tmp[i + 4*1] = 2*z2 +   z3;                                            
           \
+        tmp[i + 4*2] =   z0 -   z1;                                            
           \
+        tmp[i + 4*3] =   z2 - 2*z3;                                            
           \
+    }                                                                          
           \
+    for (i = 0; i < 4; i++) {                                                  
           \
+        int z0 = tmp[i*4 + 0] + tmp[i*4 + 3];                                  
           \
+        int z1 = tmp[i*4 + 1] + tmp[i*4 + 2];                                  
           \
+        int z2 = tmp[i*4 + 0] - tmp[i*4 + 3];                                  
           \
+        int z3 = tmp[i*4 + 1] - tmp[i*4 + 2];                                  
           \
+        coef[i*4 + 0] =   z0 +   z1;                                           
           \
+        coef[i*4 + 1] = 2*z2 +   z3;                                           
           \
+        coef[i*4 + 2] =   z0 -   z1;                                           
           \
+        coef[i*4 + 3] =   z2 - 2*z3;                                           
           \
+    }                                                                          
           \
+    for (y = 0; y < 4; y++) {                                                  
           \
+        for (x = 0; x < 4; x++) {                                              
           \
+            static const int scale[] = { 13107 * 10, 8066 * 13, 5243 * 16 };   
           \
+            coef[y*4 + x] = (coef[y*4 + x] * scale[(y & 1) + (x & 1)] + (1 << 14)) 
>> 15; \
+        }                                                                      
           \
+    }                                                                          
           \
+}
+
+#define DCT8_1D {\
+    int s07 = SRC(0) + SRC(7);\
+    int s16 = SRC(1) + SRC(6);\
+    int s25 = SRC(2) + SRC(5);\
+    int s34 = SRC(3) + SRC(4);\
+    int a0 = s07 + s34;\
+    int a1 = s16 + s25;\
+    int a2 = s07 - s34;\
+    int a3 = s16 - s25;\
+    int d07 = SRC(0) - SRC(7);\
+    int d16 = SRC(1) - SRC(6);\
+    int d25 = SRC(2) - SRC(5);\
+    int d34 = SRC(3) - SRC(4);\
+    int a4 = d16 + d25 + (d07 + (d07>>1));\
+    int a5 = d07 - d34 - (d25 + (d25>>1));\
+    int a6 = d07 + d34 - (d16 + (d16>>1));\
+    int a7 = d16 - d25 + (d34 + (d34>>1));\
+    DST(0) =  a0 + a1     ;\
+    DST(1) =  a4 + (a7>>2);\
+    DST(2) =  a2 + (a3>>1);\
+    DST(3) =  a5 + (a6>>2);\
+    DST(4) =  a0 - a1     ;\
+    DST(5) =  a6 - (a5>>2);\
+    DST(6) = (a2>>1) - a3 ;\
+    DST(7) = (a4>>2) - a7 ;\
+}
+
+static void dct8x8_16(int16_t *coef)
+{
+    int i;
+    int16_t tmp[64];
+#define SRC(x) coef[x*8 + i]
+#define DST(x) tmp [x*8 + i]
+    for (i = 0; i < 8; i++)
+        DCT8_1D
+#undef SRC
+#undef DST
+
+#define SRC(x) tmp [i*8 + x]
+#define DST(x) coef[x*8 + i]
+    for (i = 0; i < 8; i++)
+        DCT8_1D
+#undef SRC
+#undef DST
+}
+
+static void dct8x8_32(int32_t *coef)
+{
+    int i;
+    int32_t tmp[64];
+#define SRC(x) coef[x*8 + i]
+#define DST(x) tmp [x*8 + i]
+    for (i = 0; i < 8; i++)
+        DCT8_1D
+#undef SRC
+#undef DST
+
+#define SRC(x) tmp [i*8 + x]
+#define DST(x) coef[x*8 + i]
+    for (i = 0; i < 8; i++)
+        DCT8_1D
+#undef SRC
+#undef DST
+}
+
+#define scale8x8_impl(size, dctcoef)                                           
      \
+static void scale8x8_##size(dctcoef *coef)                                     
      \
+{                                                                              
      \
+    int x, y;                                                                  
      \
+    for (y = 0; y < 8; y++) {                                                  
      \
+        for (x = 0; x < 8; x++) {                                              
      \
+            static const int scale[] = {                                       
      \
+                13107 * 20, 11428 * 18, 20972 * 32,                            
      \
+                12222 * 19, 16777 * 25, 15481 * 24,                            
      \
+            };                                                                 
      \
+            static const int idxmap[] = {                                      
      \
+                0, 3, 4, 3,                                                    
      \
+                3, 1, 5, 1,                                                    
      \
+                4, 5, 2, 5,                                                    
      \
+                3, 1, 5, 1,                                                    
      \
+            };                                                                 
      \
+            int idx = idxmap[(y & 3) * 4 + (x & 3)];                           
      \
+            coef[y*8 + x] = ((int64_t)coef[y*8 + x] * scale[idx] + (1 << 17)) 
>> 18; \
+        }                                                                      
      \
+    }                                                                          
      \
+}
+
+dct4x4_impl(16, int16_t)
+dct4x4_impl(32, int32_t)
+
+scale8x8_impl(16, int16_t)
+scale8x8_impl(32, int32_t)
+
+static void dct4x4(int16_t *coef, int bit_depth)
+{
+    if (bit_depth == 8)
+        dct4x4_16(coef);
+    else
+        dct4x4_32((int32_t *) coef);
+}
+
+static void dct8x8(int16_t *coef, int bit_depth)
+{
+    if (bit_depth == 8) {
+        dct8x8_16(coef);
+        scale8x8_16(coef);
+    } else {
+        dct8x8_32((int32_t *) coef);
+        scale8x8_32((int32_t *) coef);
+    }
+}
+
+
+static int iszero(void *buf, int sz)
+{
+    int i;
+    for (i = 0; i < sz; i++)
+        if (((uint8_t *)buf)[i])
+            return 0;
+    return 1;
+}
+
+static void check_idct(void)
+{
+    LOCAL_ALIGNED_16(uint8_t, src, [8 * 8 * 2]);
+    LOCAL_ALIGNED_16(uint8_t, dst, [8 * 8 * 2]);
+    LOCAL_ALIGNED_16(uint8_t, dst0, [8 * 8 * 2]);
+    LOCAL_ALIGNED_16(uint8_t, dst1, [8 * 8 * 2]);
+    LOCAL_ALIGNED_16(int16_t, coef, [8 * 8 * 2]);
+    LOCAL_ALIGNED_16(int16_t, subcoef0, [8 * 8 * 2]);
+    LOCAL_ALIGNED_16(int16_t, subcoef1, [8 * 8 * 2]);
+    H264DSPContext h;
+    int bit_depth, sz;
+    int x, y, dc;
+    declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *block, int 
stride);
+
+    for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
+        ff_h264dsp_init(&h, bit_depth, 1);
+        for (sz = 4; sz <= 8; sz += 4) {
+            randomize_buffers();
+
+            if (sz == 4)
+                dct4x4(coef, bit_depth);
+            else
+                dct8x8(coef, bit_depth);
+
+            for (dc = 0; dc <= 1; dc++) {
+                void (*idct)(uint8_t *, int16_t *, int);
+                switch ((sz << 1) | dc) {
+                case (4 << 1) | 0: idct = h.h264_idct_add; break;
+                case (4 << 1) | 1: idct = h.h264_idct_dc_add; break;
+                case (8 << 1) | 0: idct = h.h264_idct8_add; break;
+                case (8 << 1) | 1: idct = h.h264_idct8_dc_add; break;
+                }
+                if (check_func(idct, "h264_idct%d_add%s_%dbpp", sz, dc ? "_dc" : 
"", bit_depth)) {
+                    if (dc) {
+                        memset(subcoef0, 0, sz * sz * SIZEOF_COEF);
+                        subcoef0[0] = coef[0];
+                    } else {
+                        memcpy(subcoef0, coef, sz * sz * SIZEOF_COEF);
+                    }
+                    memcpy(dst0, dst, sz * sz * SIZEOF_PIXEL);
+                    memcpy(dst1, dst, sz * sz * SIZEOF_PIXEL);
+                    memcpy(subcoef1, subcoef0, sz * sz * SIZEOF_COEF);
+                    call_ref(dst0, subcoef0, sz * SIZEOF_PIXEL);
+                    call_new(dst1, subcoef1, sz * SIZEOF_PIXEL);
+                    if (memcmp(dst0, dst1, sz * sz * SIZEOF_PIXEL) ||
+                        !iszero(subcoef0, sz * sz * SIZEOF_COEF) ||
+                        !iszero(subcoef1, sz * sz * SIZEOF_COEF))

just memcmp'ing subcoef0 and subcoef1 seems to be simpler. We assume for
checkasm that the C code is correct. The idct can either clear or keep
the coefficients. Not that it matter much, either way ok.

That sounds sensible, thanks.

+                        fail();
+                    bench_new(dst0, subcoef0, sz * SIZEOF_PIXEL);

it looks a little strange to benchmark with zeroed coefficients but it
shouldn't make difference at this level.

Indeed - the alternative would be to copy in the original coefficients for each round, but that'd require messing within bench_new.

// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to