hevc: speed up idct add

Josh de Kock Tue, 31 May 2016 09:32:20 -0700

x86/hevc: add avx2 dc idct

hevc: separate residu and prediction


x86/hevc_idct: replace old and unused idct functions

hevc: align coeffs to 32byte boundary
Fixes a segfault related to AVX not aligning to a 32bit boundary

Authors:
James Almer <[email protected]>
Michael Niedermayer <[email protected]
Pierre Edouard <[email protected]>
---
 libavcodec/hevc.c             |  28 ++--
 libavcodec/hevcdsp.c          |  23 ++--
 libavcodec/hevcdsp.h          |  14 +-
 libavcodec/hevcdsp_template.c | 313 ++++++++++++++++++------------------------
 libavcodec/x86/Makefile       |   3 +-
 libavcodec/x86/hevc_idct.asm  | 106 ++++++++++++++
 libavcodec/x86/hevcdsp_init.c |  57 ++++++++
 libavutil/x86/x86util.asm     |   4 +-
 8 files changed, 347 insertions(+), 201 deletions(-)
 create mode 100644 libavcodec/x86/hevc_idct.asm

diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index 177cf93..babc448 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -897,7 +897,7 @@ static void hls_residual_coding(HEVCContext *s, int x0, int 
y0,
     int vshift       = s->ps.sps->vshift[c_idx];
     uint8_t *dst     = &s->frame->data[c_idx][(y0 >> vshift) * stride +
                                               ((x0 >> hshift) << 
s->ps.sps->pixel_shift)];
-    DECLARE_ALIGNED(16, int16_t, coeffs[MAX_TB_SIZE * MAX_TB_SIZE]) = { 0 };
+    DECLARE_ALIGNED(32, int16_t, coeffs[MAX_TB_SIZE * MAX_TB_SIZE]) = { 0 };
     DECLARE_ALIGNED(8, uint8_t, significant_coeff_group_flag[8][8]) = { { 0 } 
};
 
     int trafo_size = 1 << log2_trafo_size;
@@ -1205,17 +1205,29 @@ static void hls_residual_coding(HEVCContext *s, int x0, 
int y0,
         }
     }
 
-    if (lc->cu.cu_transquant_bypass_flag) {
-        s->hevcdsp.transquant_bypass[log2_trafo_size - 2](dst, coeffs, stride);
-    } else {
+    if (!lc->cu.cu_transquant_bypass_flag) {
         if (transform_skip_flag)
-            s->hevcdsp.transform_skip(dst, coeffs, stride);
+            s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
         else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 &&
                  log2_trafo_size == 2)
-            s->hevcdsp.transform_4x4_luma_add(dst, coeffs, stride);
-        else
-            s->hevcdsp.transform_add[log2_trafo_size - 2](dst, coeffs, stride);
+            s->hevcdsp.idct_4x4_luma(coeffs);
+        else {
+            int max_xy = FFMAX(last_significant_coeff_x, 
last_significant_coeff_y);
+            if (max_xy == 0)
+                s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+            else {
+                int col_limit = last_significant_coeff_x + 
last_significant_coeff_y + 4;
+                if (max_xy < 4)
+                    col_limit = FFMIN(4, col_limit);
+                else if (max_xy < 8)
+                    col_limit = FFMIN(8, col_limit);
+                else if (max_xy < 12)
+                    col_limit = FFMIN(24, col_limit);
+                s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+            }
+        }
     }
+    s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
 }
 
 static int hls_transform_unit(HEVCContext *s, int x0, int y0,
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index 15a712d..6b4b97c 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -164,16 +164,21 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int 
bit_depth)
 
 #define HEVC_DSP(depth)                                                     \
     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-    hevcdsp->transquant_bypass[0]   = FUNC(transquant_bypass4x4, depth);    \
-    hevcdsp->transquant_bypass[1]   = FUNC(transquant_bypass8x8, depth);    \
-    hevcdsp->transquant_bypass[2]   = FUNC(transquant_bypass16x16, depth);  \
-    hevcdsp->transquant_bypass[3]   = FUNC(transquant_bypass32x32, depth);  \
+    hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
+    hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
+    hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
+    hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
-    hevcdsp->transform_4x4_luma_add = FUNC(transform_4x4_luma_add, depth);  \
-    hevcdsp->transform_add[0]       = FUNC(transform_4x4_add, depth);       \
-    hevcdsp->transform_add[1]       = FUNC(transform_8x8_add, depth);       \
-    hevcdsp->transform_add[2]       = FUNC(transform_16x16_add, depth);     \
-    hevcdsp->transform_add[3]       = FUNC(transform_32x32_add, depth);     \
+    hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
+    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
+    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
+    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
+                                                                            \
+    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
+    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
+    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
+    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
                                                                             \
     hevcdsp->sao_band_filter[0] = FUNC(sao_band_filter_0, depth);           \
     hevcdsp->sao_band_filter[1] = FUNC(sao_band_filter_1, depth);           \
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 4097233..1793893 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -42,13 +42,15 @@ typedef struct HEVCDSPContext {
     void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size,
                     GetBitContext *gb, int pcm_bit_depth);
 
-    void (*transquant_bypass[4])(uint8_t *dst, int16_t *coeffs,
-                                 ptrdiff_t stride);
+    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t 
_stride);
 
-    void (*transform_skip)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-    void (*transform_4x4_luma_add)(uint8_t *dst, int16_t *coeffs,
-                                   ptrdiff_t stride);
-    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+    void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
+
+    void (*idct_4x4_luma)(int16_t *coeffs);
+
+    void (*idct[4])(int16_t *coeffs, int col_limit);
+
+    void (*idct_dc[4])(int16_t *coeffs);
 
     void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                struct SAOParams *sao, int *borders,
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 31a2e7a..3846327 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -57,48 +57,53 @@ static av_always_inline void 
FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
     }
 }
 
-static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
 }
 
-static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
 }
 
-static void FUNC(transquant_bypass16x16)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
 }
 
-static void FUNC(transquant_bypass32x32)(uint8_t *_dst, int16_t *coeffs,
+static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
                                          ptrdiff_t stride)
 {
     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
 }
 
-static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs,
-                                 ptrdiff_t stride)
+static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
 {
-    pixel *dst = (pixel *)_dst;
-    int shift  = 13 - BIT_DEPTH;
-#if BIT_DEPTH <= 13
-    int offset = 1 << (shift - 1);
-#else
-    int offset = 0;
-#endif
+    int shift  = 15 - BIT_DEPTH - log2_size;
     int x, y;
+    int size = 1 << log2_size;
+    int16_t *coeffs = _coeffs;
 
-    stride /= sizeof(pixel);
 
-    for (y = 0; y < 4 * 4; y += 4) {
-        for (x = 0; x < 4; x++)
-            dst[x] = av_clip_pixel(dst[x] + ((coeffs[y + x] + offset) >> 
shift));
-        dst += stride;
+    if (shift > 0) {
+        int offset = 1 << (shift - 1);
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = (*coeffs + offset) >> shift;
+                coeffs++;
+            }
+        }
+    } else {
+        for (y = 0; y < size; y++) {
+            for (x = 0; x < size; x++) {
+                *coeffs = *coeffs << -shift;
+                coeffs++;
+            }
+        }
     }
 }
 
@@ -122,17 +127,13 @@ static void FUNC(transform_skip)(uint8_t *_dst, int16_t 
*coeffs,
         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
     } while (0)
 
-static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,
-                                         ptrdiff_t stride)
+static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 {
     int i;
-    pixel *dst   = (pixel *)_dst;
     int shift    = 7;
     int add      = 1 << (shift - 1);
     int16_t *src = coeffs;
 
-    stride /= sizeof(pixel);
-
     for (i = 0; i < 4; i++) {
         TR_4x4_LUMA(src, src, 4, SCALE);
         src++;
@@ -141,180 +142,140 @@ static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, 
int16_t *coeffs,
     shift = 20 - BIT_DEPTH;
     add   = 1 << (shift - 1);
     for (i = 0; i < 4; i++) {
-        TR_4x4_LUMA(dst, coeffs, 1, ADD_AND_SCALE);
+        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
         coeffs += 4;
-        dst    += stride;
     }
 }
 
 #undef TR_4x4_LUMA
 
-#define TR_4(dst, src, dstep, sstep, assign)                            \
-    do {                                                                \
-        const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
-                       transform[8 * 2][0] * src[2 * sstep];            \
-        const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
-                       transform[8 * 2][1] * src[2 * sstep];            \
-        const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
-                       transform[8 * 3][0] * src[3 * sstep];            \
-        const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
-                       transform[8 * 3][1] * src[3 * sstep];            \
-                                                                        \
-        assign(dst[0 * dstep], e0 + o0);                                \
-        assign(dst[1 * dstep], e1 + o1);                                \
-        assign(dst[2 * dstep], e1 - o1);                                \
-        assign(dst[3 * dstep], e0 - o0);                                \
+#define TR_4(dst, src, dstep, sstep, assign, end)                              
\
+    do {                                                                       
\
+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              
\
+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              
\
+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              
\
+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              
\
+                                                                               
\
+        assign(dst[0 * dstep], e0 + o0);                                       
\
+        assign(dst[1 * dstep], e1 + o1);                                       
\
+        assign(dst[2 * dstep], e1 - o1);                                       
\
+        assign(dst[3 * dstep], e0 - o0);                                       
\
     } while (0)
 
-static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
-                                    ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 4; i++) {
-        TR_4(src, src, 4, 4, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 4; i++) {
-        TR_4(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 4;
-        dst    += stride;
-    }
-}
-
-#define TR_8(dst, src, dstep, sstep, assign)                      \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_8[4];                                               \
-        int o_8[4] = { 0 };                                       \
-        for (i = 0; i < 4; i++)                                   \
-            for (j = 1; j < 8; j += 2)                            \
-                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
-        TR_4(e_8, src, 1, 2 * sstep, SET);                        \
-                                                                  \
-        for (i = 0; i < 4; i++) {                                 \
-            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
-            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
-        }                                                         \
+#define TR_8(dst, src, dstep, sstep, assign, end)                              
\
+    do {                                                                       
\
+        int i, j;                                                              
\
+        int e_8[4];                                                            
\
+        int o_8[4] = { 0 };                                                    
\
+        for (i = 0; i < 4; i++)                                                
\
+            for (j = 1; j < end; j += 2)                                       
\
+                o_8[i] += transform[4 * j][i] * src[j * sstep];                
\
+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  
\
+                                                                               
\
+        for (i = 0; i < 4; i++) {                                              
\
+            assign(dst[i * dstep], e_8[i] + o_8[i]);                           
\
+            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     
\
+        }                                                                      
\
     } while (0)
 
-#define TR_16(dst, src, dstep, sstep, assign)                     \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_16[8];                                              \
-        int o_16[8] = { 0 };                                      \
-        for (i = 0; i < 8; i++)                                   \
-            for (j = 1; j < 16; j += 2)                           \
-                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
-        TR_8(e_16, src, 1, 2 * sstep, SET);                       \
-                                                                  \
-        for (i = 0; i < 8; i++) {                                 \
-            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
-            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
-        }                                                         \
+#define TR_16(dst, src, dstep, sstep, assign, end)                             
\
+    do {                                                                       
\
+        int i, j;                                                              
\
+        int e_16[8];                                                           
\
+        int o_16[8] = { 0 };                                                   
\
+        for (i = 0; i < 8; i++)                                                
\
+            for (j = 1; j < end; j += 2)                                       
\
+                o_16[i] += transform[2 * j][i] * src[j * sstep];               
\
+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 
\
+                                                                               
\
+        for (i = 0; i < 8; i++) {                                              
\
+            assign(dst[i * dstep], e_16[i] + o_16[i]);                         
\
+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  
\
+        }                                                                      
\
     } while (0)
 
-#define TR_32(dst, src, dstep, sstep, assign)                     \
-    do {                                                          \
-        int i, j;                                                 \
-        int e_32[16];                                             \
-        int o_32[16] = { 0 };                                     \
-        for (i = 0; i < 16; i++)                                  \
-            for (j = 1; j < 32; j += 2)                           \
-                o_32[i] += transform[j][i] * src[j * sstep];      \
-        TR_16(e_32, src, 1, 2 * sstep, SET);                      \
-                                                                  \
-        for (i = 0; i < 16; i++) {                                \
-            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
-            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
-        }                                                         \
+#define TR_32(dst, src, dstep, sstep, assign, end)                             
\
+    do {                                                                       
\
+        int i, j;                                                              
\
+        int e_32[16];                                                          
\
+        int o_32[16] = { 0 };                                                  
\
+        for (i = 0; i < 16; i++)                                               
\
+            for (j = 1; j < end; j += 2)                                       
\
+                o_32[i] += transform[j][i] * src[j * sstep];                   
\
+        TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            
\
+                                                                               
\
+        for (i = 0; i < 16; i++) {                                             
\
+            assign(dst[i * dstep], e_32[i] + o_32[i]);                         
\
+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  
\
+        }                                                                      
\
     } while (0)
 
-
-
-static void FUNC(transform_8x8_add)(uint8_t *_dst, int16_t *coeffs,
-                                    ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
-
-    for (i = 0; i < 8; i++) {
-        TR_8(src, src, 8, 8, SCALE);
-        src++;
-    }
-
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 8; i++) {
-        TR_8(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 8;
-        dst    += stride;
-    }
+#define IDCT_VAR4(H)                                                          \
+    int      limit2   = FFMIN(col_limit + 4, H)
+#define IDCT_VAR8(H)                                                          \
+        int      limit   = FFMIN(col_limit, H);                               \
+        int      limit2   = FFMIN(col_limit + 4, H)
+#define IDCT_VAR16(H)   IDCT_VAR8(H)
+#define IDCT_VAR32(H)   IDCT_VAR8(H)
+
+#define IDCT(H)                                                              \
+static void FUNC(idct_##H ##x ##H )(                                         \
+                   int16_t *coeffs, int col_limit) {                         \
+    int i;                                                                   \
+    int      shift   = 7;                                                    \
+    int      add     = 1 << (shift - 1);                                     \
+    int16_t *src     = coeffs;                                               \
+    IDCT_VAR ##H(H);                                                         \
+                                                                             \
+    for (i = 0; i < H; i++) {                                                \
+        TR_ ## H(src, src, H, H, SCALE, limit2);                             \
+        if (limit2 < H && i%4 == 0 && !!i)                                   \
+            limit2 -= 4;                                                     \
+        src++;                                                               \
+    }                                                                        \
+                                                                             \
+    shift   = 20 - BIT_DEPTH;                                                \
+    add     = 1 << (shift - 1);                                              \
+    for (i = 0; i < H; i++) {                                                \
+        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);                        \
+        coeffs += H;                                                         \
+    }                                                                        \
 }
 
-static void FUNC(transform_16x16_add)(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
-
-    stride /= sizeof(pixel);
+#define IDCT_DC(H)                                                           \
+static void FUNC(idct_##H ##x ##H ##_dc)(                                    \
+                   int16_t *coeffs) {                                        \
+    int i, j;                                                                \
+    int      shift   = 14 - BIT_DEPTH;                                       \
+    int      add     = 1 << (shift - 1);                                     \
+    int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;              \
+                                                                             \
+    for (j = 0; j < H; j++) {                                                \
+        for (i = 0; i < H; i++) {                                            \
+            coeffs[i+j*H] = coeff;                                           \
+        }                                                                    \
+    }                                                                        \
+}
 
-    for (i = 0; i < 16; i++) {
-        TR_16(src, src, 16, 16, SCALE);
-        src++;
-    }
+IDCT( 4)
+IDCT( 8)
+IDCT(16)
+IDCT(32)
 
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 16; i++) {
-        TR_16(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 16;
-        dst    += stride;
-    }
-}
+IDCT_DC( 4)
+IDCT_DC( 8)
+IDCT_DC(16)
+IDCT_DC(32)
 
-static void FUNC(transform_32x32_add)(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride)
-{
-    int i;
-    pixel *dst   = (pixel *)_dst;
-    int shift    = 7;
-    int add      = 1 << (shift - 1);
-    int16_t *src = coeffs;
+#undef TR_4
+#undef TR_8
+#undef TR_16
+#undef TR_32
 
-    stride /= sizeof(pixel);
+#undef SET
+#undef SCALE
+#undef ADD_AND_SCALE
 
-    for (i = 0; i < 32; i++) {
-        TR_32(src, src, 32, 32, SCALE);
-        src++;
-    }
-    src   = coeffs;
-    shift = 20 - BIT_DEPTH;
-    add   = 1 << (shift - 1);
-    for (i = 0; i < 32; i++) {
-        TR_32(dst, coeffs, 1, 1, ADD_AND_SCALE);
-        coeffs += 32;
-        dst    += stride;
-    }
-}
 
 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
                                   ptrdiff_t stride, SAOParams *sao,
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index cdf7758..1460197 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -115,7 +115,8 @@ YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o
 YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_deblock.o            \
-                                          x86/hevc_mc.o
+                                          x86/hevc_mc.o                 \
+                                          x86/hevc_idct.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv40dsp.o
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
new file mode 100644
index 0000000..46457b7
--- /dev/null
+++ b/libavcodec/x86/hevc_idct.asm
@@ -0,0 +1,106 @@
+; /*
+; * SIMD optimized idct functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; * Copyright (c) 2014 James Almer
+; *
+; * This file is part of libav.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+section .text
+
+; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; %1 = HxW
+; %2 = number of loops
+; %3 = bitdepth
+%macro IDCT_DC 3
+cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%3) + 1)
+    sar               tmpw, (15-%3)
+    movd               xm0, tmpd
+    SPLATW              m0, xm0
+    DEFINE_ARGS coeff, cnt
+    mov               cntd, %2
+.loop:
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+    add  coeffq, mmsize*8
+    dec  cntd
+    jg  .loop
+    RET
+%endmacro
+
+; %1 = HxW
+; %2 = bitdepth
+%macro IDCT_DC_NL 2 ; No loop
+cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%2) + 1)
+    sar               tmpw, (15-%2)
+    movd                m0, tmpd
+    SPLATW              m0, xm0
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+%if mmsize == 16
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+%endif
+    RET
+%endmacro
+
+; 8-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,      8
+IDCT_DC     8,  2,  8
+
+INIT_XMM sse2
+IDCT_DC_NL  8,      8
+IDCT_DC    16,  4,  8
+IDCT_DC    32, 16,  8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2,  8
+IDCT_DC    32,  8,  8
+%endif ;HAVE_AVX2_EXTERNAL
+
+; 10-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,     10
+IDCT_DC     8,  2, 10
+
+INIT_XMM sse2
+IDCT_DC_NL  8,     10
+IDCT_DC    16,  4, 10
+IDCT_DC    32, 16, 10
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2, 10
+IDCT_DC    32,  8, 10
+%endif ;HAVE_AVX2_EXTERNAL
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index fd22fc3..47cd247 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -45,6 +45,39 @@ LFC_FUNCS(uint8_t, 10)
 LFL_FUNCS(uint8_t, 8)
 LFL_FUNCS(uint8_t, 10)
 
+#define idct_dc_proto(size, bitd, opt) \
+                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, 
int16_t *coeffs, ptrdiff_t stride)
+
+idct_dc_proto(4, 8,mmxext);
+idct_dc_proto(8, 8,mmxext);
+idct_dc_proto(16,8,  sse2);
+idct_dc_proto(32,8,  sse2);
+
+idct_dc_proto(32,8,  avx2);
+
+idct_dc_proto(4, 10,mmxext);
+idct_dc_proto(8, 10,  sse2);
+idct_dc_proto(16,10,  sse2);
+idct_dc_proto(32,10,  sse2);
+idct_dc_proto(8, 10,   avx);
+idct_dc_proto(16,10,   avx);
+idct_dc_proto(32,10,   avx);
+
+idct_dc_proto(16,10,  avx2);
+idct_dc_proto(32,10,  avx2);
+
+#define IDCT_FUNCS(W, opt) \
+void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
+void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs)
+
+IDCT_FUNCS(4x4,   mmxext);
+IDCT_FUNCS(8x8,   mmxext);
+IDCT_FUNCS(8x8,   sse2);
+IDCT_FUNCS(16x16, sse2);
+IDCT_FUNCS(32x32, sse2);
+IDCT_FUNCS(16x16, avx2);
+IDCT_FUNCS(32x32, avx2);
+
 #define GET_PIXELS(width, depth, cf)                                           
                           \
 void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, 
ptrdiff_t dststride,             \
                                                            uint8_t *src, 
ptrdiff_t srcstride,             \
@@ -229,10 +262,17 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
 #define SET_EPEL_FUNCS(v, h, depth, cf, name) 
SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
 
     if (bit_depth == 8) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
+        }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
 
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
             SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
             SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
 
@@ -246,12 +286,21 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
             SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
             SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
             SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
+
         }
     } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
+        }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = 
ff_hevc_v_loop_filter_chroma_10_sse2;
             c->hevc_h_loop_filter_chroma = 
ff_hevc_h_loop_filter_chroma_10_sse2;
 
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
+
             SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
             SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
 
@@ -282,6 +331,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
             SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
 #endif /* HAVE_AVX_EXTERNAL */
         }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
+        }
     } else if (bit_depth == 10) {
         if (EXTERNAL_SSSE3(cpu_flags)) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@@ -303,6 +356,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int 
bit_depth)
             SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
 #endif /* HAVE_AVX_EXTERNAL */
         }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
+        }
     }
 #endif /* ARCH_X86_64 */
 }
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 9f64dd1..16a9bae 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -552,7 +552,9 @@
 %endmacro
 
 %macro SPLATW 2-3 0
-%if mmsize == 16
+%if cpuflag(avx2) && %3 == 0
+    vpbroadcastw %1, %2
+%elif mmsize == 16
     pshuflw    %1, %2, (%3)*0x55
     punpcklqdq %1, %1
 %elif cpuflag(mmxext)
-- 
2.6.4 (Apple Git-63)

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 1/2] x86/hevc: speed up idct add

Reply via email to