x86/hevc: add avx2 dc idct hevc: separate residu and prediction
x86/hevc_idct: replace old and unused idct functions hevc: align coeffs to 32byte boundary Fixes a segfault related to AVX not aligning to a 32bit boundary Authors: James Almer <[email protected]> Michael Niedermayer <[email protected] Pierre Edouard <[email protected]> --- libavcodec/hevc.c | 28 ++-- libavcodec/hevcdsp.c | 23 ++-- libavcodec/hevcdsp.h | 14 +- libavcodec/hevcdsp_template.c | 313 ++++++++++++++++++------------------------ libavcodec/x86/Makefile | 3 +- libavcodec/x86/hevc_idct.asm | 106 ++++++++++++++ libavcodec/x86/hevcdsp_init.c | 57 ++++++++ libavutil/x86/x86util.asm | 4 +- 8 files changed, 347 insertions(+), 201 deletions(-) create mode 100644 libavcodec/x86/hevc_idct.asm diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c index 177cf93..babc448 100644 --- a/libavcodec/hevc.c +++ b/libavcodec/hevc.c @@ -897,7 +897,7 @@ static void hls_residual_coding(HEVCContext *s, int x0, int y0, int vshift = s->ps.sps->vshift[c_idx]; uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + ((x0 >> hshift) << s->ps.sps->pixel_shift)]; - DECLARE_ALIGNED(16, int16_t, coeffs[MAX_TB_SIZE * MAX_TB_SIZE]) = { 0 }; + DECLARE_ALIGNED(32, int16_t, coeffs[MAX_TB_SIZE * MAX_TB_SIZE]) = { 0 }; DECLARE_ALIGNED(8, uint8_t, significant_coeff_group_flag[8][8]) = { { 0 } }; int trafo_size = 1 << log2_trafo_size; @@ -1205,17 +1205,29 @@ static void hls_residual_coding(HEVCContext *s, int x0, int y0, } } - if (lc->cu.cu_transquant_bypass_flag) { - s->hevcdsp.transquant_bypass[log2_trafo_size - 2](dst, coeffs, stride); - } else { + if (!lc->cu.cu_transquant_bypass_flag) { if (transform_skip_flag) - s->hevcdsp.transform_skip(dst, coeffs, stride); + s->hevcdsp.transform_skip(coeffs, log2_trafo_size); else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) - s->hevcdsp.transform_4x4_luma_add(dst, coeffs, stride); - else - s->hevcdsp.transform_add[log2_trafo_size - 2](dst, coeffs, stride); + s->hevcdsp.idct_4x4_luma(coeffs); + else { + int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); + if (max_xy == 0) + s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs); + else { + int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; + if (max_xy < 4) + col_limit = FFMIN(4, col_limit); + else if (max_xy < 8) + col_limit = FFMIN(8, col_limit); + else if (max_xy < 12) + col_limit = FFMIN(24, col_limit); + s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit); + } + } } + s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride); } static int hls_transform_unit(HEVCContext *s, int x0, int y0, diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c index 15a712d..6b4b97c 100644 --- a/libavcodec/hevcdsp.c +++ b/libavcodec/hevcdsp.c @@ -164,16 +164,21 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) #define HEVC_DSP(depth) \ hevcdsp->put_pcm = FUNC(put_pcm, depth); \ - hevcdsp->transquant_bypass[0] = FUNC(transquant_bypass4x4, depth); \ - hevcdsp->transquant_bypass[1] = FUNC(transquant_bypass8x8, depth); \ - hevcdsp->transquant_bypass[2] = FUNC(transquant_bypass16x16, depth); \ - hevcdsp->transquant_bypass[3] = FUNC(transquant_bypass32x32, depth); \ + hevcdsp->transform_add[0] = FUNC(transform_add4x4, depth); \ + hevcdsp->transform_add[1] = FUNC(transform_add8x8, depth); \ + hevcdsp->transform_add[2] = FUNC(transform_add16x16, depth); \ + hevcdsp->transform_add[3] = FUNC(transform_add32x32, depth); \ hevcdsp->transform_skip = FUNC(transform_skip, depth); \ - hevcdsp->transform_4x4_luma_add = FUNC(transform_4x4_luma_add, depth); \ - hevcdsp->transform_add[0] = FUNC(transform_4x4_add, depth); \ - hevcdsp->transform_add[1] = FUNC(transform_8x8_add, depth); \ - hevcdsp->transform_add[2] = FUNC(transform_16x16_add, depth); \ - hevcdsp->transform_add[3] = FUNC(transform_32x32_add, depth); \ + hevcdsp->idct_4x4_luma = FUNC(transform_4x4_luma, depth); \ + hevcdsp->idct[0] = FUNC(idct_4x4, depth); \ + hevcdsp->idct[1] = FUNC(idct_8x8, depth); \ + hevcdsp->idct[2] = FUNC(idct_16x16, depth); \ + hevcdsp->idct[3] = FUNC(idct_32x32, depth); \ + \ + hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \ + hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \ + hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \ + hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \ \ hevcdsp->sao_band_filter[0] = FUNC(sao_band_filter_0, depth); \ hevcdsp->sao_band_filter[1] = FUNC(sao_band_filter_1, depth); \ diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h index 4097233..1793893 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -42,13 +42,15 @@ typedef struct HEVCDSPContext { void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size, GetBitContext *gb, int pcm_bit_depth); - void (*transquant_bypass[4])(uint8_t *dst, int16_t *coeffs, - ptrdiff_t stride); + void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride); - void (*transform_skip)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); - void (*transform_4x4_luma_add)(uint8_t *dst, int16_t *coeffs, - ptrdiff_t stride); - void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + void (*transform_skip)(int16_t *coeffs, int16_t log2_size); + + void (*idct_4x4_luma)(int16_t *coeffs); + + void (*idct[4])(int16_t *coeffs, int col_limit); + + void (*idct_dc[4])(int16_t *coeffs); void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride, struct SAOParams *sao, int *borders, diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 31a2e7a..3846327 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -57,48 +57,53 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe } } -static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs, +static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride) { FUNC(transquant_bypass)(_dst, coeffs, stride, 4); } -static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs, +static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride) { FUNC(transquant_bypass)(_dst, coeffs, stride, 8); } -static void FUNC(transquant_bypass16x16)(uint8_t *_dst, int16_t *coeffs, +static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride) { FUNC(transquant_bypass)(_dst, coeffs, stride, 16); } -static void FUNC(transquant_bypass32x32)(uint8_t *_dst, int16_t *coeffs, +static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride) { FUNC(transquant_bypass)(_dst, coeffs, stride, 32); } -static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride) +static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size) { - pixel *dst = (pixel *)_dst; - int shift = 13 - BIT_DEPTH; -#if BIT_DEPTH <= 13 - int offset = 1 << (shift - 1); -#else - int offset = 0; -#endif + int shift = 15 - BIT_DEPTH - log2_size; int x, y; + int size = 1 << log2_size; + int16_t *coeffs = _coeffs; - stride /= sizeof(pixel); - for (y = 0; y < 4 * 4; y += 4) { - for (x = 0; x < 4; x++) - dst[x] = av_clip_pixel(dst[x] + ((coeffs[y + x] + offset) >> shift)); - dst += stride; + if (shift > 0) { + int offset = 1 << (shift - 1); + for (y = 0; y < size; y++) { + for (x = 0; x < size; x++) { + *coeffs = (*coeffs + offset) >> shift; + coeffs++; + } + } + } else { + for (y = 0; y < size; y++) { + for (x = 0; x < size; x++) { + *coeffs = *coeffs << -shift; + coeffs++; + } + } } } @@ -122,17 +127,13 @@ static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs, assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \ } while (0) -static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride) +static void FUNC(transform_4x4_luma)(int16_t *coeffs) { int i; - pixel *dst = (pixel *)_dst; int shift = 7; int add = 1 << (shift - 1); int16_t *src = coeffs; - stride /= sizeof(pixel); - for (i = 0; i < 4; i++) { TR_4x4_LUMA(src, src, 4, SCALE); src++; @@ -141,180 +142,140 @@ static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs, shift = 20 - BIT_DEPTH; add = 1 << (shift - 1); for (i = 0; i < 4; i++) { - TR_4x4_LUMA(dst, coeffs, 1, ADD_AND_SCALE); + TR_4x4_LUMA(coeffs, coeffs, 1, SCALE); coeffs += 4; - dst += stride; } } #undef TR_4x4_LUMA -#define TR_4(dst, src, dstep, sstep, assign) \ - do { \ - const int e0 = transform[8 * 0][0] * src[0 * sstep] + \ - transform[8 * 2][0] * src[2 * sstep]; \ - const int e1 = transform[8 * 0][1] * src[0 * sstep] + \ - transform[8 * 2][1] * src[2 * sstep]; \ - const int o0 = transform[8 * 1][0] * src[1 * sstep] + \ - transform[8 * 3][0] * src[3 * sstep]; \ - const int o1 = transform[8 * 1][1] * src[1 * sstep] + \ - transform[8 * 3][1] * src[3 * sstep]; \ - \ - assign(dst[0 * dstep], e0 + o0); \ - assign(dst[1 * dstep], e1 + o1); \ - assign(dst[2 * dstep], e1 - o1); \ - assign(dst[3 * dstep], e0 - o0); \ +#define TR_4(dst, src, dstep, sstep, assign, end) \ + do { \ + const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \ + const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \ + const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \ + const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \ + \ + assign(dst[0 * dstep], e0 + o0); \ + assign(dst[1 * dstep], e1 + o1); \ + assign(dst[2 * dstep], e1 - o1); \ + assign(dst[3 * dstep], e0 - o0); \ } while (0) -static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride) -{ - int i; - pixel *dst = (pixel *)_dst; - int shift = 7; - int add = 1 << (shift - 1); - int16_t *src = coeffs; - - stride /= sizeof(pixel); - - for (i = 0; i < 4; i++) { - TR_4(src, src, 4, 4, SCALE); - src++; - } - - shift = 20 - BIT_DEPTH; - add = 1 << (shift - 1); - for (i = 0; i < 4; i++) { - TR_4(dst, coeffs, 1, 1, ADD_AND_SCALE); - coeffs += 4; - dst += stride; - } -} - -#define TR_8(dst, src, dstep, sstep, assign) \ - do { \ - int i, j; \ - int e_8[4]; \ - int o_8[4] = { 0 }; \ - for (i = 0; i < 4; i++) \ - for (j = 1; j < 8; j += 2) \ - o_8[i] += transform[4 * j][i] * src[j * sstep]; \ - TR_4(e_8, src, 1, 2 * sstep, SET); \ - \ - for (i = 0; i < 4; i++) { \ - assign(dst[i * dstep], e_8[i] + o_8[i]); \ - assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \ - } \ +#define TR_8(dst, src, dstep, sstep, assign, end) \ + do { \ + int i, j; \ + int e_8[4]; \ + int o_8[4] = { 0 }; \ + for (i = 0; i < 4; i++) \ + for (j = 1; j < end; j += 2) \ + o_8[i] += transform[4 * j][i] * src[j * sstep]; \ + TR_4(e_8, src, 1, 2 * sstep, SET, 4); \ + \ + for (i = 0; i < 4; i++) { \ + assign(dst[i * dstep], e_8[i] + o_8[i]); \ + assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \ + } \ } while (0) -#define TR_16(dst, src, dstep, sstep, assign) \ - do { \ - int i, j; \ - int e_16[8]; \ - int o_16[8] = { 0 }; \ - for (i = 0; i < 8; i++) \ - for (j = 1; j < 16; j += 2) \ - o_16[i] += transform[2 * j][i] * src[j * sstep]; \ - TR_8(e_16, src, 1, 2 * sstep, SET); \ - \ - for (i = 0; i < 8; i++) { \ - assign(dst[i * dstep], e_16[i] + o_16[i]); \ - assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \ - } \ +#define TR_16(dst, src, dstep, sstep, assign, end) \ + do { \ + int i, j; \ + int e_16[8]; \ + int o_16[8] = { 0 }; \ + for (i = 0; i < 8; i++) \ + for (j = 1; j < end; j += 2) \ + o_16[i] += transform[2 * j][i] * src[j * sstep]; \ + TR_8(e_16, src, 1, 2 * sstep, SET, 8); \ + \ + for (i = 0; i < 8; i++) { \ + assign(dst[i * dstep], e_16[i] + o_16[i]); \ + assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \ + } \ } while (0) -#define TR_32(dst, src, dstep, sstep, assign) \ - do { \ - int i, j; \ - int e_32[16]; \ - int o_32[16] = { 0 }; \ - for (i = 0; i < 16; i++) \ - for (j = 1; j < 32; j += 2) \ - o_32[i] += transform[j][i] * src[j * sstep]; \ - TR_16(e_32, src, 1, 2 * sstep, SET); \ - \ - for (i = 0; i < 16; i++) { \ - assign(dst[i * dstep], e_32[i] + o_32[i]); \ - assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \ - } \ +#define TR_32(dst, src, dstep, sstep, assign, end) \ + do { \ + int i, j; \ + int e_32[16]; \ + int o_32[16] = { 0 }; \ + for (i = 0; i < 16; i++) \ + for (j = 1; j < end; j += 2) \ + o_32[i] += transform[j][i] * src[j * sstep]; \ + TR_16(e_32, src, 1, 2 * sstep, SET, end/2); \ + \ + for (i = 0; i < 16; i++) { \ + assign(dst[i * dstep], e_32[i] + o_32[i]); \ + assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \ + } \ } while (0) - - -static void FUNC(transform_8x8_add)(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride) -{ - int i; - pixel *dst = (pixel *)_dst; - int shift = 7; - int add = 1 << (shift - 1); - int16_t *src = coeffs; - - stride /= sizeof(pixel); - - for (i = 0; i < 8; i++) { - TR_8(src, src, 8, 8, SCALE); - src++; - } - - shift = 20 - BIT_DEPTH; - add = 1 << (shift - 1); - for (i = 0; i < 8; i++) { - TR_8(dst, coeffs, 1, 1, ADD_AND_SCALE); - coeffs += 8; - dst += stride; - } +#define IDCT_VAR4(H) \ + int limit2 = FFMIN(col_limit + 4, H) +#define IDCT_VAR8(H) \ + int limit = FFMIN(col_limit, H); \ + int limit2 = FFMIN(col_limit + 4, H) +#define IDCT_VAR16(H) IDCT_VAR8(H) +#define IDCT_VAR32(H) IDCT_VAR8(H) + +#define IDCT(H) \ +static void FUNC(idct_##H ##x ##H )( \ + int16_t *coeffs, int col_limit) { \ + int i; \ + int shift = 7; \ + int add = 1 << (shift - 1); \ + int16_t *src = coeffs; \ + IDCT_VAR ##H(H); \ + \ + for (i = 0; i < H; i++) { \ + TR_ ## H(src, src, H, H, SCALE, limit2); \ + if (limit2 < H && i%4 == 0 && !!i) \ + limit2 -= 4; \ + src++; \ + } \ + \ + shift = 20 - BIT_DEPTH; \ + add = 1 << (shift - 1); \ + for (i = 0; i < H; i++) { \ + TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \ + coeffs += H; \ + } \ } -static void FUNC(transform_16x16_add)(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride) -{ - int i; - pixel *dst = (pixel *)_dst; - int shift = 7; - int add = 1 << (shift - 1); - int16_t *src = coeffs; - - stride /= sizeof(pixel); +#define IDCT_DC(H) \ +static void FUNC(idct_##H ##x ##H ##_dc)( \ + int16_t *coeffs) { \ + int i, j; \ + int shift = 14 - BIT_DEPTH; \ + int add = 1 << (shift - 1); \ + int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \ + \ + for (j = 0; j < H; j++) { \ + for (i = 0; i < H; i++) { \ + coeffs[i+j*H] = coeff; \ + } \ + } \ +} - for (i = 0; i < 16; i++) { - TR_16(src, src, 16, 16, SCALE); - src++; - } +IDCT( 4) +IDCT( 8) +IDCT(16) +IDCT(32) - shift = 20 - BIT_DEPTH; - add = 1 << (shift - 1); - for (i = 0; i < 16; i++) { - TR_16(dst, coeffs, 1, 1, ADD_AND_SCALE); - coeffs += 16; - dst += stride; - } -} +IDCT_DC( 4) +IDCT_DC( 8) +IDCT_DC(16) +IDCT_DC(32) -static void FUNC(transform_32x32_add)(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride) -{ - int i; - pixel *dst = (pixel *)_dst; - int shift = 7; - int add = 1 << (shift - 1); - int16_t *src = coeffs; +#undef TR_4 +#undef TR_8 +#undef TR_16 +#undef TR_32 - stride /= sizeof(pixel); +#undef SET +#undef SCALE +#undef ADD_AND_SCALE - for (i = 0; i < 32; i++) { - TR_32(src, src, 32, 32, SCALE); - src++; - } - src = coeffs; - shift = 20 - BIT_DEPTH; - add = 1 << (shift - 1); - for (i = 0; i < 32; i++) { - TR_32(dst, coeffs, 1, 1, ADD_AND_SCALE); - coeffs += 32; - dst += stride; - } -} static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride, SAOParams *sao, diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index cdf7758..1460197 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -115,7 +115,8 @@ YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o \ - x86/hevc_mc.o + x86/hevc_mc.o \ + x86/hevc_idct.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm new file mode 100644 index 0000000..46457b7 --- /dev/null +++ b/libavcodec/x86/hevc_idct.asm @@ -0,0 +1,106 @@ +; /* +; * SIMD optimized idct functions for HEVC decoding +; * Copyright (c) 2014 Pierre-Edouard LEPERE +; * Copyright (c) 2014 James Almer +; * +; * This file is part of libav. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +section .text + +; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs) +; %1 = HxW +; %2 = number of loops +; %3 = bitdepth +%macro IDCT_DC 3 +cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp + movsx tmpq, word [coeffq] + add tmpw, ((1 << 14-%3) + 1) + sar tmpw, (15-%3) + movd xm0, tmpd + SPLATW m0, xm0 + DEFINE_ARGS coeff, cnt + mov cntd, %2 +.loop: + mova [coeffq+mmsize*0], m0 + mova [coeffq+mmsize*1], m0 + mova [coeffq+mmsize*2], m0 + mova [coeffq+mmsize*3], m0 + mova [coeffq+mmsize*4], m0 + mova [coeffq+mmsize*5], m0 + mova [coeffq+mmsize*6], m0 + mova [coeffq+mmsize*7], m0 + add coeffq, mmsize*8 + dec cntd + jg .loop + RET +%endmacro + +; %1 = HxW +; %2 = bitdepth +%macro IDCT_DC_NL 2 ; No loop +cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp + movsx tmpq, word [coeffq] + add tmpw, ((1 << 14-%2) + 1) + sar tmpw, (15-%2) + movd m0, tmpd + SPLATW m0, xm0 + mova [coeffq+mmsize*0], m0 + mova [coeffq+mmsize*1], m0 + mova [coeffq+mmsize*2], m0 + mova [coeffq+mmsize*3], m0 +%if mmsize == 16 + mova [coeffq+mmsize*4], m0 + mova [coeffq+mmsize*5], m0 + mova [coeffq+mmsize*6], m0 + mova [coeffq+mmsize*7], m0 +%endif + RET +%endmacro + +; 8-bit +INIT_MMX mmxext +IDCT_DC_NL 4, 8 +IDCT_DC 8, 2, 8 + +INIT_XMM sse2 +IDCT_DC_NL 8, 8 +IDCT_DC 16, 4, 8 +IDCT_DC 32, 16, 8 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +IDCT_DC 16, 2, 8 +IDCT_DC 32, 8, 8 +%endif ;HAVE_AVX2_EXTERNAL + +; 10-bit +INIT_MMX mmxext +IDCT_DC_NL 4, 10 +IDCT_DC 8, 2, 10 + +INIT_XMM sse2 +IDCT_DC_NL 8, 10 +IDCT_DC 16, 4, 10 +IDCT_DC 32, 16, 10 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +IDCT_DC 16, 2, 10 +IDCT_DC 32, 8, 10 +%endif ;HAVE_AVX2_EXTERNAL diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index fd22fc3..47cd247 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -45,6 +45,39 @@ LFC_FUNCS(uint8_t, 10) LFL_FUNCS(uint8_t, 8) LFL_FUNCS(uint8_t, 10) +#define idct_dc_proto(size, bitd, opt) \ + void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) + +idct_dc_proto(4, 8,mmxext); +idct_dc_proto(8, 8,mmxext); +idct_dc_proto(16,8, sse2); +idct_dc_proto(32,8, sse2); + +idct_dc_proto(32,8, avx2); + +idct_dc_proto(4, 10,mmxext); +idct_dc_proto(8, 10, sse2); +idct_dc_proto(16,10, sse2); +idct_dc_proto(32,10, sse2); +idct_dc_proto(8, 10, avx); +idct_dc_proto(16,10, avx); +idct_dc_proto(32,10, avx); + +idct_dc_proto(16,10, avx2); +idct_dc_proto(32,10, avx2); + +#define IDCT_FUNCS(W, opt) \ +void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \ +void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs) + +IDCT_FUNCS(4x4, mmxext); +IDCT_FUNCS(8x8, mmxext); +IDCT_FUNCS(8x8, sse2); +IDCT_FUNCS(16x16, sse2); +IDCT_FUNCS(32x32, sse2); +IDCT_FUNCS(16x16, avx2); +IDCT_FUNCS(32x32, avx2); + #define GET_PIXELS(width, depth, cf) \ void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \ uint8_t *src, ptrdiff_t srcstride, \ @@ -229,10 +262,17 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) #define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf) if (bit_depth == 8) { + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext; + c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext; + } if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2; + c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2; + c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2; SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); @@ -246,12 +286,21 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v); SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h); SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v); + } } else if (bit_depth == 10) { + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext; + c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext; + } if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2; + c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2; + c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2; + SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels); SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels); @@ -282,6 +331,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv); #endif /* HAVE_AVX_EXTERNAL */ } + if (EXTERNAL_AVX2(cpu_flags)) { + c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; + } } else if (bit_depth == 10) { if (EXTERNAL_SSSE3(cpu_flags)) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; @@ -303,6 +356,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv); #endif /* HAVE_AVX_EXTERNAL */ } + if (EXTERNAL_AVX2(cpu_flags)) { + c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2; + c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2; + } } #endif /* ARCH_X86_64 */ } diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 9f64dd1..16a9bae 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -552,7 +552,9 @@ %endmacro %macro SPLATW 2-3 0 -%if mmsize == 16 +%if cpuflag(avx2) && %3 == 0 + vpbroadcastw %1, %2 +%elif mmsize == 16 pshuflw %1, %2, (%3)*0x55 punpcklqdq %1, %1 %elif cpuflag(mmxext) -- 2.6.4 (Apple Git-63) _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
