On Sun, Jul 13, 2014 at 08:18:30AM +0000, Anton Khirnov wrote:
> --- /dev/null
> +++ b/libavcodec/x86/hevc_idct.asm
> @@ -0,0 +1,180 @@
> +; /*
> +; * Provide SSE & MMX idct functions for HEVC decoding

"SIMD-optimized" - such comments get outdated quickly.  Sometimes
even from the start - this has AVX code.

> +; * You should have received a copy of the GNU Lesser General Public
> +; * License along with Libav; if not, write to the Free Software
> +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> +; */
> +%include "libavutil/x86/x86util.asm"

Add an empty line.

> +SECTION_RODATA
> +max_pixels_10:          times 8  dw ((1 << 10)-1)
> +dc_add_10:              times 4 dd ((1 << 14-10) + 1)

stray double space before dw.

> +;the idct_dc_add macros and functions were largely inspired by x264 
> project's code in the h264_idct.asm file

by the

nit: long line

> +;-----------------------------------------------------------------------------
> +; void ff_hevc_idct4_dc_add_10_mmxext(pixel *dst, int coeff, int stride)
> +;-----------------------------------------------------------------------------
> +%macro IDCT_DC_ADD_OP_10 3
> +    pxor              m5, m5
> +    mova              m6, [max_pixels_10]
> +%if avx_enabled
> +    paddw             m1, m0, [%1+0   ]
> +    paddw             m2, m0, [%1+%2  ]
> +    paddw             m3, m0, [%1+%2*2]
> +    paddw             m4, m0, [%1+%3  ]
> +%else
> +    mova              m1, [%1+0   ]
> +    mova              m2, [%1+%2  ]
> +    mova              m3, [%1+%2*2]
> +    mova              m4, [%1+%3  ]
> +    paddw             m1, m0
> +    paddw             m2, m0
> +    paddw             m3, m0
> +    paddw             m4, m0
> +%endif

Is there a reason to check for avx_enabled instead of just cpuflag(avx) here?

> --- a/libavcodec/x86/hevcdsp_init.c
> +++ b/libavcodec/x86/hevcdsp_init.c
> @@ -46,27 +46,106 @@ LFC_FUNCS(uint8_t, 10)
> +
> +#if HAVE_SSE2_EXTERNAL
> +static void hevc_idct32_dc_add_8_sse2(uint8_t *dst, int coeff, ptrdiff_t 
> stride)
> +{
> +    ff_hevc_idct16_dc_add_8_sse2(dst,                    coeff, stride);
> +    ff_hevc_idct16_dc_add_8_sse2(dst + 16,               coeff, stride);
> +    ff_hevc_idct16_dc_add_8_sse2(dst + 16 * stride,      coeff, stride);
> +    ff_hevc_idct16_dc_add_8_sse2(dst + 16 * stride + 16, coeff, stride);
> +}
> +
> +static void hevc_idct16_dc_add_10_sse2(uint8_t *dst, int coeff, ptrdiff_t 
> stride)
> +{
> +    ff_hevc_idct8_dc_add_10_sse2(dst,                   coeff, stride);
> +    ff_hevc_idct8_dc_add_10_sse2(dst + 16,              coeff, stride);
> +    ff_hevc_idct8_dc_add_10_sse2(dst + 8 * stride,      coeff, stride);
> +    ff_hevc_idct8_dc_add_10_sse2(dst + 8 * stride + 16, coeff, stride);
> +}
> +
> +static void hevc_idct32_dc_add_10_sse2(uint8_t *dst, int coeff, ptrdiff_t 
> stride)
> +{
> +    hevc_idct16_dc_add_10_sse2(dst,                    coeff, stride);
> +    hevc_idct16_dc_add_10_sse2(dst + 32,               coeff, stride);
> +    hevc_idct16_dc_add_10_sse2(dst + 16 * stride,      coeff, stride);
> +    hevc_idct16_dc_add_10_sse2(dst + 16 * stride + 32, coeff, stride);
> +}
> +#endif //HAVE_SSE2_EXTERNAL

nit: Use /* */ comments like everywhere else.

> +#if HAVE_AVX_EXTERNAL
> +static void hevc_idct16_dc_add_10_avx(uint8_t *dst, int coeff, ptrdiff_t 
> stride)
> +{
> +    ff_hevc_idct8_dc_add_10_avx(dst,                   coeff, stride);
> +    ff_hevc_idct8_dc_add_10_avx(dst + 16,              coeff, stride);
> +    ff_hevc_idct8_dc_add_10_avx(dst + 8 * stride,      coeff, stride);
> +    ff_hevc_idct8_dc_add_10_avx(dst + 8 * stride + 16, coeff, stride);
> +}
> +
> +static void hevc_idct32_dc_add_10_avx(uint8_t *dst, int coeff, ptrdiff_t 
> stride)
> +{
> +    hevc_idct16_dc_add_10_avx(dst,                    coeff, stride);
> +    hevc_idct16_dc_add_10_avx(dst + 32,               coeff, stride);
> +    hevc_idct16_dc_add_10_avx(dst + 16 * stride,      coeff, stride);
> +    hevc_idct16_dc_add_10_avx(dst + 16 * stride + 32, coeff, stride);
> +}
> +#endif //HAVE_AVX_EXTERNAL

same

>  void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
>  {
>      int mm_flags = av_get_cpu_flags();
>  
>      if (bit_depth == 8) {
> +        if (EXTERNAL_MMXEXT(mm_flags)) {
> +            c->transform_dc_add[0] = ff_hevc_idct4_dc_add_8_mmxext;
> +            c->transform_dc_add[1] = ff_hevc_idct8_dc_add_8_mmxext;
> +        }
>          if (EXTERNAL_SSE2(mm_flags)) {
>              c->hevc_v_loop_filter_chroma = 
> ff_hevc_v_loop_filter_chroma_8_sse2;
>              c->hevc_h_loop_filter_chroma = 
> ff_hevc_h_loop_filter_chroma_8_sse2;
> +
> +            c->transform_dc_add[2] = ff_hevc_idct16_dc_add_8_sse2;
> +            c->transform_dc_add[3] = hevc_idct32_dc_add_8_sse2;
>          }
>          if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
>              c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
>              c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
>          }
>      } else if (bit_depth == 10) {
> +        if (EXTERNAL_MMXEXT(mm_flags)) {
> +            c->transform_dc_add[0] = ff_hevc_idct4_dc_add_10_mmxext;
> +        }
>          if (EXTERNAL_SSE2(mm_flags)) {
>              c->hevc_v_loop_filter_chroma = 
> ff_hevc_v_loop_filter_chroma_10_sse2;
>              c->hevc_h_loop_filter_chroma = 
> ff_hevc_h_loop_filter_chroma_10_sse2;
> +
> +            c->transform_dc_add[1] = ff_hevc_idct8_dc_add_10_sse2;
> +            c->transform_dc_add[2] = hevc_idct16_dc_add_10_sse2;
> +            c->transform_dc_add[3] = hevc_idct32_dc_add_10_sse2;
>          }
>          if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
>              c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
>              c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
>          }
> +        if (EXTERNAL_AVX(mm_flags)) {
> +            c->transform_dc_add[1] = ff_hevc_idct8_dc_add_10_avx;
> +            c->transform_dc_add[2] = hevc_idct16_dc_add_10_avx;
> +            c->transform_dc_add[3] = hevc_idct32_dc_add_10_avx;
> +        }
>      }
>  }

Try compiling with optimizations disabled or pushing to oracle.
This will fail due to missing ifdefs.

Diego
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to