On Sun, Jul 13, 2014 at 08:18:30AM +0000, Anton Khirnov wrote:
> --- /dev/null
> +++ b/libavcodec/x86/hevc_idct.asm
> @@ -0,0 +1,180 @@
> +; /*
> +; * Provide SSE & MMX idct functions for HEVC decoding
"SIMD-optimized" - such comments get outdated quickly. Sometimes
even from the start - this has AVX code.
> +; * You should have received a copy of the GNU Lesser General Public
> +; * License along with Libav; if not, write to the Free Software
> +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> +; */
> +%include "libavutil/x86/x86util.asm"
Add an empty line.
> +SECTION_RODATA
> +max_pixels_10: times 8 dw ((1 << 10)-1)
> +dc_add_10: times 4 dd ((1 << 14-10) + 1)
stray double space before dw.
> +;the idct_dc_add macros and functions were largely inspired by x264
> project's code in the h264_idct.asm file
by the
nit: long line
> +;-----------------------------------------------------------------------------
> +; void ff_hevc_idct4_dc_add_10_mmxext(pixel *dst, int coeff, int stride)
> +;-----------------------------------------------------------------------------
> +%macro IDCT_DC_ADD_OP_10 3
> + pxor m5, m5
> + mova m6, [max_pixels_10]
> +%if avx_enabled
> + paddw m1, m0, [%1+0 ]
> + paddw m2, m0, [%1+%2 ]
> + paddw m3, m0, [%1+%2*2]
> + paddw m4, m0, [%1+%3 ]
> +%else
> + mova m1, [%1+0 ]
> + mova m2, [%1+%2 ]
> + mova m3, [%1+%2*2]
> + mova m4, [%1+%3 ]
> + paddw m1, m0
> + paddw m2, m0
> + paddw m3, m0
> + paddw m4, m0
> +%endif
Is there a reason to check for avx_enabled instead of just cpuflag(avx) here?
> --- a/libavcodec/x86/hevcdsp_init.c
> +++ b/libavcodec/x86/hevcdsp_init.c
> @@ -46,27 +46,106 @@ LFC_FUNCS(uint8_t, 10)
> +
> +#if HAVE_SSE2_EXTERNAL
> +static void hevc_idct32_dc_add_8_sse2(uint8_t *dst, int coeff, ptrdiff_t
> stride)
> +{
> + ff_hevc_idct16_dc_add_8_sse2(dst, coeff, stride);
> + ff_hevc_idct16_dc_add_8_sse2(dst + 16, coeff, stride);
> + ff_hevc_idct16_dc_add_8_sse2(dst + 16 * stride, coeff, stride);
> + ff_hevc_idct16_dc_add_8_sse2(dst + 16 * stride + 16, coeff, stride);
> +}
> +
> +static void hevc_idct16_dc_add_10_sse2(uint8_t *dst, int coeff, ptrdiff_t
> stride)
> +{
> + ff_hevc_idct8_dc_add_10_sse2(dst, coeff, stride);
> + ff_hevc_idct8_dc_add_10_sse2(dst + 16, coeff, stride);
> + ff_hevc_idct8_dc_add_10_sse2(dst + 8 * stride, coeff, stride);
> + ff_hevc_idct8_dc_add_10_sse2(dst + 8 * stride + 16, coeff, stride);
> +}
> +
> +static void hevc_idct32_dc_add_10_sse2(uint8_t *dst, int coeff, ptrdiff_t
> stride)
> +{
> + hevc_idct16_dc_add_10_sse2(dst, coeff, stride);
> + hevc_idct16_dc_add_10_sse2(dst + 32, coeff, stride);
> + hevc_idct16_dc_add_10_sse2(dst + 16 * stride, coeff, stride);
> + hevc_idct16_dc_add_10_sse2(dst + 16 * stride + 32, coeff, stride);
> +}
> +#endif //HAVE_SSE2_EXTERNAL
nit: Use /* */ comments like everywhere else.
> +#if HAVE_AVX_EXTERNAL
> +static void hevc_idct16_dc_add_10_avx(uint8_t *dst, int coeff, ptrdiff_t
> stride)
> +{
> + ff_hevc_idct8_dc_add_10_avx(dst, coeff, stride);
> + ff_hevc_idct8_dc_add_10_avx(dst + 16, coeff, stride);
> + ff_hevc_idct8_dc_add_10_avx(dst + 8 * stride, coeff, stride);
> + ff_hevc_idct8_dc_add_10_avx(dst + 8 * stride + 16, coeff, stride);
> +}
> +
> +static void hevc_idct32_dc_add_10_avx(uint8_t *dst, int coeff, ptrdiff_t
> stride)
> +{
> + hevc_idct16_dc_add_10_avx(dst, coeff, stride);
> + hevc_idct16_dc_add_10_avx(dst + 32, coeff, stride);
> + hevc_idct16_dc_add_10_avx(dst + 16 * stride, coeff, stride);
> + hevc_idct16_dc_add_10_avx(dst + 16 * stride + 32, coeff, stride);
> +}
> +#endif //HAVE_AVX_EXTERNAL
same
> void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
> {
> int mm_flags = av_get_cpu_flags();
>
> if (bit_depth == 8) {
> + if (EXTERNAL_MMXEXT(mm_flags)) {
> + c->transform_dc_add[0] = ff_hevc_idct4_dc_add_8_mmxext;
> + c->transform_dc_add[1] = ff_hevc_idct8_dc_add_8_mmxext;
> + }
> if (EXTERNAL_SSE2(mm_flags)) {
> c->hevc_v_loop_filter_chroma =
> ff_hevc_v_loop_filter_chroma_8_sse2;
> c->hevc_h_loop_filter_chroma =
> ff_hevc_h_loop_filter_chroma_8_sse2;
> +
> + c->transform_dc_add[2] = ff_hevc_idct16_dc_add_8_sse2;
> + c->transform_dc_add[3] = hevc_idct32_dc_add_8_sse2;
> }
> if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
> c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
> c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
> }
> } else if (bit_depth == 10) {
> + if (EXTERNAL_MMXEXT(mm_flags)) {
> + c->transform_dc_add[0] = ff_hevc_idct4_dc_add_10_mmxext;
> + }
> if (EXTERNAL_SSE2(mm_flags)) {
> c->hevc_v_loop_filter_chroma =
> ff_hevc_v_loop_filter_chroma_10_sse2;
> c->hevc_h_loop_filter_chroma =
> ff_hevc_h_loop_filter_chroma_10_sse2;
> +
> + c->transform_dc_add[1] = ff_hevc_idct8_dc_add_10_sse2;
> + c->transform_dc_add[2] = hevc_idct16_dc_add_10_sse2;
> + c->transform_dc_add[3] = hevc_idct32_dc_add_10_sse2;
> }
> if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
> c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
> c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
> }
> + if (EXTERNAL_AVX(mm_flags)) {
> + c->transform_dc_add[1] = ff_hevc_idct8_dc_add_10_avx;
> + c->transform_dc_add[2] = hevc_idct16_dc_add_10_avx;
> + c->transform_dc_add[3] = hevc_idct32_dc_add_10_avx;
> + }
> }
> }
Try compiling with optimizations disabled or pushing to oracle.
This will fail due to missing ifdefs.
Diego
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel