PR #23479 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23479 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23479.patch
>From fe51ab9f64c210909bd224325806772cda4025e1 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 14 Jun 2026 13:54:29 +0200 Subject: [PATCH 1/2] avcodec/x86/h264_intrapred: Avoid reg-reg moves Possible if src of the PRED4x4_LOWPASS macro is not used lateron. Saves 195B of .text here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_intrapred.asm | 248 +++++++++++++++--------------- 1 file changed, 125 insertions(+), 123 deletions(-) diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index 1074b474f0..ad4e267f03 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -795,7 +795,9 @@ cglobal pred8x8_tm_vp8_8, 2,3,6 mova %5, %2 pavgb %2, %3 pxor %3, %5 +%ifnidn %1, %4 mova %1, %4 +%endif pand %3, [pb_1] psubusb %2, %3 pavgb %1, %2 @@ -828,8 +830,8 @@ cglobal pred8x8l_top_dc_8, 4,4,6 pxor m1, m5 .has_topright: pxor m4, m4 - PRED4x4_LOWPASS m0, m2, m1, m3, m5 - psadbw m4, m0 + PRED4x4_LOWPASS m3, m2, m1, m3, m5 + psadbw m4, m3 paddw m4, [pw_4] psrlw m4, 3 SPLATW m4, m4, 0 @@ -901,9 +903,9 @@ cglobal pred8x8l_dc_8, 4,5 .do_left: movq mm0, mm4 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 + movq mm1, mm0 movq mm7, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + PRED4x4_LOWPASS mm1, mm3, mm0, mm1, mm5 psllq mm1, 56 PALIGNR mm7, mm1, 7, mm3 movq mm0, [r0-8] @@ -919,12 +921,12 @@ cglobal pred8x8l_dc_8, 4,5 jz .fix_tr_1 .body: lea r1, [r0+r3*2] - PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 + PRED4x4_LOWPASS mm3, mm2, mm1, mm3, mm5 pxor mm0, mm0 pxor mm1, mm1 lea r2, [r1+r3*2] psadbw mm0, mm7 - psadbw mm1, mm6 + psadbw mm1, mm3 paddw mm0, [pw_8] paddw mm0, mm1 lea r4, [r2+r3*2] @@ -985,9 +987,9 @@ cglobal pred8x8l_horizontal_8, 4,4 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 movq mm4, mm0 movq mm7, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 - psllq mm1, 56 - PALIGNR mm7, mm1, 7, mm3 + PRED4x4_LOWPASS mm4, mm3, mm0, mm4, mm5 + psllq mm4, 56 + PALIGNR mm7, mm4, 7, mm3 movq mm3, mm7 lea r1, [r0+r3*2] movq mm7, mm3 @@ -1054,14 +1056,14 @@ cglobal pred8x8l_vertical_8, 4,4 psllq mm5, 56 pxor mm1, mm5 .body: - PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 + PRED4x4_LOWPASS mm3, mm2, mm1, mm3, mm5 %rep 3 - movq [r0+r3*1], mm0 - movq [r0+r3*2], mm0 + movq [r0+r3*1], mm3 + movq [r0+r3*2], mm3 lea r0, [r0+r3*2] %endrep - movq [r0+r3*1], mm0 - movq [r0+r3*2], mm0 + movq [r0+r3*1], mm3 + movq [r0+r3*2], mm3 RET %endmacro @@ -1114,14 +1116,14 @@ cglobal pred8x8l_down_left_8, 4,4 movq2dq xmm3, mm4 test r2d, r2d ; top_right jz .fix_tr_2 - movq mm0, [r0+8] - movq mm5, mm0 - movq mm2, mm0 - movq mm4, mm0 + movq mm1, [r0+8] + movq mm5, mm1 + movq mm2, mm1 + movq mm4, mm1 psrlq mm5, 56 PALIGNR mm2, mm3, 7, mm3 PALIGNR mm5, mm4, 1, mm4 - PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 + PRED4x4_LOWPASS mm1, mm2, mm5, mm1, mm4 .do_topright: movq2dq xmm4, mm1 psrlq mm1, 56 @@ -1137,24 +1139,24 @@ cglobal pred8x8l_down_left_8, 4,4 movdqa xmm1, xmm3 pslldq xmm1, 1 INIT_XMM cpuname - PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 - psrldq xmm0, 1 - movq [r0+r3*1], xmm0 - psrldq xmm0, 1 - movq [r0+r3*2], xmm0 - psrldq xmm0, 1 + PRED4x4_LOWPASS xmm3, xmm1, xmm2, xmm3, xmm4 + psrldq xmm3, 1 + movq [r0+r3*1], xmm3 + psrldq xmm3, 1 + movq [r0+r3*2], xmm3 + psrldq xmm3, 1 lea r0, [r2+r3*2] - movq [r1+r3*1], xmm0 - psrldq xmm0, 1 - movq [r1+r3*2], xmm0 - psrldq xmm0, 1 - movq [r2+r3*1], xmm0 - psrldq xmm0, 1 - movq [r2+r3*2], xmm0 - psrldq xmm0, 1 - movq [r0+r3*1], xmm0 - psrldq xmm0, 1 - movq [r0+r3*2], xmm0 + movq [r1+r3*1], xmm3 + psrldq xmm3, 1 + movq [r1+r3*2], xmm3 + psrldq xmm3, 1 + movq [r2+r3*1], xmm3 + psrldq xmm3, 1 + movq [r2+r3*2], xmm3 + psrldq xmm3, 1 + movq [r0+r3*1], xmm3 + psrldq xmm3, 1 + movq [r0+r3*2], xmm3 RET %endmacro @@ -1222,10 +1224,10 @@ cglobal pred8x8l_down_right_8, 4,5 .do_left: movq mm0, mm4 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 + movq mm1, mm0 movq mm7, mm2 movq2dq xmm3, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + PRED4x4_LOWPASS mm1, mm3, mm0, mm1, mm5 psllq mm1, 56 PALIGNR mm7, mm1, 7, mm3 movq2dq xmm1, mm7 @@ -1241,8 +1243,8 @@ cglobal pred8x8l_down_right_8, 4,5 test r2d, r2d jz .fix_tr_1 .do_top: - PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 - movq2dq xmm4, mm4 + PRED4x4_LOWPASS mm3, mm2, mm1, mm3, mm5 + movq2dq xmm4, mm3 lea r1, [r0+r3*2] movdqa xmm0, xmm3 pslldq xmm4, 8 @@ -1258,22 +1260,22 @@ cglobal pred8x8l_down_right_8, 4,5 movdqa xmm2, xmm3 psrldq xmm2, 1 INIT_XMM cpuname - PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 - movdqa xmm1, xmm0 + PRED4x4_LOWPASS xmm3, xmm1, xmm2, xmm3, xmm4 + movdqa xmm1, xmm3 psrldq xmm1, 1 - movq [r0+r3*2], xmm0 + movq [r0+r3*2], xmm3 movq [r0+r3*1], xmm1 - psrldq xmm0, 2 + psrldq xmm3, 2 psrldq xmm1, 2 - movq [r2+r3*2], xmm0 + movq [r2+r3*2], xmm3 movq [r2+r3*1], xmm1 - psrldq xmm0, 2 + psrldq xmm3, 2 psrldq xmm1, 2 - movq [r1+r3*2], xmm0 + movq [r1+r3*2], xmm3 movq [r1+r3*1], xmm1 - psrldq xmm0, 2 + psrldq xmm3, 2 psrldq xmm1, 2 - movq [r4+r3*2], xmm0 + movq [r4+r3*2], xmm3 movq [r4+r3*1], xmm1 RET %endmacro @@ -1340,8 +1342,8 @@ cglobal pred8x8l_vertical_right_8, 4,5,6 jmp .do_top .do_left: movq mm0, mm4 - PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq2dq xmm0, mm2 + PRED4x4_LOWPASS mm3, mm1, mm4, mm3, mm5 + movq2dq xmm0, mm3 movq mm0, [r0-8] movq mm3, [r0] movq mm1, [r0+8] @@ -1354,9 +1356,9 @@ cglobal pred8x8l_vertical_right_8, 4,5,6 test r2d, r2d jz .fix_tr_1 .do_top: - PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 + PRED4x4_LOWPASS mm3, mm2, mm1, mm3, mm5 lea r1, [r0+r3*2] - movq2dq xmm4, mm6 + movq2dq xmm4, mm3 pslldq xmm4, 8 por xmm0, xmm4 movdqa xmm1, xmm0 @@ -1367,19 +1369,19 @@ cglobal pred8x8l_vertical_right_8, 4,5,6 pslldq xmm1, 2 pavgb xmm2, xmm0 INIT_XMM cpuname - PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 - movdqa xmm0, [pw_ff00] - pandn xmm0, xmm4 - movdqa xmm5, xmm4 - psrlw xmm4, 8 - packuswb xmm0, xmm4 - movhlps xmm4, xmm0 + PRED4x4_LOWPASS xmm0, xmm3, xmm1, xmm0, xmm5 + movdqa xmm4, [pw_ff00] + pandn xmm4, xmm0 + movdqa xmm5, xmm0 + psrlw xmm0, 8 + packuswb xmm4, xmm0 + movhlps xmm0, xmm4 movhps [r0+r3*2], xmm5 movhps [r0+r3*1], xmm2 psrldq xmm5, 4 - movss xmm5, xmm0 + movss xmm5, xmm4 psrldq xmm2, 4 - movss xmm2, xmm4 + movss xmm2, xmm0 lea r0, [r2+r3*2] psrldq xmm5, 1 psrldq xmm2, 1 @@ -1445,14 +1447,14 @@ cglobal pred8x8l_vertical_left_8, 4,4 movq2dq xmm4, mm4 test r2d, r2d jz .fix_tr_2 - movq mm0, [r0+8] - movq mm5, mm0 - movq mm2, mm0 - movq mm4, mm0 + movq mm1, [r0+8] + movq mm5, mm1 + movq mm2, mm1 + movq mm4, mm1 psrlq mm5, 56 PALIGNR mm2, mm3, 7, mm3 PALIGNR mm5, mm4, 1, mm4 - PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 + PRED4x4_LOWPASS mm1, mm2, mm5, mm1, mm4 .do_topright: movq2dq xmm3, mm1 lea r1, [r0+r3*2] @@ -1466,23 +1468,23 @@ cglobal pred8x8l_vertical_left_8, 4,4 pavgb xmm3, xmm2 lea r2, [r1+r3*2] INIT_XMM cpuname - PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 - psrldq xmm0, 1 + PRED4x4_LOWPASS xmm4, xmm1, xmm2, xmm4, xmm5 + psrldq xmm4, 1 movq [r0+r3*1], xmm3 - movq [r0+r3*2], xmm0 + movq [r0+r3*2], xmm4 lea r0, [r2+r3*2] psrldq xmm3, 1 - psrldq xmm0, 1 + psrldq xmm4, 1 movq [r1+r3*1], xmm3 - movq [r1+r3*2], xmm0 + movq [r1+r3*2], xmm4 psrldq xmm3, 1 - psrldq xmm0, 1 + psrldq xmm4, 1 movq [r2+r3*1], xmm3 - movq [r2+r3*2], xmm0 + movq [r2+r3*2], xmm4 psrldq xmm3, 1 - psrldq xmm0, 1 + psrldq xmm4, 1 movq [r0+r3*1], xmm3 - movq [r0+r3*2], xmm0 + movq [r0+r3*2], xmm4 RET %endmacro @@ -1527,9 +1529,9 @@ cglobal pred8x8l_horizontal_up_8, 4,4 PALIGNR mm1, mm2, 1, mm2 movq mm0, mm4 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 - movq mm4, mm0 + movq mm1, mm0 movq mm7, mm2 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + PRED4x4_LOWPASS mm1, mm3, mm0, mm1, mm5 psllq mm1, 56 PALIGNR mm7, mm1, 7, mm3 lea r1, [r0+r3*2] @@ -1643,8 +1645,8 @@ cglobal pred8x8l_horizontal_down_8, 4,5 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 movq2dq xmm0, mm2 pslldq xmm0, 8 - movq mm4, mm0 - PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + movq mm1, mm0 + PRED4x4_LOWPASS mm1, mm3, mm0, mm1, mm5 movq2dq xmm2, mm1 pslldq xmm2, 15 psrldq xmm2, 8 @@ -1665,14 +1667,14 @@ cglobal pred8x8l_horizontal_down_8, 4,5 movq2dq xmm1, mm4 test r2d, r2d jz .fix_tr_2 - movq mm0, [r0+8] - movq mm5, mm0 - movq mm2, mm0 - movq mm4, mm0 + movq mm1, [r0+8] + movq mm5, mm1 + movq mm2, mm1 + movq mm4, mm1 psrlq mm5, 56 PALIGNR mm2, mm3, 7, mm3 PALIGNR mm5, mm4, 1, mm4 - PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 + PRED4x4_LOWPASS mm1, mm2, mm5, mm1, mm4 .do_topright: movq2dq xmm5, mm1 pslldq xmm5, 8 @@ -1688,23 +1690,23 @@ INIT_XMM cpuname movdqa xmm4, xmm1 pavgb xmm4, xmm3 lea r0, [r1+r3*2] - PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 - punpcklbw xmm4, xmm0 - movhlps xmm0, xmm4 + PRED4x4_LOWPASS xmm3, xmm1, xmm2, xmm3, xmm5 + punpcklbw xmm4, xmm3 + movhlps xmm3, xmm4 movq [r0+r3*2], xmm4 - movq [r2+r3*2], xmm0 + movq [r2+r3*2], xmm3 psrldq xmm4, 2 - psrldq xmm0, 2 + psrldq xmm3, 2 movq [r0+r3*1], xmm4 - movq [r2+r3*1], xmm0 + movq [r2+r3*1], xmm3 psrldq xmm4, 2 - psrldq xmm0, 2 + psrldq xmm3, 2 movq [r1+r3*2], xmm4 - movq [r4+r3*2], xmm0 + movq [r4+r3*2], xmm3 psrldq xmm4, 2 - psrldq xmm0, 2 + psrldq xmm3, 2 movq [r1+r3*1], xmm4 - movq [r4+r3*1], xmm0 + movq [r4+r3*1], xmm3 RET %endmacro @@ -1824,11 +1826,11 @@ cglobal pred4x4_vertical_vp8_8, 3,3 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 lea r1, [r0+r2*2] psrlq m0, 8 ;t1 t2 t3 t4 - PRED4x4_LOWPASS m3, m1, m0, m2, m4 - movd [r0+r2*1], m3 - movd [r0+r2*2], m3 - movd [r1+r2*1], m3 - movd [r1+r2*2], m3 + PRED4x4_LOWPASS m2, m1, m0, m2, m4 + movd [r0+r2*1], m2 + movd [r0+r2*2], m2 + movd [r1+r2*1], m2 + movd [r1+r2*2], m2 RET ;----------------------------------------------------------------------------- @@ -1841,12 +1843,12 @@ cglobal pred4x4_down_left_8, 3,3 movq m1, [r0] punpckldq m1, [r1] movq m2, m1 - movq m3, m1 + movq m0, m1 psllq m1, 8 pxor m2, m1 psrlq m2, 8 - pxor m2, m3 - PRED4x4_LOWPASS m0, m1, m2, m3, m4 + pxor m2, m0 + PRED4x4_LOWPASS m0, m1, m2, m0, m3 lea r1, [r0+r2*2] psrlq m0, 8 movd [r0+r2*1], m0 @@ -1868,13 +1870,13 @@ cglobal pred4x4_vertical_left_8, 3,3 sub r0, r2 movq m1, [r0] punpckldq m1, [r1] - movq m3, m1 + movq m0, m1 movq m2, m1 - psrlq m3, 8 + psrlq m0, 8 psrlq m2, 16 - movq m4, m3 + movq m4, m0 pavgb m4, m1 - PRED4x4_LOWPASS m0, m1, m2, m3, m5 + PRED4x4_LOWPASS m0, m1, m2, m0, m5 lea r1, [r0+r2*2] movh [r0+r2*1], m4 movh [r0+r2*2], m0 @@ -1908,8 +1910,8 @@ cglobal pred4x4_horizontal_up_8, 3,3 psrlq m2, 16 psrlq m3, 8 pavgb m7, m3 - PRED4x4_LOWPASS m4, m0, m2, m3, m5 - punpcklbw m7, m4 + PRED4x4_LOWPASS m3, m0, m2, m3, m5 + punpcklbw m7, m3 movd [r0+r2*1], m7 psrlq m7, 16 movd [r0+r2*2], m7 @@ -1943,16 +1945,16 @@ cglobal pred4x4_horizontal_down_8, 3,3 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 pavgb m5, m2 - PRED4x4_LOWPASS m3, m1, m0, m2, m4 - punpcklbw m5, m3 - psrlq m3, 32 - PALIGNR m3, m5, 6, m4 + PRED4x4_LOWPASS m2, m1, m0, m2, m4 + punpcklbw m5, m2 + psrlq m2, 32 + PALIGNR m2, m5, 6, m4 movh [r1+r2*2], m5 psrlq m5, 16 movh [r1+r2*1], m5 psrlq m5, 16 movh [r0+r2*2], m5 - movh [r0+r2*1], m3 + movh [r0+r2*1], m2 RET ;----------------------------------------------------------------------------- @@ -1974,17 +1976,17 @@ cglobal pred4x4_vertical_right_8, 3,3 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 movq m2, m0 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 - PRED4x4_LOWPASS m3, m1, m0, m2, m4 - movq m1, m3 - psrlq m3, 16 + PRED4x4_LOWPASS m2, m1, m0, m2, m4 + movq m1, m2 + psrlq m2, 16 psllq m1, 48 movh [r0+r2*1], m5 - movh [r0+r2*2], m3 - PALIGNR m5, m1, 7, m2 + movh [r0+r2*2], m2 + PALIGNR m5, m1, 7, m3 psllq m1, 8 movh [r1+r2*1], m5 - PALIGNR m3, m1, 7, m1 - movh [r1+r2*2], m3 + PALIGNR m2, m1, 7, m1 + movh [r1+r2*2], m2 RET ;----------------------------------------------------------------------------- @@ -2004,9 +2006,9 @@ cglobal pred4x4_down_right_8, 3,3 PALIGNR m3, m1, 5, m1 movq m1, m3 PALIGNR m3, [r1+r2*1-8], 7, m4 - movq m2, m3 + movq m0, m3 PALIGNR m3, [r1+r2*2-8], 7, m4 - PRED4x4_LOWPASS m0, m3, m1, m2, m4 + PRED4x4_LOWPASS m0, m3, m1, m0, m4 movh [r1+r2*2], m0 psrlq m0, 8 movh [r1+r2*1], m0 -- 2.52.0 >From 63c3e8b68d855ed77852b1b3e88780e7e24c40cb Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 14 Jun 2026 14:27:00 +0200 Subject: [PATCH 2/2] avcodec/x86/h264_intrapred: Add AVX2 horizontal pred versions pred8x8_horizontal_8_c: 6.9 pred8x8_horizontal_8_sse2: 9.9 ( 0.70x) pred8x8_horizontal_8_ssse3: 9.5 ( 0.73x) pred8x8_horizontal_8_avx2: 5.1 ( 1.35x) pred16x16_horizontal_8_c: 10.9 pred16x16_horizontal_8_sse2: 15.0 ( 0.72x) pred16x16_horizontal_8_ssse3: 11.7 ( 0.93x) pred16x16_horizontal_8_avx2: 9.6 ( 1.13x) The new functions are cheap and only occupy 2*48B. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h264_intrapred.asm | 18 ++++++++++++++++-- libavcodec/x86/h264_intrapred_init.c | 5 +++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index ad4e267f03..d3bf6a627f 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -71,10 +71,14 @@ cglobal pred16x16_vertical_8, 2,3 %macro PRED16x16_H 0 cglobal pred16x16_horizontal_8, 2,3 mov r2, 8 -%if cpuflag(ssse3) +%if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif .loop: +%if cpuflag(avx2) + vpbroadcastb m0, [r0+r1*0-1] + vpbroadcastb m1, [r0+r1*1-1] +%else movd m0, [r0+r1*0-4] movd m1, [r0+r1*1-4] @@ -86,6 +90,7 @@ cglobal pred16x16_horizontal_8, 2,3 punpcklbw m1, m1 SPLATW m0, m0, 3 SPLATW m1, m1, 3 +%endif %endif mova [r0+r1*0], m0 @@ -100,6 +105,8 @@ INIT_XMM sse2 PRED16x16_H INIT_XMM ssse3 PRED16x16_H +INIT_XMM avx2 +PRED16x16_H ;----------------------------------------------------------------------------- ; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride) @@ -586,12 +593,17 @@ cglobal pred8x8_vertical_8, 2,2 %macro PRED8x8_H 0 cglobal pred8x8_horizontal_8, 2,3,3 mov r2, 4 -%if cpuflag(ssse3) +%if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif .loop: +%if cpuflag(avx2) + vpbroadcastb m0, [r0+r1*0-1] + vpbroadcastb m1, [r0+r1*1-1] +%else SPLATB_LOAD m0, r0+r1*0-1, m2 SPLATB_LOAD m1, r0+r1*1-1, m2 +%endif movq [r0+r1*0], m0 movq [r0+r1*1], m1 lea r0, [r0+r1*2] @@ -604,6 +616,8 @@ INIT_XMM sse2 PRED8x8_H INIT_XMM ssse3 PRED8x8_H +INIT_XMM avx2 +PRED8x8_H ;----------------------------------------------------------------------------- ; void ff_pred8x8_top_dc_8_sse2(uint8_t *src, ptrdiff_t stride) diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 5b308f658f..b5d82694a2 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -102,6 +102,7 @@ PRED16x16(horizontal, 10, sse2) PRED16x16(vertical, 8, sse) PRED16x16(horizontal, 8, sse2) PRED16x16(horizontal, 8, ssse3) +PRED16x16(horizontal, 8, avx2) PRED16x16(dc, 8, sse2) PRED16x16(dc, 8, ssse3) PRED16x16(plane_h264, 8, sse2) @@ -119,6 +120,7 @@ PRED8x8(dc, 8, sse2) PRED8x8(vertical, 8, sse2) PRED8x8(horizontal, 8, sse2) PRED8x8(horizontal, 8, ssse3) +PRED8x8(horizontal, 8, avx2) PRED8x8(plane, 8, sse2) PRED8x8(plane, 8, ssse3) PRED8x8(tm_vp8, 8, sse2) @@ -256,6 +258,9 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, } if(EXTERNAL_AVX2(cpu_flags)){ + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_avx2; + if (chroma_format_idc <= 1) + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_avx2; if (codec_id == AV_CODEC_ID_VP8) { h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2; } -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
