Ping. https://patchwork.ffmpeg.org/project/ffmpeg/patch/[email protected]/
The MSVC CI failure on v1 was an unrelated AVX-512 link issue on the softworkz runner (124 unresolved _avx512icl externals across hevc/vp9/filters), not anything from this patch. v2 CI is clean. Happy to rebase or address any review comments. Kindest, Zane On Tue, Jan 20, 2026 at 11:24 PM ZaneHam <[email protected]> wrote: > Add AVX2 implementations for 10-bit H.264 16x16 intra prediction: > - pred16x16_vertical_10 > - pred16x16_horizontal_10 > - pred16x16_dc_10 > - pred16x16_top_dc_10 > - pred16x16_left_dc_10 > - pred16x16_128_dc_10 > > 10-bit 16x16 blocks are 32 bytes per row, perfectly matching AVX2's > 256-bit YMM registers, allowing single-instruction row operations > versus two XMM operations with SSE2. > > checkasm benchmarks on Zen3 (cycles, lower is better): > C SSE2 AVX2 > pred16x16_dc_10 65.7 40.3 27.3 (1.48x vs SSE2) > pred16x16_128_dc_10 31.1 28.1 21.4 (1.31x vs SSE2) > pred16x16_horizontal 67.8 28.1 21.6 (1.30x vs SSE2) > pred16x16_left_dc_10 55.6 35.0 22.9 (1.53x vs SSE2) > pred16x16_top_dc_10 49.5 32.3 21.8 (1.48x vs SSE2) > pred16x16_vertical_10 32.3 28.3 24.1 (1.17x vs SSE2) > --- > libavcodec/x86/h264_intrapred_10bit.asm | 186 ++++++++++++++++++++++++ > libavcodec/x86/h264_intrapred_init.c | 14 ++ > 2 files changed, 200 insertions(+) > > diff --git a/libavcodec/x86/h264_intrapred_10bit.asm > b/libavcodec/x86/h264_intrapred_10bit.asm > index 2f30807332..78e2f263bc 100644 > --- a/libavcodec/x86/h264_intrapred_10bit.asm > +++ b/libavcodec/x86/h264_intrapred_10bit.asm > @@ -1117,3 +1117,189 @@ cglobal pred16x16_128_dc_10, 2,3 > dec r2d > jg .loop > RET > + > > +;----------------------------------------------------------------------------- > +; AVX2 versions of pred16x16 10-bit functions > +; For 10-bit: 16 pixels * 2 bytes = 32 bytes = 1 YMM register (perfect > match\!) > > +;----------------------------------------------------------------------------- > + > +%if HAVE_AVX2_EXTERNAL > + > > +;----------------------------------------------------------------------------- > +; void ff_pred16x16_vertical_10_avx2(pixel *src, ptrdiff_t stride) > > +;----------------------------------------------------------------------------- > +INIT_YMM avx2 > +cglobal pred16x16_vertical_10, 2, 4 > + sub r0, r1 > + movu m0, [r0] ; Load all 16 pixels (32 bytes) from > top row > + mov r2d, 4 > + lea r3, [r1*3] > +.loop: > + movu [r0+r1*1], m0 > + movu [r0+r1*2], m0 > + movu [r0+r3 ], m0 > + lea r0, [r0+r1*2] > + movu [r0+r1*2], m0 > + lea r0, [r0+r1*2] > + dec r2d > + jg .loop > + RET > + > > +;----------------------------------------------------------------------------- > +; void ff_pred16x16_horizontal_10_avx2(pixel *src, ptrdiff_t stride) > > +;----------------------------------------------------------------------------- > +INIT_YMM avx2 > +cglobal pred16x16_horizontal_10, 2, 4 > + lea r2, [r1*3] > + mov r3d, 4 > +.loop: > + vpbroadcastw m0, [r0-2] > + movu [r0], m0 > + vpbroadcastw m0, [r0+r1-2] > + movu [r0+r1], m0 > + vpbroadcastw m0, [r0+r1*2-2] > + movu [r0+r1*2], m0 > + vpbroadcastw m0, [r0+r2-2] > + movu [r0+r2], m0 > + lea r0, [r0+r1*4] > + dec r3d > + jg .loop > + RET > + > > +;----------------------------------------------------------------------------- > +; void ff_pred16x16_dc_10_avx2(pixel *src, ptrdiff_t stride) > +; DC = (sum of 16 top pixels + sum of 16 left pixels + 16) >> 5 > > +;----------------------------------------------------------------------------- > +INIT_YMM avx2 > +cglobal pred16x16_dc_10, 2, 6 > + mov r5, r0 ; Save dest pointer > + sub r0, r1 > + movu m0, [r0] ; Load top row (32 bytes) > + vextracti128 xm1, m0, 1 ; Get high 128 bits > + paddw xm0, xm1 ; Sum to 8 words > + phaddw xm0, xm0 ; 4 words > + phaddw xm0, xm0 ; 2 words > + phaddw xm0, xm0 ; 1 word (top sum in low word) > + movd r3d, xm0 > + and r3d, 0xFFFF ; Keep only low 16 bits > + > + ; Sum left column using lea-based pointer advancement > + lea r0, [r0+r1-2] ; Point to left pixel of row 0 > + movzx r4d, word [r0] > + add r3d, r4d > + movzx r4d, word [r0+r1] > + add r3d, r4d > +%rep 7 > + lea r0, [r0+r1*2] > + movzx r4d, word [r0] > + add r3d, r4d > + movzx r4d, word [r0+r1] > + add r3d, r4d > +%endrep > + add r3d, 16 ; Rounding > + shr r3d, 5 ; Divide by 32 > + > + movd xm0, r3d > + vpbroadcastw m0, xm0 ; Broadcast to all 16 words > + > + ; Fill all 16 rows > + mov r3d, 4 > + lea r4, [r1*3] > +.loop: > + movu [r5+r1*0], m0 > + movu [r5+r1*1], m0 > + movu [r5+r1*2], m0 > + movu [r5+r4 ], m0 > + lea r5, [r5+r1*4] > + dec r3d > + jg .loop > + RET > + > > +;----------------------------------------------------------------------------- > +; void ff_pred16x16_top_dc_10_avx2(pixel *src, ptrdiff_t stride) > +; DC = (sum of 16 top pixels + 8) >> 4 > > +;----------------------------------------------------------------------------- > +INIT_YMM avx2 > +cglobal pred16x16_top_dc_10, 2, 4 > + sub r0, r1 > + movu m0, [r0] ; Load top row > + vextracti128 xm1, m0, 1 > + paddw xm0, xm1 > + phaddw xm0, xm0 > + phaddw xm0, xm0 > + phaddw xm0, xm0 > + paddw xm0, [pw_8] ; Add 8 for rounding > + psrlw xm0, 4 ; Divide by 16 > + vpbroadcastw m0, xm0 > + > + mov r2d, 4 > + lea r3, [r1*3] > +.loop: > + movu [r0+r1*1], m0 > + movu [r0+r1*2], m0 > + movu [r0+r3 ], m0 > + lea r0, [r0+r1*2] > + movu [r0+r1*2], m0 > + lea r0, [r0+r1*2] > + dec r2d > + jg .loop > + RET > + > > +;----------------------------------------------------------------------------- > +; void ff_pred16x16_left_dc_10_avx2(pixel *src, ptrdiff_t stride) > +; DC = (sum of 16 left pixels + 8) >> 4 > > +;----------------------------------------------------------------------------- > +INIT_YMM avx2 > +cglobal pred16x16_left_dc_10, 2, 5 > + mov r4, r0 ; Save dest pointer > + > + ; Sum left column using lea-based pointer advancement > + sub r0, 2 ; Point to left pixel of row 0 > + movzx r2d, word [r0] > + movzx r3d, word [r0+r1] > +%rep 7 > + lea r0, [r0+r1*2] > + movzx eax, word [r0] > + add r2d, eax > + movzx eax, word [r0+r1] > + add r3d, eax > +%endrep > + lea r2d, [r2+r3+8] ; Sum with rounding > + shr r2d, 4 ; Divide by 16 > + > + movd xm0, r2d > + vpbroadcastw m0, xm0 > + > + ; Fill all 16 rows > + mov r2d, 4 > + lea r3, [r1*3] > +.loop: > + movu [r4+r1*0], m0 > + movu [r4+r1*1], m0 > + movu [r4+r1*2], m0 > + movu [r4+r3 ], m0 > + lea r4, [r4+r1*4] > + dec r2d > + jg .loop > + RET > + > > +;----------------------------------------------------------------------------- > +; void ff_pred16x16_128_dc_10_avx2(pixel *src, ptrdiff_t stride) > +; Fill with constant 512 (1 << 9 for 10-bit midpoint) > > +;----------------------------------------------------------------------------- > +INIT_YMM avx2 > +cglobal pred16x16_128_dc_10, 2, 4 > + vpbroadcastw m0, [pw_512] > + mov r2d, 4 > + lea r3, [r1*3] > +.loop: > + movu [r0+r1*0], m0 > + movu [r0+r1*1], m0 > + movu [r0+r1*2], m0 > + movu [r0+r3 ], m0 > + lea r0, [r0+r1*4] > + dec r2d > + jg .loop > + RET > + > +%endif ; HAVE_AVX2_EXTERNAL > diff --git a/libavcodec/x86/h264_intrapred_init.c > b/libavcodec/x86/h264_intrapred_init.c > index aa9bc721f0..6918c7f985 100644 > --- a/libavcodec/x86/h264_intrapred_init.c > +++ b/libavcodec/x86/h264_intrapred_init.c > @@ -97,6 +97,12 @@ PRED16x16(128_dc, 10, sse2) > PRED16x16(left_dc, 10, sse2) > PRED16x16(vertical, 10, sse2) > PRED16x16(horizontal, 10, sse2) > +PRED16x16(dc, 10, avx2) > +PRED16x16(top_dc, 10, avx2) > +PRED16x16(128_dc, 10, avx2) > +PRED16x16(left_dc, 10, avx2) > +PRED16x16(vertical, 10, avx2) > +PRED16x16(horizontal, 10, avx2) > > /* 8-bit versions */ > PRED16x16(vertical, 8, sse) > @@ -328,5 +334,13 @@ av_cold void ff_h264_pred_init_x86(H264PredContext > *h, int codec_id, > h->pred8x8l[VERT_RIGHT_PRED ] = > ff_pred8x8l_vertical_right_10_avx; > h->pred8x8l[HOR_UP_PRED ] = > ff_pred8x8l_horizontal_up_10_avx; > } > + if (EXTERNAL_AVX2(cpu_flags)) { > + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_avx2; > + h->pred16x16[TOP_DC_PRED8x8 ] = > ff_pred16x16_top_dc_10_avx2; > + h->pred16x16[DC_128_PRED8x8 ] = > ff_pred16x16_128_dc_10_avx2; > + h->pred16x16[LEFT_DC_PRED8x8 ] = > ff_pred16x16_left_dc_10_avx2; > + h->pred16x16[VERT_PRED8x8 ] = > ff_pred16x16_vertical_10_avx2; > + h->pred16x16[HOR_PRED8x8 ] = > ff_pred16x16_horizontal_10_avx2; > + } > } > } > -- > 2.51.0.windows.2 > > _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
