The gather is unmasked but the instruction does a merge into ymm4, which depends on the value of ymm4 from the previous loop iteration. The out-of-order scheduler does not know statically that the instruction is fully unmasked, preventing parallel out-of-order execution of the gathers. --- libswscale/x86/scale_avx2.asm | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index b4b852d60b..90ee8b0a0e 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -68,8 +68,10 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, .innerloop: %endif vpcmpeqd m13, m13 + pxor m3, m3 ; break loop-carried dependency vpgatherdd m3,[srcmemq + m1], m13 vpcmpeqd m13, m13 + pxor m4, m4 ; break loop-carried dependency vpgatherdd m4,[srcmemq + m2], m13 vpunpcklbw m5, m3, m0 vpunpckhbw m6, m3, m0 @@ -119,6 +121,7 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, .tail_innerloop: %endif vpcmpeqd xm13, xm13 + pxor m3, m3 ; break loop-carried dependency vpgatherdd xm3,[srcmemq + xm1], xm13 vpunpcklbw xm5, xm3, xm0 vpunpckhbw xm6, xm3, xm0 -- 2.50.1.565.gc32cd1483b-goog _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".