This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 2729c529880cd0d3c8d0925003fd33f694ef0870 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Sun Jan 25 00:28:53 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Jan 29 11:54:57 2026 +0100 avcodec/x86/hevc/deblock: Reduce usage of GPRs Don't use two GPRs to store two words from xmm registers; shuffle these words so that they are fit into one GPR. This reduces the amount of GPRs used and leads to tiny speedups here. Also avoid rex prefixes whenever possible (for lines that needed to be modified anyway). Old benchmarks: hevc_h_loop_filter_luma8_skip_c: 23.8 ( 1.00x) hevc_h_loop_filter_luma8_skip_sse2: 8.5 ( 2.80x) hevc_h_loop_filter_luma8_skip_ssse3: 7.2 ( 3.29x) hevc_h_loop_filter_luma8_skip_avx: 6.4 ( 3.71x) hevc_h_loop_filter_luma8_strong_c: 150.4 ( 1.00x) hevc_h_loop_filter_luma8_strong_sse2: 34.4 ( 4.37x) hevc_h_loop_filter_luma8_strong_ssse3: 34.5 ( 4.36x) hevc_h_loop_filter_luma8_strong_avx: 32.3 ( 4.65x) hevc_h_loop_filter_luma8_weak_c: 103.2 ( 1.00x) hevc_h_loop_filter_luma8_weak_sse2: 34.5 ( 2.99x) hevc_h_loop_filter_luma8_weak_ssse3: 7.3 (14.22x) hevc_h_loop_filter_luma8_weak_avx: 32.4 ( 3.18x) hevc_h_loop_filter_luma10_skip_c: 23.5 ( 1.00x) hevc_h_loop_filter_luma10_skip_sse2: 6.6 ( 3.58x) hevc_h_loop_filter_luma10_skip_ssse3: 6.1 ( 3.86x) hevc_h_loop_filter_luma10_skip_avx: 5.4 ( 4.34x) hevc_h_loop_filter_luma10_strong_c: 161.8 ( 1.00x) hevc_h_loop_filter_luma10_strong_sse2: 32.2 ( 5.03x) hevc_h_loop_filter_luma10_strong_ssse3: 30.4 ( 5.33x) hevc_h_loop_filter_luma10_strong_avx: 30.3 ( 5.33x) hevc_h_loop_filter_luma10_weak_c: 23.5 ( 1.00x) hevc_h_loop_filter_luma10_weak_sse2: 6.6 ( 3.58x) hevc_h_loop_filter_luma10_weak_ssse3: 6.1 ( 3.85x) hevc_h_loop_filter_luma10_weak_avx: 5.4 ( 4.35x) hevc_h_loop_filter_luma12_skip_c: 18.8 ( 1.00x) hevc_h_loop_filter_luma12_skip_sse2: 6.6 ( 2.87x) hevc_h_loop_filter_luma12_skip_ssse3: 6.1 ( 3.08x) hevc_h_loop_filter_luma12_skip_avx: 6.2 ( 3.06x) hevc_h_loop_filter_luma12_strong_c: 159.0 ( 1.00x) hevc_h_loop_filter_luma12_strong_sse2: 36.3 ( 4.38x) hevc_h_loop_filter_luma12_strong_ssse3: 36.1 ( 4.40x) hevc_h_loop_filter_luma12_strong_avx: 33.5 ( 4.75x) hevc_h_loop_filter_luma12_weak_c: 40.1 ( 1.00x) hevc_h_loop_filter_luma12_weak_sse2: 35.5 ( 1.13x) hevc_h_loop_filter_luma12_weak_ssse3: 36.1 ( 1.11x) hevc_h_loop_filter_luma12_weak_avx: 6.2 ( 6.52x) hevc_v_loop_filter_luma8_skip_c: 25.5 ( 1.00x) hevc_v_loop_filter_luma8_skip_sse2: 10.6 ( 2.40x) hevc_v_loop_filter_luma8_skip_ssse3: 11.4 ( 2.24x) hevc_v_loop_filter_luma8_skip_avx: 8.3 ( 3.07x) hevc_v_loop_filter_luma8_strong_c: 146.8 ( 1.00x) hevc_v_loop_filter_luma8_strong_sse2: 43.9 ( 3.35x) hevc_v_loop_filter_luma8_strong_ssse3: 43.7 ( 3.36x) hevc_v_loop_filter_luma8_strong_avx: 42.3 ( 3.47x) hevc_v_loop_filter_luma8_weak_c: 25.5 ( 1.00x) hevc_v_loop_filter_luma8_weak_sse2: 10.6 ( 2.40x) hevc_v_loop_filter_luma8_weak_ssse3: 44.0 ( 0.58x) hevc_v_loop_filter_luma8_weak_avx: 8.3 ( 3.09x) hevc_v_loop_filter_luma10_skip_c: 20.0 ( 1.00x) hevc_v_loop_filter_luma10_skip_sse2: 11.3 ( 1.77x) hevc_v_loop_filter_luma10_skip_ssse3: 11.0 ( 1.82x) hevc_v_loop_filter_luma10_skip_avx: 9.3 ( 2.15x) hevc_v_loop_filter_luma10_strong_c: 193.5 ( 1.00x) hevc_v_loop_filter_luma10_strong_sse2: 46.1 ( 4.19x) hevc_v_loop_filter_luma10_strong_ssse3: 44.2 ( 4.38x) hevc_v_loop_filter_luma10_strong_avx: 44.4 ( 4.35x) hevc_v_loop_filter_luma10_weak_c: 90.3 ( 1.00x) hevc_v_loop_filter_luma10_weak_sse2: 46.3 ( 1.95x) hevc_v_loop_filter_luma10_weak_ssse3: 10.8 ( 8.37x) hevc_v_loop_filter_luma10_weak_avx: 44.4 ( 2.03x) hevc_v_loop_filter_luma12_skip_c: 16.8 ( 1.00x) hevc_v_loop_filter_luma12_skip_sse2: 11.8 ( 1.42x) hevc_v_loop_filter_luma12_skip_ssse3: 11.7 ( 1.43x) hevc_v_loop_filter_luma12_skip_avx: 8.7 ( 1.93x) hevc_v_loop_filter_luma12_strong_c: 159.3 ( 1.00x) hevc_v_loop_filter_luma12_strong_sse2: 45.3 ( 3.52x) hevc_v_loop_filter_luma12_strong_ssse3: 60.3 ( 2.64x) hevc_v_loop_filter_luma12_strong_avx: 44.1 ( 3.61x) hevc_v_loop_filter_luma12_weak_c: 63.6 ( 1.00x) hevc_v_loop_filter_luma12_weak_sse2: 45.3 ( 1.40x) hevc_v_loop_filter_luma12_weak_ssse3: 11.7 ( 5.41x) hevc_v_loop_filter_luma12_weak_avx: 43.9 ( 1.45x) New benchmarks: hevc_h_loop_filter_luma8_skip_c: 24.2 ( 1.00x) hevc_h_loop_filter_luma8_skip_sse2: 8.6 ( 2.82x) hevc_h_loop_filter_luma8_skip_ssse3: 7.0 ( 3.46x) hevc_h_loop_filter_luma8_skip_avx: 6.8 ( 3.54x) hevc_h_loop_filter_luma8_strong_c: 150.4 ( 1.00x) hevc_h_loop_filter_luma8_strong_sse2: 33.3 ( 4.52x) hevc_h_loop_filter_luma8_strong_ssse3: 32.7 ( 4.61x) hevc_h_loop_filter_luma8_strong_avx: 32.7 ( 4.60x) hevc_h_loop_filter_luma8_weak_c: 104.0 ( 1.00x) hevc_h_loop_filter_luma8_weak_sse2: 33.2 ( 3.13x) hevc_h_loop_filter_luma8_weak_ssse3: 7.0 (14.91x) hevc_h_loop_filter_luma8_weak_avx: 31.3 ( 3.32x) hevc_h_loop_filter_luma10_skip_c: 19.2 ( 1.00x) hevc_h_loop_filter_luma10_skip_sse2: 6.2 ( 3.08x) hevc_h_loop_filter_luma10_skip_ssse3: 6.2 ( 3.08x) hevc_h_loop_filter_luma10_skip_avx: 5.0 ( 3.85x) hevc_h_loop_filter_luma10_strong_c: 159.8 ( 1.00x) hevc_h_loop_filter_luma10_strong_sse2: 30.0 ( 5.32x) hevc_h_loop_filter_luma10_strong_ssse3: 29.2 ( 5.48x) hevc_h_loop_filter_luma10_strong_avx: 28.6 ( 5.58x) hevc_h_loop_filter_luma10_weak_c: 19.2 ( 1.00x) hevc_h_loop_filter_luma10_weak_sse2: 6.2 ( 3.09x) hevc_h_loop_filter_luma10_weak_ssse3: 6.2 ( 3.09x) hevc_h_loop_filter_luma10_weak_avx: 5.0 ( 3.88x) hevc_h_loop_filter_luma12_skip_c: 18.7 ( 1.00x) hevc_h_loop_filter_luma12_skip_sse2: 6.2 ( 3.00x) hevc_h_loop_filter_luma12_skip_ssse3: 5.7 ( 3.27x) hevc_h_loop_filter_luma12_skip_avx: 5.2 ( 3.61x) hevc_h_loop_filter_luma12_strong_c: 160.2 ( 1.00x) hevc_h_loop_filter_luma12_strong_sse2: 34.2 ( 4.68x) hevc_h_loop_filter_luma12_strong_ssse3: 29.3 ( 5.48x) hevc_h_loop_filter_luma12_strong_avx: 31.4 ( 5.10x) hevc_h_loop_filter_luma12_weak_c: 40.2 ( 1.00x) hevc_h_loop_filter_luma12_weak_sse2: 35.2 ( 1.14x) hevc_h_loop_filter_luma12_weak_ssse3: 29.3 ( 1.37x) hevc_h_loop_filter_luma12_weak_avx: 5.0 ( 8.09x) hevc_v_loop_filter_luma8_skip_c: 25.6 ( 1.00x) hevc_v_loop_filter_luma8_skip_sse2: 10.2 ( 2.52x) hevc_v_loop_filter_luma8_skip_ssse3: 10.5 ( 2.45x) hevc_v_loop_filter_luma8_skip_avx: 8.2 ( 3.11x) hevc_v_loop_filter_luma8_strong_c: 147.1 ( 1.00x) hevc_v_loop_filter_luma8_strong_sse2: 42.6 ( 3.45x) hevc_v_loop_filter_luma8_strong_ssse3: 42.4 ( 3.47x) hevc_v_loop_filter_luma8_strong_avx: 40.1 ( 3.67x) hevc_v_loop_filter_luma8_weak_c: 25.6 ( 1.00x) hevc_v_loop_filter_luma8_weak_sse2: 10.6 ( 2.42x) hevc_v_loop_filter_luma8_weak_ssse3: 42.7 ( 0.60x) hevc_v_loop_filter_luma8_weak_avx: 8.2 ( 3.11x) hevc_v_loop_filter_luma10_skip_c: 16.7 ( 1.00x) hevc_v_loop_filter_luma10_skip_sse2: 11.0 ( 1.52x) hevc_v_loop_filter_luma10_skip_ssse3: 10.5 ( 1.59x) hevc_v_loop_filter_luma10_skip_avx: 9.6 ( 1.74x) hevc_v_loop_filter_luma10_strong_c: 190.0 ( 1.00x) hevc_v_loop_filter_luma10_strong_sse2: 44.8 ( 4.24x) hevc_v_loop_filter_luma10_strong_ssse3: 42.3 ( 4.49x) hevc_v_loop_filter_luma10_strong_avx: 42.5 ( 4.47x) hevc_v_loop_filter_luma10_weak_c: 88.3 ( 1.00x) hevc_v_loop_filter_luma10_weak_sse2: 45.7 ( 1.93x) hevc_v_loop_filter_luma10_weak_ssse3: 10.5 ( 8.40x) hevc_v_loop_filter_luma10_weak_avx: 42.4 ( 2.09x) hevc_v_loop_filter_luma12_skip_c: 16.7 ( 1.00x) hevc_v_loop_filter_luma12_skip_sse2: 11.7 ( 1.42x) hevc_v_loop_filter_luma12_skip_ssse3: 10.5 ( 1.59x) hevc_v_loop_filter_luma12_skip_avx: 8.8 ( 1.90x) hevc_v_loop_filter_luma12_strong_c: 159.4 ( 1.00x) hevc_v_loop_filter_luma12_strong_sse2: 45.2 ( 3.53x) hevc_v_loop_filter_luma12_strong_ssse3: 59.3 ( 2.69x) hevc_v_loop_filter_luma12_strong_avx: 41.7 ( 3.82x) hevc_v_loop_filter_luma12_weak_c: 63.3 ( 1.00x) hevc_v_loop_filter_luma12_weak_sse2: 44.9 ( 1.41x) hevc_v_loop_filter_luma12_weak_ssse3: 10.5 ( 6.02x) hevc_v_loop_filter_luma12_weak_avx: 41.7 ( 1.52x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hevc/deblock.asm | 105 +++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 56 deletions(-) diff --git a/libavcodec/x86/hevc/deblock.asm b/libavcodec/x86/hevc/deblock.asm index 9671415c66..d43d95142a 100644 --- a/libavcodec/x86/hevc/deblock.asm +++ b/libavcodec/x86/hevc/deblock.asm @@ -333,8 +333,8 @@ ALIGN 16 ;compare pcmpgtw m15, m13, m14 - movmskps r12, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) - test r12, r12 + movmskps r10, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) + test r10d, r10d je .bypassluma ;weak / strong decision compare to beta_2 @@ -345,45 +345,41 @@ ALIGN 16 ;end weak / strong decision ; weak filter nd_p/q calculation - pshufd m8, m10, 0x31 - psrld m8, 16 + psrlq m8, m10, 48 paddw m8, m10 - movd r7d, m8 - pshufd m8, m8, 0x4E - movd r8d, m8 + pshufd m8, m8, 0x8 + movq r7q, m8 - pshufd m8, m11, 0x31 - psrld m8, 16 + psrlq m8, m11, 48 paddw m8, m11 - movd r9d, m8 - pshufd m8, m8, 0x4E - movd r10d, m8 + pshufd m8, m8, 0x8 + movq r8q, m8 ; end calc for weak filter ; filtering mask - mov r11, r12 - shr r11, 3 - movd m15, r11d - and r12, 1 - movd m11, r12d + mov r9, r10 + shr r9d, 3 + movd m15, r9d + and r10d, 1 + movd m11, r10d shufps m11, m15, 0 - shl r11, 1 - or r12, r11 + shl r9d, 1 + or r10d, r9d pcmpeqd m11, [pd_1]; filtering mask ;decide between strong and weak filtering ;tc25 calculations - mov r11d, [tcq]; + mov r9d, [tcq]; %if %1 > 8 - shl r11, %1 - 8 + shl r9d, %1 - 8 %endif - movd m8, r11d; tc0 + movd m8, r9d; tc0 mov r3d, [tcq+4]; %if %1 > 8 - shl r3, %1 - 8 + shl r3d, %1 - 8 %endif - add r11d, r3d; tc0 + tc1 + add r9d, r3d; tc0 + tc1 jz .bypassluma movd m9, r3d; tc1 punpcklwd m8, m8 @@ -408,8 +404,8 @@ ALIGN 16 psraw m13, 3; beta >> 3 pcmpgtw m13, m12; - movmskps r11, m13; - and r6, r11; strong mask , beta_2 and beta_3 comparisons + movmskps r9d, m13; + and r6d, r9d; strong mask , beta_2 and beta_3 comparisons ;----beta_3 comparison end----- ;----tc25 comparison--- psubw m12, m3, m4; p0 - q0 @@ -419,24 +415,24 @@ ALIGN 16 pshuflw m12, m12, 0xf0 ;0b11110000; pcmpgtw m8, m12; tc25 comparisons - movmskps r11, m8; - and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons + movmskps r9d, m8; + and r6d, r9d; strong mask, beta_2, beta_3 and tc25 comparisons ;----tc25 comparison end--- - mov r11, r6; - shr r11, 1; - and r6, r11; strong mask, bits 2 and 0 + mov r9d, r6d; + shr r9d, 1; + and r6d, r9d; strong mask, bits 2 and 0 pmullw m14, m9, [pw_m2]; -tc * 2 paddw m9, m9 and r6, 5; 0b101 - mov r11, r6; strong mask + mov r9d, r6d; strong mask shr r6, 2; movd m12, r6d; store to xmm for mask generation shl r6, 1 - and r11, 1 - movd m10, r11d; store to xmm for mask generation - or r6, r11; final strong mask, bits 1 and 0 + and r9d, 1 + movd m10, r9d; store to xmm for mask generation + or r6d, r9d; final strong mask, bits 1 and 0 jz .weakfilter shufps m10, m12, 0 @@ -522,21 +518,21 @@ ALIGN 16 .weakfilter: not r6; strong mask -> weak mask - and r6, r12; final weak filtering mask, bits 0 and 1 + and r6d, r10d; final weak filtering mask, bits 0 and 1 jz .store ; weak filtering mask - mov r11, r6 - shr r11, 1 - movd m12, r11d + mov r9, r6 + shr r9d, 1 + movd m12, r9d and r6, 1 movd m11, r6d shufps m11, m12, 0 pcmpeqd m11, [pd_1]; filtering mask - mov r12, betaq - shr r12, 1; - add betaq, r12 + mov r10d, betad + shr r10d, 1; + add betad, r10d shr betaq, 3; ((beta + (beta >> 1)) >> 3)) psubw m12, m4, m3 ; q0 - p0 @@ -605,11 +601,9 @@ ALIGN 16 movd m10, betad SPLATW m10, m10, 0 - movd m13, r7d; 1dp0 + 1dp3 - movd m8, r8d; 0dp0 + 0dp3 - punpcklwd m8, m8 + movq m13, r7q; 1dp0 + 1dp3, 0dp0 + 0dp3 punpcklwd m13, m13 - shufps m13, m8, 0; + pshufd m13, m13, 10100000b pcmpgtw m8, m10, m13 pand m8, m11 ;end beta calculations @@ -623,11 +617,10 @@ ALIGN 16 pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2) paddw m8, m5; q1' - movd m13, r9d; - movd m15, r10d; - punpcklwd m15, m15 + movq m13, r8q; punpcklwd m13, m13 - shufps m13, m15, 0; dq0 + dq3 + movhlps m15, m13 + pshufd m13, m13, 10100000b pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3) pand m10, m11 @@ -742,7 +735,7 @@ LOOP_FILTER_CHROMA ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, ; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_v_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_v_loop_filter_luma_8, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 4 lea pix0q, [3 * r1] mov src3strideq, pixq @@ -754,7 +747,7 @@ cglobal hevc_v_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3s .bypassluma: RET -cglobal hevc_v_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_v_loop_filter_luma_10, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 8 lea pix0q, [3 * strideq] mov src3strideq, pixq @@ -766,7 +759,7 @@ cglobal hevc_v_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3 .bypassluma: RET -cglobal hevc_v_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_v_loop_filter_luma_12, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 8 lea pix0q, [3 * strideq] mov src3strideq, pixq @@ -782,7 +775,7 @@ cglobal hevc_v_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3 ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, ; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_h_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_h_loop_filter_luma_8, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq @@ -818,7 +811,7 @@ cglobal hevc_h_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3s .bypassluma: RET -cglobal hevc_h_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_h_loop_filter_luma_10, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq @@ -849,7 +842,7 @@ cglobal hevc_h_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3 .bypassluma: RET -cglobal hevc_h_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_h_loop_filter_luma_12, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
