PR #21582 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21582 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21582.patch
Also reduce the amount of GPRs used. >From 4580374e4ad922924a485cc7586f625eaca25482 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 21 Jan 2026 12:50:11 +0100 Subject: [PATCH 1/4] avcodec/x86/vp9lpf: Avoid vmovdqa Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9lpf.asm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm index 4e7ede2235..92e5d03ad7 100644 --- a/libavcodec/x86/vp9lpf.asm +++ b/libavcodec/x86/vp9lpf.asm @@ -818,8 +818,7 @@ cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], ) ; filter4() - mova m4, m2 - paddsb m2, m4 ; 2 * (q0 - p0) + paddsb m4, m2, m2 ; 2 * (q0 - p0) paddsb m2, m4 ; 3 * (q0 - p0) paddsb m6, m2, [pb_4] ; m6: f1 = clip(f + 4, 127) paddsb m2, [pb_3] ; m2: f2 = clip(f + 3, 127) -- 2.52.0 >From f437bab6ccc58d4f2ae7fb1eef380191ebbcda8c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 21 Jan 2026 13:56:05 +0100 Subject: [PATCH 2/4] avcodec/x86/hevc/deblock: Avoid vmovdqa (It would even be possible to avoid a clobbering m10 in MASKED_COPY and the mask register (%3) in MASKED_COPY2 when VEX encoding is in use.) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hevc/deblock.asm | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/hevc/deblock.asm b/libavcodec/x86/hevc/deblock.asm index 61b79f8079..fbb12322e3 100644 --- a/libavcodec/x86/hevc/deblock.asm +++ b/libavcodec/x86/hevc/deblock.asm @@ -257,8 +257,7 @@ INIT_XMM sse2 %macro MASKED_COPY 2 pand %2, m11 ; and mask pandn m10, m11, %1; and -mask - por %2, m10 - mova %1, %2 + por %1, %2, m10 %endmacro ; in: %2 clobbered @@ -267,8 +266,7 @@ INIT_XMM sse2 %macro MASKED_COPY2 3 pand %2, %3 ; and mask pandn %3, %1; and -mask - por %2, %3 - mova %1, %2 + por %1, %2, %3 %endmacro ALIGN 16 -- 2.52.0 >From 37cade3edf22ece324c8784f94637943a255b30f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 21 Jan 2026 14:31:14 +0100 Subject: [PATCH 3/4] avcodec/x86/hevc/deblock: avoid unused GPR r12 is unused, so use it instead of r13 to reduce the amount of push/pops. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hevc/deblock.asm | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/libavcodec/x86/hevc/deblock.asm b/libavcodec/x86/hevc/deblock.asm index fbb12322e3..9671415c66 100644 --- a/libavcodec/x86/hevc/deblock.asm +++ b/libavcodec/x86/hevc/deblock.asm @@ -333,8 +333,8 @@ ALIGN 16 ;compare pcmpgtw m15, m13, m14 - movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) - test r13, r13 + movmskps r12, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) + test r12, r12 je .bypassluma ;weak / strong decision compare to beta_2 @@ -361,14 +361,14 @@ ALIGN 16 ; end calc for weak filter ; filtering mask - mov r11, r13 + mov r11, r12 shr r11, 3 movd m15, r11d - and r13, 1 - movd m11, r13d + and r12, 1 + movd m11, r12d shufps m11, m15, 0 shl r11, 1 - or r13, r11 + or r12, r11 pcmpeqd m11, [pd_1]; filtering mask @@ -522,7 +522,7 @@ ALIGN 16 .weakfilter: not r6; strong mask -> weak mask - and r6, r13; final weak filtering mask, bits 0 and 1 + and r6, r12; final weak filtering mask, bits 0 and 1 jz .store ; weak filtering mask @@ -534,9 +534,9 @@ ALIGN 16 shufps m11, m12, 0 pcmpeqd m11, [pd_1]; filtering mask - mov r13, betaq - shr r13, 1; - add betaq, r13 + mov r12, betaq + shr r12, 1; + add betaq, r12 shr betaq, 3; ((beta + (beta >> 1)) >> 3)) psubw m12, m4, m3 ; q0 - p0 @@ -742,7 +742,7 @@ LOOP_FILTER_CHROMA ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, ; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_v_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 4 lea pix0q, [3 * r1] mov src3strideq, pixq @@ -754,7 +754,7 @@ cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3s .bypassluma: RET -cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_v_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 8 lea pix0q, [3 * strideq] mov src3strideq, pixq @@ -766,7 +766,7 @@ cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3 .bypassluma: RET -cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_v_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 8 lea pix0q, [3 * strideq] mov src3strideq, pixq @@ -782,7 +782,7 @@ cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3 ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, ; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_h_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq @@ -818,7 +818,7 @@ cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3s .bypassluma: RET -cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_h_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq @@ -849,7 +849,7 @@ cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3 .bypassluma: RET -cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_h_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq -- 2.52.0 >From e24fedee8d141487c11a18cf4a955ff0c0433579 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 25 Jan 2026 00:28:53 +0100 Subject: [PATCH 4/4] avcodec/x86/hevc/deblock: Reduce usage of GPRs Don't use two GPRs to store two words from xmm registers; shuffle these words so that they are fit into one GPR. This reduces the amount of GPRs used and leads to tiny speedups here. Also avoid rex prefixes whenever possible (for lines that needed to be modified anyway). Old benchmarks: hevc_h_loop_filter_luma8_skip_c: 23.8 ( 1.00x) hevc_h_loop_filter_luma8_skip_sse2: 8.5 ( 2.80x) hevc_h_loop_filter_luma8_skip_ssse3: 7.2 ( 3.29x) hevc_h_loop_filter_luma8_skip_avx: 6.4 ( 3.71x) hevc_h_loop_filter_luma8_strong_c: 150.4 ( 1.00x) hevc_h_loop_filter_luma8_strong_sse2: 34.4 ( 4.37x) hevc_h_loop_filter_luma8_strong_ssse3: 34.5 ( 4.36x) hevc_h_loop_filter_luma8_strong_avx: 32.3 ( 4.65x) hevc_h_loop_filter_luma8_weak_c: 103.2 ( 1.00x) hevc_h_loop_filter_luma8_weak_sse2: 34.5 ( 2.99x) hevc_h_loop_filter_luma8_weak_ssse3: 7.3 (14.22x) hevc_h_loop_filter_luma8_weak_avx: 32.4 ( 3.18x) hevc_h_loop_filter_luma10_skip_c: 23.5 ( 1.00x) hevc_h_loop_filter_luma10_skip_sse2: 6.6 ( 3.58x) hevc_h_loop_filter_luma10_skip_ssse3: 6.1 ( 3.86x) hevc_h_loop_filter_luma10_skip_avx: 5.4 ( 4.34x) hevc_h_loop_filter_luma10_strong_c: 161.8 ( 1.00x) hevc_h_loop_filter_luma10_strong_sse2: 32.2 ( 5.03x) hevc_h_loop_filter_luma10_strong_ssse3: 30.4 ( 5.33x) hevc_h_loop_filter_luma10_strong_avx: 30.3 ( 5.33x) hevc_h_loop_filter_luma10_weak_c: 23.5 ( 1.00x) hevc_h_loop_filter_luma10_weak_sse2: 6.6 ( 3.58x) hevc_h_loop_filter_luma10_weak_ssse3: 6.1 ( 3.85x) hevc_h_loop_filter_luma10_weak_avx: 5.4 ( 4.35x) hevc_h_loop_filter_luma12_skip_c: 18.8 ( 1.00x) hevc_h_loop_filter_luma12_skip_sse2: 6.6 ( 2.87x) hevc_h_loop_filter_luma12_skip_ssse3: 6.1 ( 3.08x) hevc_h_loop_filter_luma12_skip_avx: 6.2 ( 3.06x) hevc_h_loop_filter_luma12_strong_c: 159.0 ( 1.00x) hevc_h_loop_filter_luma12_strong_sse2: 36.3 ( 4.38x) hevc_h_loop_filter_luma12_strong_ssse3: 36.1 ( 4.40x) hevc_h_loop_filter_luma12_strong_avx: 33.5 ( 4.75x) hevc_h_loop_filter_luma12_weak_c: 40.1 ( 1.00x) hevc_h_loop_filter_luma12_weak_sse2: 35.5 ( 1.13x) hevc_h_loop_filter_luma12_weak_ssse3: 36.1 ( 1.11x) hevc_h_loop_filter_luma12_weak_avx: 6.2 ( 6.52x) hevc_v_loop_filter_luma8_skip_c: 25.5 ( 1.00x) hevc_v_loop_filter_luma8_skip_sse2: 10.6 ( 2.40x) hevc_v_loop_filter_luma8_skip_ssse3: 11.4 ( 2.24x) hevc_v_loop_filter_luma8_skip_avx: 8.3 ( 3.07x) hevc_v_loop_filter_luma8_strong_c: 146.8 ( 1.00x) hevc_v_loop_filter_luma8_strong_sse2: 43.9 ( 3.35x) hevc_v_loop_filter_luma8_strong_ssse3: 43.7 ( 3.36x) hevc_v_loop_filter_luma8_strong_avx: 42.3 ( 3.47x) hevc_v_loop_filter_luma8_weak_c: 25.5 ( 1.00x) hevc_v_loop_filter_luma8_weak_sse2: 10.6 ( 2.40x) hevc_v_loop_filter_luma8_weak_ssse3: 44.0 ( 0.58x) hevc_v_loop_filter_luma8_weak_avx: 8.3 ( 3.09x) hevc_v_loop_filter_luma10_skip_c: 20.0 ( 1.00x) hevc_v_loop_filter_luma10_skip_sse2: 11.3 ( 1.77x) hevc_v_loop_filter_luma10_skip_ssse3: 11.0 ( 1.82x) hevc_v_loop_filter_luma10_skip_avx: 9.3 ( 2.15x) hevc_v_loop_filter_luma10_strong_c: 193.5 ( 1.00x) hevc_v_loop_filter_luma10_strong_sse2: 46.1 ( 4.19x) hevc_v_loop_filter_luma10_strong_ssse3: 44.2 ( 4.38x) hevc_v_loop_filter_luma10_strong_avx: 44.4 ( 4.35x) hevc_v_loop_filter_luma10_weak_c: 90.3 ( 1.00x) hevc_v_loop_filter_luma10_weak_sse2: 46.3 ( 1.95x) hevc_v_loop_filter_luma10_weak_ssse3: 10.8 ( 8.37x) hevc_v_loop_filter_luma10_weak_avx: 44.4 ( 2.03x) hevc_v_loop_filter_luma12_skip_c: 16.8 ( 1.00x) hevc_v_loop_filter_luma12_skip_sse2: 11.8 ( 1.42x) hevc_v_loop_filter_luma12_skip_ssse3: 11.7 ( 1.43x) hevc_v_loop_filter_luma12_skip_avx: 8.7 ( 1.93x) hevc_v_loop_filter_luma12_strong_c: 159.3 ( 1.00x) hevc_v_loop_filter_luma12_strong_sse2: 45.3 ( 3.52x) hevc_v_loop_filter_luma12_strong_ssse3: 60.3 ( 2.64x) hevc_v_loop_filter_luma12_strong_avx: 44.1 ( 3.61x) hevc_v_loop_filter_luma12_weak_c: 63.6 ( 1.00x) hevc_v_loop_filter_luma12_weak_sse2: 45.3 ( 1.40x) hevc_v_loop_filter_luma12_weak_ssse3: 11.7 ( 5.41x) hevc_v_loop_filter_luma12_weak_avx: 43.9 ( 1.45x) New benchmarks: hevc_h_loop_filter_luma8_skip_c: 24.2 ( 1.00x) hevc_h_loop_filter_luma8_skip_sse2: 8.6 ( 2.82x) hevc_h_loop_filter_luma8_skip_ssse3: 7.0 ( 3.46x) hevc_h_loop_filter_luma8_skip_avx: 6.8 ( 3.54x) hevc_h_loop_filter_luma8_strong_c: 150.4 ( 1.00x) hevc_h_loop_filter_luma8_strong_sse2: 33.3 ( 4.52x) hevc_h_loop_filter_luma8_strong_ssse3: 32.7 ( 4.61x) hevc_h_loop_filter_luma8_strong_avx: 32.7 ( 4.60x) hevc_h_loop_filter_luma8_weak_c: 104.0 ( 1.00x) hevc_h_loop_filter_luma8_weak_sse2: 33.2 ( 3.13x) hevc_h_loop_filter_luma8_weak_ssse3: 7.0 (14.91x) hevc_h_loop_filter_luma8_weak_avx: 31.3 ( 3.32x) hevc_h_loop_filter_luma10_skip_c: 19.2 ( 1.00x) hevc_h_loop_filter_luma10_skip_sse2: 6.2 ( 3.08x) hevc_h_loop_filter_luma10_skip_ssse3: 6.2 ( 3.08x) hevc_h_loop_filter_luma10_skip_avx: 5.0 ( 3.85x) hevc_h_loop_filter_luma10_strong_c: 159.8 ( 1.00x) hevc_h_loop_filter_luma10_strong_sse2: 30.0 ( 5.32x) hevc_h_loop_filter_luma10_strong_ssse3: 29.2 ( 5.48x) hevc_h_loop_filter_luma10_strong_avx: 28.6 ( 5.58x) hevc_h_loop_filter_luma10_weak_c: 19.2 ( 1.00x) hevc_h_loop_filter_luma10_weak_sse2: 6.2 ( 3.09x) hevc_h_loop_filter_luma10_weak_ssse3: 6.2 ( 3.09x) hevc_h_loop_filter_luma10_weak_avx: 5.0 ( 3.88x) hevc_h_loop_filter_luma12_skip_c: 18.7 ( 1.00x) hevc_h_loop_filter_luma12_skip_sse2: 6.2 ( 3.00x) hevc_h_loop_filter_luma12_skip_ssse3: 5.7 ( 3.27x) hevc_h_loop_filter_luma12_skip_avx: 5.2 ( 3.61x) hevc_h_loop_filter_luma12_strong_c: 160.2 ( 1.00x) hevc_h_loop_filter_luma12_strong_sse2: 34.2 ( 4.68x) hevc_h_loop_filter_luma12_strong_ssse3: 29.3 ( 5.48x) hevc_h_loop_filter_luma12_strong_avx: 31.4 ( 5.10x) hevc_h_loop_filter_luma12_weak_c: 40.2 ( 1.00x) hevc_h_loop_filter_luma12_weak_sse2: 35.2 ( 1.14x) hevc_h_loop_filter_luma12_weak_ssse3: 29.3 ( 1.37x) hevc_h_loop_filter_luma12_weak_avx: 5.0 ( 8.09x) hevc_v_loop_filter_luma8_skip_c: 25.6 ( 1.00x) hevc_v_loop_filter_luma8_skip_sse2: 10.2 ( 2.52x) hevc_v_loop_filter_luma8_skip_ssse3: 10.5 ( 2.45x) hevc_v_loop_filter_luma8_skip_avx: 8.2 ( 3.11x) hevc_v_loop_filter_luma8_strong_c: 147.1 ( 1.00x) hevc_v_loop_filter_luma8_strong_sse2: 42.6 ( 3.45x) hevc_v_loop_filter_luma8_strong_ssse3: 42.4 ( 3.47x) hevc_v_loop_filter_luma8_strong_avx: 40.1 ( 3.67x) hevc_v_loop_filter_luma8_weak_c: 25.6 ( 1.00x) hevc_v_loop_filter_luma8_weak_sse2: 10.6 ( 2.42x) hevc_v_loop_filter_luma8_weak_ssse3: 42.7 ( 0.60x) hevc_v_loop_filter_luma8_weak_avx: 8.2 ( 3.11x) hevc_v_loop_filter_luma10_skip_c: 16.7 ( 1.00x) hevc_v_loop_filter_luma10_skip_sse2: 11.0 ( 1.52x) hevc_v_loop_filter_luma10_skip_ssse3: 10.5 ( 1.59x) hevc_v_loop_filter_luma10_skip_avx: 9.6 ( 1.74x) hevc_v_loop_filter_luma10_strong_c: 190.0 ( 1.00x) hevc_v_loop_filter_luma10_strong_sse2: 44.8 ( 4.24x) hevc_v_loop_filter_luma10_strong_ssse3: 42.3 ( 4.49x) hevc_v_loop_filter_luma10_strong_avx: 42.5 ( 4.47x) hevc_v_loop_filter_luma10_weak_c: 88.3 ( 1.00x) hevc_v_loop_filter_luma10_weak_sse2: 45.7 ( 1.93x) hevc_v_loop_filter_luma10_weak_ssse3: 10.5 ( 8.40x) hevc_v_loop_filter_luma10_weak_avx: 42.4 ( 2.09x) hevc_v_loop_filter_luma12_skip_c: 16.7 ( 1.00x) hevc_v_loop_filter_luma12_skip_sse2: 11.7 ( 1.42x) hevc_v_loop_filter_luma12_skip_ssse3: 10.5 ( 1.59x) hevc_v_loop_filter_luma12_skip_avx: 8.8 ( 1.90x) hevc_v_loop_filter_luma12_strong_c: 159.4 ( 1.00x) hevc_v_loop_filter_luma12_strong_sse2: 45.2 ( 3.53x) hevc_v_loop_filter_luma12_strong_ssse3: 59.3 ( 2.69x) hevc_v_loop_filter_luma12_strong_avx: 41.7 ( 3.82x) hevc_v_loop_filter_luma12_weak_c: 63.3 ( 1.00x) hevc_v_loop_filter_luma12_weak_sse2: 44.9 ( 1.41x) hevc_v_loop_filter_luma12_weak_ssse3: 10.5 ( 6.02x) hevc_v_loop_filter_luma12_weak_avx: 41.7 ( 1.52x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hevc/deblock.asm | 105 +++++++++++++++----------------- 1 file changed, 49 insertions(+), 56 deletions(-) diff --git a/libavcodec/x86/hevc/deblock.asm b/libavcodec/x86/hevc/deblock.asm index 9671415c66..d43d95142a 100644 --- a/libavcodec/x86/hevc/deblock.asm +++ b/libavcodec/x86/hevc/deblock.asm @@ -333,8 +333,8 @@ ALIGN 16 ;compare pcmpgtw m15, m13, m14 - movmskps r12, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) - test r12, r12 + movmskps r10, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1) + test r10d, r10d je .bypassluma ;weak / strong decision compare to beta_2 @@ -345,45 +345,41 @@ ALIGN 16 ;end weak / strong decision ; weak filter nd_p/q calculation - pshufd m8, m10, 0x31 - psrld m8, 16 + psrlq m8, m10, 48 paddw m8, m10 - movd r7d, m8 - pshufd m8, m8, 0x4E - movd r8d, m8 + pshufd m8, m8, 0x8 + movq r7q, m8 - pshufd m8, m11, 0x31 - psrld m8, 16 + psrlq m8, m11, 48 paddw m8, m11 - movd r9d, m8 - pshufd m8, m8, 0x4E - movd r10d, m8 + pshufd m8, m8, 0x8 + movq r8q, m8 ; end calc for weak filter ; filtering mask - mov r11, r12 - shr r11, 3 - movd m15, r11d - and r12, 1 - movd m11, r12d + mov r9, r10 + shr r9d, 3 + movd m15, r9d + and r10d, 1 + movd m11, r10d shufps m11, m15, 0 - shl r11, 1 - or r12, r11 + shl r9d, 1 + or r10d, r9d pcmpeqd m11, [pd_1]; filtering mask ;decide between strong and weak filtering ;tc25 calculations - mov r11d, [tcq]; + mov r9d, [tcq]; %if %1 > 8 - shl r11, %1 - 8 + shl r9d, %1 - 8 %endif - movd m8, r11d; tc0 + movd m8, r9d; tc0 mov r3d, [tcq+4]; %if %1 > 8 - shl r3, %1 - 8 + shl r3d, %1 - 8 %endif - add r11d, r3d; tc0 + tc1 + add r9d, r3d; tc0 + tc1 jz .bypassluma movd m9, r3d; tc1 punpcklwd m8, m8 @@ -408,8 +404,8 @@ ALIGN 16 psraw m13, 3; beta >> 3 pcmpgtw m13, m12; - movmskps r11, m13; - and r6, r11; strong mask , beta_2 and beta_3 comparisons + movmskps r9d, m13; + and r6d, r9d; strong mask , beta_2 and beta_3 comparisons ;----beta_3 comparison end----- ;----tc25 comparison--- psubw m12, m3, m4; p0 - q0 @@ -419,24 +415,24 @@ ALIGN 16 pshuflw m12, m12, 0xf0 ;0b11110000; pcmpgtw m8, m12; tc25 comparisons - movmskps r11, m8; - and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons + movmskps r9d, m8; + and r6d, r9d; strong mask, beta_2, beta_3 and tc25 comparisons ;----tc25 comparison end--- - mov r11, r6; - shr r11, 1; - and r6, r11; strong mask, bits 2 and 0 + mov r9d, r6d; + shr r9d, 1; + and r6d, r9d; strong mask, bits 2 and 0 pmullw m14, m9, [pw_m2]; -tc * 2 paddw m9, m9 and r6, 5; 0b101 - mov r11, r6; strong mask + mov r9d, r6d; strong mask shr r6, 2; movd m12, r6d; store to xmm for mask generation shl r6, 1 - and r11, 1 - movd m10, r11d; store to xmm for mask generation - or r6, r11; final strong mask, bits 1 and 0 + and r9d, 1 + movd m10, r9d; store to xmm for mask generation + or r6d, r9d; final strong mask, bits 1 and 0 jz .weakfilter shufps m10, m12, 0 @@ -522,21 +518,21 @@ ALIGN 16 .weakfilter: not r6; strong mask -> weak mask - and r6, r12; final weak filtering mask, bits 0 and 1 + and r6d, r10d; final weak filtering mask, bits 0 and 1 jz .store ; weak filtering mask - mov r11, r6 - shr r11, 1 - movd m12, r11d + mov r9, r6 + shr r9d, 1 + movd m12, r9d and r6, 1 movd m11, r6d shufps m11, m12, 0 pcmpeqd m11, [pd_1]; filtering mask - mov r12, betaq - shr r12, 1; - add betaq, r12 + mov r10d, betad + shr r10d, 1; + add betad, r10d shr betaq, 3; ((beta + (beta >> 1)) >> 3)) psubw m12, m4, m3 ; q0 - p0 @@ -605,11 +601,9 @@ ALIGN 16 movd m10, betad SPLATW m10, m10, 0 - movd m13, r7d; 1dp0 + 1dp3 - movd m8, r8d; 0dp0 + 0dp3 - punpcklwd m8, m8 + movq m13, r7q; 1dp0 + 1dp3, 0dp0 + 0dp3 punpcklwd m13, m13 - shufps m13, m8, 0; + pshufd m13, m13, 10100000b pcmpgtw m8, m10, m13 pand m8, m11 ;end beta calculations @@ -623,11 +617,10 @@ ALIGN 16 pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2) paddw m8, m5; q1' - movd m13, r9d; - movd m15, r10d; - punpcklwd m15, m15 + movq m13, r8q; punpcklwd m13, m13 - shufps m13, m15, 0; dq0 + dq3 + movhlps m15, m13 + pshufd m13, m13, 10100000b pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3) pand m10, m11 @@ -742,7 +735,7 @@ LOOP_FILTER_CHROMA ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, ; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_v_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_v_loop_filter_luma_8, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 4 lea pix0q, [3 * r1] mov src3strideq, pixq @@ -754,7 +747,7 @@ cglobal hevc_v_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3s .bypassluma: RET -cglobal hevc_v_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_v_loop_filter_luma_10, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 8 lea pix0q, [3 * strideq] mov src3strideq, pixq @@ -766,7 +759,7 @@ cglobal hevc_v_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3 .bypassluma: RET -cglobal hevc_v_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_v_loop_filter_luma_12, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride sub pixq, 8 lea pix0q, [3 * strideq] mov src3strideq, pixq @@ -782,7 +775,7 @@ cglobal hevc_v_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3 ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta, ; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q); ;----------------------------------------------------------------------------- -cglobal hevc_h_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_h_loop_filter_luma_8, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq @@ -818,7 +811,7 @@ cglobal hevc_h_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3s .bypassluma: RET -cglobal hevc_h_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_h_loop_filter_luma_10, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq @@ -849,7 +842,7 @@ cglobal hevc_h_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3 .bypassluma: RET -cglobal hevc_h_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride +cglobal hevc_h_loop_filter_luma_12, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride lea src3strideq, [3 * strideq] mov pix0q, pixq sub pix0q, src3strideq -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
