This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 19e377b4b967c73b210c36dcac184f1abdab0326 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Sun Apr 12 17:53:41 2026 +0200 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sun Jun 14 22:04:42 2026 +0200 avcodec/x86/hpeldsp: Port mmxext functions to SSE2 The only noticable changes in benchmarks are for the x2 horizontal no_rnd case where SSE2 and movhps are beneficial: Old benchmarks: avg_pixels_tab[1][1]_c: 42.2 ( 1.00x) avg_pixels_tab[1][1]_mmxext: 10.8 ( 3.89x) avg_pixels_tab[1][2]_c: 18.0 ( 1.00x) avg_pixels_tab[1][2]_mmxext: 6.1 ( 2.96x) put_no_rnd_pixels_tab[1][1]_c: 29.7 ( 1.00x) put_no_rnd_pixels_tab[1][1]_mmxext: 12.3 ( 2.41x) put_no_rnd_pixels_tab[1][2]_c: 20.4 ( 1.00x) put_no_rnd_pixels_tab[1][2]_mmxext: 12.2 ( 1.67x) put_pixels_tab[1][1]_c: 29.9 ( 1.00x) put_pixels_tab[1][1]_mmxext: 7.6 ( 3.92x) put_pixels_tab[1][2]_c: 16.8 ( 1.00x) put_pixels_tab[1][2]_mmxext: 6.4 ( 2.63x) New benchmarks: avg_pixels_tab[1][1]_c: 42.3 ( 1.00x) avg_pixels_tab[1][1]_sse2: 10.7 ( 3.95x) avg_pixels_tab[1][2]_c: 17.8 ( 1.00x) avg_pixels_tab[1][2]_sse2: 6.3 ( 2.83x) put_no_rnd_pixels_tab[1][1]_c: 29.6 ( 1.00x) put_no_rnd_pixels_tab[1][1]_sse2: 10.5 ( 2.81x) put_no_rnd_pixels_tab[1][2]_c: 20.4 ( 1.00x) put_no_rnd_pixels_tab[1][2]_sse2: 12.3 ( 1.67x) put_pixels_tab[1][1]_c: 30.1 ( 1.00x) put_pixels_tab[1][1]_sse2: 7.6 ( 3.93x) put_pixels_tab[1][2]_c: 16.8 ( 1.00x) put_pixels_tab[1][2]_sse2: 6.4 ( 2.64x) Switching to SSE2 unfortunately increased codesize of the relevant functions by 160B. This makes these functions ABI compatible, i.e. they no longer rely on others calling emms_c to fix the fpu state. It also implies that many mpegvideo decoders (the exceptions are MPEG-4, RV30, RV40 and the VC-1 family) now no longer use any mmx registers at all. So one can remove the emms_c from the MPEG-1/2 decoder. The same is true for VP3. Reviewed-by: James Almer <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/mpeg12dec.c | 3 - libavcodec/vp3.c | 2 - libavcodec/x86/hpeldsp.asm | 279 +++++++++++++++++++++++++----------------- libavcodec/x86/hpeldsp_init.c | 67 ++++------ tests/checkasm/hpeldsp.c | 2 +- 5 files changed, 197 insertions(+), 156 deletions(-) diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c index 4c83bcfa90..ce3066e4a0 100644 --- a/libavcodec/mpeg12dec.c +++ b/libavcodec/mpeg12dec.c @@ -32,7 +32,6 @@ #include <stdatomic.h> #include "libavutil/attributes.h" -#include "libavutil/emms.h" #include "libavutil/imgutils.h" #include "libavutil/internal.h" #include "libavutil/mem_internal.h" @@ -1651,7 +1650,6 @@ static int slice_decode_thread(AVCodecContext *c, void *arg) int ret; ret = mpeg_decode_slice(s, mb_y, &buf, end - buf); - emms_c(); ff_dlog(c, "ret:%d resync:%d/%d mb:%d/%d ts:%d/%d ec:%d\n", ret, s->c.resync_mb_x, s->c.resync_mb_y, s->c.mb_x, s->c.mb_y, s->c.start_mb_y, s->c.end_mb_y, s->c.er.error_count); @@ -2532,7 +2530,6 @@ static int decode_chunks(AVCodecContext *avctx, AVFrame *picture, buf_ptr += 2; // FIXME add minimum number of bytes per slice } else { ret = mpeg_decode_slice(&s->slice, mb_y, &buf_ptr, input_size); - emms_c(); if (ret < 0) { if (avctx->err_recognition & AV_EF_EXPLODE) diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c index 7789252f11..ca468b047c 100644 --- a/libavcodec/vp3.c +++ b/libavcodec/vp3.c @@ -36,7 +36,6 @@ #include <string.h> #include "libavutil/attributes.h" -#include "libavutil/emms.h" #include "libavutil/imgutils.h" #include "libavutil/mem.h" #include "libavutil/mem_internal.h" @@ -1924,7 +1923,6 @@ static void vp3_draw_horiz_band(Vp3DecodeContext *s, int y) for (int i = 3; i < AV_NUM_DATA_POINTERS; i++) offset[i] = 0; - emms_c(); s->avctx->draw_horiz_band(s->avctx, s->current_frame.f, offset, y, 3, h); } diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 8e29a232d7..f79d40a84b 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -34,138 +34,183 @@ cextern pw_8192 SECTION .text ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PIXELS_X2 2 +%macro PIXELS_X2 4 cglobal %1_pixels%2_x2, 4,5,4 lea r4, [r2*2] .loop: - movu m0, [r1+1] - movu m1, [r1+r2+1] -%if cpuflag(sse2) - movu m2, [r1] - movu m3, [r1+r2] - pavgb m0, m2 - pavgb m1, m3 -%else + mov%3 m0, [r1+1] + mov%3 m1, [r1+r2+1] +%if %2 == mmsize && avx_enabled pavgb m0, [r1] pavgb m1, [r1+r2] +%else + mov%3 m2, [r1] + mov%3 m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 %endif add r1, r4 %ifidn %1,avg +%if %2 == mmsize pavgb m0, [r0] pavgb m1, [r0+r2] -%endif - mova [r0], m0 - mova [r0+r2], m1 - add r0, r4 - movu m0, [r1+1] - movu m1, [r1+r2+1] -%if cpuflag(sse2) - movu m2, [r1] - movu m3, [r1+r2] +%else + mov%4 m2, [r0] + mov%4 m3, [r0+r2] pavgb m0, m2 pavgb m1, m3 -%else +%endif +%endif + mov%4 [r0], m0 + mov%4 [r0+r2], m1 + add r0, r4 + mov%3 m0, [r1+1] + mov%3 m1, [r1+r2+1] +%if %2 == mmsize && avx_enabled pavgb m0, [r1] pavgb m1, [r1+r2] +%else + mov%3 m2, [r1] + mov%3 m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 %endif add r1, r4 %ifidn %1,avg +%if %2 == mmsize pavgb m0, [r0] pavgb m1, [r0+r2] +%else + mov%4 m2, [r0] + mov%4 m3, [r0+r2] + pavgb m0, m2 + pavgb m1, m3 +%endif %endif - mova [r0], m0 - mova [r0+r2], m1 + mov%4 [r0], m0 + mov%4 [r0+r2], m1 add r0, r4 sub r3d, 4 jne .loop RET %endmacro -INIT_MMX mmxext -PIXELS_X2 put, 8 -PIXELS_X2 avg, 8 - INIT_XMM sse2 -PIXELS_X2 put, 16 -PIXELS_X2 avg, 16 +PIXELS_X2 put, 8, q, q +PIXELS_X2 avg, 8, q, q + +PIXELS_X2 put, 16, u, a +PIXELS_X2 avg, 16, u, a ; void ff_put_no_rnd_pixels8_x2_approx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -INIT_MMX mmxext -cglobal put_no_rnd_pixels8_x2_approx, 4,5 - mova m6, [pb_1] +INIT_XMM sse2 +cglobal put_no_rnd_pixels8_x2_approx, 4,5,5 + mova m4, [pb_1] lea r4, [r2*2] .loop: - mova m0, [r1] - mova m2, [r1+r2] - mova m1, [r1+1] - mova m3, [r1+r2+1] + movq m0, [r1] + movq m1, [r1+1] + movhps m0, [r1+r2] + movhps m1, [r1+r2+1] add r1, r4 - psubusb m0, m6 - psubusb m2, m6 + psubusb m0, m4 pavgb m0, m1 - pavgb m2, m3 - mova [r0], m0 - mova [r0+r2], m2 - mova m0, [r1] - mova m1, [r1+1] - mova m2, [r1+r2] - mova m3, [r1+r2+1] + movq [r0], m0 + movhps [r0+r2], m0 + movq m0, [r1] + movq m1, [r1+1] + movhps m0, [r1+r2] + movhps m1, [r1+r2+1] add r0, r4 add r1, r4 - psubusb m0, m6 - psubusb m2, m6 + psubusb m0, m4 pavgb m0, m1 - pavgb m2, m3 - mova [r0], m0 - mova [r0+r2], m2 + movq [r0], m0 + movhps [r0+r2], m0 add r0, r4 sub r3d, 4 jne .loop RET -%macro NO_RND_PIXELS_X2 2 +%macro NO_RND_PIXELS_X2 4 cglobal %1_no_rnd_pixels%2_x2, 4,5,5 lea r4, [r2*3] pcmpeqb m4, m4 .loop: - movu m0, [r1] - movu m2, [r1+r2] - movu m1, [r1+1] - movu m3, [r1+r2+1] + mov%3 m0, [r1] +%if %2 == mmsize + mov%3 m2, [r1+r2] + mov%3 m1, [r1+1] + mov%3 m3, [r1+r2+1] +%else + movq m1, [r1+1] + movhps m0, [r1+r2] + movhps m1, [r1+r2+1] +%endif pxor m0, m4 +%if %2 == mmsize pxor m2, m4 +%endif pxor m1, m4 +%if %2 == mmsize pxor m3, m4 +%endif pavgb m0, m1 +%if %2 == mmsize pavgb m2, m3 +%endif pxor m0, m4 +%if %2 == mmsize pxor m2, m4 +%endif %ifidn %1, avg pavgb m0, [r0] pavgb m2, [r0+r2] %endif - mova [r0], m0 - mova [r0+r2], m2 - movu m0, [r1+r2*2] - movu m1, [r1+r2*2+1] - movu m2, [r1+r4] - movu m3, [r1+r4+1] + mov%4 [r0], m0 +%if %2 == mmsize + mov%4 [r0+r2], m2 +%else + movhps [r0+r2], m0 +%endif + mov%3 m0, [r1+2*r2] +%if %2 == mmsize + mov%3 m2, [r1+r4] + mov%3 m1, [r1+2*r2+1] + mov%3 m3, [r1+r4+1] +%else + movq m1, [r1+2*r2+1] + movhps m0, [r1+r4] + movhps m1, [r1+r4+1] +%endif pxor m0, m4 - pxor m1, m4 +%if %2 == mmsize pxor m2, m4 +%endif + pxor m1, m4 +%if %2 == mmsize pxor m3, m4 +%endif pavgb m0, m1 +%if %2 == mmsize pavgb m2, m3 +%endif pxor m0, m4 +%if %2 == mmsize pxor m2, m4 +%endif %ifidn %1, avg pavgb m0, [r0+r2*2] pavgb m2, [r0+r4] %endif - mova [r0+r2*2], m0 - mova [r0+r4], m2 + mov%4 [r0+2*r2], m0 +%if %2 == mmsize + mov%4 [r0+r4], m2 +%else + movhps [r0+r4], m0 +%endif lea r1, [r1+r2*4] lea r0, [r0+r2*4] sub r3d, 4 @@ -173,95 +218,109 @@ cglobal %1_no_rnd_pixels%2_x2, 4,5,5 RET %endmacro -INIT_MMX mmxext -NO_RND_PIXELS_X2 put, 8 INIT_XMM sse2 -NO_RND_PIXELS_X2 avg, 16 -NO_RND_PIXELS_X2 put, 16 +NO_RND_PIXELS_X2 put, 8, q, q + +NO_RND_PIXELS_X2 avg, 16, u, a +NO_RND_PIXELS_X2 put, 16, u, a ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -%macro PIXELS_Y2 2 -cglobal %1_pixels%2_y2, 4,5,3 +%macro PIXELS_Y2 4 +cglobal %1_pixels%2_y2, 4,5,5 + mov%3 m0, [r1] lea r4, [r2*2] - movu m0, [r1] .loop: - movu m1, [r1+r2] - movu m2, [r1+r4] + mov%3 m1, [r1+r2] + mov%3 m2, [r1+r4] add r1, r4 pavgb m0, m1 pavgb m1, m2 %ifidn %1,avg +%if %2 == mmsize pavgb m0, [r0] pavgb m1, [r0+r2] +%else + mov%4 m3, [r0] + mov%4 m4, [r0+r2] + pavgb m0, m3 + pavgb m1, m4 +%endif %endif - mova [r0], m0 - mova [r0+r2], m1 - movu m1, [r1+r2] - movu m0, [r1+r4] + mov%4 [r0], m0 + mov%4 [r0+r2], m1 + mov%3 m1, [r1+r2] + mov%3 m0, [r1+r4] add r0, r4 add r1, r4 pavgb m2, m1 pavgb m1, m0 %ifidn %1,avg +%if %2 == mmsize pavgb m2, [r0] pavgb m1, [r0+r2] +%else + mov%4 m3, [r0] + mov%4 m4, [r0+r2] + pavgb m2, m3 + pavgb m1, m4 +%endif %endif - mova [r0], m2 - mova [r0+r2], m1 + mov%4 [r0], m2 + mov%4 [r0+r2], m1 add r0, r4 sub r3d, 4 jne .loop RET %endmacro -INIT_MMX mmxext -PIXELS_Y2 put, 8 -PIXELS_Y2 avg, 8 INIT_XMM sse2 -PIXELS_Y2 put, 16 -PIXELS_Y2 avg, 16 +PIXELS_Y2 put, 8, q, q +PIXELS_Y2 avg, 8, q, q + +PIXELS_Y2 put, 16, u, a +PIXELS_Y2 avg, 16, u, a ; void ff_put_no_rnd_pixels8_y2_approx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -INIT_MMX mmxext -cglobal put_no_rnd_pixels8_y2_approx, 4,5 - mova m6, [pb_1] +INIT_XMM sse2 +cglobal put_no_rnd_pixels8_y2_approx, 4,5,4 + mova m3, [pb_1] + movq m0, [r1] lea r4, [r2+r2] - mova m0, [r1] .loop: - mova m1, [r1+r2] - mova m2, [r1+r4] + movq m1, [r1+r2] + movq m2, [r1+r4] add r1, r4 - psubusb m1, m6 + psubusb m1, m3 pavgb m0, m1 pavgb m1, m2 - mova [r0], m0 - mova [r0+r2], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] + movq [r0], m0 + movq [r0+r2], m1 + movq m1, [r1+r2] + movq m0, [r1+r4] add r0, r4 add r1, r4 - psubusb m1, m6 + psubusb m1, m3 pavgb m2, m1 pavgb m1, m0 - mova [r0], m2 - mova [r0+r2], m1 + movq [r0], m2 + movq [r0+r2], m1 add r0, r4 sub r3d, 4 jne .loop RET -%macro NO_RND_PIXELS_Y2 2 +%macro NO_RND_PIXELS_Y2 4 cglobal %1_no_rnd_pixels%2_y2, 4,5,4 + mov%3 m0, [r1] lea r4, [r2*3] - movu m0, [r1] pcmpeqb m3, m3 add r1, r2 pxor m0, m3 .loop: - movu m1, [r1] - movu m2, [r1+r2] + mov%3 m1, [r1] + mov%3 m2, [r1+r2] pxor m1, m3 pxor m2, m3 pavgb m0, m1 @@ -272,10 +331,10 @@ cglobal %1_no_rnd_pixels%2_y2, 4,5,4 pavgb m0, [r0] pavgb m1, [r0+r2] %endif - mova [r0], m0 - mova [r0+r2], m1 - movu m1, [r1+r2*2] - movu m0, [r1+r4] + mov%4 [r0], m0 + mov%4 [r0+r2], m1 + mov%3 m1, [r1+r2*2] + mov%3 m0, [r1+r4] pxor m1, m3 pxor m0, m3 pavgb m2, m1 @@ -286,8 +345,8 @@ cglobal %1_no_rnd_pixels%2_y2, 4,5,4 pavgb m2,[r0+r2*2] pavgb m1,[r0+r4] %endif - mova [r0+r2*2], m2 - mova [r0+r4], m1 + mov%4 [r0+r2*2], m2 + mov%4 [r0+r4], m1 lea r1, [r1+r2*4] lea r0, [r0+r2*4] sub r3d, 4 @@ -295,11 +354,11 @@ cglobal %1_no_rnd_pixels%2_y2, 4,5,4 RET %endmacro -INIT_MMX mmxext -NO_RND_PIXELS_Y2 put, 8 INIT_XMM sse2 -NO_RND_PIXELS_Y2 avg, 16 -NO_RND_PIXELS_Y2 put, 16 +NO_RND_PIXELS_Y2 put, 8, q, q + +NO_RND_PIXELS_Y2 avg, 16, u, a +NO_RND_PIXELS_Y2 put, 16, u, a ; void ff_put_no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 4e4abd5273..f337e5c85d 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -33,8 +33,8 @@ #include "fpel.h" #include "hpeldsp.h" -void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); +void ff_put_pixels8_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, @@ -43,22 +43,20 @@ void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_approx_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_x2_approx_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_approx_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, - const uint8_t *pixels, - ptrdiff_t line_size, int h); +void ff_put_pixels8_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_y2_approx_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_no_rnd_pixels8_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, @@ -69,29 +67,10 @@ void ff_put_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_avg_no_rnd_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); -void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); - -static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags) -{ -#if HAVE_MMXEXT_EXTERNAL - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; - - c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; - c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; - - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; - - if (!(flags & AV_CODEC_FLAG_BITEXACT)) { - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_approx_mmxext; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_approx_mmxext; - } -#endif /* HAVE_MMXEXT_EXTERNAL */ -} +void ff_avg_pixels8_x2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_y2_sse2(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); static void hpeldsp_init_sse2(HpelDSPContext *c, int flags) { @@ -106,19 +85,30 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags) c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2; c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_sse2; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_sse2; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_sse2; c->put_no_rnd_pixels_tab[1][0] = c->put_pixels_tab[1][0] = ff_put_pixels8_sse2; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_sse2; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_sse2; c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2; c->avg_pixels_tab[1][0] = ff_avg_pixels8_sse2; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_sse2; + c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_sse2; c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2; c->avg_no_rnd_pixels_tab[1] = ff_avg_no_rnd_pixels16_x2_sse2; c->avg_no_rnd_pixels_tab[2] = ff_avg_no_rnd_pixels16_y2_sse2; c->avg_no_rnd_pixels_tab[3] = ff_avg_no_rnd_pixels16_xy2_sse2; + + if (!(flags & AV_CODEC_FLAG_BITEXACT)) { + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_approx_sse2; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_approx_sse2; + } #endif /* HAVE_SSE2_EXTERNAL */ } @@ -138,9 +128,6 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMXEXT(cpu_flags)) - hpeldsp_init_mmxext(c, flags); - if (EXTERNAL_SSE2(cpu_flags)) hpeldsp_init_sse2(c, flags); diff --git a/tests/checkasm/hpeldsp.c b/tests/checkasm/hpeldsp.c index bf44a666ca..fd87509ddc 100644 --- a/tests/checkasm/hpeldsp.c +++ b/tests/checkasm/hpeldsp.c @@ -69,7 +69,7 @@ void checkasm_check_hpeldsp(void) TEST(put_no_rnd_pixels_tab, 2), // put_no_rnd_pixels_tab only has two usable blocksizes TEST(avg_no_rnd_pixels_tab, 1), }; - declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); + declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h); ff_hpeldsp_init(&hdsp, AV_CODEC_FLAG_BITEXACT); _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
