x86_64 always has SSE2, so skip compiling in SIMD-optimized functions
that will always be overridden by variants of the same function done
with more advanced SIMD optimization types.
---
Updated to now actually disable the yasm code as well, as noted by Justin.
The total saving is about 100kB:
before
$ ls -l libavcodec/libavcodec.a
-rw-rw-r-- 1 biurrun 10006 84458588 Oct 6 19:22 libavcodec/libavcodec.a
after
$ ls -l libavcodec/libavcodec.a
-rw-rw-r-- 1 biurrun 10006 84349546 Oct 6 18:57 libavcodec/libavcodec.a
84458588
- 84349546
----------
109042
libavcodec/x86/ac3dsp_init.c | 4 ++
libavcodec/x86/dct32.asm | 2 +
libavcodec/x86/dct_init.c | 2 +
libavcodec/x86/dsputil.asm | 10 ++++-
libavcodec/x86/dsputil_init.c | 12 +++++-
libavcodec/x86/dsputilenc.asm | 6 +++
libavcodec/x86/dsputilenc_mmx.c | 26 +++++++++---
libavcodec/x86/fmtconvert.asm | 14 +++++++
libavcodec/x86/fmtconvert_init.c | 10 +++++
libavcodec/x86/h264_chromamc.asm | 2 +
libavcodec/x86/h264_idct.asm | 76 ++++++++--------------------------
libavcodec/x86/h264_intrapred.asm | 22 ++++++++++
libavcodec/x86/h264_intrapred_init.c | 20 ++++++++-
libavcodec/x86/h264chroma_init.c | 2 +
libavcodec/x86/h264dsp_init.c | 16 +++++--
libavcodec/x86/hpeldsp.asm | 22 ++++++++++
libavcodec/x86/hpeldsp_init.c | 6 +++
libavcodec/x86/imdct36.asm | 2 +
libavcodec/x86/motion_est.c | 35 +++++++++++-----
libavcodec/x86/mpegaudiodsp.c | 4 ++
libavcodec/x86/mpegvideo.c | 4 ++
libavcodec/x86/mpegvideoenc.c | 6 ++-
libavcodec/x86/rv34dsp.asm | 2 +
libavcodec/x86/rv34dsp_init.c | 2 +
libavcodec/x86/rv40dsp.asm | 2 +
libavcodec/x86/rv40dsp_init.c | 8 ++--
libavcodec/x86/vc1dsp_init.c | 2 +
libavutil/x86/float_dsp_init.c | 4 ++
libswscale/x86/rgb2rgb.c | 26 +++++++-----
29 files changed, 250 insertions(+), 99 deletions(-)
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index ca10864..9d0a221 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -183,11 +183,14 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int
bit_exact)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
+#if ARCH_X86_32
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
+#endif /* ARCH_X86_32 */
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
+#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (!bit_exact) {
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
@@ -200,6 +203,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int
bit_exact)
if (EXTERNAL_SSE(cpu_flags)) {
c->float_to_fixed24 = ff_float_to_fixed24_sse;
}
+#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE2(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm
index 9c147b9..42abb91 100644
--- a/libavcodec/x86/dct32.asm
+++ b/libavcodec/x86/dct32.asm
@@ -482,7 +482,9 @@ cglobal dct32_float, 2, 3, 16, out, in, tmp
%endif
%endmacro
+%if ARCH_X86_32
INIT_XMM sse
DCT32_FUNC
+%endif ; ARCH_X86_32
INIT_XMM sse2
DCT32_FUNC
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index 7bda5e8..16050cd 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -30,8 +30,10 @@ av_cold void ff_dct_init_x86(DCTContext *s)
{
int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags))
s->dct32 = ff_dct32_float_sse;
+#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE2(cpu_flags))
s->dct32 = ff_dct32_float_sse2;
if (EXTERNAL_AVX(cpu_flags))
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 5d73ff8..4601954 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -107,8 +107,10 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3,
order, mul
RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
SCALARPRODUCT
+%endif ; ARCH_X86_32
INIT_XMM sse2
SCALARPRODUCT
@@ -316,13 +318,17 @@ cglobal apply_window_int16_round, 4,5,6, output, input,
window, offset, offset2
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
APPLY_WINDOW_INT16 0
+%endif ; ARCH_X86_32
INIT_XMM sse2
APPLY_WINDOW_INT16 0
+%if ARCH_X86_32
INIT_MMX mmxext
APPLY_WINDOW_INT16 1
+%endif ; ARCH_X86_32
INIT_XMM sse2
APPLY_WINDOW_INT16 1
INIT_XMM ssse3
@@ -526,9 +532,11 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max,
len
REP_RET
%endmacro
-INIT_MMX mmx
%define CLIPD CLIPD_MMX
+%if ARCH_X86_32
+INIT_MMX mmx
VECTOR_CLIP_INT32 0, 1, 0, 0
+%endif ; ARCH_X86_32
INIT_XMM sse2
VECTOR_CLIP_INT32 6, 1, 0, 0, _int
%define CLIPD CLIPD_SSE2
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index a38cf24..d0a7cc4 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -540,8 +540,10 @@ static av_cold void dsputil_init_mmx(DSPContext *c,
AVCodecContext *avctx,
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth) {
+#if ARCH_X86_32
c->clear_block = ff_clear_block_mmx;
c->clear_blocks = ff_clear_blocks_mmx;
+#endif /* ARCH_X86_32 */
c->draw_edges = ff_draw_edges_mmx;
switch (avctx->idct_algo) {
@@ -552,11 +554,13 @@ static av_cold void dsputil_init_mmx(DSPContext *c,
AVCodecContext *avctx,
c->idct = ff_simple_idct_mmx;
c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
break;
+#if ARCH_X86_32
case FF_IDCT_XVIDMMX:
c->idct_put = ff_idct_xvid_mmx_put;
c->idct_add = ff_idct_xvid_mmx_add;
c->idct = ff_idct_xvid_mmx;
break;
+#endif /* ARCH_X86_32 */
}
}
@@ -571,14 +575,16 @@ static av_cold void dsputil_init_mmx(DSPContext *c,
AVCodecContext *avctx,
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
}
+#if ARCH_X86_32
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
+#endif /* ARCH_X86_32 */
#endif /* HAVE_MMX_EXTERNAL */
}
static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
-#if HAVE_MMXEXT_INLINE
+#if HAVE_MMXEXT_INLINE && ARCH_X86_32
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
@@ -586,7 +592,7 @@ static av_cold void dsputil_init_mmxext(DSPContext *c,
AVCodecContext *avctx,
c->idct_add = ff_idct_xvid_mmxext_add;
c->idct = ff_idct_xvid_mmxext;
}
-#endif /* HAVE_MMXEXT_INLINE */
+#endif /* HAVE_MMXEXT_INLINE && ARCH_X86_32 */
#if HAVE_MMXEXT_EXTERNAL
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
@@ -601,6 +607,7 @@ static av_cold void dsputil_init_mmxext(DSPContext *c,
AVCodecContext *avctx,
if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
+#if ARCH_X86_32
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
@@ -609,6 +616,7 @@ static av_cold void dsputil_init_mmxext(DSPContext *c,
AVCodecContext *avctx,
} else {
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
}
+#endif /* ARCH_X86_32 */
#endif /* HAVE_MMXEXT_EXTERNAL */
}
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index 7e4fd81..469c157 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -256,12 +256,15 @@ hadamard8_16_wrapper 0, 14
%endif
%endmacro
+%if ARCH_X86_32 || HAVE_ALIGNED_STACK == 0
INIT_MMX mmx
HADAMARD8_DIFF
INIT_MMX mmxext
HADAMARD8_DIFF
+%endif ; ARCH_X86_32 || HAVE_ALIGNED_STACK == 0
+%if HAVE_ALIGNED_STACK
INIT_XMM sse2
%if ARCH_X86_64
%define ABS_SUM_8x8 ABS_SUM_8x8_64
@@ -273,6 +276,7 @@ HADAMARD8_DIFF 10
INIT_XMM ssse3
%define ABS_SUM_8x8 ABS_SUM_8x8_64
HADAMARD8_DIFF 9
+%endif ; HAVE_ALIGNED_STACK
INIT_XMM sse2
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
@@ -334,6 +338,7 @@ cglobal sse16, 5, 5, 8
movd eax, m7 ; return value
RET
+%if ARCH_X86_32
INIT_MMX mmx
; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
cglobal get_pixels, 3,4
@@ -358,6 +363,7 @@ cglobal get_pixels, 3,4
add r3, 32
js .loop
REP_RET
+%endif ; ARCH_X86_32
INIT_XMM sse2
cglobal get_pixels, 3, 4
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index a1f80af..36575a3 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -421,6 +421,7 @@ static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t *
pix2, int line_size, int
else return score1 + FFABS(score2)*8;
}
+#if ARCH_X86_32
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int
line_size, int h) {
int tmp;
@@ -481,6 +482,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t
* dummy, int line_si
return tmp & 0xFFFF;
}
#undef SUM
+#endif /* ARCH_X86_32 */
static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy,
int line_size, int h)
@@ -524,6 +526,7 @@ static int vsad_intra16_mmxext(void *v, uint8_t *pix,
uint8_t *dummy,
}
#undef SUM
+#if ARCH_X86_32
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size,
int h) {
int tmp;
@@ -601,6 +604,7 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t *
pix2, int line_size, in
return tmp & 0x7FFF;
}
#undef SUM
+#endif /* ARCH_X86_32 */
static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h)
@@ -809,6 +813,7 @@ static int sum_abs_dctelem_##cpu(int16_t *block){\
return sum&0xFFFF;\
}
+#if ARCH_X86_32
#define DCT_SAD DCT_SAD_MMX
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
#define MMABS(a,z) MMABS_MMX(a,z)
@@ -821,9 +826,11 @@ DCT_SAD_FUNC(mmx)
DCT_SAD_FUNC(mmxext)
#undef HSUM
#undef DCT_SAD
+#endif /* ARCH_X86_32 */
#define DCT_SAD DCT_SAD_SSE2
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
+#define MMABS(a,z) MMABS_MMXEXT(a,z)
DCT_SAD_FUNC(sse2)
#undef MMABS
@@ -952,8 +959,10 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c,
AVCodecContext *avctx)
int bit_depth = avctx->bits_per_raw_sample;
if (EXTERNAL_MMX(cpu_flags)) {
+#if ARCH_X86_32
if (bit_depth <= 8)
c->get_pixels = ff_get_pixels_mmx;
+#endif /* ARCH_X86_32 */
c->diff_pixels = ff_diff_pixels_mmx;
c->pix_sum = ff_pix_sum16_mmx;
@@ -966,22 +975,25 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c,
AVCodecContext *avctx)
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
+#if ARCH_X86_32
if (avctx->bits_per_raw_sample <= 8 &&
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
c->fdct = ff_fdct_mmx;
- c->diff_bytes= diff_bytes_mmx;
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
+ c->vsad[4] = vsad_intra16_mmx;
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT))
+ c->vsad[0] = vsad16_mmx;
+#endif /* ARCH_X86_32 */
+
+ c->diff_bytes = diff_bytes_mmx;
+
c->sse[0] = sse16_mmx;
c->sse[1] = sse8_mmx;
- c->vsad[4]= vsad_intra16_mmx;
c->nsse[0] = nsse16_mmx;
c->nsse[1] = nsse8_mmx;
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->vsad[0] = vsad16_mmx;
- }
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->try_8x8basis= try_8x8basis_mmx;
@@ -999,11 +1011,13 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c,
AVCodecContext *avctx)
}
if (INLINE_MMXEXT(cpu_flags)) {
+#if ARCH_X86_32
if (avctx->bits_per_raw_sample <= 8 &&
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
c->fdct = ff_fdct_mmxext;
c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
+#endif /* ARCH_X86_32 */
c->vsad[4] = vsad_intra16_mmxext;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -1032,6 +1046,7 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c,
AVCodecContext *avctx)
#endif
#endif /* HAVE_INLINE_ASM */
+#if ARCH_X86_32 || !HAVE_ALIGNED_STACK
if (EXTERNAL_MMX(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
@@ -1041,6 +1056,7 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c,
AVCodecContext *avctx)
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
}
+#endif /* ARCH_X86_32 || !HAVE_ALIGNED_STACK */
if (EXTERNAL_SSE2(cpu_flags)) {
c->sse[0] = ff_sse16_sse2;
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index e7803df..0123390 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -71,8 +71,10 @@ cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul,
len
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_XMM sse
INT32_TO_FLOAT_FMUL_SCALAR 5
+%endif ; ARCH_X86_32
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_SCALAR 3
@@ -112,10 +114,12 @@ cglobal float_to_int16, 3, 3, %1, dst, src, len
INIT_XMM sse2
FLOAT_TO_INT16 2
+%if ARCH_X86_32
INIT_MMX sse
FLOAT_TO_INT16 0
INIT_MMX 3dnow
FLOAT_TO_INT16 0
+%endif ; ARCH_X86_32
;------------------------------------------------------------------------------
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long
step);
@@ -190,10 +194,12 @@ cglobal float_to_int16_step, 4, 7, %1, dst, src, len,
step, step3, v1, v2
INIT_XMM sse2
FLOAT_TO_INT16_STEP 2
+%if ARCH_X86_32
INIT_MMX sse
FLOAT_TO_INT16_STEP 0
INIT_MMX 3dnow
FLOAT_TO_INT16_STEP 0
+%endif ; ARCH_X86_32
;-------------------------------------------------------------------------------
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long
len);
@@ -236,10 +242,12 @@ cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0,
src1, len
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX 3dnow
FLOAT_TO_INT16_INTERLEAVE2
INIT_MMX sse
FLOAT_TO_INT16_INTERLEAVE2
+%endif ; ARCH_X86_32
INIT_XMM sse2
FLOAT_TO_INT16_INTERLEAVE2
@@ -293,10 +301,12 @@ cglobal float_to_int16_interleave6, 2, 8, 0, dst, src,
src1, src2, src3, src4, s
INIT_MMX sse
FLOAT_TO_INT16_INTERLEAVE6
+%if ARCH_X86_32
INIT_MMX 3dnow
FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX 3dnowext
FLOAT_TO_INT16_INTERLEAVE6
+%endif ; ARCH_X86_32
;-----------------------------------------------------------------------------
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
@@ -376,8 +386,10 @@ cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2,
src3, src4, src5, len
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
FLOAT_INTERLEAVE6 0
+%endif ; ARCH_X86_32
INIT_XMM sse
FLOAT_INTERLEAVE6 7
@@ -419,10 +431,12 @@ cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
%define PUNPCKLDQ punpckldq
%define PUNPCKHDQ punpckhdq
FLOAT_INTERLEAVE2 0
+%endif ; ARCH_X86_32
INIT_XMM sse
%define PUNPCKLDQ unpcklps
%define PUNPCKHDQ unpckhps
diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c
index 3d75df9..8230c29 100644
--- a/libavcodec/x86/fmtconvert_init.c
+++ b/libavcodec/x86/fmtconvert_init.c
@@ -71,10 +71,13 @@ static void float_to_int16_interleave_##cpu(int16_t *dst,
const float **src, lon
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
}
+#if ARCH_X86_32
FLOAT_TO_INT16_INTERLEAVE(3dnow)
FLOAT_TO_INT16_INTERLEAVE(sse)
+#endif /* ARCH_X86_32 */
FLOAT_TO_INT16_INTERLEAVE(sse2)
+#if ARCH_X86_32
static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
long len, int channels)
{
@@ -83,6 +86,7 @@ static void float_to_int16_interleave_3dnowext(int16_t *dst,
const float **src,
else
float_to_int16_interleave_3dnow(dst, src, len, channels);
}
+#endif /* ARCH_X86_32 */
void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
@@ -90,6 +94,7 @@ void ff_float_interleave2_sse(float *dst, const float **src,
unsigned int len);
void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
+#if ARCH_X86_32
static void float_interleave_mmx(float *dst, const float **src,
unsigned int len, int channels)
{
@@ -100,6 +105,7 @@ static void float_interleave_mmx(float *dst, const float
**src,
else
ff_float_interleave_c(dst, src, len, channels);
}
+#endif /* ARCH_X86_32 */
static void float_interleave_sse(float *dst, const float **src,
unsigned int len, int channels)
@@ -118,6 +124,7 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c,
AVCodecContext *avctx
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->float_interleave = float_interleave_mmx;
}
@@ -132,10 +139,13 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext
*c, AVCodecContext *avctx
c->float_to_int16_interleave = float_to_int16_interleave_3dnowext;
}
}
+#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE(cpu_flags)) {
+#if ARCH_X86_32
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
c->float_to_int16 = ff_float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
+#endif /* ARCH_X86_32 */
c->float_interleave = float_interleave_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index b7b18e0..f7fa1c5 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -455,12 +455,14 @@ chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40
chroma_mc2_mmx_func avg, h264
+%if ARCH_X86_32
INIT_MMX 3dnow
chroma_mc8_mmx_func avg, h264, _rnd
chroma_mc8_mmx_func avg, vc1, _nornd
chroma_mc8_mmx_func avg, rv40
chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40
+%endif ; ARCH_X86_32
%macro chroma_mc8_ssse3_func 2-3
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 2771291..34b840f 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -347,6 +347,7 @@ cglobal h264_idct8_dc_add_8, 2, 3, 0
RET
%endif
+%if ARCH_X86_32
INIT_MMX mmx
; ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
@@ -425,16 +426,10 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1,
block_offset, block, stride
jz .no_dc
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- lea dst2q, [r0+dst2q]
- DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
+ mov r1d, dword [r1+r5*4]
+ lea r1, [r0+r1]
+ DC_ADD_MMXEXT_OP movh, r1, r3, r6
mov r1, r1m
-%endif
inc r5
add r2, 32
cmp r5, 16
@@ -504,16 +499,10 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1,
block_offset, block, s
jz .skipblock
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- add dst2q, r0
- DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
+ mov r1d, dword [r1+r5*4]
+ add r1, r0
+ DC_ADD_MMXEXT_OP movh, r1, r3, r6
mov r1, r1m
-%endif
.skipblock:
inc r5
add r2, 32
@@ -544,18 +533,12 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1,
block_offset, block, stride
jz .no_dc
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- lea dst2q, [r0+dst2q]
- DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
- lea dst2q, [dst2q+r3*4]
- DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
-%if ARCH_X86_64 == 0
+ mov r1d, dword [r1+r5*4]
+ lea r1, [r0+r1]
+ DC_ADD_MMXEXT_OP mova, r1, r3, r6
+ lea r1, [r1+r3*4]
+ DC_ADD_MMXEXT_OP mova, r1, r3, r6
mov r1, r1m
-%endif
add r5, 4
add r2, 128
cmp r5, 16
@@ -581,6 +564,7 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1,
block_offset, block, stride
ADD rsp, pad
RET
+%endif ; ARCH_X86_32
INIT_XMM sse2
; ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
@@ -636,6 +620,7 @@ INIT_XMM cpuname
jl .nextblock
REP_RET
+%if ARCH_X86_32
INIT_MMX mmx
h264_idct_add8_mmx_plane:
.nextblock:
@@ -644,14 +629,9 @@ h264_idct_add8_mmx_plane:
or r6w, word [r2]
test r6, r6
jz .skipblock
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
-%endif
IDCT4_ADD r0, r2, r3
.skipblock:
inc r5
@@ -668,17 +648,10 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1,
block_offset, block, stride,
%ifdef PIC
lea picregq, [scan8_mem]
%endif
-%if ARCH_X86_64
- mov dst2q, r0
-%endif
call h264_idct_add8_mmx_plane
mov r5, 32
add r2, 384
-%if ARCH_X86_64
- add dst2q, gprsize
-%else
add r0mp, gprsize
-%endif
call h264_idct_add8_mmx_plane
RET
@@ -688,14 +661,9 @@ h264_idct_add8_mmxext_plane:
movzx r6, byte [r4+r6]
test r6, r6
jz .try_dc
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
-%endif
IDCT4_ADD r0, r2, r3
inc r5
add r2, 32
@@ -708,14 +676,9 @@ h264_idct_add8_mmxext_plane:
jz .skipblock
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
-%endif
DC_ADD_MMXEXT_OP movh, r0, r3, r6
.skipblock:
inc r5
@@ -731,23 +694,18 @@ INIT_MMX mmxext
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block,
stride, nnzc, cntr, coeff, dst2, picreg
mov r5, 16
add r2, 512
-%if ARCH_X86_64
- mov dst2q, r0
-%endif
%ifdef PIC
lea picregq, [scan8_mem]
%endif
call h264_idct_add8_mmxext_plane
mov r5, 32
add r2, 384
-%if ARCH_X86_64
- add dst2q, gprsize
-%else
add r0mp, gprsize
-%endif
call h264_idct_add8_mmxext_plane
RET
+%endif ; ARCH_X86_32
+INIT_MMX mmxext
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
h264_idct_dc_add8_mmxext:
movd m0, [r2 ] ; 0 0 X D
@@ -1076,7 +1034,9 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, %1
RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
IDCT_DC_DEQUANT 0
+%endif ; ARCH_X86_32
INIT_MMX sse2
IDCT_DC_DEQUANT 7
diff --git a/libavcodec/x86/h264_intrapred.asm
b/libavcodec/x86/h264_intrapred.asm
index b9db3f4..feb481a 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -118,8 +118,10 @@ cglobal pred16x16_horizontal_8, 2,3
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
PRED16x16_H
+%endif ; ARCH_X86_32
INIT_MMX mmxext
PRED16x16_H
INIT_XMM ssse3
@@ -180,8 +182,10 @@ cglobal pred16x16_dc_8, 2,7
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
PRED16x16_DC
+%endif ; ARCH_X86_32
INIT_XMM sse2
PRED16x16_DC
INIT_XMM ssse3
@@ -227,10 +231,12 @@ cglobal pred16x16_tm_vp8_8, 2,5
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
PRED16x16_TM
INIT_MMX mmxext
PRED16x16_TM
+%endif ; ARCH_X86_32
INIT_XMM sse2
cglobal pred16x16_tm_vp8_8, 2,6,6
@@ -532,6 +538,7 @@ cglobal pred16x16_plane_%1_8, 2,9,7
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
@@ -540,6 +547,7 @@ INIT_MMX mmxext
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
H264_PRED16x16_PLANE svq3
+%endif ; ARCH_X86_32
INIT_XMM sse2
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
@@ -714,10 +722,12 @@ ALIGN 16
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
H264_PRED8x8_PLANE
INIT_MMX mmxext
H264_PRED8x8_PLANE
+%endif ; ARCH_X86_32
INIT_XMM sse2
H264_PRED8x8_PLANE
INIT_XMM ssse3
@@ -761,8 +771,10 @@ cglobal pred8x8_horizontal_8, 2,3
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
PRED8x8_H
+%endif ; ARCH_X86_32
INIT_MMX mmxext
PRED8x8_H
INIT_MMX ssse3
@@ -939,10 +951,12 @@ cglobal pred8x8_tm_vp8_8, 2,6
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
PRED8x8_TM
INIT_MMX mmxext
PRED8x8_TM
+%endif ; ARCH_X86_32
INIT_XMM sse2
cglobal pred8x8_tm_vp8_8, 2,6,4
@@ -1495,6 +1509,7 @@ PRED8x8L_DOWN_LEFT
;void pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, int
has_topright, int stride)
;-----------------------------------------------------------------------------
+%if ARCH_X86_32
INIT_MMX mmxext
cglobal pred8x8l_down_right_8, 4,5
sub r0, r3
@@ -1626,6 +1641,7 @@ cglobal pred8x8l_down_right_8, 4,5
por mm0, mm1
movq [r0+r3*1], mm0
RET
+%endif ; ARCH_X86_32
%macro PRED8x8L_DOWN_RIGHT 0
cglobal pred8x8l_down_right_8, 4,5
@@ -1746,6 +1762,7 @@ PRED8x8L_DOWN_RIGHT
; void pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, int
has_topright, int stride)
;-----------------------------------------------------------------------------
+%if ARCH_X86_32
INIT_MMX mmxext
cglobal pred8x8l_vertical_right_8, 4,5
sub r0, r3
@@ -1852,6 +1869,7 @@ cglobal pred8x8l_vertical_right_8, 4,5
PALIGNR mm5, mm0, 7, mm1
movq [r4+r3*2], mm5
RET
+%endif ; ARCH_X86_32
%macro PRED8x8L_VERTICAL_RIGHT 0
cglobal pred8x8l_vertical_right_8, 4,5,7
@@ -2149,6 +2167,7 @@ PRED8x8L_HORIZONTAL_UP
;void pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, int
has_topright, int stride)
;-----------------------------------------------------------------------------
+%if ARCH_X86_32
INIT_MMX mmxext
cglobal pred8x8l_horizontal_down_8, 4,5
sub r0, r3
@@ -2263,6 +2282,7 @@ cglobal pred8x8l_horizontal_down_8, 4,5
PALIGNR mm3, mm4, 6, mm4
movq [r0+r3*1], mm3
RET
+%endif ; ARCH_X86_32
%macro PRED8x8L_HORIZONTAL_DOWN 0
cglobal pred8x8l_horizontal_down_8, 4,5
@@ -2463,8 +2483,10 @@ cglobal pred4x4_tm_vp8_8, 3,6
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
PRED4x4_TM
+%endif ; ARCH_X86_32
INIT_MMX mmxext
PRED4x4_TM
diff --git a/libavcodec/x86/h264_intrapred_init.c
b/libavcodec/x86/h264_intrapred_init.c
index 6dd98aa..35dfb59 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -190,11 +190,16 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h,
int codec_id,
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
+#if ARCH_X86_32
h->pred16x16[HOR_PRED8x8 ] =
ff_pred16x16_horizontal_8_mmx;
+#endif /* ARCH_X86_32 */
if (chroma_format_idc == 1) {
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
+#if ARCH_X86_32
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
+#endif /* ARCH_X86_32 */
}
+#if ARCH_X86_32
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
@@ -211,22 +216,27 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h,
int codec_id,
h->pred16x16[PLANE_PRED8x8] =
ff_pred16x16_plane_h264_8_mmx;
}
}
+#endif /* ARCH_X86_32 */
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred16x16[HOR_PRED8x8 ] =
ff_pred16x16_horizontal_8_mmxext;
+#if ARCH_X86_32
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
+#endif /* ARCH_X86_32 */
if (chroma_format_idc == 1)
h->pred8x8[HOR_PRED8x8 ] =
ff_pred8x8_horizontal_8_mmxext;
h->pred8x8l [TOP_DC_PRED ] =
ff_pred8x8l_top_dc_8_mmxext;
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
h->pred8x8l [HOR_PRED ] =
ff_pred8x8l_horizontal_8_mmxext;
h->pred8x8l [VERT_PRED ] =
ff_pred8x8l_vertical_8_mmxext;
+#if ARCH_X86_32
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] =
ff_pred8x8l_down_right_8_mmxext;
h->pred8x8l [VERT_RIGHT_PRED ] =
ff_pred8x8l_vertical_right_8_mmxext;
- h->pred8x8l [HOR_UP_PRED ] =
ff_pred8x8l_horizontal_up_8_mmxext;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] =
ff_pred8x8l_down_left_8_mmxext;
h->pred8x8l [HOR_DOWN_PRED ] =
ff_pred8x8l_horizontal_down_8_mmxext;
+#endif /* ARCH_X86_32 */
+ h->pred8x8l [HOR_UP_PRED ] =
ff_pred8x8l_horizontal_up_8_mmxext;
h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] =
ff_pred4x4_down_right_8_mmxext;
h->pred4x4 [VERT_RIGHT_PRED ] =
ff_pred4x4_vertical_right_8_mmxext;
h->pred4x4 [HOR_DOWN_PRED ] =
ff_pred4x4_horizontal_down_8_mmxext;
@@ -247,12 +257,15 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h,
int codec_id,
}
}
if (codec_id == AV_CODEC_ID_VP8) {
+#if ARCH_X86_32
h->pred16x16[PLANE_PRED8x8 ] =
ff_pred16x16_tm_vp8_8_mmxext;
- h->pred8x8 [DC_PRED8x8 ] =
ff_pred8x8_dc_rv40_8_mmxext;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
+#endif /* ARCH_X86_32 */
+ h->pred8x8 [DC_PRED8x8 ] =
ff_pred8x8_dc_rv40_8_mmxext;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
h->pred4x4 [VERT_PRED ] =
ff_pred4x4_vertical_vp8_8_mmxext;
} else {
+#if ARCH_X86_32
if (chroma_format_idc == 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
if (codec_id == AV_CODEC_ID_SVQ3) {
@@ -262,6 +275,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int
codec_id,
} else {
h->pred16x16[PLANE_PRED8x8 ] =
ff_pred16x16_plane_h264_8_mmxext;
}
+#endif /* ARCH_X86_32 */
}
}
@@ -327,6 +341,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int
codec_id,
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
h->pred4x4[HOR_UP_PRED ] =
ff_pred4x4_horizontal_up_10_mmxext;
+#if ARCH_X86_32
if (chroma_format_idc == 1)
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
@@ -338,6 +353,7 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int
codec_id,
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
h->pred16x16[HOR_PRED8x8 ] =
ff_pred16x16_horizontal_10_mmxext;
+#endif /* ARCH_X86_32 */
}
if (EXTERNAL_SSE2(cpu_flags)) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index eec1653..504be95 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -73,6 +73,7 @@ av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int
bit_depth)
int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
@@ -82,6 +83,7 @@ av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int
bit_depth)
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
}
+#endif /* ARCH_X86_32 */
if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 4164b83..824872f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -223,6 +223,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const
int bit_depth,
c->h264_idct8_dc_add =
c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
+#if ARCH_X86_32
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
if (chroma_format_idc == 1)
@@ -230,15 +231,18 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const
int bit_depth,
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
if (cpu_flags & AV_CPU_FLAG_CMOV)
c->h264_luma_dc_dequant_idct =
ff_h264_luma_dc_dequant_idct_mmx;
+#endif /* ARCH_X86_32 */
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
+#if ARCH_X86_32
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
+#endif /* ARCH_X86_32 */
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
c->h264_v_loop_filter_chroma_intra =
ff_deblock_v_chroma_intra_8_mmxext;
@@ -246,18 +250,22 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const
int bit_depth,
c->h264_h_loop_filter_chroma =
ff_deblock_h_chroma_8_mmxext;
c->h264_h_loop_filter_chroma_intra =
ff_deblock_h_chroma_intra_8_mmxext;
}
-#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+#if ARCH_X86_32
+#if HAVE_MMXEXT_EXTERNAL
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
- c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
+#endif /* HAVE_MMXEXT_EXTERNAL */
+ c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
c->h264_h_loop_filter_luma_intra =
ff_deblock_h_luma_intra_8_mmxext;
-#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
+#endif /* ARCH_X86_32 */
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
+#if ARCH_X86_32
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
+#endif /* ARCH_X86_32 */
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
@@ -293,7 +301,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const
int bit_depth,
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
-#if ARCH_X86_32
+#if ARCH_X86_32 || !HAVE_ALIGNED_STACK
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
c->h264_v_loop_filter_chroma_intra =
ff_deblock_v_chroma_intra_10_mmxext;
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index ec04d99..2cd89e6 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -53,8 +53,10 @@ cglobal put_pixels8_x2, 4,5
INIT_MMX mmxext
PUT_PIXELS8_X2
+%if ARCH_X86_32
INIT_MMX 3dnow
PUT_PIXELS8_X2
+%endif ; ARCH_X86_32
; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size,
int h)
@@ -97,8 +99,10 @@ cglobal put_pixels16_x2, 4,5
INIT_MMX mmxext
PUT_PIXELS_16
+%if ARCH_X86_32
INIT_MMX 3dnow
PUT_PIXELS_16
+%endif ; ARCH_X86_32
; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
@@ -138,8 +142,10 @@ cglobal put_no_rnd_pixels8_x2, 4,5
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_X2
+%if ARCH_X86_32
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_X2
+%endif ; ARCH_X86_32
; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
@@ -185,8 +191,10 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_X2_EXACT
+%if ARCH_X86_32
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_X2_EXACT
+%endif ; ARCH_X86_32
; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size,
int h)
@@ -219,8 +227,10 @@ cglobal put_pixels8_y2, 4,5
INIT_MMX mmxext
PUT_PIXELS8_Y2
+%if ARCH_X86_32
INIT_MMX 3dnow
PUT_PIXELS8_Y2
+%endif ; ARCH_X86_32
; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
@@ -256,8 +266,10 @@ cglobal put_no_rnd_pixels8_y2, 4,5
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_Y2
+%if ARCH_X86_32
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_Y2
+%endif ; ARCH_X86_32
; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
@@ -298,8 +310,10 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_Y2_EXACT
+%if ARCH_X86_32
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_Y2_EXACT
+%endif ; ARCH_X86_32
; avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int
h)
@@ -328,8 +342,10 @@ cglobal avg_pixels8, 4,5
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX 3dnow
AVG_PIXELS8
+%endif ; ARCH_X86_32
; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size,
int h)
@@ -364,8 +380,10 @@ cglobal avg_pixels8_x2, 4,5
INIT_MMX mmxext
AVG_PIXELS8_X2
+%if ARCH_X86_32
INIT_MMX 3dnow
AVG_PIXELS8_X2
+%endif ; ARCH_X86_32
; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size,
int h)
@@ -406,8 +424,10 @@ cglobal avg_pixels8_y2, 4,5
INIT_MMX mmxext
AVG_PIXELS8_Y2
+%if ARCH_X86_32
INIT_MMX 3dnow
AVG_PIXELS8_Y2
+%endif ; ARCH_X86_32
; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size,
int h)
@@ -450,5 +470,7 @@ cglobal avg_pixels8_xy2, 4,5
INIT_MMX mmxext
AVG_PIXELS8_XY2
+%if ARCH_X86_32
INIT_MMX 3dnow
AVG_PIXELS8_XY2
+%endif ; ARCH_X86_32
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 3bc5601..1d88a21 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -145,7 +145,9 @@ PIXELS16(static, put, , _y2, _mmx)
PIXELS16(static, avg, ff_, _y2, CPUEXT) \
PIXELS16(static, avg, ff_, _xy2, CPUEXT)
+#if ARCH_X86_32
HPELDSP_AVG_PIXELS16(_3dnow)
+#endif /* ARCH_X86_32 */
HPELDSP_AVG_PIXELS16(_mmxext)
#endif /* HAVE_YASM */
@@ -205,6 +207,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int
flags, int cpu_flags)
#endif /* HAVE_MMXEXT_EXTERNAL */
}
+#if ARCH_X86_32
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_AMD3DNOW_EXTERNAL
@@ -238,6 +241,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int
flags, int cpu_flags)
}
#endif /* HAVE_AMD3DNOW_EXTERNAL */
}
+#endif /* ARCH_X86_32 */
static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
{
@@ -258,8 +262,10 @@ void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
if (INLINE_MMX(cpu_flags))
hpeldsp_init_mmx(c, flags, cpu_flags);
+#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags))
hpeldsp_init_3dnow(c, flags, cpu_flags);
+#endif /* ARCH_X86_32 */
if (EXTERNAL_MMXEXT(cpu_flags))
hpeldsp_init_mmxext(c, flags, cpu_flags);
diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm
index 633fcd9..71dfc25 100644
--- a/libavcodec/x86/imdct36.asm
+++ b/libavcodec/x86/imdct36.asm
@@ -358,8 +358,10 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
RET
%endmacro
+%if ARCH_X86_32
INIT_XMM sse
DEFINE_IMDCT
+%endif ; ARCH_X86_32
INIT_XMM sse2
DEFINE_IMDCT
diff --git a/libavcodec/x86/motion_est.c b/libavcodec/x86/motion_est.c
index 41b9c5c..12fd953 100644
--- a/libavcodec/x86/motion_est.c
+++ b/libavcodec/x86/motion_est.c
@@ -329,7 +329,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t
*blk2, int stride, int h)
}
-#define PIX_SAD(suf)\
+#define SAD8_16(suf)\
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int
h)\
{\
assert(h==8);\
@@ -340,6 +340,23 @@ static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t
*blk1, int stride, int h
\
return sum_ ## suf();\
}\
+static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride,
int h)\
+{\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t":);\
+\
+ sad8_1_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+ return sum_ ## suf();\
+}
+
+#if ARCH_X86_32
+SAD8_16(mmx)
+#endif /* ARCH_X86_32 */
+SAD8_16(mmxext)
+
+#define PIX_SAD(suf)\
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride,
int h)\
{\
assert(h==8);\
@@ -380,16 +397,6 @@ static int sad8_xy2_ ## suf(void *v, uint8_t *blk2,
uint8_t *blk1, int stride, i
return sum_ ## suf();\
}\
\
-static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride,
int h)\
-{\
- __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t":);\
-\
- sad8_1_ ## suf(blk1 , blk2 , stride, h);\
- sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
-\
- return sum_ ## suf();\
-}\
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride,
int h)\
{\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
@@ -439,17 +446,23 @@ av_cold void ff_dsputil_init_pix_mmx(DSPContext *c,
AVCodecContext *avctx)
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
+#if ARCH_X86_32
c->pix_abs[0][0] = sad16_mmx;
+#endif /* ARCH_X86_32 */
c->pix_abs[0][1] = sad16_x2_mmx;
c->pix_abs[0][2] = sad16_y2_mmx;
c->pix_abs[0][3] = sad16_xy2_mmx;
+#if ARCH_X86_32
c->pix_abs[1][0] = sad8_mmx;
+#endif /* ARCH_X86_32 */
c->pix_abs[1][1] = sad8_x2_mmx;
c->pix_abs[1][2] = sad8_y2_mmx;
c->pix_abs[1][3] = sad8_xy2_mmx;
+#if ARCH_X86_32
c->sad[0]= sad16_mmx;
c->sad[1]= sad8_mmx;
+#endif /* ARCH_X86_32 */
}
if (INLINE_MMXEXT(cpu_flags)) {
c->pix_abs[0][0] = sad16_mmxext;
diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c
index 3f0943c..5b143b7 100644
--- a/libavcodec/x86/mpegaudiodsp.c
+++ b/libavcodec/x86/mpegaudiodsp.c
@@ -217,7 +217,9 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf,
float *in, \
} \
}
+#if ARCH_X86_32
DECL_IMDCT_BLOCKS(sse,sse)
+#endif /* ARCH_X86_32 */
DECL_IMDCT_BLOCKS(sse2,sse)
DECL_IMDCT_BLOCKS(sse3,sse)
DECL_IMDCT_BLOCKS(ssse3,sse)
@@ -249,9 +251,11 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
#endif /* HAVE_SSE2_INLINE */
#if HAVE_YASM
+#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse;
}
+#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE2(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse2;
}
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 07fd1e5..4271d99 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -443,6 +443,7 @@ __asm__ volatile(
);
}
+#if ARCH_X86_32
static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
int *sum= s->dct_error_sum[intra];
@@ -496,6 +497,7 @@ static void denoise_dct_mmx(MpegEncContext *s, int16_t
*block){
: "r"(block+64)
);
}
+#endif /* ARCH_X86_32 */
static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
@@ -568,7 +570,9 @@ av_cold void ff_MPV_common_init_x86(MpegEncContext *s)
if(!(s->flags & CODEC_FLAG_BITEXACT))
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
+#if ARCH_X86_32
s->denoise_dct = denoise_dct_mmx;
+#endif /* ARCH_X86_32 */
}
if (INLINE_SSE2(cpu_flags)) {
s->denoise_dct = denoise_dct_sse2;
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index ca39a3b..6904eff 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -30,6 +30,7 @@
extern uint16_t ff_inv_zigzag_direct16[64];
+#if ARCH_X86_32
#if HAVE_MMX_INLINE
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
@@ -52,6 +53,7 @@ extern uint16_t ff_inv_zigzag_direct16[64];
#define RENAMEl(a) a ## _mmxext
#include "mpegvideoenc_template.c"
#endif /* HAVE_MMXEXT_INLINE */
+#endif /* ARCH_X86_32 */
#if HAVE_SSE2_INLINE
#undef COMPILE_TEMPLATE_MMXEXT
@@ -86,8 +88,9 @@ av_cold void ff_MPV_encode_init_x86(MpegEncContext *s)
const int dct_algo = s->avctx->dct_algo;
if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
-#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
+#if HAVE_MMX_INLINE
if (INLINE_MMX(cpu_flags))
s->dct_quantize = dct_quantize_MMX;
#endif
@@ -95,6 +98,7 @@ av_cold void ff_MPV_encode_init_x86(MpegEncContext *s)
if (INLINE_MMXEXT(cpu_flags))
s->dct_quantize = dct_quantize_MMXEXT;
#endif
+#endif /* ARCH_X86_32 */
#if HAVE_SSE2_INLINE
if (INLINE_SSE2(cpu_flags))
s->dct_quantize = dct_quantize_SSE2;
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 4d9c35b..f1b84f6 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -63,6 +63,7 @@ rv34_idct dc
%define IDCT_DC IDCT_DC_NOROUND
rv34_idct dc_noround
+%if ARCH_X86_32
; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
INIT_MMX mmx
cglobal rv34_idct_dc_add, 3, 3
@@ -97,6 +98,7 @@ cglobal rv34_idct_dc_add, 3, 3
movh [r2], m4
movh [r2+r1], m5
RET
+%endif ; ARCH_X86_32
; Load coeffs and perform row transform
; Output: coeffs in mm[0467], rounder in mm5
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 5f284b8..91d0280 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -34,8 +34,10 @@ av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
{
int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
+#endif /* ARCH_X86_32 */
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
c->rv34_idct_add = ff_rv34_idct_add_mmxext;
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 7ec72be..694d07d 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -486,11 +486,13 @@ cglobal rv40_weight_func_%1_%2, 6, 7, 8
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4
+%endif ; ARCH_X86_32
INIT_XMM sse2
RV40_WEIGHT rnd, 8, 3
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 781f467..0d75a00 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -232,23 +232,23 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
QPEL_MC_SET(put_, _mmx)
#endif
}
+#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
-#if ARCH_X86_32
QPEL_MC_SET(avg_, _3dnow)
-#endif
}
+#endif /* ARCH_X86_32 */
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
+#if ARCH_X86_32
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
-#if ARCH_X86_32
QPEL_MC_SET(avg_, _mmxext)
-#endif
+#endif /* ARCH_X86_32 */
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 9f18131..5f16659 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -103,9 +103,11 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
if (EXTERNAL_MMX(cpu_flags)) {
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] =
ff_put_vc1_chroma_mc8_nornd_mmx;
}
+#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] =
ff_avg_vc1_chroma_mc8_nornd_3dnow;
}
+#endif /* ARCH_X86_32 */
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_LF(mmxext);
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] =
ff_avg_vc1_chroma_mc8_nornd_mmxext;
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index a04d91c..82d8e4b 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -57,6 +57,7 @@ float ff_scalarproduct_float_sse(const float *v1, const float
*v2, int order);
void ff_butterflies_float_sse(float *src0, float *src1, int len);
#if HAVE_6REGS && HAVE_INLINE_ASM
+#if ARCH_X86_32
static void vector_fmul_window_3dnowext(float *dst, const float *src0,
const float *src1, const float *win,
int len)
@@ -88,6 +89,7 @@ static void vector_fmul_window_3dnowext(float *dst, const
float *src0,
: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
);
}
+#endif /* ARCH_X86_32 */
static void vector_fmul_window_sse(float *dst, const float *src0,
const float *src1, const float *win, int
len)
@@ -127,9 +129,11 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
int cpu_flags = av_get_cpu_flags();
#if HAVE_6REGS && HAVE_INLINE_ASM
+#if ARCH_X86_32
if (INLINE_AMD3DNOWEXT(cpu_flags)) {
fdsp->vector_fmul_window = vector_fmul_window_3dnowext;
}
+#endif /* ARCH_X86_32 */
if (INLINE_SSE(cpu_flags)) {
fdsp->vector_fmul_window = vector_fmul_window_sse;
}
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index d4f2580..5b4f073 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -90,11 +90,24 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask) =
0x0000001f0000001fULL;
#define COMPILE_TEMPLATE_AMD3DNOW 0
#define COMPILE_TEMPLATE_SSE2 0
+#if ARCH_X86_32
//MMX versions
#undef RENAME
#define RENAME(a) a ## _MMX
#include "rgb2rgb_template.c"
+//3DNOW versions
+#undef RENAME
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_AMD3DNOW 1
+#define RENAME(a) a ## _3DNOW
+#include "rgb2rgb_template.c"
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_AMD3DNOW
+#endif /* ARCH_X86_32 */
+
// MMXEXT versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
@@ -109,17 +122,6 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask) =
0x0000001f0000001fULL;
#define RENAME(a) a ## _SSE2
#include "rgb2rgb_template.c"
-//3DNOW versions
-#undef RENAME
-#undef COMPILE_TEMPLATE_MMXEXT
-#undef COMPILE_TEMPLATE_SSE2
-#undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define COMPILE_TEMPLATE_SSE2 0
-#define COMPILE_TEMPLATE_AMD3DNOW 1
-#define RENAME(a) a ## _3DNOW
-#include "rgb2rgb_template.c"
-
/*
RGB15->RGB16 original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
@@ -134,10 +136,12 @@ av_cold void rgb2rgb_init_x86(void)
#if HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
if (INLINE_MMX(cpu_flags))
rgb2rgb_init_MMX();
if (INLINE_AMD3DNOW(cpu_flags))
rgb2rgb_init_3DNOW();
+#endif /* ARCH_X86_32 */
if (INLINE_MMXEXT(cpu_flags))
rgb2rgb_init_MMXEXT();
if (INLINE_SSE2(cpu_flags))
--
1.7.9.5
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel