Tested using this command:
/ffmpeg -pix_fmt yuv420p -s 1920*1080 -i ArashRawYuv420.yuv \
-vcodec rawvideo -s 1920*1080 -pix_fmt rgb24 -f null /dev/null
The fps increase from 389 to 640 on my local machine.
Signed-off-by: Ting Fu
---
libswscale/x86/yuv2rgb.c | 8 +-
libswscale/x86/yuv2rgb_template.c | 58 ++-
libswscale/x86/yuv_2_rgb.asm | 162 +++---
3 files changed, 209 insertions(+), 19 deletions(-)
diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index ed9b613cab..b83dd7089a 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -61,13 +61,19 @@ DECLARE_ASM_CONST(8, uint64_t, pb_07) =
0x0707070707070707ULL;
#define COMPILE_TEMPLATE_MMXEXT 1
#endif /* HAVE_MMXEXT */
+//SSSE3 versions
+#if HAVE_SSSE3
+#define COMPILE_TEMPLATE_SSSE3 1
+#endif
+
#include "yuv2rgb_template.c"
av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
-if (EXTERNAL_MMX(cpu_flags) || EXTERNAL_MMXEXT(cpu_flags)) {
+if (EXTERNAL_MMX(cpu_flags) || EXTERNAL_MMXEXT(cpu_flags) ||
+EXTERNAL_SSSE3(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
diff --git a/libswscale/x86/yuv2rgb_template.c
b/libswscale/x86/yuv2rgb_template.c
index efe6356f30..fe586047f0 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -40,6 +40,30 @@
const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
+extern void ff_yuv_420_rgb24_ssse3(x86_reg index, uint8_t *image, const
uint8_t *pu_index,
+ const uint8_t *pv_index, const uint8_t
*pointer_c_dither,
+ const uint8_t *py_2index);
+extern void ff_yuv_420_bgr24_ssse3(x86_reg index, uint8_t *image, const
uint8_t *pu_index,
+ const uint8_t *pv_index, const uint8_t
*pointer_c_dither,
+ const uint8_t *py_2index);
+extern void ff_yuv_420_rgb15_ssse3(x86_reg index, uint8_t *image, const
uint8_t *pu_index,
+ const uint8_t *pv_index, const uint8_t
*pointer_c_dither,
+ const uint8_t *py_2index);
+extern void ff_yuv_420_rgb16_ssse3(x86_reg index, uint8_t *image, const
uint8_t *pu_index,
+ const uint8_t *pv_index, const uint8_t
*pointer_c_dither,
+ const uint8_t *py_2index);
+extern void ff_yuv_420_rgb32_ssse3(x86_reg index, uint8_t *image, const
uint8_t *pu_index,
+ const uint8_t *pv_index, const uint8_t
*pointer_c_dither,
+ const uint8_t *py_2index);
+extern void ff_yuv_420_bgr32_ssse3(x86_reg index, uint8_t *image, const
uint8_t *pu_index,
+ const uint8_t *pv_index, const uint8_t
*pointer_c_dither,
+ const uint8_t *py_2index);
+extern void ff_yuva_420_rgb32_ssse3(x86_reg index, uint8_t *image, const
uint8_t *pu_index,
+const uint8_t *pv_index, const uint8_t
*pointer_c_dither,
+const uint8_t *py_2index, const uint8_t
*pa_2index);
+extern void ff_yuva_420_bgr32_ssse3(x86_reg index, uint8_t *image, const
uint8_t *pu_index,
+const uint8_t *pv_index, const uint8_t
*pointer_c_dither,
+const uint8_t *py_2index, const uint8_t
*pa_2index);
extern void ff_yuv_420_rgb24_mmxext(x86_reg index, uint8_t *image, const
uint8_t *pu_index,
const uint8_t *pv_index, const uint8_t
*pointer_c_dither,
const uint8_t *py_2index);
@@ -84,7 +108,12 @@ static inline int yuv420_rgb15(SwsContext *c, const uint8_t
*src[],
c->greenDither = ff_dither8[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
+
+#if COMPILE_TEMPLATE_SSSE3
+ff_yuv_420_rgb15_ssse3(index, image, pu - index, pv - index,
&(c->redDither), py - 2 * index);
+#else
ff_yuv_420_rgb15_mmx(index, image, pu - index, pv - index,
&(c->redDither), py - 2 * index);
+#endif
}
return srcSliceH;
}
@@ -102,7 +131,12 @@ static inline int yuv420_rgb16(SwsContext *c, const
uint8_t *src[],
c->greenDither = ff_dither4[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
+
+#if COMPILE_TEMPLATE_SSSE3
+ff_yuv_420_rgb16_ssse3(index, image, pu - index, pv - index,
&(c->redDither), py - 2 * index);
+#else
ff_yuv_420_rgb16_mmx(index, image, pu - index, pv - index,
&(c->redDither), py - 2 * index);
+#endif
}
return srcSliceH;
}
@@ -115,7 +149,9 @@ static inline int yuv420_rgb24(SwsContext *c, const uint8_t
*src[],
int