On 8/16/17, James Almer <jamr...@gmail.com> wrote: > On 8/15/2017 5:25 PM, Paul B Mahol wrote: >> From f25f0022fbc675affd65b95f097fa62e55788a37 Mon Sep 17 00:00:00 2001 >> From: Paul B Mahol <one...@gmail.com> >> Date: Tue, 15 Aug 2017 20:12:32 +0200 >> Subject: [PATCH] avfilter/vf_transpose: rewrite for x86 SIMD >> >> Transpose first in chunks of 8x8 blocks. >> 15% faster overall. >> --- >> libavfilter/vf_transpose.c | 184 >> +++++++++++++++++++++++++++++++++++---------- >> 1 file changed, 143 insertions(+), 41 deletions(-) >> >> diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c >> index 75b4dda41f..2fa751c925 100644 >> --- a/libavfilter/vf_transpose.c >> +++ b/libavfilter/vf_transpose.c >> @@ -58,6 +58,12 @@ typedef struct TransContext { >> >> int passthrough; ///< PassthroughType, landscape passthrough mode >> enabled >> int dir; ///< TransposeDir >> + >> + void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize); >> + void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize, >> + int w, int h); >> } TransContext; >> >> static int query_formats(AVFilterContext *ctx) >> @@ -79,6 +85,109 @@ static int query_formats(AVFilterContext *ctx) >> return ff_set_common_formats(ctx, pix_fmts); >> } >> >> +static av_always_inline void transpose_block_8_c(uint8_t *src, ptrdiff_t >> src_linesize, > > Is always_inline needed? Shouldn't inline be enough for the 8x8 functions?
Changed. > >> + uint8_t *dst, ptrdiff_t >> dst_linesize, >> + int w, int h) >> +{ >> + int x, y; >> + for (y = 0; y < h; y++, dst += dst_linesize, src++) >> + for (x = 0; x < w; x++) >> + dst[x] = src[x*src_linesize]; >> +} >> + >> +static void transpose_8x8_8_c(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize) >> +{ >> + transpose_block_8_c(src, src_linesize, dst, dst_linesize, 8, 8); >> +} >> + >> +static av_always_inline void transpose_block_16_c(uint8_t *src, ptrdiff_t >> src_linesize, >> + uint8_t *dst, ptrdiff_t >> dst_linesize, >> + int w, int h) >> +{ >> + int x, y; >> + for (y = 0; y < h; y++, dst += dst_linesize, src += 2) >> + for (x = 0; x < w; x++) >> + *((uint16_t *)(dst + 2*x)) = *((uint16_t *)(src + >> x*src_linesize)); > > Use local uint16_t* pointers instead of casting inside the loop. It will > probably make no difference for compilers but it helps readability. > > Same for the cases below. > For src case its not trivial, feel free to do it in another commit. >> +} >> + >> +static void transpose_8x8_16_c(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize) >> +{ >> + transpose_block_16_c(src, src_linesize, dst, dst_linesize, 8, 8); >> +} >> + >> +static av_always_inline void transpose_block_24_c(uint8_t *src, ptrdiff_t >> src_linesize, >> + uint8_t *dst, ptrdiff_t >> dst_linesize, >> + int w, int h) >> +{ >> + int x, y; >> + for (y = 0; y < h; y++, dst += dst_linesize) { >> + for (x = 0; x < w; x++) { >> + int32_t v = AV_RB24(src + x*src_linesize + y*3); >> + AV_WB24(dst + 3*x, v); >> + } >> + } >> +} >> + >> +static void transpose_8x8_24_c(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize) >> +{ >> + transpose_block_24_c(src, src_linesize, dst, dst_linesize, 8, 8); >> +} >> + >> +static av_always_inline void transpose_block_32_c(uint8_t *src, ptrdiff_t >> src_linesize, >> + uint8_t *dst, ptrdiff_t >> dst_linesize, >> + int w, int h) >> +{ >> + int x, y; >> + for (y = 0; y < h; y++, dst += dst_linesize, src += 4) { >> + for (x = 0; x < w; x++) >> + *((uint32_t *)(dst + 4*x)) = *((uint32_t *)(src + >> x*src_linesize)); >> + } >> +} >> + >> +static void transpose_8x8_32_c(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize) >> +{ >> + transpose_block_32_c(src, src_linesize, dst, dst_linesize, 8, 8); >> +} >> + >> +static av_always_inline void transpose_block_48_c(uint8_t *src, ptrdiff_t >> src_linesize, >> + uint8_t *dst, ptrdiff_t >> dst_linesize, >> + int w, int h) >> +{ >> + int x, y; >> + for (y = 0; y < h; y++, dst += dst_linesize, src += 6) { >> + for (x = 0; x < w; x++) { >> + int64_t v = AV_RB48(src + x*src_linesize); >> + AV_WB48(dst + 6*x, v); >> + } >> + } >> +} >> + >> +static void transpose_8x8_48_c(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize) >> +{ >> + transpose_block_48_c(src, src_linesize, dst, dst_linesize, 8, 8); >> +} >> + >> +static av_always_inline void transpose_block_64_c(uint8_t *src, ptrdiff_t >> src_linesize, >> + uint8_t *dst, ptrdiff_t >> dst_linesize, >> + int w, int h) >> +{ >> + int x, y; >> + for (y = 0; y < h; y++, dst += dst_linesize, src += 8) >> + for (x = 0; x < w; x++) >> + *((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src + >> x*src_linesize)); >> +} >> + >> +static void transpose_8x8_64_c(uint8_t *src, ptrdiff_t src_linesize, >> + uint8_t *dst, ptrdiff_t dst_linesize) >> +{ >> + transpose_block_64_c(src, src_linesize, dst, dst_linesize, 8, 8); >> +} >> + > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel