Re: [FFmpeg-devel] [PATCH] v210enc: Add SIMD optimised 8-bit and 10-bit encoders
> v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 > -v210_enc_chroma_shuf2_8: db 4,-1,5,-1,6,-1,7,-1,12,-1,13,-1,14,-1,15,-1 > +v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 > > v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0 Thanks Th ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] v210enc: Add SIMD optimised 8-bit and 10-bit encoders
On Wed, Nov 26, 2014 at 05:44:37PM +0100, Michael Niedermayer wrote: > On Wed, Nov 26, 2014 at 03:59:14PM +, Kieran Kunhya wrote: > > --- > > libavcodec/v210enc.c | 191 > > +- > > libavcodec/v210enc.h | 33 > > libavcodec/x86/Makefile | 2 + > > libavcodec/x86/v210enc.asm| 146 > > libavcodec/x86/v210enc_init.c | 37 > > libavutil/x86/x86util.asm | 5 ++ > > 6 files changed, 373 insertions(+), 41 deletions(-) > > create mode 100644 libavcodec/v210enc.h > > create mode 100644 libavcodec/x86/v210enc.asm > > create mode 100644 libavcodec/x86/v210enc_init.c > > breaks fate, also judging purely from the numbers it would be worse > quality wise > > --- ./tests/ref/vsynth/vsynth1-v210 2014-11-26 16:30:08.642024147 +0100 > +++ tests/data/fate/vsynth1-v2102014-11-26 17:41:43.134114620 +0100 > @@ -1,4 +1,4 @@ > -895d30660eb4da017568141a8d1df4e8 *tests/data/fate/vsynth1-v210.avi > +a98ae536d6362c6841744ef42e9ea8e1 *tests/data/fate/vsynth1-v210.avi > 14752448 tests/data/fate/vsynth1-v210.avi > -50973792d3f1abe04a51ee0121f077f2 *tests/data/fate/vsynth1-v210.out.rawvideo > -stddev:1.85 PSNR: 42.78 MAXDIFF: 29 bytes: 7603200/ 7603200 > +fc1d9531a1d2c05b0099fb93a47f4e05 *tests/data/fate/vsynth1-v210.out.rawvideo > +stddev:9.56 PSNR: 28.52 MAXDIFF: 196 bytes: 7603200/ 7603200 this fixes it (only remainig is some ronding difference from sws and v210 converting a bit different diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm index 9442bb2..3245de3 100644 --- a/libavcodec/x86/v210enc.asm +++ b/libavcodec/x86/v210enc.asm @@ -39,7 +39,7 @@ v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1 -v210_enc_chroma_shuf2_8: db 4,-1,5,-1,6,-1,7,-1,12,-1,13,-1,14,-1,15,-1 +v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1 v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0 > > PS: Thanks for posting a patch which applied cleanly! > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > No human being will ever know the Truth, for even if they happen to say it > by chance, they would not even known they had done so. -- Xenophanes > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB When you are offended at any man's fault, turn to yourself and study your own failings. Then you will forget your anger. -- Epictetus signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] v210enc: Add SIMD optimised 8-bit and 10-bit encoders
On Wed, Nov 26, 2014 at 03:59:14PM +, Kieran Kunhya wrote: > --- > libavcodec/v210enc.c | 191 > +- > libavcodec/v210enc.h | 33 > libavcodec/x86/Makefile | 2 + > libavcodec/x86/v210enc.asm| 146 > libavcodec/x86/v210enc_init.c | 37 > libavutil/x86/x86util.asm | 5 ++ > 6 files changed, 373 insertions(+), 41 deletions(-) > create mode 100644 libavcodec/v210enc.h > create mode 100644 libavcodec/x86/v210enc.asm > create mode 100644 libavcodec/x86/v210enc_init.c breaks fate, also judging purely from the numbers it would be worse quality wise --- ./tests/ref/vsynth/vsynth1-v210 2014-11-26 16:30:08.642024147 +0100 +++ tests/data/fate/vsynth1-v2102014-11-26 17:41:43.134114620 +0100 @@ -1,4 +1,4 @@ -895d30660eb4da017568141a8d1df4e8 *tests/data/fate/vsynth1-v210.avi +a98ae536d6362c6841744ef42e9ea8e1 *tests/data/fate/vsynth1-v210.avi 14752448 tests/data/fate/vsynth1-v210.avi -50973792d3f1abe04a51ee0121f077f2 *tests/data/fate/vsynth1-v210.out.rawvideo -stddev:1.85 PSNR: 42.78 MAXDIFF: 29 bytes: 7603200/ 7603200 +fc1d9531a1d2c05b0099fb93a47f4e05 *tests/data/fate/vsynth1-v210.out.rawvideo +stddev:9.56 PSNR: 28.52 MAXDIFF: 196 bytes: 7603200/ 7603200 PS: Thanks for posting a patch which applied cleanly! [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB No human being will ever know the Truth, for even if they happen to say it by chance, they would not even known they had done so. -- Xenophanes signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] v210enc: Add SIMD optimised 8-bit and 10-bit encoders
--- libavcodec/v210enc.c | 191 +- libavcodec/v210enc.h | 33 libavcodec/x86/Makefile | 2 + libavcodec/x86/v210enc.asm| 146 libavcodec/x86/v210enc_init.c | 37 libavutil/x86/x86util.asm | 5 ++ 6 files changed, 373 insertions(+), 41 deletions(-) create mode 100644 libavcodec/v210enc.h create mode 100644 libavcodec/x86/v210enc.asm create mode 100644 libavcodec/x86/v210enc_init.c diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c index 1e53bdb..0d40f99 100644 --- a/libavcodec/v210enc.c +++ b/libavcodec/v210enc.c @@ -24,82 +24,190 @@ #include "avcodec.h" #include "bytestream.h" #include "internal.h" +#include "v210enc.h" + +#define CLIP(v) av_clip(v, 4, 1019) +#define CLIP8(v) av_clip(v, 1, 254) + +#define WRITE_PIXELS(a, b, c) \ +do {\ +val = CLIP(*a++); \ +val |= (CLIP(*b++) << 10) | \ + (CLIP(*c++) << 20); \ +AV_WL32(dst, val); \ +dst += 4; \ +} while (0) + +#define WRITE_PIXELS8(a, b, c) \ +do {\ +val = (CLIP8(*a++) << 2); \ +val |= (CLIP8(*b++) << 12) | \ + (CLIP8(*c++) << 22); \ +AV_WL32(dst, val); \ +dst += 4; \ +} while (0) + +static void v210_planar_pack_8_c(const uint8_t *y, const uint8_t *u, + const uint8_t *v, uint8_t *dst, ptrdiff_t width) +{ +uint32_t val; +int i; + +/* unroll this to match the assembly */ +for( i = 0; i < width-11; i += 12 ){ +WRITE_PIXELS8(u, y, v); +WRITE_PIXELS8(y, u, y); +WRITE_PIXELS8(v, y, u); +WRITE_PIXELS8(y, v, y); +WRITE_PIXELS8(u, y, v); +WRITE_PIXELS8(y, u, y); +WRITE_PIXELS8(v, y, u); +WRITE_PIXELS8(y, v, y); +} +} + +static void v210_planar_pack_10_c(const uint16_t *y, const uint16_t *u, + const uint16_t *v, uint8_t *dst, ptrdiff_t width) +{ +uint32_t val; +int i; + +for( i = 0; i < width-5; i += 6 ){ +WRITE_PIXELS(u, y, v); +WRITE_PIXELS(y, u, y); +WRITE_PIXELS(v, y, u); +WRITE_PIXELS(y, v, y); +} +} static av_cold int encode_init(AVCodecContext *avctx) { +V210EncContext *s = avctx->priv_data; + if (avctx->width & 1) { av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n"); return AVERROR(EINVAL); } -if (avctx->bits_per_raw_sample != 10) -av_log(avctx, AV_LOG_WARNING, "bits per raw sample: %d != 10-bit\n", - avctx->bits_per_raw_sample); - avctx->coded_frame = av_frame_alloc(); if (!avctx->coded_frame) return AVERROR(ENOMEM); avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I; +s->pack_line_8 = v210_planar_pack_8_c; +s->pack_line_10 = v210_planar_pack_10_c; + +if (ARCH_X86) +ff_v210enc_init_x86(s); + return 0; } static int encode_frame(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *pic, int *got_packet) { +V210EncContext *s = avctx->priv_data; + int aligned_width = ((avctx->width + 47) / 48) * 48; int stride = aligned_width * 8 / 3; int line_padding = stride - ((avctx->width * 8 + 11) / 12) * 4; int h, w, ret; -const uint16_t *y = (const uint16_t*)pic->data[0]; -const uint16_t *u = (const uint16_t*)pic->data[1]; -const uint16_t *v = (const uint16_t*)pic->data[2]; -PutByteContext p; +uint8_t *dst; -if ((ret = ff_alloc_packet2(avctx, pkt, avctx->height * stride)) < 0) +if ((ret = ff_alloc_packet(pkt, avctx->height * stride)) < 0) { +av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n"); return ret; +} -bytestream2_init_writer(&p, pkt->data, pkt->size); +dst = pkt->data; + +if (pic->format == AV_PIX_FMT_YUV422P10) { +const uint16_t *y = (const uint16_t*)pic->data[0]; +const uint16_t *u = (const uint16_t*)pic->data[1]; +const uint16_t *v = (const uint16_t*)pic->data[2]; +for (h = 0; h < avctx->height; h++) { +uint32_t val; +w = (avctx->width / 6) * 6; +s->pack_line_10(y, u, v, dst, w); + +y += w; +u += w >> 1; +v += w >> 1; +dst += (w / 6) * 16; +if (w < avctx->width - 1) { +WRITE_PIXELS(u, y, v); + +val = CLIP(*y++); +if (w == avctx->width - 2) { +AV_WL32(dst, val); +dst += 4; +} +} +if (w < avctx->width - 3) { +val |= (CLIP(*u++) << 10) | (CLIP(*y++) << 20); +AV_WL32(dst, val); +