Re: [FFmpeg-devel] [PATCH] v210enc: Add SIMD optimised 8-bit and 10-bit encoders

2014-11-26 Thread Kieran Kunhya
>  v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
> -v210_enc_chroma_shuf2_8: db 4,-1,5,-1,6,-1,7,-1,12,-1,13,-1,14,-1,15,-1
> +v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
>
>  v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0

Thanks

Th
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] v210enc: Add SIMD optimised 8-bit and 10-bit encoders

2014-11-26 Thread Michael Niedermayer
On Wed, Nov 26, 2014 at 05:44:37PM +0100, Michael Niedermayer wrote:
> On Wed, Nov 26, 2014 at 03:59:14PM +, Kieran Kunhya wrote:
> > ---
> >  libavcodec/v210enc.c  | 191 
> > +-
> >  libavcodec/v210enc.h  |  33 
> >  libavcodec/x86/Makefile   |   2 +
> >  libavcodec/x86/v210enc.asm| 146 
> >  libavcodec/x86/v210enc_init.c |  37 
> >  libavutil/x86/x86util.asm |   5 ++
> >  6 files changed, 373 insertions(+), 41 deletions(-)
> >  create mode 100644 libavcodec/v210enc.h
> >  create mode 100644 libavcodec/x86/v210enc.asm
> >  create mode 100644 libavcodec/x86/v210enc_init.c
> 
> breaks fate, also judging purely from the numbers it would be worse
> quality wise
> 
> --- ./tests/ref/vsynth/vsynth1-v210 2014-11-26 16:30:08.642024147 +0100
> +++ tests/data/fate/vsynth1-v2102014-11-26 17:41:43.134114620 +0100
> @@ -1,4 +1,4 @@
> -895d30660eb4da017568141a8d1df4e8 *tests/data/fate/vsynth1-v210.avi
> +a98ae536d6362c6841744ef42e9ea8e1 *tests/data/fate/vsynth1-v210.avi
>  14752448 tests/data/fate/vsynth1-v210.avi
> -50973792d3f1abe04a51ee0121f077f2 *tests/data/fate/vsynth1-v210.out.rawvideo
> -stddev:1.85 PSNR: 42.78 MAXDIFF:   29 bytes:  7603200/  7603200
> +fc1d9531a1d2c05b0099fb93a47f4e05 *tests/data/fate/vsynth1-v210.out.rawvideo
> +stddev:9.56 PSNR: 28.52 MAXDIFF:  196 bytes:  7603200/  7603200

this fixes it (only remainig is some ronding difference from sws and
v210 converting a bit different

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index 9442bb2..3245de3 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -39,7 +39,7 @@ v210_enc_luma_shuf_8: db 
6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
 v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0

 v210_enc_chroma_shuf1_8: db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
-v210_enc_chroma_shuf2_8: db 4,-1,5,-1,6,-1,7,-1,12,-1,13,-1,14,-1,15,-1
+v210_enc_chroma_shuf2_8: db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1

 v210_enc_chroma_mult_8: dw 4,16,64,0,64,4,16,0







> 
> PS: Thanks for posting a patch which applied cleanly!
> 
> [...]
> -- 
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> No human being will ever know the Truth, for even if they happen to say it
> by chance, they would not even known they had done so. -- Xenophanes



> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

When you are offended at any man's fault, turn to yourself and study your
own failings. Then you will forget your anger. -- Epictetus


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] v210enc: Add SIMD optimised 8-bit and 10-bit encoders

2014-11-26 Thread Michael Niedermayer
On Wed, Nov 26, 2014 at 03:59:14PM +, Kieran Kunhya wrote:
> ---
>  libavcodec/v210enc.c  | 191 
> +-
>  libavcodec/v210enc.h  |  33 
>  libavcodec/x86/Makefile   |   2 +
>  libavcodec/x86/v210enc.asm| 146 
>  libavcodec/x86/v210enc_init.c |  37 
>  libavutil/x86/x86util.asm |   5 ++
>  6 files changed, 373 insertions(+), 41 deletions(-)
>  create mode 100644 libavcodec/v210enc.h
>  create mode 100644 libavcodec/x86/v210enc.asm
>  create mode 100644 libavcodec/x86/v210enc_init.c

breaks fate, also judging purely from the numbers it would be worse
quality wise

--- ./tests/ref/vsynth/vsynth1-v210 2014-11-26 16:30:08.642024147 +0100
+++ tests/data/fate/vsynth1-v2102014-11-26 17:41:43.134114620 +0100
@@ -1,4 +1,4 @@
-895d30660eb4da017568141a8d1df4e8 *tests/data/fate/vsynth1-v210.avi
+a98ae536d6362c6841744ef42e9ea8e1 *tests/data/fate/vsynth1-v210.avi
 14752448 tests/data/fate/vsynth1-v210.avi
-50973792d3f1abe04a51ee0121f077f2 *tests/data/fate/vsynth1-v210.out.rawvideo
-stddev:1.85 PSNR: 42.78 MAXDIFF:   29 bytes:  7603200/  7603200
+fc1d9531a1d2c05b0099fb93a47f4e05 *tests/data/fate/vsynth1-v210.out.rawvideo
+stddev:9.56 PSNR: 28.52 MAXDIFF:  196 bytes:  7603200/  7603200

PS: Thanks for posting a patch which applied cleanly!

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

No human being will ever know the Truth, for even if they happen to say it
by chance, they would not even known they had done so. -- Xenophanes


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] v210enc: Add SIMD optimised 8-bit and 10-bit encoders

2014-11-26 Thread Kieran Kunhya
---
 libavcodec/v210enc.c  | 191 +-
 libavcodec/v210enc.h  |  33 
 libavcodec/x86/Makefile   |   2 +
 libavcodec/x86/v210enc.asm| 146 
 libavcodec/x86/v210enc_init.c |  37 
 libavutil/x86/x86util.asm |   5 ++
 6 files changed, 373 insertions(+), 41 deletions(-)
 create mode 100644 libavcodec/v210enc.h
 create mode 100644 libavcodec/x86/v210enc.asm
 create mode 100644 libavcodec/x86/v210enc_init.c

diff --git a/libavcodec/v210enc.c b/libavcodec/v210enc.c
index 1e53bdb..0d40f99 100644
--- a/libavcodec/v210enc.c
+++ b/libavcodec/v210enc.c
@@ -24,82 +24,190 @@
 #include "avcodec.h"
 #include "bytestream.h"
 #include "internal.h"
+#include "v210enc.h"
+
+#define CLIP(v) av_clip(v, 4, 1019)
+#define CLIP8(v) av_clip(v, 1, 254)
+
+#define WRITE_PIXELS(a, b, c)   \
+do {\
+val =   CLIP(*a++); \
+val |= (CLIP(*b++) << 10) | \
+   (CLIP(*c++) << 20);  \
+AV_WL32(dst, val);  \
+dst += 4;   \
+} while (0)
+
+#define WRITE_PIXELS8(a, b, c)  \
+do {\
+val =  (CLIP8(*a++) << 2);   \
+val |= (CLIP8(*b++) << 12) | \
+   (CLIP8(*c++) << 22);  \
+AV_WL32(dst, val);  \
+dst += 4;   \
+} while (0)
+
+static void v210_planar_pack_8_c(const uint8_t *y, const uint8_t *u,
+ const uint8_t *v, uint8_t *dst, ptrdiff_t 
width)
+{
+uint32_t val;
+int i;
+
+/* unroll this to match the assembly */
+for( i = 0; i < width-11; i += 12 ){
+WRITE_PIXELS8(u, y, v);
+WRITE_PIXELS8(y, u, y);
+WRITE_PIXELS8(v, y, u);
+WRITE_PIXELS8(y, v, y);
+WRITE_PIXELS8(u, y, v);
+WRITE_PIXELS8(y, u, y);
+WRITE_PIXELS8(v, y, u);
+WRITE_PIXELS8(y, v, y);
+}
+}
+
+static void v210_planar_pack_10_c(const uint16_t *y, const uint16_t *u,
+  const uint16_t *v, uint8_t *dst, ptrdiff_t 
width)
+{
+uint32_t val;
+int i;
+
+for( i = 0; i < width-5; i += 6 ){
+WRITE_PIXELS(u, y, v);
+WRITE_PIXELS(y, u, y);
+WRITE_PIXELS(v, y, u);
+WRITE_PIXELS(y, v, y);
+}
+}
 
 static av_cold int encode_init(AVCodecContext *avctx)
 {
+V210EncContext *s = avctx->priv_data;
+
 if (avctx->width & 1) {
 av_log(avctx, AV_LOG_ERROR, "v210 needs even width\n");
 return AVERROR(EINVAL);
 }
 
-if (avctx->bits_per_raw_sample != 10)
-av_log(avctx, AV_LOG_WARNING, "bits per raw sample: %d != 10-bit\n",
-   avctx->bits_per_raw_sample);
-
 avctx->coded_frame = av_frame_alloc();
 if (!avctx->coded_frame)
 return AVERROR(ENOMEM);
 
 avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
 
+s->pack_line_8  = v210_planar_pack_8_c;
+s->pack_line_10 = v210_planar_pack_10_c;
+
+if (ARCH_X86)
+ff_v210enc_init_x86(s);
+
 return 0;
 }
 
 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 const AVFrame *pic, int *got_packet)
 {
+V210EncContext *s = avctx->priv_data;
+
 int aligned_width = ((avctx->width + 47) / 48) * 48;
 int stride = aligned_width * 8 / 3;
 int line_padding = stride - ((avctx->width * 8 + 11) / 12) * 4;
 int h, w, ret;
-const uint16_t *y = (const uint16_t*)pic->data[0];
-const uint16_t *u = (const uint16_t*)pic->data[1];
-const uint16_t *v = (const uint16_t*)pic->data[2];
-PutByteContext p;
+uint8_t *dst;
 
-if ((ret = ff_alloc_packet2(avctx, pkt, avctx->height * stride)) < 0)
+if ((ret = ff_alloc_packet(pkt, avctx->height * stride)) < 0) {
+av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
 return ret;
+}
 
-bytestream2_init_writer(&p, pkt->data, pkt->size);
+dst = pkt->data;
+
+if (pic->format == AV_PIX_FMT_YUV422P10) {
+const uint16_t *y = (const uint16_t*)pic->data[0];
+const uint16_t *u = (const uint16_t*)pic->data[1];
+const uint16_t *v = (const uint16_t*)pic->data[2];
+for (h = 0; h < avctx->height; h++) {
+uint32_t val;
+w = (avctx->width / 6) * 6;
+s->pack_line_10(y, u, v, dst, w);
+
+y += w;
+u += w >> 1;
+v += w >> 1;
+dst += (w / 6) * 16;
+if (w < avctx->width - 1) {
+WRITE_PIXELS(u, y, v);
+
+val = CLIP(*y++);
+if (w == avctx->width - 2) {
+AV_WL32(dst, val);
+dst += 4;
+}
+}
+if (w < avctx->width - 3) {
+val |= (CLIP(*u++) << 10) | (CLIP(*y++) << 20);
+AV_WL32(dst, val);
+