Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-27 Thread Hendrik Leppkes
On Wed, Sep 27, 2017 at 12:04 PM, Mateusz  wrote:
>
> OK, so this fight with possible overflow is even more needed.
>

Luckily x86 SIMD has saturating instructions which don't overflow, so
if we device a way to properly optimize this in yasm/nasm assembly,
then this should be pretty simple to do.

- Hendrik
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-27 Thread Mateusz
W dniu 2017-09-26 o 13:31, Carl Eugen Hoyos pisze:
> 2017-09-26 1:33 GMT+02:00 Mateusz :
> 
>> I've sent C code patch 2017-09-06 (and nothing) so I thought that the
>> problem is with speed. For simplicity I've attached this patch.
> 
> You could (wait a day or two and) either add an option to
> select your dithering code or put it under #ifdef so more
> people can test it.

I've attached patch that do nothing unless you specify
--extra-cflags="-DNEW_DITHER_COPY" or export CFLAGS="-DNEW_DITHER_COPY"

>> In theory it is enough to make only dst = (src + dither)>>shift;
>> -- white in limited range has 0 on bits to remove (235*4 for example)
>> so overflow is impossible. For files with full range not marked as
>> full range overflow is possible (for dither > 0) and white goes
>> to black. tmp - (tmp>>dst_depth) undoing this overflow.
> 
> (Not necessarily related, sorry if I misunderstand:)
> Valid limited-range frames can contain some pixels with peak
> values outside of the defined range.
> 
> Carl Eugen

OK, so this fight with possible overflow is even more needed.

Mateusz
From 2c0adb2d9a0fc0fbbffc643d27860fbb779c08fc Mon Sep 17 00:00:00 2001
From: Mateusz 
Date: Tue, 26 Sep 2017 22:20:10 +0200
Subject: [PATCH] swscale: new precise DITHER_COPY macro (if
 "-DNEW_DITHER_COPY" CFLAGS)

---
 libswscale/swscale_unscaled.c | 47 ++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index ef36aec..0d41695 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -110,6 +110,7 @@ DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={
   { 112, 16,104,  8,118, 22,110, 14,},
 }};
 
+#ifndef NEW_DITHER_COPY
 static const uint16_t dither_scale[15][16]={
 {2,3,3,5,5,5,5,5,5,5,5,5,
5,5,5,5,},
 {2,3,7,7,   13,   13,   25,   25,   25,   25,   25,   25,   
25,   25,   25,   25,},
@@ -127,7 +128,7 @@ static const uint16_t dither_scale[15][16]={
 {3,5,7,9,   10,   12,   14,   14,   14,   14,   14,   14,   
14,   15,32767,32767,},
 {3,5,7,9,   11,   12,   14,   15,   15,   15,   15,   15,   
15,   15,   16,65535,},
 };
-
+#endif
 
 static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
   uint8_t val)
@@ -1501,6 +1502,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t 
*src[],
 return srcSliceH;
 }
 
+#ifndef NEW_DITHER_COPY
 #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
 uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
 int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
@@ -1521,6 +1523,49 @@ static int packedCopyWrapper(SwsContext *c, const 
uint8_t *src[],
 dst += dstStride;\
 src += srcStride;\
 }
+#else
+#define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
+unsigned shift= src_depth-dst_depth, tmp;\
+if (shiftonly) {\
+for (i = 0; i < height; i++) {\
+const uint8_t *dither= dithers[shift-1][i&7];\
+for (j = 0; j < length-7; j+=8){\
+tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = 
dbswap(tmp - (tmp>>dst_depth));\
+}\
+for (; j < length; j++){\
+tmp = (bswap(src[j]) + dither[j&7])>>shift; dst[j] = 
dbswap(tmp - (tmp>>dst_depth));\
+}\
+dst += dstStride;\
+src += srcStride;\
+}\
+} else {\
+for (i = 0; i < height; i++) {\
+const uint8_t *dither= dithers[shift-1][i&7];\
+for (j = 0; j < length-7; j+=8){\
+tmp = bswap(src[j+0]); dst[j+0] = dbswap((tmp - 
(tmp>>dst_depth) + dither[0])>>shift);\
+tmp = bswap(src[j+1]); dst[j+1] = dbswap((tmp - 
(tmp>>dst_depth) + dither[1])>>shift);\
+tmp = bswap(src[j+2]); dst[j+2] = dbswap((tmp - 
(tmp>>dst_depth) + dither[2])>>shift);\
+tmp = bswap(src[j+3]); dst[j+3] = dbswap((tmp - 
(tmp>>dst_depth) + dither[3])>>shift);\
+tmp = bswap(src[j+4]); dst[j+4] = dbswap((tmp - 
(tmp>>dst_depth) + dither[

Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-26 Thread Carl Eugen Hoyos
2017-09-26 1:33 GMT+02:00 Mateusz :

> I've sent C code patch 2017-09-06 (and nothing) so I thought that the
> problem is with speed. For simplicity I've attached this patch.

You could (wait a day or two and) either add an option to
select your dithering code or put it under #ifdef so more
people can test it.

[...]

> In theory it is enough to make only dst = (src + dither)>>shift;
> -- white in limited range has 0 on bits to remove (235*4 for example)
> so overflow is impossible. For files with full range not marked as
> full range overflow is possible (for dither > 0) and white goes
> to black. tmp - (tmp>>dst_depth) undoing this overflow.

(Not necessarily related, sorry if I misunderstand:)
Valid limited-range frames can contain some pixels with peak
values outside of the defined range.

Carl Eugen
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-25 Thread Mateusz
W dniu 2017-09-25 o 22:53, Carl Eugen Hoyos pisze:
> 2017-09-23 19:18 GMT+02:00 Mateusz :
>> W dniu 2017-09-23 o 17:01, Michael Niedermayer pisze:
>>> On Fri, Sep 22, 2017 at 02:10:01AM +0200, Mateusz wrote:
 To reduce bit depth in planar YUV or gray pixel formats
 ffmpeg uses DITHER_COPY macro.
 Now it makes images greener and with visible dither pattern.

 In my opinion there is no point to use dither tables for
 destination bit depth >= 9, we can use simple down-shift
 which is neutral in full and limited range -- result images
 are with the same brightness and with the same colors.
>>>
>>> Theres no reason why dither should mess up the average color tone.
>>
>> In theory -- yes, I agree.
>> In reality -- current version of DITHER_COPY mess up the
>> average color tone.
>> It's one of the reasons why I sending these patches.
>>
>>> And if the user asks for >= 9 bit depth and has >= 10 bit
>>> input the user likely wants to get the best quality.
>>> Thats more so in a world where computers get faster
>>> every few years, this isnt 1995 where shaving off a add
>>> or a multiply per pixel was actually making a difference
>>> in being able to play something in realtime
>>> More so coverting between bit depths might be memory
>>> speed limited and not limited by arithmetic computations
>>> once its done in SIMD
>>
>> Yes, I agree. Now I can't write patches in NASM syntax, but
>> I started reading and learning.
>> I hope I'll back in a few months...
> 
> I strongly suspect there should be agreement over the C code
> first, asm optimizations can be done once C code is agreed
> upon.
> 
> Thank you for the samples, Carl Eugen

I've sent C code patch 2017-09-06 (and nothing) so I thought that the
problem is with speed. For simplicity I've attached this patch.

This code full-fill all rules (for limited and for full range):
white -> white
black -> black
white-1 -> gray (it means white-1 -> white usually but sometimes white-1 -> 
white-1)
black+1 -> gray (it means black+1 -> black usually but sometimes black+1 -> 
black+1)
increase bitdepth & decrease bitdepth = identity

About code:
limited range (shiftonly == 1)
tmp = (src + dither)>>shift; dst = tmp - (tmp>>dst_depth);

In theory it is enough to make only dst = (src + dither)>>shift;
-- white in limited range has 0 on bits to remove (235*4 for example)
so overflow is impossible. For files with full range not marked as
full range overflow is possible (for dither > 0) and white goes
to black. tmp - (tmp>>dst_depth) undoing this overflow.

full range (shiftonly == 0)
dst = (src - (src>>dst_depth) + dither)>>shift;

If we want to remove 2 bits from source (for example 10-bit -> 8-bit)
white in full range: ...111|11
white-1 in full range: ...111|10
so we should subtract dither for rule 'white-1 -> gray'
black in full range: ...000|00
black+1 in full range: ...000|01
so we should add dither for rule 'black+1 -> gray'
(src>>dst_depth) is 0 for small values (close to black) and max possible
dither for big values (close to white)

For rule 'increase bitdepth & decrease bitdepth = identity' in full range,
we have increase bitdepth: (v<<(dst_depth-src_depth)) | 
(v>>(2*src_depth-dst_depth))
so (src - (src>>dst_depth))>>shift is exactly inverse operation when we
decreasing bitdepth.
For full range when 'src' is from increased 'v'
(src - (src>>dst_depth)) is equal 'v' shifted only, so we are as in limited 
range.
And overflow is impossible in operation (src - (src>>dst_depth) + dither).

Mateusz
From fd9e271ea531d25bc5a708d0dfeb1be5415b90d0 Mon Sep 17 00:00:00 2001
From: Mateusz 
Date: Wed, 6 Sep 2017 09:05:02 +0200
Subject: [PATCH] fix DITHER_COPY macro according to shiftonly state

---
 libswscale/swscale_unscaled.c | 73 ++-
 1 file changed, 38 insertions(+), 35 deletions(-)

diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index ef36aec..e3e375a 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -110,24 +110,6 @@ DECLARE_ALIGNED(8, static const uint8_t, 
dithers)[8][8][8]={
   { 112, 16,104,  8,118, 22,110, 14,},
 }};
 
-static const uint16_t dither_scale[15][16]={
-{2,3,3,5,5,5,5,5,5,5,5,5,
5,5,5,5,},
-{2,3,7,7,   13,   13,   25,   25,   25,   25,   25,   25,   
25,   25,   25,   25,},
-{3,3,4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  
113,  113,  113,  113,},
-{3,4,4,5,   31,   31,   61,  121,  241,  241,  241,  241,  
481,  481,  481,  481,},
-{3,4,5,5,6,   63,   63,  125,  249,  497,  993,  993,  
993,  993,  993, 1985,},
-{3,5,6,6,6,7,  127,  127,  253,  505, 1009, 2017, 
4033, 4033, 4033, 4033,},
-{3,5,6,7,7,7,8,  255,  255,  509, 1017, 2033, 
4065, 8129,16257,16257,},
-{3,5,6,8,8,8,8,9,  511,  511, 1021, 2041, 
4081

Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-25 Thread Carl Eugen Hoyos
2017-09-23 19:18 GMT+02:00 Mateusz :
> W dniu 2017-09-23 o 17:01, Michael Niedermayer pisze:
>> On Fri, Sep 22, 2017 at 02:10:01AM +0200, Mateusz wrote:
>>> To reduce bit depth in planar YUV or gray pixel formats
>>> ffmpeg uses DITHER_COPY macro.
>>> Now it makes images greener and with visible dither pattern.
>>>
>>> In my opinion there is no point to use dither tables for
>>> destination bit depth >= 9, we can use simple down-shift
>>> which is neutral in full and limited range -- result images
>>> are with the same brightness and with the same colors.
>>
>> Theres no reason why dither should mess up the average color tone.
>
> In theory -- yes, I agree.
> In reality -- current version of DITHER_COPY mess up the
> average color tone.
> It's one of the reasons why I sending these patches.
>
>> And if the user asks for >= 9 bit depth and has >= 10 bit
>> input the user likely wants to get the best quality.
>> Thats more so in a world where computers get faster
>> every few years, this isnt 1995 where shaving off a add
>> or a multiply per pixel was actually making a difference
>> in being able to play something in realtime
>> More so coverting between bit depths might be memory
>> speed limited and not limited by arithmetic computations
>> once its done in SIMD
>
> Yes, I agree. Now I can't write patches in NASM syntax, but
> I started reading and learning.
> I hope I'll back in a few months...

I strongly suspect there should be agreement over the C code
first, asm optimizations can be done once C code is agreed
upon.

Thank you for the samples, Carl Eugen
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-25 Thread Mateusz
W dniu 2017-09-25 o 01:42, Carl Eugen Hoyos pisze:
> 2017-09-23 19:18 GMT+02:00 Mateusz :
> 
>> In reality -- current version of DITHER_COPY mess
>> up the average color tone.
> 
> You could explain how we can reproduce this.

Please take any video with colors that you can see change to green color --
it could be with human faces, for example free
http://media.xiph.org/video/derf/y4m/KristenAndSara_1280x720_60.y4m

Copy the video to o.y4m and make some iteration of 8-bit -> 10-bit -> 8-bit 
conversions:
ffmpeg -i o.y4m -y -strict -1 -pix_fmt yuv420p10 t10.y4m
ffmpeg -i t10.y4m -y -strict -1 -pix_fmt yuv420p o.y4m

Please watch result video -- it should be more green, with pattern and darker.

You can watch 'KristenAndSara' video after 100 iteration
http://www.msystem.waw.pl/x265/limited.mkv

And 100th iteration of 'KristenAndSara' with '-color_range 2' option
http://www.msystem.waw.pl/x265/full.mkv

Now DITHER_COPY works wrong in both cases (full and limited range).

Mateusz

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-24 Thread Carl Eugen Hoyos
2017-09-23 19:18 GMT+02:00 Mateusz :

> In reality -- current version of DITHER_COPY mess
> up the average color tone.

You could explain how we can reproduce this.

Carl Eugen
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-23 Thread Mateusz
W dniu 2017-09-23 o 17:01, Michael Niedermayer pisze:
> On Fri, Sep 22, 2017 at 02:10:01AM +0200, Mateusz wrote:
>> To reduce bit depth in planar YUV or gray pixel formats ffmpeg uses 
>> DITHER_COPY macro.
>> Now it makes images greener and with visible dither pattern.
>>
>> In my opinion there is no point to use dither tables for destination bit 
>> depth >= 9,
>> we can use simple down-shift which is neutral in full and limited range -- 
>> result images
>> are with the same brightness and with the same colors.
> 
> Theres no reason why dither should mess up the average color tone.

In theory -- yes, I agree.
In reality -- current version of DITHER_COPY mess up the average color tone.
It's one of the reasons why I sending these patches.

> And if the user asks for >= 9 bit depth and has >= 10 bit input the
> user likely wants to get the best quality.
> Thats more so in a world where computers get faster every few years,
> this isnt 1995 where shaving off a add or a multiply per pixel was
> actually making a difference in being able to play something in
> realtime
> More so coverting between bit depths might be memory speed limited and
> not limited by arithmetic computations once its done in SIMD

Yes, I agree. Now I can't write patches in NASM syntax, but I started reading 
and learning.
I hope I'll back in a few months...

Mateusz

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-23 Thread Michael Niedermayer
On Fri, Sep 22, 2017 at 02:10:01AM +0200, Mateusz wrote:
> To reduce bit depth in planar YUV or gray pixel formats ffmpeg uses 
> DITHER_COPY macro.
> Now it makes images greener and with visible dither pattern.
> 
> In my opinion there is no point to use dither tables for destination bit 
> depth >= 9,
> we can use simple down-shift which is neutral in full and limited range -- 
> result images
> are with the same brightness and with the same colors.

Theres no reason why dither should mess up the average color tone.
And if the user asks for >= 9 bit depth and has >= 10 bit input the
user likely wants to get the best quality.
Thats more so in a world where computers get faster every few years,
this isnt 1995 where shaving off a add or a multiply per pixel was
actually making a difference in being able to play something in
realtime
More so coverting between bit depths might be memory speed limited and
not limited by arithmetic computations once its done in SIMD


[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

I am the wisest man alive, for I know one thing, and that is that I know
nothing. -- Socrates


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-21 Thread Hendrik Leppkes
On Fri, Sep 22, 2017 at 2:10 AM, Mateusz  wrote:
> To reduce bit depth in planar YUV or gray pixel formats ffmpeg uses 
> DITHER_COPY macro.
> Now it makes images greener and with visible dither pattern.
>
> In my opinion there is no point to use dither tables for destination bit 
> depth >= 9,
> we can use simple down-shift which is neutral in full and limited range -- 
> result images
> are with the same brightness and with the same colors.
>
> For destination bit depth == 8 we could use new bit exact precise DITHER_COPY 
> macro
> (which is slower).
>

Why would the target bitdepth matter? Both 8 and 9 bit (and any other)
should follow the same rules for a result, there is full and limited
range minimum and maximum in all of them.
In my mind, the logic should be independent of the target bitdepth,
perhaps minus optimizations for speed for the more common cases.

- Hendrik
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8

2017-09-21 Thread Mateusz
To reduce bit depth in planar YUV or gray pixel formats ffmpeg uses DITHER_COPY 
macro.
Now it makes images greener and with visible dither pattern.

In my opinion there is no point to use dither tables for destination bit depth 
>= 9,
we can use simple down-shift which is neutral in full and limited range -- 
result images
are with the same brightness and with the same colors.

For destination bit depth == 8 we could use new bit exact precise DITHER_COPY 
macro
(which is slower).

If the problem is with speed only, I've attached second patch with Intel 
Intrinsics for x86_64
that makes code faster (I don't see any Intel Intrinsics in ffmpeg so it's 
probably for testing only).

Please review.

Mateusz

From a52417a3817ac774eb364bbef20c954a3d278d45 Mon Sep 17 00:00:00 2001
From: Mateusz 
Date: Fri, 22 Sep 2017 01:22:59 +0200
Subject: [PATCH] swscale_unscaled: fix and speed up DITHER_COPY macro for
 x86_64, use it only for dst_depth == 8

---
 libswscale/swscale_unscaled.c | 185 ++
 1 file changed, 150 insertions(+), 35 deletions(-)

diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index ef36aec..7d1cbed 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -35,6 +35,10 @@
 #include "libavutil/avassert.h"
 #include "libavutil/avconfig.h"
 
+#if ARCH_X86_64
+#include 
+#endif
+
 DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={
 {
   {   0,  1,  0,  1,  0,  1,  0,  1,},
@@ -110,24 +114,6 @@ DECLARE_ALIGNED(8, static const uint8_t, 
dithers)[8][8][8]={
   { 112, 16,104,  8,118, 22,110, 14,},
 }};
 
-static const uint16_t dither_scale[15][16]={
-{2,3,3,5,5,5,5,5,5,5,5,5,
5,5,5,5,},
-{2,3,7,7,   13,   13,   25,   25,   25,   25,   25,   25,   
25,   25,   25,   25,},
-{3,3,4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  
113,  113,  113,  113,},
-{3,4,4,5,   31,   31,   61,  121,  241,  241,  241,  241,  
481,  481,  481,  481,},
-{3,4,5,5,6,   63,   63,  125,  249,  497,  993,  993,  
993,  993,  993, 1985,},
-{3,5,6,6,6,7,  127,  127,  253,  505, 1009, 2017, 
4033, 4033, 4033, 4033,},
-{3,5,6,7,7,7,8,  255,  255,  509, 1017, 2033, 
4065, 8129,16257,16257,},
-{3,5,6,8,8,8,8,9,  511,  511, 1021, 2041, 
4081, 8161,16321,32641,},
-{3,5,7,8,9,9,9,9,   10, 1023, 1023, 2045, 
4089, 8177,16353,32705,},
-{3,5,7,8,   10,   10,   10,   10,   10,   11, 2047, 2047, 
4093, 8185,16369,32737,},
-{3,5,7,8,   10,   11,   11,   11,   11,   11,   12, 4095, 
4095, 8189,16377,32753,},
-{3,5,7,9,   10,   12,   12,   12,   12,   12,   12,   13, 
8191, 8191,16381,32761,},
-{3,5,7,9,   10,   12,   13,   13,   13,   13,   13,   13,   
14,16383,16383,32765,},
-{3,5,7,9,   10,   12,   14,   14,   14,   14,   14,   14,   
14,   15,32767,32767,},
-{3,5,7,9,   11,   12,   14,   15,   15,   15,   15,   15,   
15,   15,   16,65535,},
-};
-
 
 static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
   uint8_t val)
@@ -1502,22 +1488,127 @@ static int packedCopyWrapper(SwsContext *c, const 
uint8_t *src[],
 }
 
 #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
-uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
-int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
+unsigned shift= src_depth-dst_depth, tmp;\
+if (shiftonly) {\
+for (i = 0; i < height; i++) {\
+const uint8_t *dither= dithers[shift-1][i&7];\
+for (j = 0; j < length-7; j+=8) {\
+tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = 
dbswap(tmp - (tmp>>dst_depth));\
+tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = 
dbswap(tmp - (tmp>>dst_depth));\
+}\
+for (; j < length; j++) {\
+tmp = (bswap(src[j]) + dither[j&7])>>shift; dst[j] = 
dbswap(tmp - (tmp>>dst_depth));\
+}\
+dst += dstStride;\
+src += srcStride;\
+}\
+} else {\
+for (i = 0; i < heigh