Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
On Wed, Sep 27, 2017 at 12:04 PM, Mateusz wrote: > > OK, so this fight with possible overflow is even more needed. > Luckily x86 SIMD has saturating instructions which don't overflow, so if we device a way to properly optimize this in yasm/nasm assembly, then this should be pretty simple to do. - Hendrik ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
W dniu 2017-09-26 o 13:31, Carl Eugen Hoyos pisze: > 2017-09-26 1:33 GMT+02:00 Mateusz : > >> I've sent C code patch 2017-09-06 (and nothing) so I thought that the >> problem is with speed. For simplicity I've attached this patch. > > You could (wait a day or two and) either add an option to > select your dithering code or put it under #ifdef so more > people can test it. I've attached patch that do nothing unless you specify --extra-cflags="-DNEW_DITHER_COPY" or export CFLAGS="-DNEW_DITHER_COPY" >> In theory it is enough to make only dst = (src + dither)>>shift; >> -- white in limited range has 0 on bits to remove (235*4 for example) >> so overflow is impossible. For files with full range not marked as >> full range overflow is possible (for dither > 0) and white goes >> to black. tmp - (tmp>>dst_depth) undoing this overflow. > > (Not necessarily related, sorry if I misunderstand:) > Valid limited-range frames can contain some pixels with peak > values outside of the defined range. > > Carl Eugen OK, so this fight with possible overflow is even more needed. Mateusz From 2c0adb2d9a0fc0fbbffc643d27860fbb779c08fc Mon Sep 17 00:00:00 2001 From: Mateusz Date: Tue, 26 Sep 2017 22:20:10 +0200 Subject: [PATCH] swscale: new precise DITHER_COPY macro (if "-DNEW_DITHER_COPY" CFLAGS) --- libswscale/swscale_unscaled.c | 47 ++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index ef36aec..0d41695 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -110,6 +110,7 @@ DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={ { 112, 16,104, 8,118, 22,110, 14,}, }}; +#ifndef NEW_DITHER_COPY static const uint16_t dither_scale[15][16]={ {2,3,3,5,5,5,5,5,5,5,5,5, 5,5,5,5,}, {2,3,7,7, 13, 13, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,}, @@ -127,7 +128,7 @@ static const uint16_t dither_scale[15][16]={ {3,5,7,9, 10, 12, 14, 14, 14, 14, 14, 14, 14, 15,32767,32767,}, {3,5,7,9, 11, 12, 14, 15, 15, 15, 15, 15, 15, 15, 16,65535,}, }; - +#endif static void fillPlane(uint8_t *plane, int stride, int width, int height, int y, uint8_t val) @@ -1501,6 +1502,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], return srcSliceH; } +#ifndef NEW_DITHER_COPY #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\ uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\ int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\ @@ -1521,6 +1523,49 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], dst += dstStride;\ src += srcStride;\ } +#else +#define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\ +unsigned shift= src_depth-dst_depth, tmp;\ +if (shiftonly) {\ +for (i = 0; i < height; i++) {\ +const uint8_t *dither= dithers[shift-1][i&7];\ +for (j = 0; j < length-7; j+=8){\ +tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = dbswap(tmp - (tmp>>dst_depth));\ +}\ +for (; j < length; j++){\ +tmp = (bswap(src[j]) + dither[j&7])>>shift; dst[j] = dbswap(tmp - (tmp>>dst_depth));\ +}\ +dst += dstStride;\ +src += srcStride;\ +}\ +} else {\ +for (i = 0; i < height; i++) {\ +const uint8_t *dither= dithers[shift-1][i&7];\ +for (j = 0; j < length-7; j+=8){\ +tmp = bswap(src[j+0]); dst[j+0] = dbswap((tmp - (tmp>>dst_depth) + dither[0])>>shift);\ +tmp = bswap(src[j+1]); dst[j+1] = dbswap((tmp - (tmp>>dst_depth) + dither[1])>>shift);\ +tmp = bswap(src[j+2]); dst[j+2] = dbswap((tmp - (tmp>>dst_depth) + dither[2])>>shift);\ +tmp = bswap(src[j+3]); dst[j+3] = dbswap((tmp - (tmp>>dst_depth) + dither[3])>>shift);\ +tmp = bswap(src[j+4]); dst[j+4] = dbswap((tmp - (tmp>>dst_depth) + dither[
Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
2017-09-26 1:33 GMT+02:00 Mateusz : > I've sent C code patch 2017-09-06 (and nothing) so I thought that the > problem is with speed. For simplicity I've attached this patch. You could (wait a day or two and) either add an option to select your dithering code or put it under #ifdef so more people can test it. [...] > In theory it is enough to make only dst = (src + dither)>>shift; > -- white in limited range has 0 on bits to remove (235*4 for example) > so overflow is impossible. For files with full range not marked as > full range overflow is possible (for dither > 0) and white goes > to black. tmp - (tmp>>dst_depth) undoing this overflow. (Not necessarily related, sorry if I misunderstand:) Valid limited-range frames can contain some pixels with peak values outside of the defined range. Carl Eugen ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
W dniu 2017-09-25 o 22:53, Carl Eugen Hoyos pisze: > 2017-09-23 19:18 GMT+02:00 Mateusz : >> W dniu 2017-09-23 o 17:01, Michael Niedermayer pisze: >>> On Fri, Sep 22, 2017 at 02:10:01AM +0200, Mateusz wrote: To reduce bit depth in planar YUV or gray pixel formats ffmpeg uses DITHER_COPY macro. Now it makes images greener and with visible dither pattern. In my opinion there is no point to use dither tables for destination bit depth >= 9, we can use simple down-shift which is neutral in full and limited range -- result images are with the same brightness and with the same colors. >>> >>> Theres no reason why dither should mess up the average color tone. >> >> In theory -- yes, I agree. >> In reality -- current version of DITHER_COPY mess up the >> average color tone. >> It's one of the reasons why I sending these patches. >> >>> And if the user asks for >= 9 bit depth and has >= 10 bit >>> input the user likely wants to get the best quality. >>> Thats more so in a world where computers get faster >>> every few years, this isnt 1995 where shaving off a add >>> or a multiply per pixel was actually making a difference >>> in being able to play something in realtime >>> More so coverting between bit depths might be memory >>> speed limited and not limited by arithmetic computations >>> once its done in SIMD >> >> Yes, I agree. Now I can't write patches in NASM syntax, but >> I started reading and learning. >> I hope I'll back in a few months... > > I strongly suspect there should be agreement over the C code > first, asm optimizations can be done once C code is agreed > upon. > > Thank you for the samples, Carl Eugen I've sent C code patch 2017-09-06 (and nothing) so I thought that the problem is with speed. For simplicity I've attached this patch. This code full-fill all rules (for limited and for full range): white -> white black -> black white-1 -> gray (it means white-1 -> white usually but sometimes white-1 -> white-1) black+1 -> gray (it means black+1 -> black usually but sometimes black+1 -> black+1) increase bitdepth & decrease bitdepth = identity About code: limited range (shiftonly == 1) tmp = (src + dither)>>shift; dst = tmp - (tmp>>dst_depth); In theory it is enough to make only dst = (src + dither)>>shift; -- white in limited range has 0 on bits to remove (235*4 for example) so overflow is impossible. For files with full range not marked as full range overflow is possible (for dither > 0) and white goes to black. tmp - (tmp>>dst_depth) undoing this overflow. full range (shiftonly == 0) dst = (src - (src>>dst_depth) + dither)>>shift; If we want to remove 2 bits from source (for example 10-bit -> 8-bit) white in full range: ...111|11 white-1 in full range: ...111|10 so we should subtract dither for rule 'white-1 -> gray' black in full range: ...000|00 black+1 in full range: ...000|01 so we should add dither for rule 'black+1 -> gray' (src>>dst_depth) is 0 for small values (close to black) and max possible dither for big values (close to white) For rule 'increase bitdepth & decrease bitdepth = identity' in full range, we have increase bitdepth: (v<<(dst_depth-src_depth)) | (v>>(2*src_depth-dst_depth)) so (src - (src>>dst_depth))>>shift is exactly inverse operation when we decreasing bitdepth. For full range when 'src' is from increased 'v' (src - (src>>dst_depth)) is equal 'v' shifted only, so we are as in limited range. And overflow is impossible in operation (src - (src>>dst_depth) + dither). Mateusz From fd9e271ea531d25bc5a708d0dfeb1be5415b90d0 Mon Sep 17 00:00:00 2001 From: Mateusz Date: Wed, 6 Sep 2017 09:05:02 +0200 Subject: [PATCH] fix DITHER_COPY macro according to shiftonly state --- libswscale/swscale_unscaled.c | 73 ++- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index ef36aec..e3e375a 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -110,24 +110,6 @@ DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={ { 112, 16,104, 8,118, 22,110, 14,}, }}; -static const uint16_t dither_scale[15][16]={ -{2,3,3,5,5,5,5,5,5,5,5,5, 5,5,5,5,}, -{2,3,7,7, 13, 13, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,}, -{3,3,4, 15, 15, 29, 57, 57, 57, 113, 113, 113, 113, 113, 113, 113,}, -{3,4,4,5, 31, 31, 61, 121, 241, 241, 241, 241, 481, 481, 481, 481,}, -{3,4,5,5,6, 63, 63, 125, 249, 497, 993, 993, 993, 993, 993, 1985,}, -{3,5,6,6,6,7, 127, 127, 253, 505, 1009, 2017, 4033, 4033, 4033, 4033,}, -{3,5,6,7,7,7,8, 255, 255, 509, 1017, 2033, 4065, 8129,16257,16257,}, -{3,5,6,8,8,8,8,9, 511, 511, 1021, 2041, 4081
Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
2017-09-23 19:18 GMT+02:00 Mateusz : > W dniu 2017-09-23 o 17:01, Michael Niedermayer pisze: >> On Fri, Sep 22, 2017 at 02:10:01AM +0200, Mateusz wrote: >>> To reduce bit depth in planar YUV or gray pixel formats >>> ffmpeg uses DITHER_COPY macro. >>> Now it makes images greener and with visible dither pattern. >>> >>> In my opinion there is no point to use dither tables for >>> destination bit depth >= 9, we can use simple down-shift >>> which is neutral in full and limited range -- result images >>> are with the same brightness and with the same colors. >> >> Theres no reason why dither should mess up the average color tone. > > In theory -- yes, I agree. > In reality -- current version of DITHER_COPY mess up the > average color tone. > It's one of the reasons why I sending these patches. > >> And if the user asks for >= 9 bit depth and has >= 10 bit >> input the user likely wants to get the best quality. >> Thats more so in a world where computers get faster >> every few years, this isnt 1995 where shaving off a add >> or a multiply per pixel was actually making a difference >> in being able to play something in realtime >> More so coverting between bit depths might be memory >> speed limited and not limited by arithmetic computations >> once its done in SIMD > > Yes, I agree. Now I can't write patches in NASM syntax, but > I started reading and learning. > I hope I'll back in a few months... I strongly suspect there should be agreement over the C code first, asm optimizations can be done once C code is agreed upon. Thank you for the samples, Carl Eugen ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
W dniu 2017-09-25 o 01:42, Carl Eugen Hoyos pisze: > 2017-09-23 19:18 GMT+02:00 Mateusz : > >> In reality -- current version of DITHER_COPY mess >> up the average color tone. > > You could explain how we can reproduce this. Please take any video with colors that you can see change to green color -- it could be with human faces, for example free http://media.xiph.org/video/derf/y4m/KristenAndSara_1280x720_60.y4m Copy the video to o.y4m and make some iteration of 8-bit -> 10-bit -> 8-bit conversions: ffmpeg -i o.y4m -y -strict -1 -pix_fmt yuv420p10 t10.y4m ffmpeg -i t10.y4m -y -strict -1 -pix_fmt yuv420p o.y4m Please watch result video -- it should be more green, with pattern and darker. You can watch 'KristenAndSara' video after 100 iteration http://www.msystem.waw.pl/x265/limited.mkv And 100th iteration of 'KristenAndSara' with '-color_range 2' option http://www.msystem.waw.pl/x265/full.mkv Now DITHER_COPY works wrong in both cases (full and limited range). Mateusz ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
2017-09-23 19:18 GMT+02:00 Mateusz : > In reality -- current version of DITHER_COPY mess > up the average color tone. You could explain how we can reproduce this. Carl Eugen ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
W dniu 2017-09-23 o 17:01, Michael Niedermayer pisze: > On Fri, Sep 22, 2017 at 02:10:01AM +0200, Mateusz wrote: >> To reduce bit depth in planar YUV or gray pixel formats ffmpeg uses >> DITHER_COPY macro. >> Now it makes images greener and with visible dither pattern. >> >> In my opinion there is no point to use dither tables for destination bit >> depth >= 9, >> we can use simple down-shift which is neutral in full and limited range -- >> result images >> are with the same brightness and with the same colors. > > Theres no reason why dither should mess up the average color tone. In theory -- yes, I agree. In reality -- current version of DITHER_COPY mess up the average color tone. It's one of the reasons why I sending these patches. > And if the user asks for >= 9 bit depth and has >= 10 bit input the > user likely wants to get the best quality. > Thats more so in a world where computers get faster every few years, > this isnt 1995 where shaving off a add or a multiply per pixel was > actually making a difference in being able to play something in > realtime > More so coverting between bit depths might be memory speed limited and > not limited by arithmetic computations once its done in SIMD Yes, I agree. Now I can't write patches in NASM syntax, but I started reading and learning. I hope I'll back in a few months... Mateusz ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
On Fri, Sep 22, 2017 at 02:10:01AM +0200, Mateusz wrote: > To reduce bit depth in planar YUV or gray pixel formats ffmpeg uses > DITHER_COPY macro. > Now it makes images greener and with visible dither pattern. > > In my opinion there is no point to use dither tables for destination bit > depth >= 9, > we can use simple down-shift which is neutral in full and limited range -- > result images > are with the same brightness and with the same colors. Theres no reason why dither should mess up the average color tone. And if the user asks for >= 9 bit depth and has >= 10 bit input the user likely wants to get the best quality. Thats more so in a world where computers get faster every few years, this isnt 1995 where shaving off a add or a multiply per pixel was actually making a difference in being able to play something in realtime More so coverting between bit depths might be memory speed limited and not limited by arithmetic computations once its done in SIMD [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB I am the wisest man alive, for I know one thing, and that is that I know nothing. -- Socrates signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
On Fri, Sep 22, 2017 at 2:10 AM, Mateusz wrote: > To reduce bit depth in planar YUV or gray pixel formats ffmpeg uses > DITHER_COPY macro. > Now it makes images greener and with visible dither pattern. > > In my opinion there is no point to use dither tables for destination bit > depth >= 9, > we can use simple down-shift which is neutral in full and limited range -- > result images > are with the same brightness and with the same colors. > > For destination bit depth == 8 we could use new bit exact precise DITHER_COPY > macro > (which is slower). > Why would the target bitdepth matter? Both 8 and 9 bit (and any other) should follow the same rules for a result, there is full and limited range minimum and maximum in all of them. In my mind, the logic should be independent of the target bitdepth, perhaps minus optimizations for speed for the more common cases. - Hendrik ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] swscale_unscaled: fix DITHER_COPY macro, use it only for dst_depth == 8
To reduce bit depth in planar YUV or gray pixel formats ffmpeg uses DITHER_COPY macro. Now it makes images greener and with visible dither pattern. In my opinion there is no point to use dither tables for destination bit depth >= 9, we can use simple down-shift which is neutral in full and limited range -- result images are with the same brightness and with the same colors. For destination bit depth == 8 we could use new bit exact precise DITHER_COPY macro (which is slower). If the problem is with speed only, I've attached second patch with Intel Intrinsics for x86_64 that makes code faster (I don't see any Intel Intrinsics in ffmpeg so it's probably for testing only). Please review. Mateusz From a52417a3817ac774eb364bbef20c954a3d278d45 Mon Sep 17 00:00:00 2001 From: Mateusz Date: Fri, 22 Sep 2017 01:22:59 +0200 Subject: [PATCH] swscale_unscaled: fix and speed up DITHER_COPY macro for x86_64, use it only for dst_depth == 8 --- libswscale/swscale_unscaled.c | 185 ++ 1 file changed, 150 insertions(+), 35 deletions(-) diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index ef36aec..7d1cbed 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -35,6 +35,10 @@ #include "libavutil/avassert.h" #include "libavutil/avconfig.h" +#if ARCH_X86_64 +#include +#endif + DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={ { { 0, 1, 0, 1, 0, 1, 0, 1,}, @@ -110,24 +114,6 @@ DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={ { 112, 16,104, 8,118, 22,110, 14,}, }}; -static const uint16_t dither_scale[15][16]={ -{2,3,3,5,5,5,5,5,5,5,5,5, 5,5,5,5,}, -{2,3,7,7, 13, 13, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,}, -{3,3,4, 15, 15, 29, 57, 57, 57, 113, 113, 113, 113, 113, 113, 113,}, -{3,4,4,5, 31, 31, 61, 121, 241, 241, 241, 241, 481, 481, 481, 481,}, -{3,4,5,5,6, 63, 63, 125, 249, 497, 993, 993, 993, 993, 993, 1985,}, -{3,5,6,6,6,7, 127, 127, 253, 505, 1009, 2017, 4033, 4033, 4033, 4033,}, -{3,5,6,7,7,7,8, 255, 255, 509, 1017, 2033, 4065, 8129,16257,16257,}, -{3,5,6,8,8,8,8,9, 511, 511, 1021, 2041, 4081, 8161,16321,32641,}, -{3,5,7,8,9,9,9,9, 10, 1023, 1023, 2045, 4089, 8177,16353,32705,}, -{3,5,7,8, 10, 10, 10, 10, 10, 11, 2047, 2047, 4093, 8185,16369,32737,}, -{3,5,7,8, 10, 11, 11, 11, 11, 11, 12, 4095, 4095, 8189,16377,32753,}, -{3,5,7,9, 10, 12, 12, 12, 12, 12, 12, 13, 8191, 8191,16381,32761,}, -{3,5,7,9, 10, 12, 13, 13, 13, 13, 13, 13, 14,16383,16383,32765,}, -{3,5,7,9, 10, 12, 14, 14, 14, 14, 14, 14, 14, 15,32767,32767,}, -{3,5,7,9, 11, 12, 14, 15, 15, 15, 15, 15, 15, 15, 16,65535,}, -}; - static void fillPlane(uint8_t *plane, int stride, int width, int height, int y, uint8_t val) @@ -1502,22 +1488,127 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], } #define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\ -uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\ -int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\ +unsigned shift= src_depth-dst_depth, tmp;\ +if (shiftonly) {\ +for (i = 0; i < height; i++) {\ +const uint8_t *dither= dithers[shift-1][i&7];\ +for (j = 0; j < length-7; j+=8) {\ +tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+2]) + dither[2])>>shift; dst[j+2] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+3]) + dither[3])>>shift; dst[j+3] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+4]) + dither[4])>>shift; dst[j+4] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+5]) + dither[5])>>shift; dst[j+5] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+6]) + dither[6])>>shift; dst[j+6] = dbswap(tmp - (tmp>>dst_depth));\ +tmp = (bswap(src[j+7]) + dither[7])>>shift; dst[j+7] = dbswap(tmp - (tmp>>dst_depth));\ +}\ +for (; j < length; j++) {\ +tmp = (bswap(src[j]) + dither[j&7])>>shift; dst[j] = dbswap(tmp - (tmp>>dst_depth));\ +}\ +dst += dstStride;\ +src += srcStride;\ +}\ +} else {\ +for (i = 0; i < heigh