[libav-devel] [RFC] fix swscale for 9/10bit

Ronald S. Bultje Wed, 11 May 2011 08:26:18 -0700

Hi,

attached is a RFC for fixing SWS for 9/10bit, as output by H264. It
takes endianness in account and does some other stuff. As you can see,
this stuff quickly gets hideous (e.g. planarCopyWrapper is a mess), so
I'd like some advice on how people think we should do this.


We need this fix so we can convert 10bitBE to 10bitLE and thereby fix
10bit H264 on BE systems (e.g. PPC), which are now correct except that
byte ordering for each word is inverted.

Ronald

diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 533eb9f..d88775f 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -128,10 +128,10 @@ enum PixelFormat {
     PIX_FMT_VDPAU_MPEG4,  ///< MPEG4 HW decoding with VDPAU, data[0] contains 
a vdpau_render_state struct which contains the bitstream of the slices as well 
as various fields extracted from headers
     PIX_FMT_DXVA2_VLD,    ///< HW decoding through DXVA2, Picture.data[3] 
contains a LPDIRECT3DSURFACE9 pointer
 
-    PIX_FMT_RGB444BE,  ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), 
big-endian, most significant bits to 0
     PIX_FMT_RGB444LE,  ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), 
little-endian, most significant bits to 0
-    PIX_FMT_BGR444BE,  ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), 
big-endian, most significant bits to 1
+    PIX_FMT_RGB444BE,  ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), 
big-endian, most significant bits to 0
     PIX_FMT_BGR444LE,  ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), 
little-endian, most significant bits to 1
+    PIX_FMT_BGR444BE,  ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), 
big-endian, most significant bits to 1
     PIX_FMT_Y400A,     ///< 8bit gray, 8bit alpha
     PIX_FMT_BGR48BE,   ///< packed RGB 16:16:16, 48bpp, 16B, 16G, 16R, the 
2-byte value for each R/G/B component is stored as big-endian
     PIX_FMT_BGR48LE,   ///< packed RGB 16:16:16, 48bpp, 16B, 16G, 16R, the 
2-byte value for each R/G/B component is stored as little-endian
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 2830f26..7a0a9a7 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1669,25 +1669,124 @@ static int planarCopyWrapper(SwsContext *c, const 
uint8_t* src[], int srcStride[
                 length*=2;
             fillPlane(dst[plane], dstStride[plane], length, height, y, 
(plane==3) ? 255 : 128);
         } else {
-            if(isNBPS(c->srcFormat)) {
-                const int depth = 
av_pix_fmt_descriptors[c->srcFormat].comp[plane].depth_minus1+1;
-                uint16_t *srcPtr2 = (uint16_t*)srcPtr;
+            if(is9_OR_10BPS(c->srcFormat)) {
+                const int src_depth = 
av_pix_fmt_descriptors[c->srcFormat].comp[plane].depth_minus1+1;
+                const int dst_depth = 
av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1+1;
+                const uint16_t *srcPtr2 = (const uint16_t*)srcPtr;
 
                 if (is16BPS(c->dstFormat)) {
                     uint16_t *dstPtr2 = (uint16_t*)dstPtr;
-                    for (i = 0; i < height; i++) {
-                        for (j = 0; j < length; j++)
-                            dstPtr2[j] = (srcPtr2[j]<<(16-depth)) | 
(srcPtr2[j]>>(2*depth-16));
-                        dstPtr2 += dstStride[plane]/2;
-                        srcPtr2 += srcStride[plane]/2;
+#define COPY9_OR_10TO16(rfunc, wfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            int srcpx = rfunc(&srcPtr2[j]); \
+                            wfunc(&dstPtr2[j], (srcpx<<(16-src_depth)) | 
(srcpx>>(2*src_depth-16))); \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO16(AV_RB16, AV_WB16);
+                        } else {
+                            COPY9_OR_10TO16(AV_RL16, AV_WB16);
+                        }
+                    } else {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO16(AV_RB16, AV_WL16);
+                        } else {
+                            COPY9_OR_10TO16(AV_RL16, AV_WL16);
+                        }
+                    }
+                } else if (is9_OR_10BPS(c->dstFormat)) {
+                    uint16_t *dstPtr2 = (uint16_t*)dstPtr;
+#define COPY9_OR_10TO9_OR_10(loop) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            loop; \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+#define COPY9_OR_10TO9_OR_10_2(rfunc, wfunc) \
+                    if (dst_depth > src_depth) { \
+                        COPY9_OR_10TO9_OR_10(int srcpx = rfunc(&srcPtr2[j]); \
+                            wfunc(&dstPtr2[j], (srcpx << 1) | (srcpx >> 9))); \
+                    } else if (dst_depth < src_depth) { \
+                        COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], 
rfunc(&srcPtr2[j]) >> 1)); \
+                    } else { \
+                        COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], 
rfunc(&srcPtr2[j]))); \
+                    }
+                    if (isBE(c->dstFormat)) { 
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WB16);
+                        } else {
+                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WB16);
+                        }
+                    } else {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WL16);
+                        } else {
+                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WL16);
+                        }
                     }
                 } else {
                     // FIXME Maybe dither instead.
-                    for (i = 0; i < height; i++) {
-                        for (j = 0; j < length; j++)
-                            dstPtr[j] = srcPtr2[j]>>(depth-8);
-                        dstPtr  += dstStride[plane];
-                        srcPtr2 += srcStride[plane]/2;
+#define COPY9_OR_10TO8(rfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            dstPtr[j] = rfunc(&srcPtr2[j])>>(src_depth-8); \
+                        } \
+                        dstPtr  += dstStride[plane]; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+                    if (isBE(c->srcFormat)) {
+                        COPY9_OR_10TO8(AV_RB16);
+                    } else {
+                        COPY9_OR_10TO8(AV_RL16);
+                    }
+                }
+            } else if(is9_OR_10BPS(c->dstFormat)) {
+                const int dst_depth = 
av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1+1;
+                uint16_t *dstPtr2 = (uint16_t*)dstPtr;
+
+                if (is16BPS(c->srcFormat)) {
+                    const uint16_t *srcPtr2 = (const uint16_t*)srcPtr;
+#define COPY16TO9_OR_10(rfunc, wfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            wfunc(&dstPtr2[j], 
rfunc(&srcPtr2[j])>>(16-dst_depth)); \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        if (isBE(c->srcFormat)) {
+                            COPY16TO9_OR_10(AV_RB16, AV_WB16);
+                        } else {
+                            COPY16TO9_OR_10(AV_RL16, AV_WB16);
+                        }
+                    } else {
+                        if (isBE(c->srcFormat)) {
+                            COPY16TO9_OR_10(AV_RB16, AV_WL16);
+                        } else {
+                            COPY16TO9_OR_10(AV_RL16, AV_WL16);
+                        }
+                    }
+                } else /* 8bit */ {
+#define COPY8TO9_OR_10(wfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            const int srcpx = srcPtr[j]; \
+                            wfunc(&dstPtr[j], (srcpx<<(dst_depth-8)) | (srcpx 
>> (16-dst_depth))); \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr  += srcStride[plane]; \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        COPY8TO9_OR_10(AV_WB16);
+                    } else {
+                        COPY8TO9_OR_10(AV_WL16);
                     }
                 }
             } else if(is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) {
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 2d40215..2369546 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -354,7 +354,7 @@ const char *sws_format_name(enum PixelFormat format);
         || (x)==PIX_FMT_YUV422P16BE \
         || (x)==PIX_FMT_YUV444P16BE \
     )
-#define isNBPS(x)       (           \
+#define is9_OR_10BPS(x) (           \
            (x)==PIX_FMT_YUV420P9LE  \
         || (x)==PIX_FMT_YUV420P9BE  \
         || (x)==PIX_FMT_YUV420P10LE \
diff --git a/libswscale/swscale_template.c b/libswscale/swscale_template.c
index 81a8d66..d28ad64 100644
--- a/libswscale/swscale_template.c
+++ b/libswscale/swscale_template.c
@@ -164,6 +164,8 @@ static inline void LEToUV_c(uint8_t *dstU, uint8_t *dstV, 
const uint8_t *src1,
                             const uint8_t *src2, long width, uint32_t *unused)
 {
     int i;
+    // FIXME I don't think this code is right for YUV444/422, since then h is 
not subsampled so
+    // we need to skip each second pixel. Same for BEToUV.
     for (i=0; i<width; i++) {
         dstU[i]= src1[2*i + 1];
         dstV[i]= src2[2*i + 1];
@@ -226,8 +228,8 @@ static inline void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
 }
 
 // FIXME Maybe dither instead.
-#define YUV_NBPS(depth) \
-static inline void yuv ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
+#define YUV_NBPS(depth, endianness, rfunc) \
+static inline void endianness ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, 
\
                                           const uint8_t *_srcU, const uint8_t 
*_srcV, \
                                           long width, uint32_t *unused) \
 { \
@@ -235,21 +237,23 @@ static inline void yuv ## depth ## ToUV_c(uint8_t *dstU, 
uint8_t *dstV, \
     const uint16_t *srcU = (const uint16_t*)_srcU; \
     const uint16_t *srcV = (const uint16_t*)_srcV; \
     for (i = 0; i < width; i++) { \
-        dstU[i] = srcU[i]>>(depth-8); \
-        dstV[i] = srcV[i]>>(depth-8); \
+        dstU[i] = rfunc(srcU[i])>>(depth-8); \
+        dstV[i] = rfunc(srcV[i])>>(depth-8); \
     } \
 } \
 \
-static inline void yuv ## depth ## ToY_c(uint8_t *dstY, const uint8_t *_srcY, 
long width, uint32_t *unused) \
+static inline void endianness ## depth ## ToY_c(uint8_t *dstY, const uint8_t 
*_srcY, long width, uint32_t *unused) \
 { \
     int i; \
     const uint16_t *srcY = (const uint16_t*)_srcY; \
     for (i = 0; i < width; i++) \
-        dstY[i] = srcY[i]>>(depth-8); \
+        dstY[i] = rfunc(srcY[i])>>(depth-8); \
 } \
 
-YUV_NBPS( 9)
-YUV_NBPS(10)
+YUV_NBPS( 9, LE, AV_RL16)
+YUV_NBPS( 9, BE, AV_RB16)
+YUV_NBPS(10, LE, AV_RL16)
+YUV_NBPS(10, BE, AV_RB16)
 
 static inline void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
                               long width, uint32_t *unused)
@@ -816,10 +820,10 @@ static void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_PAL8     :
         case PIX_FMT_BGR4_BYTE:
         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
-        case PIX_FMT_YUV420P9BE:
-        case PIX_FMT_YUV420P9LE: c->chrToYV12 = yuv9ToUV_c; break;
-        case PIX_FMT_YUV420P10BE:
-        case PIX_FMT_YUV420P10LE: c->chrToYV12 = yuv10ToUV_c; break;
+        case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
+        case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
+        case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
+        case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
         case PIX_FMT_YUV420P16BE:
         case PIX_FMT_YUV422P16BE:
         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
@@ -866,10 +870,10 @@ static void sws_init_swScale_c(SwsContext *c)
     c->lumToYV12 = NULL;
     c->alpToYV12 = NULL;
     switch (srcFormat) {
-    case PIX_FMT_YUV420P9BE:
-    case PIX_FMT_YUV420P9LE: c->lumToYV12 = yuv9ToY_c; break;
-    case PIX_FMT_YUV420P10BE:
-    case PIX_FMT_YUV420P10LE: c->lumToYV12 = yuv10ToY_c; break;
+    case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
+    case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
+    case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
+    case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
     case PIX_FMT_YUYV422  :
     case PIX_FMT_YUV420P16BE:
     case PIX_FMT_YUV422P16BE:

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [RFC] fix swscale for 9/10bit

Reply via email to