This shader is only for BGR->NV12, is it right ? Is there a way to re-use this shader for other RGB format, for example, pass the RGB format to the thread ?
+ mul (16) REG2(r, nTEMP4, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub + mul (16) REG2(r, nTEMP5, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub Could you name the coefficients such as REG2(r, nTEMP0, 0) so it is more readable ? Thanks Haihao > --- > .../gen5_6/Common/RGBX_Load_16x8.asm | 57 ++ > .../gen5_6/Common/RGBX_Load_16x8.inc | 48 ++ > .../gen5_6/Common/RGBX_Save_YUV_Fix.asm | 115 ++++ > .../gen5_6/Common/RGBX_Save_YUV_Float.asm | 152 +++++ > .../gen5_6/Common/RGBX_to_YUV_Coef.asm | 43 ++ > src/shaders/post_processing/gen5_6/Makefile.am | 8 + > .../post_processing/gen5_6/rgbx_load_save_nv12.asm | 26 + > .../gen5_6/rgbx_load_save_nv12.g4b.gen5 | 562 +++++++++++++++++ > .../post_processing/gen5_6/rgbx_load_save_nv12.g6b | 635 > ++++++++++++++++++++ > 9 files changed, 1646 insertions(+), 0 deletions(-) > create mode 100755 > src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm > create mode 100755 > src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc > create mode 100755 > src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm > create mode 100755 > src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm > create mode 100755 > src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm > create mode 100755 src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm > create mode 100644 > src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.g4b.gen5 > create mode 100644 src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.g6b > > diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm > b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm > new file mode 100755 > index 0000000..958308a > --- /dev/null > +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm > @@ -0,0 +1,57 @@ > +/* > + * All Video Processing kernels > + * Copyright © <2010>, Intel Corporation. > + * > + * This program is licensed under the terms and conditions of the > + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at > + * http://www.opensource.org/licenses/eclipse-1.0.php. > + * > + */ > + > +// Module name: RGBA_Load_16x8.asm (copied from AYUV_Load_16x8.asm) > +//---------------------------------------------------------------- > + > + > +#include "RGBX_Load_16x8.inc" > + > +// In order to load 64x8 RGBA data (16x8 pixels), we need to divide the data > +// into two regions and load them separately. > +// > +// 32 byte 32 byte > +//|----------------|----------------| > +//| | | > +//| A | B |8 > +//| | | > +//| | | > +//|----------------|----------------| > + > +// Load the first 32x8 data block > +// Packed data block should be loaded as 32x8 pixel block > + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w > // Source Block origin > + shl (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:w 2:w > { NoDDClr } // H. block origin need to be four times larger > + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_RGBA:ud { NoDDChk } > // Block width and height (32x8) > + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud > + send (8) udSRC_RGBA(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ > nDPMR_MSGDSC+nDPR_MSG_SIZE_RGBA+nBI_CURRENT_SRC_YUV:ud > + > +//Load the second 32x8 data block > +// Offset the origin X - move to next 32 colomns > + add (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 32:w > // Increase X origin by 8 > + > +// Size stays the same - 32x8 > + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud > // Copy message description to message header > + send (8) udSRC_RGBA(8)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ > nDPMR_MSGDSC+nDPR_MSG_SIZE_RGBA+nBI_CURRENT_SRC_YUV:ud > + > +// Give AYUV region addresses to address register > + // a0.0 is 0x38*32, a0.1 is 0x40*32. 0x40-0x38=8 (pixel) > + mov (1) SRC_RGBA_OFFSET<1>:ud 0x00400038*32:ud > //Address registers contain starting addresses of two halves > + > +#if !defined(FIX_POINT_CONVERSION) && !defined(FLOAT_POINT_CONVERSION) > + //Directly move the data to destination > + $for(0; <nY_NUM_OF_ROWS; 1) { > + // 8 means 8 elements, not 2=8/2 element per row. > + mov (16) uwDEST_Y(%1)<1> r[SRC_RGBA_OFFSET,%1*32+3]<8,4>:ub // A/R > + mov (16) uwDEST_U(%1)<1> r[SRC_RGBA_OFFSET,%1*32+2]<8,4>:ub // Y/G > + mov (16) uwDEST_V(%1)<1> r[SRC_RGBA_OFFSET,%1*32+1]<8,4>:ub // U/B > + } > +#endif > + > diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc > b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc > new file mode 100755 > index 0000000..7199d64 > --- /dev/null > +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc > @@ -0,0 +1,48 @@ > +/* > + * All Video Processing kernels > + * Copyright © <2010>, Intel Corporation. > + * > + * This program is licensed under the terms and conditions of the > + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at > + * http://www.opensource.org/licenses/eclipse-1.0.php. > + * > + */ > +#ifndef RGBA_LOAD_16X8_INC > +#define RGBA_LOAD_16X8_INC > + > +// Module name: RGBA_Load_16x8.inc > +// > +// RGBA data are first loaded to bottom I/O REGION_2, then does color > conversion from RGB to YUV > +// finally, YUV data are stored in top I/O REGION_1 with planar format > + > +#undef nY_NUM_OF_ROWS > + > +#define nY_NUM_OF_ROWS 8 // Number of Y rows per block > + > +#define nDPR_BLOCK_SIZE_RGBA nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // > RGBA block size 32x8 (it is half size) > +#define nDPR_MSG_SIZE_RGBA nRESLEN_8 // > # of MRF's to hold RGBA block data (8) > + > +//Temporary storage for unpacked AYUV data > +#define rUNPACK_TEMP REG(r,nTEMP0) > +.declare udUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=4 > SrcRegion=<8;8,1> Type=ud //1 GRF > +.declare ubUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=1 > SrcRegion=<32;32,1> Type=ub //1 GRF > + > +.declare ubBOT_Y_IO Base=REG(r,nBOT_Y) ElementSize=1 > SrcRegion=REGION(32,1) Type=ub > + > + > +#define udSRC_RGBA udBOT_Y_IO > +#define ubSRC_RGBA ubBOT_Y_IO > +#define nSRC_RGBA_REG nBOT_Y > + > +#define uwDEST_Y uwTOP_Y > +#define uwDEST_U uwTOP_U > +#define uwDEST_V uwTOP_V > + > +#define SRC_RGBA_OFFSET a0.0 > +#define SRC_RGBA_OFFSET_1 a0.0 > +#define SRC_RGBA_OFFSET_2 a0.1 > + > +#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for > first kernel > + > +// End of RGBA_Load_16x8.inc > +#endif > \ No newline at end of file > diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm > b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm > new file mode 100755 > index 0000000..f60a2a0 > --- /dev/null > +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm > @@ -0,0 +1,115 @@ > +/* > + * All Video Processing kernels > + * Copyright © <2010>, Intel Corporation. > + * > + * This program is licensed under the terms and conditions of the > + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at > + * http://www.opensource.org/licenses/eclipse-1.0.php. > + * > + * Authors: > + * Halley Zhao <[email protected]> > + */ > + > +// Module name: PL16x8_PL8x4.asm > +//---------------------------------------------------------------- > + > +#include "RGBX_Load_16x8.inc" > + > +#if (0) > + #define nTEMP0 34 // transformation coefficient > + #define nTEMP1 35 // one row of Y (first half register > is used) > + #define nTEMP2 36 // first half of one row > + #define nTEMP3 37 // second half of one row > + #define nTEMP4 38 // mul and add > + #define nTEMP5 39 // mul and add > + #define nTEMP6 40 // mul and add > + #define nTEMP7 41 // mul and add > + #define nTEMP8 42 // sum of mul > + #define nTEMP10 44 > + #define nTEMP12 46 > + #define nTEMP14 48 > + #define nTEMP16 50 > + #define nTEMP17 51 > + #define nTEMP18 52 > + > + #define nTEMP24 58 > +#endif > + > +$for(0; <nY_NUM_OF_ROWS; 1) { > + // BGRX | B | G | R | X | > + // ###### do on row for Y > + // #### mul and add > + mul (16) REG2(r, nTEMP4, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + > 0]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub > + mul (16) REG2(r, nTEMP5, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + > 16]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub > + mul (16) REG2(r, nTEMP6, 0)<1>:uw r[SRC_RGBA_OFFSET_2, %1*32 + > 0]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub > + mul (16) REG2(r, nTEMP7, 0)<1>:uw r[SRC_RGBA_OFFSET_2, %1*32 + > 16]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub > + > + add (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:uw > REG2(r, nTEMP4, 1)<0;4,4>:uw > + add (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:uw > REG2(r, nTEMP4, 2)<0;4,4>:uw > + add (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:uw > REG2(r, nTEMP5, 1)<0;4,4>:uw > + add (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:uw > REG2(r, nTEMP5, 2)<0;4,4>:uw > + add (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:uw > REG2(r, nTEMP6, 1)<0;4,4>:uw > + add (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:uw > REG2(r, nTEMP6, 2)<0;4,4>:uw > + add (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:uw > REG2(r, nTEMP7, 1)<0;4,4>:uw > + add (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:uw > REG2(r, nTEMP7, 2)<0;4,4>:uw > + > + // #### write Y to the 1 row > + mov (4) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP4, 0)<0; 4, 4>:uw > + mov (4) REG2(r, nTEMP8, 4)<1>:uw REG2(r, nTEMP5, 0)<0; 4, 4>:uw > + mov (4) REG2(r, nTEMP8, 8)<1>:uw REG2(r, nTEMP6, 0)<0; 4, 4>:uw > + mov (4) REG2(r, nTEMP8, 12)<1>:uw REG2(r, nTEMP7, 0)<0; 4, 4>:uw > + add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:uw > 0x1080:uw > + mov (16) REG2(r, nTEMP8, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub > + mov (16) uwDEST_Y(%1)<1> REG2(r,nTEMP8, 0)<0;16,1>:ub > + > + // ###### do one row for U > + // #### mul and add > + mul (16) REG2(r, nTEMP4, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + > 0]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b > + mul (16) REG2(r, nTEMP5, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + > 16]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b > + mul (16) REG2(r, nTEMP6, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + > 0]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b > + mul (16) REG2(r, nTEMP7, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + > 16]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b > + > + add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w > REG2(r, nTEMP4, 1)<0;4,4>:w > + add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w > REG2(r, nTEMP4, 2)<0;4,4>:w > + add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w > REG2(r, nTEMP5, 1)<0;4,4>:w > + add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w > REG2(r, nTEMP5, 2)<0;4,4>:w > + add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w > REG2(r, nTEMP6, 1)<0;4,4>:w > + add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w > REG2(r, nTEMP6, 2)<0;4,4>:w > + add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w > REG2(r, nTEMP7, 1)<0;4,4>:w > + add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w > REG2(r, nTEMP7, 2)<0;4,4>:w > + > + // #### write U to the 1 row > + mov (4) REG2(r, nTEMP8, 0)<1>:w REG2(r, nTEMP4, 0)<0; 4, 4>:w > + mov (4) REG2(r, nTEMP8, 4)<1>:w REG2(r, nTEMP5, 0)<0; 4, 4>:w > + mov (4) REG2(r, nTEMP8, 8)<1>:w REG2(r, nTEMP6, 0)<0; 4, 4>:w > + mov (4) REG2(r, nTEMP8, 12)<1>:w REG2(r, nTEMP7, 0)<0; 4, 4>:w > + add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:w > 0x8080:uw // ok? > + mov (16) REG2(r, nTEMP8, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub > + mov (16) uwDEST_U(%1)<1> REG2(r,nTEMP8, 0)<0;16,1>:ub > + > + // ###### do one row for V > + // #### mul and add > + mul (16) REG2(r, nTEMP4, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + > 0]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b > + mul (16) REG2(r, nTEMP5, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + > 16]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b > + mul (16) REG2(r, nTEMP6, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + > 0]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b > + mul (16) REG2(r, nTEMP7, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + > 16]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b > + > + add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w > REG2(r, nTEMP4, 1)<0;4,4>:w > + add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w > REG2(r, nTEMP4, 2)<0;4,4>:w > + add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w > REG2(r, nTEMP5, 1)<0;4,4>:w > + add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w > REG2(r, nTEMP5, 2)<0;4,4>:w > + add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w > REG2(r, nTEMP6, 1)<0;4,4>:w > + add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w > REG2(r, nTEMP6, 2)<0;4,4>:w > + add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w > REG2(r, nTEMP7, 1)<0;4,4>:w > + add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w > REG2(r, nTEMP7, 2)<0;4,4>:w > + > + // #### write V to the 1 row > + mov (4) REG2(r, nTEMP8, 0)<1>:w REG2(r, nTEMP4, 0)<0; 4, 4>:w > + mov (4) REG2(r, nTEMP8, 4)<1>:w REG2(r, nTEMP5, 0)<0; 4, 4>:w > + mov (4) REG2(r, nTEMP8, 8)<1>:w REG2(r, nTEMP6, 0)<0; 4, 4>:w > + mov (4) REG2(r, nTEMP8, 12)<1>:w REG2(r, nTEMP7, 0)<0; 4, 4>:w > + add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:w > 0x8080:uw // ok? > + mov (16) REG2(r, nTEMP8, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub > + mov (16) uwDEST_V(%1)<1> REG2(r,nTEMP8, 0)<0;16,1>:ub > +} > + > diff --git > a/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm > b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm > new file mode 100755 > index 0000000..a771187 > --- /dev/null > +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm > @@ -0,0 +1,152 @@ > +/* > + * All Video Processing kernels > + * Copyright © <2010>, Intel Corporation. > + * > + * This program is licensed under the terms and conditions of the > + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at > + * http://www.opensource.org/licenses/eclipse-1.0.php. > + * > + * Authors: > + * Halley Zhao <[email protected]> > + */ > + > +// Module name: RGBX_Save_YUV_Float.asm > +//---------------------------------------------------------------- > + > +#include "RGBX_Load_16x8.inc" > + > +#if (0) > + // 8 grf reg for one row of pixel (2 pixel per grf) > + #define nTEMP0 34 > + #define nTEMP1 35 > + #define nTEMP2 36 > + #define nTEMP3 37 > + #define nTEMP4 38 > + #define nTEMP5 39 > + #define nTEMP6 40 > + #define nTEMP7 41 > + > + #define nTEMP8 42 // transformation coefficient > + #define nTEMP10 44 // transformation coefficient > + > + #define nTEMP12 46 // save Y/U/V in ub format > + #define nTEMP14 48 // save YUV in ud format > + #define nTEMP16 50 // dp4 result > + #define nTEMP17 51 > + #define nTEMP18 52 > + > + #define nTEMP24 58 > +#endif > + > +$for(0; <nY_NUM_OF_ROWS; 1) { > + // BGRX | B | G | R | X | > + // ###### save one row of pixel to temp grf with float format (required > by dp4) > + // mov (8) doesn't work, puzzle > + mov (4) REG(r, nTEMP0)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 0]<4,1>:ub > + mov (4) REG(r, nTEMP1)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 8]<4,1>:ub > + mov (4) REG(r, nTEMP2)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 16]<4,1>:ub > + mov (4) REG(r, nTEMP3)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 24]<4,1>:ub > + mov (4) REG(r, nTEMP4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 0]<4,1>:ub > + mov (4) REG(r, nTEMP5)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 8]<4,1>:ub > + mov (4) REG(r, nTEMP6)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 16]<4,1>:ub > + mov (4) REG(r, nTEMP7)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 24]<4,1>:ub > + mov (4) REG2(r, nTEMP0, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 4]<4,1>:ub > + mov (4) REG2(r, nTEMP1, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 12]<4,1>:ub > + mov (4) REG2(r, nTEMP2, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 20]<4,1>:ub > + mov (4) REG2(r, nTEMP3, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 28]<4,1>:ub > + mov (4) REG2(r, nTEMP4, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 4]<4,1>:ub > + mov (4) REG2(r, nTEMP5, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 12]<4,1>:ub > + mov (4) REG2(r, nTEMP6, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 20]<4,1>:ub > + mov (4) REG2(r, nTEMP7, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 24]<4,1>:ub > + > + // ###### do one row for Y > + // ##### dp4(nTEMP16) and save result to uw format(nTEMP12) > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP0, 0)<0;8,1>:f > REG2(r, nTEMP8, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 0)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP1, 0)<0;8,1>:f > REG2(r, nTEMP8, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 2)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP2, 0)<0;8,1>:f > REG2(r, nTEMP8, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 4)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP3, 0)<0;8,1>:f > REG2(r, nTEMP8, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 6)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP4, 0)<0;8,1>:f > REG2(r, nTEMP8, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 8)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP5, 0)<0;8,1>:f > REG2(r, nTEMP8, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 10)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP6, 0)<0;8,1>:f > REG2(r, nTEMP8, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 12)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP7, 0)<0;8,1>:f > REG2(r, nTEMP8, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 14)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub > + > + // #### write Y to the 1 row > + mov (16) uwDEST_Y(%1)<1> REG2(r,nTEMP12, 0)<0;16,1>:ub > + > + // ###### do one row for U > + // ##### dp4(nTEMP16) and save result to uw format(nTEMP12) > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP0, 0)<0;8,1>:f > REG2(r, nTEMP8, 4)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP1, 0)<0;8,1>:f > REG2(r, nTEMP8, 4)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 2)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP2, 0)<0;8,1>:f > REG2(r, nTEMP8, 4)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 4)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP3, 0)<0;8,1>:f > REG2(r, nTEMP8, 4)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 6)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP4, 0)<0;8,1>:f > REG2(r, nTEMP8, 4)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 8)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP5, 0)<0;8,1>:f > REG2(r, nTEMP8, 4)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 10)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP6, 0)<0;8,1>:f > REG2(r, nTEMP8, 4)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 12)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP7, 0)<0;8,1>:f > REG2(r, nTEMP8, 4)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 14)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + add (16) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP12, 0)<0;16,1>:w > 128:w > + // #### write U to the 1 row > + mov (16) uwDEST_U(%1)<1> REG2(r,nTEMP12, 0)<0;16,2>:ub > + > + // ###### do one row for V > + // ##### dp4(nTEMP16) and save result to uw format(nTEMP12) > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP0, 0)<0;8,1>:f > REG2(r, nTEMP10, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP1, 0)<0;8,1>:f > REG2(r, nTEMP10, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 2)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP2, 0)<0;8,1>:f > REG2(r, nTEMP10, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 4)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP3, 0)<0;8,1>:f > REG2(r, nTEMP10, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 6)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP4, 0)<0;8,1>:f > REG2(r, nTEMP10, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 8)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP5, 0)<0;8,1>:f > REG2(r, nTEMP10, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 10)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP6, 0)<0;8,1>:f > REG2(r, nTEMP10, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 12)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP7, 0)<0;8,1>:f > REG2(r, nTEMP10, 0)<0;4,1>:f > + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f > + mov (2) REG2(r, nTEMP12, 14)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w > + add (16) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP12, 0)<0;16,1>:w > 128:w > + > + // #### write V to the 1 row > + mov (16) uwDEST_V(%1)<1> REG2(r,nTEMP12, 0)<0;16,2>:ub > +} > diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm > b/src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm > new file mode 100755 > index 0000000..1f58643 > --- /dev/null > +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm > @@ -0,0 +1,43 @@ > +/* > + * All Video Processing kernels > + * Copyright © <2010>, Intel Corporation. > + * > + * This program is licensed under the terms and conditions of the > + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at > + * http://www.opensource.org/licenses/eclipse-1.0.php. > + * > + * Authors: > + * Halley Zhao <[email protected]> > + */ > + > +// Module name: RGB_to_YUV_Coef.asm > +//---------------------------------------------------------------- > + > +#ifdef FIX_POINT_CONVERSION > + // Y = ( ( 66 * R + 129 * G + 25 * B + 128 ) >> 8) + 16 > + mov (1) REG2(r, nTEMP0, 0):ud 0x00428119:ud // used as > unsigned byte > + // U = ( ( -38 * R - 74 * G + 112 * B + 128 ) >> 8) + 128 > + mov (1) REG2(r, nTEMP0, 1):ud 0x00DAB670:ud // used as signed > byte > + // V = ( ( 112 * R - 94 * G - 18 * B + 128 ) >> 8) + 128 > + mov (1) REG2(r, nTEMP0, 2):ud 0x0070A2EEud // used as signed > byte > +#else > + // Y = 0.299R + 0.587G + 0.114B > + mov (1) REG2(r, nTEMP8, 0):f 0.114f // B coef > + mov (1) REG2(r, nTEMP8, 1):f 0.587f // G coef > + mov (1) REG2(r, nTEMP8, 2):f 0.299f // R coef > + mov (1) REG2(r, nTEMP8, 3):f 0.000f // A coef > + > + // Cb= -0.169R - 0.331G + 0.499B + 128 > + // U = -0.147R - 0.289G + 0.436B + 128 > + mov (1) REG2(r, nTEMP8, 4):f 0.436f // B coef > + mov (1) REG2(r, nTEMP8, 5):f -0.289f // G coef > + mov (1) REG2(r, nTEMP8, 6):f -0.147f // R coef > + mov (1) REG2(r, nTEMP8, 7):f 0.000f // A coef > + // Cr= 0.499R - 0.418G - 0.0813B+ 128 > + // V = 0.615R - 0.515G - 0.100B + 128 > + mov (1) REG2(r, nTEMP10, 0):f -0.100f // B coef > + mov (1) REG2(r, nTEMP10, 1):f -0.515f // G coef > + mov (1) REG2(r, nTEMP10, 2):f 0.615f // R coef > + mov (1) REG2(r, nTEMP10, 3):f 0.000f // A coef > +#endif > + > diff --git a/src/shaders/post_processing/gen5_6/Makefile.am > b/src/shaders/post_processing/gen5_6/Makefile.am > index 1cc1ecb..8658938 100755 > --- a/src/shaders/post_processing/gen5_6/Makefile.am > +++ b/src/shaders/post_processing/gen5_6/Makefile.am > @@ -20,6 +20,7 @@ INTEL_PP_G4B_GEN5 = \ > pl3_load_save_pa.g4b.gen5 \ > pa_load_save_nv12.g4b.gen5 \ > pa_load_save_pl3.g4b.gen5 \ > + rgbx_load_save_nv12.g4b.gen5 \ > $(NULL) > > INTEL_PP_G6B = \ > @@ -35,6 +36,7 @@ INTEL_PP_G6B = \ > pl3_load_save_pa.g6b \ > pa_load_save_nv12.g6b \ > pa_load_save_pl3.g6b \ > + rgbx_load_save_nv12.g6b \ > $(NULL) > > INTEL_PP_ASM = \ > @@ -50,6 +52,7 @@ INTEL_PP_ASM = \ > pl3_load_save_pa.asm \ > pa_load_save_nv12.asm \ > pa_load_save_pl3.asm \ > + rgbx_load_save_nv12.asm \ > $(NULL) > > INTEL_PP_ASM += \ > @@ -86,6 +89,10 @@ INTEL_PP_ASM += \ > Common/RGB16x8_Save_RGB16.asm \ > Common/RGB16x8_Save_Y416.asm \ > Common/RGB_Pack.asm \ > + Common/RGBX_Load_16x8.asm \ > + Common/RGBX_to_YUV_Coef.asm \ > + Common/RGBX_Save_YUV_Fix.asm \ > + Common/RGBX_Save_YUV_Float.asm \ > Common/SetupVPKernel.asm \ > Common/readSampler16x1.asm \ > Core_Kernels/AVS_SetupFirstBlock.asm \ > @@ -145,6 +152,7 @@ INTEL_PP_INC = \ > Common/RGB16x8_Save_RGB.inc \ > Common/RGB16x8_Save_RGB16.inc \ > Common/RGB16x8_Save_Y416.inc \ > + Common/RGBX_Load_16x8.inc \ > Common/common.inc \ > Common/undefall.inc \ > Core_Kernels/AVS_IEF.inc \ > diff --git a/src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm > b/src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm > new file mode 100755 > index 0000000..4922cc7 > --- /dev/null > +++ b/src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm > @@ -0,0 +1,26 @@ > +// Module name: RGBX_LOAD_SAVE_NV12 > +.kernel RGBX_LOAD_SAVE_NV12 > +.code > +#define FIX_POINT_CONVERSION > +// #define FLOAT_POINT_CONVERSION > + > +#include "SetupVPKernel.asm" > +#include "RGBX_to_YUV_Coef.asm" > +#include "Multiple_Loop_Head.asm" > +#include "RGBX_Load_16x8.asm" > +#ifdef FIX_POINT_CONVERSION > + #include "RGBX_Save_YUV_Fix.asm" > +#else > + #include "RGBX_Save_YUV_Float.asm" > +#endif > +#include "PL16x8_PL8x4.asm" > +#include "PL8x4_Save_NV12.asm" > +#include "Multiple_Loop.asm" > + > +END_THREAD // End of Thread > + > +.end_code > + > +.end_kernel > + > +// end of rgbx_load_save_nv12.asm _______________________________________________ Libva mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/libva
