vlc | branch: master | Sébastien Toque <[email protected]> | Tue Mar 12 18:20:00 2013 +0100| [afff7f0aca4dd56360db3b0c39c8eb5e8ae18ba7] | committer: Jean-Baptiste Kempf
i420->rv32 neon: improve scheduling & registers usage Signed-off-by: Jean-Baptiste Kempf <[email protected]> > http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=afff7f0aca4dd56360db3b0c39c8eb5e8ae18ba7 --- modules/arm_neon/i420_rgb.S | 112 ++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 54 deletions(-) diff --git a/modules/arm_neon/i420_rgb.S b/modules/arm_neon/i420_rgb.S index db955e9..a512b5f 100644 --- a/modules/arm_neon/i420_rgb.S +++ b/modules/arm_neon/i420_rgb.S @@ -50,16 +50,20 @@ #define u D24 #define v D25 -#define y1 D28 -#define y2 D29 +#define y1 D18 +#define y2 D19 #define chro_r Q6 #define chro_g Q7 #define chro_b Q8 -#define red Q9 -#define green Q10 -#define blue Q11 -#define lumi Q15 +#define lumi1 Q15 +#define lumi2 Q10 +#define red16_1 Q9 +#define green16_1 Q10 +#define blue16_1 Q11 +#define red16_2 Q12 +#define green16_2 Q13 +#define blue16_2 Q14 #define red1 D24 #define green1 D25 @@ -123,69 +127,69 @@ loop_col: vld1.u8 {u}, [U,:64]! vld1.u8 {v}, [V,:64]! - vmull.u8 chro_r, v, coefRV - vmull.u8 chro_g, u, coefGU - vmlal.u8 chro_g, v, coefGV - vmull.u8 chro_b, u, coefBU + /* Y Top Row */ + vld2.u8 {y1,y2}, [Y1,:128]! - vadd.s16 chro_r, Rc, chro_r - vsub.s16 chro_g, Gc, chro_g - vadd.s16 chro_b, Bc, chro_b + vmull.u8 Q14, v, coefRV + vmull.u8 Q11, u, coefGU + vmull.u8 Q13, u, coefBU + vmlal.u8 Q11, v, coefGV + + vmull.u8 lumi2, y2, coefY + vmull.u8 lumi1, y1, coefY + vadd.s16 chro_r, Rc, Q14 + vadd.s16 chro_b, Bc, Q13 + vsub.s16 chro_g, Gc, Q11 pld [U] pld [V] - /* Y Top Row */ - vld2.u8 {y1,y2}, [Y1,:128]! - - /* y1 : chrominance + luminance, then clamp (divide by 64) */ - vmull.u8 lumi, y1, coefY - vqadd.s16 red, lumi, chro_r - vqadd.s16 green, lumi, chro_g - vqadd.s16 blue, lumi, chro_b - vqrshrun.s16 red1, red, #6 - vqrshrun.s16 green1, green, #6 - vqrshrun.s16 blue1, blue, #6 - - /* y2 : chrominance + luminance, then clamp (divide by 64) */ - vmull.u8 lumi, y2, coefY - vqadd.s16 red, lumi, chro_r - vqadd.s16 green, lumi, chro_g - vqadd.s16 blue, lumi, chro_b - vqrshrun.s16 red2, red, #6 - vqrshrun.s16 green2, green, #6 - vqrshrun.s16 blue2, blue, #6 + /* chrominance + luminance */ + vqadd.s16 red16_2, lumi2, chro_r + vqadd.s16 blue16_2, lumi2, chro_b + vqadd.s16 green16_2, lumi2, chro_g + vqadd.s16 red16_1, lumi1, chro_r + vqadd.s16 green16_1, lumi1, chro_g + vqadd.s16 blue16_1, lumi1, chro_b + + /* clamp (divide by 64) */ + vqrshrun.s16 blue2, blue16_2, #6 + vqrshrun.s16 red2, red16_2, #6 + vqrshrun.s16 green2, green16_2, #6 + vqrshrun.s16 red1, red16_1, #6 + vqrshrun.s16 green1, green16_1, #6 + vqrshrun.s16 blue1, blue16_1, #6 pld [Y1] - vmov.u8 alpha2, #255 + /* Y Bottom Row */ + vld2.u8 {y1,y2}, [Y2,:128]! + + vmov.u8 alpha1, #255 vzip.u8 red1, red2 vzip.u8 green1, green2 vzip.u8 blue1, blue2 + vmull.u8 lumi2, y2, coefY vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]! vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]! - /* Y Bottom Row */ - vld2.u8 {y1,y2}, [Y2,:128]! - - /* y1 : chrominance + luminance, then clamp (divide by 64) */ - vmull.u8 lumi, y1, coefY - vqadd.s16 red, lumi, chro_r - vqadd.s16 green, lumi, chro_g - vqadd.s16 blue, lumi, chro_b - vqrshrun.s16 red1, red, #6 - vqrshrun.s16 green1, green, #6 - vqrshrun.s16 blue1, blue, #6 - - /* y2 : chrominance + luminance, then clamp (divide by 64) */ - vmull.u8 lumi, y2, coefY - vqadd.s16 red, lumi, chro_r - vqadd.s16 green, lumi, chro_g - vqadd.s16 blue, lumi, chro_b - vqrshrun.s16 red2, red, #6 - vqrshrun.s16 green2, green, #6 - vqrshrun.s16 blue2, blue, #6 + /* chrominance + luminance */ + vmull.u8 lumi1, y1, coefY + vqadd.s16 red16_2, lumi2, chro_r + vqadd.s16 green16_2, lumi2, chro_g + vqadd.s16 blue16_2, lumi2, chro_b + vqadd.s16 red16_1, lumi1, chro_r + vqadd.s16 green16_1, lumi1, chro_g + vqadd.s16 blue16_1, lumi1, chro_b + + /* clamp (divide by 64) */ + vqrshrun.s16 blue2, blue16_2, #6 + vqrshrun.s16 red2, red16_2, #6 + vqrshrun.s16 green2, green16_2, #6 + vqrshrun.s16 red1, red16_1, #6 + vqrshrun.s16 green1, green16_1, #6 + vqrshrun.s16 blue1, blue16_1, #6 pld [Y2] _______________________________________________ vlc-commits mailing list [email protected] http://mailman.videolan.org/listinfo/vlc-commits
