It just does that part in scalar form, I doubt using a vector store
over 2 array would speed it up particularly.
The function should be written to not use a scratch buffer.
---
libswscale/ppc/swscale_altivec.c | 15 +++++++++++++--
1 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 369e93b..27545be 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -101,17 +101,23 @@ yuv2yuvX_altivec_real(SwsContext *c,
uint8_t *dest[4], int dstW, int chrDstW)
{
uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2];
- const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 <<
18)};
+ //const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 <<
18)};
+ const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
register int i, j;
{
DECLARE_ALIGNED(16, int, val)[dstW];
+ for (i=0; i<dstW; i++)
+ val[i] = lumDither[i & 7] << 12;
+
+ /* XXX properly vectorize
for (i = 0; i < (dstW -7); i+=4) {
vec_st(vini, i << 2, val);
}
for (; i < dstW; i++) {
val[i] = (1 << 18);
}
+ */
for (j = 0; j < lumFilterSize; j++) {
vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter);
@@ -155,6 +161,11 @@ yuv2yuvX_altivec_real(SwsContext *c,
DECLARE_ALIGNED(16, int, u)[chrDstW];
DECLARE_ALIGNED(16, int, v)[chrDstW];
+ for (i=0; i<chrDstW; i++) {
+ u[i] = chrDither[i & 7] << 12;
+ v[i] = chrDither[(i + 3) & 7] << 12;
+ }
+ /*
for (i = 0; i < (chrDstW -7); i+=4) {
vec_st(vini, i << 2, u);
vec_st(vini, i << 2, v);
@@ -163,7 +174,7 @@ yuv2yuvX_altivec_real(SwsContext *c,
u[i] = (1 << 18);
v[i] = (1 << 18);
}
-
+ */
for (j = 0; j < chrFilterSize; j++) {
vector signed short l1, l1_V, vChrFilter = vec_ld(j << 1,
chrFilter);
vector unsigned char perm, perm0 = vec_lvsl(j << 1, chrFilter);
--
1.7.6
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel