On Wed, 5 Feb 2014 15:32:59 +0100, Diego Biurrun <[email protected]> wrote:
> On Wed, Feb 05, 2014 at 09:59:45AM +0100, Anton Khirnov wrote:
> > --- a/libavcodec/ppc/vp8dsp_altivec.c
> > +++ b/libavcodec/ppc/vp8dsp_altivec.c
> > @@ -269,9 +269,44 @@ EPEL_HV(4,  4,6)
> >  
> > +static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, 
> > uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
> >  {
> > -    ff_put_pixels16_altivec(dst, src, stride, h);
> > +    register vector unsigned char pixelsv1, pixelsv2;
> > +    register vector unsigned char pixelsv1B, pixelsv2B;
> > +    register vector unsigned char pixelsv1C, pixelsv2C;
> > +    register vector unsigned char pixelsv1D, pixelsv2D;
> > +
> > +    register vector unsigned char perm = vec_lvsl(0, src);
> > +    int i;
> > +    register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
> > +    register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + 
> > sstride2;
> > +    register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
> > +
> > +// hand-unrolling the loop by 4 gains about 15%
> > +// mininum execution time goes from 74 to 60 cycles
> > +// it's faster than -funroll-loops, but using
> > +// -funroll-loops w/ this is bad - 74 cycles again.
> > +// all this is on a 7450, tuning for the 7450
> > +    for (i = 0; i < h; i += 4) {
> > +        pixelsv1  = vec_ld( 0, src);
> > +        pixelsv2  = vec_ld(15, src);
> > +        pixelsv1B = vec_ld(sstride, src);
> > +        pixelsv2B = vec_ld(15 + sstride, src);
> > +        pixelsv1C = vec_ld(sstride2, src);
> > +        pixelsv2C = vec_ld(15 + sstride2, src);
> > +        pixelsv1D = vec_ld(sstride3, src);
> > +        pixelsv2D = vec_ld(15 + sstride3, src);
> > +        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
> > +               0, (unsigned char*)dst);
> > +        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
> > +               dstride, (unsigned char*)dst);
> > +        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
> > +               dstride2, (unsigned char*)dst);
> > +        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
> > +               dstride3, (unsigned char*)dst);
> > +        src += sstride4;
> > +        dst += dstride4;
> > +    }
> >  }
> 
> This duplicates the ff_put_pixels16_altivec() function from
> libavcodec/ppc/hpeldsp_altivec.c.
> 

Yes I know.
Got a better solution?

-- 
Anton Khirnov
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to