On Tue, Oct 8, 2013 at 4:20 AM, <[email protected]> wrote:
> # HG changeset patch > # User Yuvaraj Venkatesh <[email protected]> > # Date 1381223945 -19800 > # Tue Oct 08 14:49:05 2013 +0530 > # Node ID 23f4e0a507a6be19fceb4a2525aeb2a5fae5e1ab > # Parent 1a62566488b7ece9bbfb665e37ac402a08ce156e > pixel: replace getResidual64 from vector class to intrinsic > > diff -r 1a62566488b7 -r 23f4e0a507a6 source/common/vec/pixel8.inc > --- a/source/common/vec/pixel8.inc Tue Oct 08 14:33:26 2013 +0530 > +++ b/source/common/vec/pixel8.inc Tue Oct 08 14:49:05 2013 +0530 > @@ -112,46 +112,41 @@ > } > } > > -void getResidual64(pixel *fenc, pixel *pred, short *resi, int stride) > -{ > - Vec16uc f, p; > - Vec8s r; > +void getResidual64(pixel *fenc, pixel *pred, short *resi, int stride) > +{ > + __m128i T00, T01, T02, T03, T04; > + > +#define RESIDUAL_64x4(BASE, OFFSET) \ > + T00 = _mm_load_si128((__m128i*)(fenc + OFFSET + (BASE + 0) * > stride)); \ > + T01 = _mm_load_si128((__m128i*)(pred + OFFSET + (BASE + 0) * > stride)); \ > + T02 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); \ > + T03 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); \ > + T04 = _mm_sub_epi16(T02, T03); \ > + _mm_store_si128((__m128i*)(resi + OFFSET + (BASE + 0) * stride), > T04); \ > + T02 = _mm_unpackhi_epi8(T00, _mm_setzero_si128()); \ > + T03 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); \ > + T04 = _mm_sub_epi16(T02, T03); \ > + _mm_store_si128((__m128i*)(resi + 8 + OFFSET + (BASE + 0) * stride), > T04); \ > + T00 = _mm_load_si128((__m128i*)(fenc + OFFSET + (BASE + 1) * > stride)); \ > + T01 = _mm_load_si128((__m128i*)(pred + OFFSET + (BASE + 1) * > stride)); \ > + T02 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); \ > + T03 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); \ > + T04 = _mm_sub_epi16(T02, T03); \ > + _mm_store_si128((__m128i*)(resi + OFFSET + (BASE + 1) * stride), > T04); \ > + T02 = _mm_unpackhi_epi8(T00, _mm_setzero_si128()); \ > + T03 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); \ > + T04 = _mm_sub_epi16(T02, T03); \ > + _mm_store_si128((__m128i*)(resi + 8 + OFFSET + (BASE + 1) * stride), > T04) > + > + for (int i = 0; i < 64; i += 2) > + { > + RESIDUAL_64x4(i, 0); > + RESIDUAL_64x4(i, 16); > + RESIDUAL_64x4(i, 32); > + RESIDUAL_64x4(i, 48); > And this one can use the same RESIDUAL_2x16 macro as the 32x32 primitive > + } > +} > > - for (int y = 0; y < 64; y++) > - { > - f.load_a(fenc); > - p.load_a(pred); > - r = extend_low(f) - extend_low(p); > - r.store(resi); > - r = extend_high(f) - extend_high(p); > - r.store(resi + 8); > - > - f.load_a(fenc + 16); > - p.load_a(pred + 16); > - r = extend_low(f) - extend_low(p); > - r.store(resi + 16); > - r = extend_high(f) - extend_high(p); > - r.store(resi + 24); > - > - f.load_a(fenc + 32); > - p.load_a(pred + 32); > - r = extend_low(f) - extend_low(p); > - r.store(resi + 32); > - r = extend_high(f) - extend_high(p); > - r.store(resi + 40); > - > - f.load_a(fenc + 48); > - p.load_a(pred + 48); > - r = extend_low(f) - extend_low(p); > - r.store(resi + 48); > - r = extend_high(f) - extend_high(p); > - r.store(resi + 56); > - > - fenc += stride; > - pred += stride; > - resi += stride; > - } > -} > > void calcRecons4(pixel* pPred, short* pResi, pixel* pReco, short* pRecQt, > pixel* pRecIPred, int stride, int recstride, int ipredstride) > { > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
