On Mon, Oct 7, 2013 at 8:45 AM, <[email protected]> wrote:
> # HG changeset patch > # User Yuvaraj Venkatesh <[email protected]> > # Date 1381153465 -19800 > # Mon Oct 07 19:14:25 2013 +0530 > # Node ID 52ee436b58f9aa48757063bd678672d0ab56be01 > # Parent c010342f7605c86867824f5b525a8f84c0d2de1c > Replacing Residual4 from vector class to intrinsic. > these two patches have tab-stops. I've queued them for default after change tabs to spaces and improving the commit messages > diff -r c010342f7605 -r 52ee436b58f9 source/common/vec/pixel8.inc > --- a/source/common/vec/pixel8.inc Sun Oct 06 02:09:00 2013 -0500 > +++ b/source/common/vec/pixel8.inc Mon Oct 07 19:14:25 2013 +0530 > @@ -29,19 +29,35 @@ > > void getResidual4(pixel *fenc, pixel *pred, short *resi, int stride) > { > - for (int y = 0; y < 4; y++) > - { > - Vec16uc f; > - f.fromUint32(*(uint32_t*)fenc); > - Vec16uc p; > - p.fromUint32(*(uint32_t*)pred); > - Vec8s r = extend_low(f) - extend_low(p); > - store_partial(const_int(8), resi, r); > + __m128i T00, T01, T02; > > - fenc += stride; > - pred += stride; > - resi += stride; > - } > + T00 = _mm_cvtsi32_si128(*(uint32_t*)fenc); > + T01 = _mm_cvtsi32_si128(*(uint32_t*)pred); > + T00 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); > + T01 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); > + T02 = _mm_sub_epi16(T00, T01); > + _mm_storel_epi64((__m128i*)resi, T02); > + > + T00 = _mm_cvtsi32_si128(*(uint32_t*)(fenc + stride)); > + T01 = _mm_cvtsi32_si128(*(uint32_t*)(pred + stride)); > + T00 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); > + T01 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); > + T02 = _mm_sub_epi16(T00, T01); > + _mm_storel_epi64((__m128i*)(resi + stride), T02); > + > + T00 = _mm_cvtsi32_si128(*(uint32_t*)(fenc + (2) * stride)); > + T01 = _mm_cvtsi32_si128(*(uint32_t*)(pred + (2) * stride)); > + T00 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); > + T01 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); > + T02 = _mm_sub_epi16(T00, T01); > + _mm_storel_epi64((__m128i*)(resi + (2) * stride), T02); > + > + T00 = _mm_cvtsi32_si128(*(uint32_t*)(fenc + (3) * stride)); > + T01 = _mm_cvtsi32_si128(*(uint32_t*)(pred + (3) * stride)); > + T00 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); > + T01 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); > + T02 = _mm_sub_epi16(T00, T01); > + _mm_storel_epi64((__m128i*)(resi + (3) * stride), T02); > } > > void getResidual8(pixel *fenc, pixel *pred, short *resi, int stride) > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
