On Thu, Oct 10, 2013 at 2:05 AM, <[email protected]> wrote:
> # HG changeset patch > # User Yuvaraj Venkatesh <[email protected]> > # Date 1381388695 -19800 > # Thu Oct 10 12:34:55 2013 +0530 > # Node ID f0df51b8dc1c16c7a0dbbe1adacdb488b12cbde2 > # Parent 283fbedb6265f82ee62651edf25ec5ce767d8f4f > pixel: replace sse_sp64 vector class with intrinsic > Nice, only sse_pp4 remains of the 8bpp vector pixel functions. I've moved it into pixel-sse41.cpp and deleted sse.inc > > diff -r 283fbedb6265 -r f0df51b8dc1c source/common/vec/sse.inc > --- a/source/common/vec/sse.inc Thu Oct 10 12:29:30 2013 +0530 > +++ b/source/common/vec/sse.inc Thu Oct 10 12:34:55 2013 +0530 > @@ -543,57 +543,65 @@ > template<int ly> > int sse_sp64(short* fenc, intptr_t strideFenc, pixel* fref, intptr_t > strideFref) > { > - int rows = ly; > - Vec8s m1; > - Vec16uc n1; > + __m128i sum0 = _mm_setzero_si128(); > + __m128i sum1 = _mm_setzero_si128(); > > - Vec8us diff_low(0), diff_high(0); > - Vec4i sum_low(0), sum_high(0); > - for (; rows != 0; rows--) > + for(int i = 0; i < ly; i++) > { > - n1.load(fref); > - m1.load(fenc); > - diff_low = m1 - extend_low(n1); > - m1.load(fenc + 8); > - diff_high = m1 - extend_high(n1); > - diff_low = diff_low * diff_low; > - diff_high = diff_high * diff_high; > - sum_low += (extend_low(diff_low) + extend_low(diff_high)); > - sum_high += (extend_high(diff_low) + extend_high(diff_high)); > + __m128i T00, T01, T02; > + __m128i T10, T11, T12, T13; > > - n1.load(fref + 16); > - m1.load(fenc + 16); > - diff_low = m1 - extend_low(n1); > - m1.load(fenc + 24); > - diff_high = m1 - extend_high(n1); > - diff_low = diff_low * diff_low; > - diff_high = diff_high * diff_high; > - sum_low += (extend_low(diff_low) + extend_low(diff_high)); > - sum_high += (extend_high(diff_low) + extend_high(diff_high)); > + T00 = _mm_loadu_si128((__m128i*)(fenc)); > + T01 = _mm_loadu_si128((__m128i*)(fref)); > + T02 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); > > - n1.load(fref + 32); > - m1.load(fenc + 32); > - diff_low = m1 - extend_low(n1); > - m1.load(fenc + 40); > - diff_high = m1 - extend_high(n1); > - diff_low = diff_low * diff_low; > - diff_high = diff_high * diff_high; > - sum_low += (extend_low(diff_low) + extend_low(diff_high)); > - sum_high += (extend_high(diff_low) + extend_high(diff_high)); > + SSE_SP8x1; > > - n1.load(fref + 48); > - m1.load(fenc + 48); > - diff_low = m1 - extend_low(n1); > - m1.load(fenc + 56); > - diff_high = m1 - extend_high(n1); > - diff_low = diff_low * diff_low; > - diff_high = diff_high * diff_high; > - sum_low += (extend_low(diff_low) + extend_low(diff_high)); > - sum_high += (extend_high(diff_low) + extend_high(diff_high)); > + T00 = _mm_loadu_si128((__m128i*)(fenc + 8)); > + T02 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); > + > + SSE_SP8x1; > + > + T00 = _mm_loadu_si128((__m128i*)(fenc + 16)); > + T01 = _mm_loadu_si128((__m128i*)(fref + 16)); > + T02 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); > + > + SSE_SP8x1; > + > + T00 = _mm_loadu_si128((__m128i*)(fenc + 24)); > + T02 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); > + > + SSE_SP8x1; > + > + T00 = _mm_loadu_si128((__m128i*)(fenc + 32)); > + T01 = _mm_loadu_si128((__m128i*)(fref + 32)); > + T02 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); > + > + SSE_SP8x1; > + > + T00 = _mm_loadu_si128((__m128i*)(fenc + 40)); > + T02 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); > + > + SSE_SP8x1; > + > + T00 = _mm_loadu_si128((__m128i*)(fenc + 48)); > + T01 = _mm_loadu_si128((__m128i*)(fref + 48)); > + T02 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); > + > + SSE_SP8x1; > + > + T00 = _mm_loadu_si128((__m128i*)(fenc + 56)); > + T02 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); > + > + SSE_SP8x1; > > fenc += strideFenc; > fref += strideFref; > } > + sum0 = _mm_add_epi32(sum0, sum1); > > - return horizontal_add(sum_low) + horizontal_add(sum_high); > + sum0 = _mm_hadd_epi32(sum0, _mm_setzero_si128()); > + sum0 = _mm_hadd_epi32(sum0, _mm_setzero_si128()); > + > + return _mm_cvtsi128_si32(sum0); > } > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
