On Tue, Oct 8, 2013 at 12:08 PM, Steve Borho <[email protected]> wrote:
> > > > On Tue, Oct 8, 2013 at 4:05 AM, <[email protected]> wrote: > >> # HG changeset patch >> # User Yuvaraj Venkatesh <[email protected]> >> # Date 1381223006 -19800 >> # Tue Oct 08 14:33:26 2013 +0530 >> # Node ID 1a62566488b7ece9bbfb665e37ac402a08ce156e >> # Parent 96e30370f4d96c7fed69f432027ed3be8e01dcf6 >> pixel: replace getResidual32 from vector class to intrinsic >> >> diff -r 96e30370f4d9 -r 1a62566488b7 source/common/vec/pixel8.inc >> --- a/source/common/vec/pixel8.inc Tue Oct 08 14:16:23 2013 +0530 >> +++ b/source/common/vec/pixel8.inc Tue Oct 08 14:33:26 2013 +0530 >> @@ -79,31 +79,37 @@ >> RESIDUAL_16x4(12); >> } >> >> -void getResidual32(pixel *fenc, pixel *pred, short *resi, int stride) >> -{ >> - Vec16uc f, p; >> - Vec8s r; >> - >> - for (int y = 0; y < 32; y++) >> - { >> - f.load_a(fenc); >> - p.load_a(pred); >> - r = extend_low(f) - extend_low(p); >> - r.store(resi); >> - r = extend_high(f) - extend_high(p); >> - r.store(resi + 8); >> - >> - f.load_a(fenc + 16); >> - p.load_a(pred + 16); >> - r = extend_low(f) - extend_low(p); >> - r.store(resi + 16); >> - r = extend_high(f) - extend_high(p); >> - r.store(resi + 24); >> - >> - fenc += stride; >> - pred += stride; >> - resi += stride; >> - } >> +void getResidual32(pixel *fenc, pixel *pred, short *resi, int stride) >> +{ >> + __m128i T00, T01, T02, T03, T04; >> + >> +#define RESIDUAL_32x4(BASE, OFFSET) \ >> + T00 = _mm_load_si128((__m128i*)(fenc + OFFSET + (BASE + 0) * >> stride)); \ >> + T01 = _mm_load_si128((__m128i*)(pred + OFFSET + (BASE + 0) * >> stride)); \ >> + T02 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); \ >> + T03 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); \ >> + T04 = _mm_sub_epi16(T02, T03); \ >> + _mm_store_si128((__m128i*)(resi + OFFSET + (BASE + 0) * stride), >> T04); \ >> + T02 = _mm_unpackhi_epi8(T00, _mm_setzero_si128()); \ >> + T03 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); \ >> + T04 = _mm_sub_epi16(T02, T03); \ >> + _mm_store_si128((__m128i*)(resi + 8 + OFFSET + (BASE + 0) * stride), >> T04); \ >> + T00 = _mm_load_si128((__m128i*)(fenc + OFFSET + (BASE + 1) * >> stride)); \ >> + T01 = _mm_load_si128((__m128i*)(pred + OFFSET + (BASE + 1) * >> stride)); \ >> + T02 = _mm_unpacklo_epi8(T00, _mm_setzero_si128()); \ >> + T03 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); \ >> + T04 = _mm_sub_epi16(T02, T03); \ >> + _mm_store_si128((__m128i*)(resi + OFFSET + (BASE + 1) * stride), >> T04); \ >> + T02 = _mm_unpackhi_epi8(T00, _mm_setzero_si128()); \ >> + T03 = _mm_unpackhi_epi8(T01, _mm_setzero_si128()); \ >> + T04 = _mm_sub_epi16(T02, T03); \ >> + _mm_store_si128((__m128i*)(resi + 8 + OFFSET + (BASE + 1) * stride), >> T04) >> + >> + for (int i = 0; i < 32; i += 2) >> + { >> + RESIDUAL_32x4(i, 0); >> + RESIDUAL_32x4(i, 16); >> > > I assume this macro should be named RESIDUAL_32x16; changing then queueing > > Actually.. 2x16 -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
