On Tue, Oct 8, 2013 at 8:26 AM, <[email protected]> wrote:
> # HG changeset patch > # User Dnyaneshwar Gorade <[email protected]> > # Date 1381238689 -19800 > # Tue Oct 08 18:54:49 2013 +0530 > # Node ID 70927cb4bb4cc12d2dbb4a65590a92dc77b2b545 > # Parent 41e5e72e2a4688642f7a46041c50fcc30972c4ab > pixel8.inc: replace weightUnidirPixel vector class function with intrinsic. > > diff -r 41e5e72e2a46 -r 70927cb4bb4c source/common/vec/pixel8.inc > --- a/source/common/vec/pixel8.inc Mon Oct 07 16:51:18 2013 -0500 > +++ b/source/common/vec/pixel8.inc Tue Oct 08 18:54:49 2013 +0530 > @@ -240,31 +240,52 @@ > } > } > > -void weightUnidirPixel(pixel *src, pixel *dst, intptr_t srcStride, > intptr_t dstStride, int width, int height, int w0, int round, int shift, > int offset) > +void weightUnidirPixel(pixel *arg_src, pixel *arg_dst, intptr_t > srcStride, intptr_t dstStride, int width, int height, int w0, int > arg_round, int shift, int offset) > { > int x, y; > - Vec16uc tmp; > + __m128i temp; > + __m128i vw0 = _mm_set1_epi32(w0); // broadcast > (32-bit integer) w0 to all elements of vw0 > + __m128i iofs = _mm_set1_epi32(IF_INTERNAL_OFFS); > + __m128i ofs = _mm_set1_epi32(offset); > + __m128i round = _mm_set1_epi32(arg_round); > + __m128i src, dst; > > - Vec4i vw0(w0), vsrc, iofs(IF_INTERNAL_OFFS), ofs(offset), > vround(round), vdst; > for (y = height - 1; y >= 0; y--) > { > for (x = 0; x <= width - 4; x += 4) > { > - tmp = load_partial(const_int(4), src + x); > // The intermediate results would outgrow 16 bits because > internal offset is too high > - vsrc = extend_low(extend_low(tmp)); > - vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs; > - store_partial(const_int(4), dst + x, > compress_unsafe(compress_saturated(vdst, vdst), 0)); > + temp = _mm_cvtsi32_si128(*(uint32_t*) (arg_src + x)); > + // extend the low 4 elements to 32 bits with zero extension > + src = _mm_unpacklo_epi16(_mm_unpacklo_epi16(temp, > _mm_setzero_si128()), _mm_setzero_si128()); > + dst = _mm_add_epi32((_mm_mul_epi32(vw0, _mm_add_epi32(src, > iofs))), round); > + dst = _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift)); > + dst = _mm_add_epi32(dst, ofs); > + __m128i tmp = _mm_shuffle_epi32(dst, 2); > + dst = _mm_add_epi64(dst, tmp); > + *(uint32_t*)(arg_dst + x) = > _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(dst, dst), > _mm_setzero_si128())); > } > - > if (width > x) > { > - tmp = load_partial(const_int(4), src + x); > - vsrc = extend_low(extend_low(tmp)); > - vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs; > - compress_unsafe(compress_saturated(vdst, vdst), > 0).store_partial(2, dst + x); > + temp = _mm_cvtsi32_si128(*(uint32_t*)(arg_src + x)); > + src = _mm_unpacklo_epi16(_mm_unpacklo_epi16(temp, > _mm_setzero_si128()), _mm_setzero_si128()); > + dst = _mm_add_epi32((_mm_mul_epi32(vw0, _mm_add_epi32(src, > iofs))), round); > _mm_mul_epi32 (pmuldq) is an SSE4.1 instruction and so this function needs to be moved to pixel-sse41.cpp > + dst = _mm_add_epi32(dst, ofs); > + __m128i tmp = _mm_shuffle_epi32(dst, 2); > + dst = _mm_add_epi64(dst, tmp); > + dst = _mm_sra_epi32(dst, _mm_cvtsi32_si128(shift)); > + temp = _mm_packus_epi16(_mm_packs_epi32(dst,dst), > _mm_setzero_si128()); > + > + union > + { > + int8_t c[16]; > + int16_t s[8]; > + } u; > + > + _mm_storeu_si128((__m128i*)u.c, temp); > + ((int16_t*)(arg_dst + x))[0] = u.s[0]; > } > - src += srcStride; > - dst += dstStride; > + arg_src += srcStride; > + arg_dst += dstStride; > } > } > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > -- Steve Borho
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
