# HG changeset patch
# User Yuvaraj Venkatesh <[email protected]>
# Date 1381315230 -19800
# Wed Oct 09 16:10:30 2013 +0530
# Node ID 1d3760e10f643954edb5dd8dd953c2511ff9a90f
# Parent fc7fbdd18bc0d6d7f98180332e065d83c054fe02
pixel: Replace weightUnidir vector class function with intrinsic.
diff -r fc7fbdd18bc0 -r 1d3760e10f64 source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc Wed Oct 09 00:00:10 2013 -0500
+++ b/source/common/vec/pixel8.inc Wed Oct 09 16:10:30 2013 +0530
@@ -27,33 +27,45 @@
/* intrinsics for when pixel type is uint8_t */
-void weightUnidir(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t
dstStride, int width, int height, int w0, int round, int shift, int offset)
-{
- int x, y;
- Vec8s tmp;
+void weightUnidir(short *src, pixel *dst, intptr_t srcStride, intptr_t
dstStride, int width, int height, int w0, int round, int shift, int offset)
+{
+ __m128i w00, roundoff, ofs, fs, tmpsrc, tmpdst, tmp;
+ int x, y;
+
+ w00 = _mm_set1_epi32(w0);
+ ofs = _mm_set1_epi32(IF_INTERNAL_OFFS);
+ fs = _mm_set1_epi32(offset);
+ roundoff = _mm_set1_epi32(round);
+ for (y = height - 1; y >= 0; y--)
+ {
+ for (x = 0; x <= width - 4; x += 4)
+ {
+ tmpsrc = _mm_loadl_epi64((__m128i*)(src + x));
+ tmpsrc = _mm_unpacklo_epi16(tmpsrc, _mm_setzero_si128());
+ tmpdst =
_mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_mullo_epi32(w00,
_mm_add_epi32(tmpsrc, ofs)), roundoff), shift), fs);
+ *(uint32_t*)(dst + x) =
_mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(tmpdst, tmpdst),
_mm_setzero_si128()));
+ }
+
+ if (width > x)
+ {
+ tmpsrc = _mm_loadl_epi64((__m128i*)(src + x));
+ tmpsrc = _mm_unpacklo_epi16(tmpsrc, _mm_setzero_si128());
+ tmpdst =
_mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_mullo_epi32(w00,
_mm_add_epi32(tmpsrc, ofs)), roundoff), shift), fs);
+ tmp = _mm_packus_epi16(_mm_packs_epi32(tmpdst, tmpdst),
_mm_setzero_si128());
+ union
+ {
+ int8_t c[16];
+ int16_t s[8];
+ } u;
+
+ _mm_storeu_si128((__m128i*)u.c, tmp);
+ ((int16_t*)(dst + x))[0] = u.s[0]; //to store only first 16-bit
from 128-bit to memory
+ }
+ src += srcStride;
+ dst += dstStride;
+ }
+}
- Vec4i vw0(w0), vsrc, iofs(IF_INTERNAL_OFFS), ofs(offset), vround(round),
vdst;
- for (y = height - 1; y >= 0; y--)
- {
- for (x = 0; x <= width - 4; x += 4)
- {
- tmp = load_partial(const_int(8), src + x);
- vsrc = extend_low(tmp);
- vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
- store_partial(const_int(4), dst + x,
compress_unsafe(compress_saturated(vdst, vdst), 0));
- }
-
- if (width > x)
- {
- tmp = load_partial(const_int(4), src + x);
- vsrc = extend_low(tmp);
- vdst = ((vw0 * (vsrc + iofs) + vround) >> shift) + ofs;
- compress_unsafe(compress_saturated(vdst, vdst),
0).store_partial(2, dst + x);
- }
- src += srcStride;
- dst += dstStride;
- }
-}
void weightUnidirPixel(pixel *src, pixel *dst, intptr_t srcStride, intptr_t
dstStride, int width, int height, int w0, int round, int shift, int offset)
{
_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel