From: Matt Turner <[email protected]>

Signed-off-by: Matt Turner <[email protected]>
---
 pixman/pixman-mmx.c |  109 +++++++++++++++++++++++++++++++++++---------------
 1 files changed, 76 insertions(+), 33 deletions(-)

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 71fa18e..b9b09b6 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -298,6 +298,22 @@ in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 
 #endif
 
+/* Elemental unaligned loads */
+
+static __inline__ uint64_t ldq_u(uint64_t *p)
+{
+    struct __una_u64 { uint64_t x __attribute__((packed)); };
+    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
+    return ptr->x;
+}
+
+static __inline__ uint32_t ldl_u(uint32_t *p)
+{
+    struct __una_u32 { uint32_t x __attribute__((packed)); };
+    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
+    return ptr->x;
+}
+
 static force_inline __m64
 load8888 (uint32_t v)
 {
@@ -1369,7 +1385,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t 
*imp,
 
        while (w >= 2)
        {
-           __m64 vs = *(__m64 *)src;
+           __m64 vs = (__m64)ldq_u((uint64_t *)src);
            __m64 vd = *(__m64 *)dst;
            __m64 vsrc0 = expand8888 (vs, 0);
            __m64 vsrc1 = expand8888 (vs, 1);
@@ -1454,14 +1470,14 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t 
*imp,
            __m64 vd6 = *(__m64 *)(dst + 12);
            __m64 vd7 = *(__m64 *)(dst + 14);
 
-           __m64 vs0 = *(__m64 *)(src + 0);
-           __m64 vs1 = *(__m64 *)(src + 2);
-           __m64 vs2 = *(__m64 *)(src + 4);
-           __m64 vs3 = *(__m64 *)(src + 6);
-           __m64 vs4 = *(__m64 *)(src + 8);
-           __m64 vs5 = *(__m64 *)(src + 10);
-           __m64 vs6 = *(__m64 *)(src + 12);
-           __m64 vs7 = *(__m64 *)(src + 14);
+           __m64 vs0 = (__m64)ldq_u((uint64_t *)(src + 0));
+           __m64 vs1 = (__m64)ldq_u((uint64_t *)(src + 2));
+           __m64 vs2 = (__m64)ldq_u((uint64_t *)(src + 4));
+           __m64 vs3 = (__m64)ldq_u((uint64_t *)(src + 6));
+           __m64 vs4 = (__m64)ldq_u((uint64_t *)(src + 8));
+           __m64 vs5 = (__m64)ldq_u((uint64_t *)(src + 10));
+           __m64 vs6 = (__m64)ldq_u((uint64_t *)(src + 12));
+           __m64 vs7 = (__m64)ldq_u((uint64_t *)(src + 14));
 
            vd0 = pack8888 (
                in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
@@ -2569,20 +2585,31 @@ mmx_composite_in_8_8 (pixman_implementation_t *imp,
        src_line += src_stride;
        w = width;
 
-       if ((((unsigned long)dest_image & 3) == 0) &&
-           (((unsigned long)src_image & 3) == 0))
+       while (w && (unsigned long)dst & 3)
        {
-           while (w >= 4)
-           {
-               uint32_t *s = (uint32_t *)src;
-               uint32_t *d = (uint32_t *)dst;
+           uint8_t s, d;
+           uint16_t tmp;
+
+           s = *src;
+           d = *dst;
 
-               *d = store8888 (in (load8888 (*s), load8888 (*d)));
+           *dst = MUL_UN8 (s, d, tmp);
 
-               w -= 4;
-               dst += 4;
-               src += 4;
-           }
+           src++;
+           dst++;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           uint32_t *s = (uint32_t *)src;
+           uint32_t *d = (uint32_t *)dst;
+
+           *d = store8888 (in (load8888 (*s), load8888 (*d)));
+
+           w -= 4;
+           dst += 4;
+           src += 4;
        }
 
        while (w--)
@@ -2637,20 +2664,36 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
        mask_line += mask_stride;
        w = width;
 
-       if ((((unsigned long)mask_image & 3) == 0) &&
-           (((unsigned long)dest_image  & 3) == 0))
+       while (w && (unsigned long)dst & 3)
        {
-           while (w >= 4)
-           {
-               __m64 vmask = load8888 (*(uint32_t *)mask);
-               __m64 vdest = load8888 (*(uint32_t *)dst);
+           uint16_t tmp;
+           uint16_t a;
+           uint32_t m, d;
+           uint32_t r;
 
-               *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), 
vdest));
+           a = *mask++;
+           d = *dst;
 
-               w -= 4;
-               dst += 4;
-               mask += 4;
-           }
+           m = MUL_UN8 (sa, a, tmp);
+           r = ADD_UN8 (m, d, tmp);
+
+           *dst++ = r;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           __m64 vmask;
+           __m64 vdest;
+
+           vmask = load8888 ((uint32_t)ldl_u((uint32_t *)mask));
+           vdest = load8888 (*(uint32_t *)dst);
+
+           *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), 
vdest));
+
+           dst += 4;
+           mask += 4;
+           w -= 4;
        }
 
        while (w--)
@@ -2713,7 +2756,7 @@ mmx_composite_add_8_8 (pixman_implementation_t *imp,
 
        while (w >= 8)
        {
-           *(__m64*)dst = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
+           *(__m64*)dst = _mm_adds_pu8 ((__m64)ldq_u((uint64_t *)src), 
*(__m64*)dst);
            dst += 8;
            src += 8;
            w -= 8;
@@ -2771,7 +2814,7 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
 
        while (w >= 2)
        {
-           dst64 = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
+           dst64 = _mm_adds_pu8 ((__m64)ldq_u((uint64_t *)src), *(__m64*)dst);
            *(uint64_t*)dst = to_uint64 (dst64);
            dst += 2;
            src += 2;
-- 
1.7.3.4

_______________________________________________
Pixman mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pixman

Reply via email to