Here is a ssse3 optimization example.

I suspect why this can't get effect when using cairo-perf. But we did see some
improvement when play flash.

So just provided here.


>From 39b7a01f019cb0494af2a3cec7d094efb9b0a2ce Mon Sep 17 00:00:00 2001
From: Xu, Samuel <[email protected]>
Date: Wed, 8 Dec 2010 22:19:50 +0800
Subject: [PATCH] [ssse3] Optimization for fetch_scanline_x8r8g8b8

Add x8888 ssse3 optimization.

Signed-off-by: Xu Samuel <[email protected]>
Signed-off-by: Ma Ling <[email protected]>
Signed-off-by: Zhao Yakui <[email protected]>
---
 pixman/Makefile.am            |    4 +-
 pixman/pixman-ssse3-x86-asm.S |  255 +++++++++++++++++++
 pixman/pixman-ssse3-x86-asm.h |  552 +++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-ssse3.c         |   47 ++++
 4 files changed, 857 insertions(+), 1 deletions(-)
 create mode 100755 pixman/pixman-ssse3-x86-asm.S
 create mode 100755 pixman/pixman-ssse3-x86-asm.h

diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index ba6810c..c6a731c 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -98,7 +98,8 @@ endif
 if USE_SSSE3
 noinst_LTLIBRARIES += libpixman-ssse3.la
 libpixman_ssse3_la_SOURCES = \
-       pixman-ssse3.c
+       pixman-ssse3.c \
+       pixman-ssse3-x86-asm.S
 libpixman_ssse3_la_CFLAGS = $(DEP_CFLAGS) $(SSSE3_CFLAGS)
 libpixman_ssse3_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
@@ -106,6 +107,7 @@ libpixman_1_la_LIBADD += libpixman-ssse3.la
 
 ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
 endif
+
 # arm simd code
 if USE_ARM_SIMD
 noinst_LTLIBRARIES += libpixman-arm-simd.la
diff --git a/pixman/pixman-ssse3-x86-asm.S b/pixman/pixman-ssse3-x86-asm.S
new file mode 100755
index 0000000..c9b187e
--- /dev/null
+++ b/pixman/pixman-ssse3-x86-asm.S
@@ -0,0 +1,255 @@
+/*
+ * Copyright 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Ma Ling ([email protected])
+ * Author:  Xu, Samuel ([email protected])
+ * Author:  Yakui, Zhao ([email protected])
+ */
+#include "pixman-ssse3-x86-asm.h"
+
+
+    .section .note.GNU-stack
+    .previous
+
+#if (!defined(__amd64__) && !defined(__x86_64__))
+
+       .section        .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+       .globl  __i686.get_pc_thunk.bx
+       .hidden __i686.get_pc_thunk.bx
+       ALIGN (4)
+       .type   __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+       movl    (%esp), %ebx
+       ret
+#endif
+       .section .text.ssse3,"ax",@progbits
+
+ENTRY(composite_line_src_x888_8888_ssse3)
+       /* This is meaningless on 64-bit. But on 32-bit system,
+        * it saves EBX register and get the input argument.
+        */
+       ENTRANCE;
+       /* check whether the copy count is >= 48.
+        * if the copy count is >=48, goto 48bytesormore and use
+        * the XMM register to copy data. Otherwise the general
+        * purpose register is used.
+        */
+       CMP_COPY_LENGTH $48;
+       jae L(48bytesormore);
+       /*
+        * When the copy length is less than 48, we will use the general-purpose
+        * register to copy the pixel data.
+        */
+       GOTO_FWD_COPY;
+
+       ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned.  */
+L(48bytesormore):
+       /* Check whether the source address is aligned with the destination
+        * address at 16-bytes boundardy. If it is aligned, we will try to
+        * use 128-bytes loop copy mode. If they are not aligned, the
+        * packed-align copy mode will be used for the different unaligned
+        * length. For example: 4/8/12
+        * It is noted that the following src/dest will also be regarded as
+        * aligned after we handle the first 16-bytes.
+        * SRC: 0x48000005 , DEST: 0x49000025
+        */
+       SHL_COPY_PREPROCESS;
+       /* If it is aligned, use the 128-bytes loop copy mode */
+       jz L(shl_0);
+       /* calculate the different unaligned length and then use the
+        * unaligned copy mode. The possible unaligned length
+        * is 4/8/12
+        */
+       GOTO_UNALIGNED_COPY;
+
+       ALIGN (4)
+L(shl_0):
+       /* Compare whether the length is above 127bytes. If not,
+        * try to use 32-bytes copy mode several times and then handle the
+        * left length. For example: if the length is 100, it will use 32-bytes
+        * copy three times and then handle the left 4 bytes by using
+        * general-purpose register to do forward-copy mode.
+        */
+       SHL0_COPY_PREPROCESS;
+       CMP_COPY_LENGTH $127
+       /* If the length is >= 128bytes, we will use 128-byte loop-copy mode */
+       ja L(shl_0_gobble);
+       DEC_COPY_LENGTH 32;
+
+       /* Copy 32-bytes */
+       SHL0_COPY_32BYTES;
+       jb      L(shl_0_end)
+
+       SHL0_COPY_32BYTES;
+       jb      L(shl_0_end)
+
+       SHL0_COPY_32BYTES;
+       jb      L(shl_0_end)
+
+       SHL0_COPY_32BYTES;
+L(shl_0_end):
+       /* handle the left length, which can't be handled by 32-byte
+        * copy mode.
+        */
+       SHL0_COPY_POSTPROCESS;
+
+L(shl_0_gobble):
+       SHL0_GOBBLE_PREPROCESS;
+       DEC_COPY_LENGTH 128
+       /* Use the 128-byte loop-copy mode. The si/di/cx will be updated. Every
+        * loop will try to copy 128bytes by using XMM register */
+L(shl_0_gobble_cache_loop):
+       SHL0_COPY_128BYTES;
+
+       jae     L(shl_0_gobble_cache_loop)
+
+       /* If the left length is < 128, it will compare the length with 64/32/16
+        * then handle the corresponding length data */
+       CMP_COPY_LENGTH $-0x40;
+       INC_COPY_LENGTH 0x80;
+       jl L(shl_0_cache_less_64bytes);
+       /* Copy 64bytes when length is >=64 */
+       HANDLE_64BYTES;
+L(shl_0_cache_less_64bytes):
+       CMP_COPY_LENGTH $32;
+       jb L(shl_0_cache_less_32bytes);
+       /* Copy 32bytes when length is >=32 */
+       HANDLE_32BYTES;
+L(shl_0_cache_less_32bytes):
+       CMP_COPY_LENGTH $16;
+       /* Copy 16bytes when length is >=16 */
+       jb L(shl_0_cache_less_16bytes);
+       HANDLE_16BYTES;
+
+L(shl_0_cache_less_16bytes):
+       /* Use the general-purpose register to copy when the left length
+        * is < 16
+        */
+       HANDLE_LESS_16BYTES;
+
+       ALIGN(4)
+       /* The unaligned length is 4 */
+L(shl_4):
+       SHL_PREPROCESS;
+       ALIGN(4)
+       /* Use the two-stage 32bytes copy. And the 4-unaligned length
+        * is considered in course of copy. It is noted that two
+        * 16-bytes XMM register will be packed into one 16-byte
+        * XMM register.
+        */
+L(shl_4_loop):
+       SHL_COPY_STAGE_ONE $4
+       jb      L(shl_4_end)
+
+       SHL_COPY_STAGE_TWO $4
+       jae     L(shl_4_loop)
+L(shl_4_end):
+       /* Copy the left length */
+       SHL_POSTPROCESS(4);
+
+       ALIGN (4)
+       /* The unaligned length is 8 */
+L(shl_8):
+       SHL_PREPROCESS;
+       ALIGN(4)
+L(shl_8_loop):
+       SHL_COPY_STAGE_ONE $8
+       jb      L(shl_8_end)
+
+       SHL_COPY_STAGE_TWO $8
+       jae     L(shl_8_loop)
+L(shl_8_end):
+       SHL_POSTPROCESS(8);
+
+       /* The unaligned length is 12 */
+       ALIGN(4)
+L(shl_12):
+       SHL_PREPROCESS;
+       ALIGN(4)
+
+L(shl_12_loop):
+       SHL_COPY_STAGE_ONE $12
+       jb      L(shl_12_end)
+
+       SHL_COPY_STAGE_TWO $12
+
+       jae     L(shl_12_loop)
+L(shl_12_end):
+       SHL_POSTPROCESS(12);
+
+/* Forward copy for the length < 48 */
+       ALIGN (4)
+L(fwd_write_44bytes):
+       fwd_write_bytes 44
+L(fwd_write_40bytes):
+       fwd_write_bytes 40
+L(fwd_write_36bytes):
+       fwd_write_bytes 36
+L(fwd_write_32bytes):
+       fwd_write_bytes 32
+L(fwd_write_28bytes):
+       fwd_write_bytes 28
+L(fwd_write_24bytes):
+       fwd_write_bytes 24
+L(fwd_write_20bytes):
+       fwd_write_bytes 20
+L(fwd_write_16bytes):
+       fwd_write_bytes 16
+L(fwd_write_12bytes):
+       fwd_write_bytes 12
+L(fwd_write_8bytes):
+       fwd_write_bytes 8
+L(fwd_write_4bytes):
+       fwd_write_bytes 4
+L(fwd_write_0bytes):
+       FWD_WRITE_0BYTES;
+
+
+/* the Jump table for the forward copy with different length.*/
+       .pushsection .rodata.ssse3,"a",@progbits
+       ALIGN (2)
+L(table_48bytes_fwd):
+       .int    JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+       .int    JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+
+/* the Jump table for the copy with different unalign case.*/
+       ALIGN (2)
+L(shl_table):
+       .int    JMPTBL (L(shl_0), L(shl_table))
+       .int    JMPTBL (L(shl_4), L(shl_table))
+       .int    JMPTBL (L(shl_8), L(shl_table))
+       .int    JMPTBL (L(shl_12), L(shl_table))
+
+       .popsection
+
+END(composite_line_src_x888_8888_ssse3)
diff --git a/pixman/pixman-ssse3-x86-asm.h b/pixman/pixman-ssse3-x86-asm.h
new file mode 100755
index 0000000..d4a50ef
--- /dev/null
+++ b/pixman/pixman-ssse3-x86-asm.h
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Ma Ling ([email protected])
+ * Author:  Xu, Samuel ([email protected])
+ * Author:  Yakui, Zhao ([email protected])
+ *
+ */
+
+#ifndef L
+#define L(label)                       .L##label
+#endif
+
+/* the align macro, which will be aligned to 2^n bytes boundary */
+#ifndef ALIGN
+# define ALIGN(n)                      .p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc                 .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc                   .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)      .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)              .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)    .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state            .cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state             .cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)                   \
+       .type name,  @function;         \
+       .globl name;                    \
+       .p2align 4;                     \
+name:                                  \
+       cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)                     \
+       cfi_endproc;                    \
+       .size name, .-name
+#endif
+
+#define JMPTBL(I, B)   I - B
+
+/* the macro definition of 128-bytes copy. It is only for the aligned case.
+ * The si/di/cx register will also be updated after copy
+ * It is noted that the si/di should be algined at 16-bytes boundardy when
+ * the movdqa/movaps instruction is used.
+ */
+.macro ALIGN_COPY_128BYTES si, di, cx
+       movdqa  (\si), %xmm0
+       sub     $128, \cx
+       movaps  0x10(\si), %xmm1
+       por     %xmm6,  %xmm0
+       movaps  0x20(\si), %xmm2
+       por     %xmm6,  %xmm1
+       movaps  0x30(\si), %xmm3
+       por     %xmm6,  %xmm2
+       movdqa  %xmm0, (\di)
+       por     %xmm6,  %xmm3
+       movaps  %xmm1, 0x10(\di)
+       movaps  %xmm2, 0x20(\di)
+       movaps  %xmm3, 0x30(\di)
+       movaps  0x40(\si), %xmm0
+       lea     0x80(\di), \di
+       movaps  0x50(\si), %xmm1
+       por     %xmm6,  %xmm0
+       movaps  0x60(\si), %xmm2
+       por     %xmm6,  %xmm1
+       movaps  0x70(\si), %xmm3
+       por     %xmm6,  %xmm2
+       lea     0x80(\si), \si
+       movaps  %xmm0, -0x40(\di)
+       por     %xmm6,  %xmm3
+       movaps  %xmm1, -0x30(\di)
+       movaps  %xmm2, -0x20(\di)
+       movaps  %xmm3, -0x10(\di)
+.endm
+
+/* the macro definition of 64-bytes copy. It is only for the aligned case.
+ * The si/di/cx register will also be updated after copy
+ */
+.macro ALIGN_COPY_64BYTES si, di, cx
+       movdqa  (\si), %xmm0
+       sub     $0x40, \cx
+       movdqa  0x10(\si), %xmm1
+       por     %xmm6,  %xmm0
+       movdqa  0x20(\si), %xmm2
+       por     %xmm6,  %xmm1
+       movdqa  0x30(\si), %xmm3
+       por     %xmm6,  %xmm2
+       movdqa  %xmm0, (\di)
+       lea     0x40(\si), \si
+       movdqa  %xmm1, 0x10(\di)
+       por     %xmm6,  %xmm3
+       movdqa  %xmm2, 0x20(\di)
+       movdqa  %xmm3, 0x30(\di)
+       lea     0x40(\di), \di
+.endm
+
+/* the macro definition of 32-bytes copy. It is only for the aligned case.
+ * The si/di/cx register will also be updated after copy
+ */
+.macro ALIGN_COPY_32BYTES si, di, cx
+       movdqa  (\si), %xmm0
+       sub     $0x20, \cx
+       movdqa  0x10(\si), %xmm1
+       por     %xmm6,  %xmm0
+       lea     0x20(\si), \si
+       por     %xmm6,  %xmm1
+       movdqa  %xmm0, (\di)
+       movdqa  %xmm1, 0x10(\di)
+       lea     0x20(\di), \di
+.endm
+
+/* the macro definition of 16-bytes copy. It is only for the aligned case.
+ * The si/di/cx register will also be updated after copy
+ */
+.macro ALIGN_COPY_16BYTES si, di, cx
+       movdqa  (\si), %xmm0
+       sub     $0x10, \cx
+       add     $0x10, \si
+       por     %xmm6,  %xmm0
+       movdqa  %xmm0, (\di)
+       add     $0x10, \di
+.endm
+
+#if (defined(__amd64__) || defined(__x86_64__))
+
+#define ENTRANCE
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
+       shr             $2, INDEX;                              \
+       lea             TABLE(%rip), %r11;                      \
+       movslq  (%r11, INDEX, SCALE), INDEX;                    \
+       lea             (%r11, INDEX), INDEX;                   \
+       jmp             *INDEX;                                 \
+       ud2
+
+/* the macro definition of forward copy for the remaining small bytes.*/
+.macro fwd_write_bytes x
+       movl    $0xff000000, %ecx
+       or      -\x(%rsi), %ecx
+       movl    %ecx, -\x(%rdi)
+.endm
+
+/* the macro definition of shift copy in stage one . 32bytes.
+ * Two 16-bytes XMM register will be packed into another 16-byte by using 
palignr
+ * instruction. */
+.macro SHL_COPY_STAGE_ONE x
+       movaps  16(%rsi), %xmm2
+       sub     $32, %rdx
+       movaps  32(%rsi), %xmm3
+       lea     32(%rsi), %rsi
+       movdqa  %xmm3, %xmm4
+       palignr \x, %xmm2, %xmm3
+       lea     32(%rdi), %rdi
+       palignr \x, %xmm1, %xmm2
+       por     %xmm6, %xmm2
+       movaps  %xmm2, -32(%rdi)
+       por     %xmm6, %xmm3
+       movaps  %xmm3, -16(%rdi)
+.endm
+
+/* the macro definition of shift copy in stage two . 32bytes.
+ * Two 16-bytes XMM register will be packed into another 16-byte by using 
palignr
+ * instruction. */
+.macro SHL_COPY_STAGE_TWO x
+       movaps  16(%rsi), %xmm2
+       sub     $32, %rdx
+       movaps  32(%rsi), %xmm3
+       lea     32(%rsi), %rsi
+       movdqa  %xmm3, %xmm1
+       palignr \x, %xmm2, %xmm3
+       lea     32(%rdi), %rdi
+       palignr \x, %xmm4, %xmm2
+       por     %xmm6, %xmm2
+       movaps  %xmm2, -32(%rdi)
+       por     %xmm6, %xmm3
+       movaps  %xmm3, -16(%rdi)
+.endm
+
+/* the macro definition of INC/DEC copy-length register */
+.macro DEC_COPY_LENGTH x
+       lea     -\x(%rdx), %rdx;
+.endm
+
+.macro INC_COPY_LENGTH x
+       lea     \x(%rdx), %rdx;
+.endm
+
+/* Compare the length with the given length */
+.macro CMP_COPY_LENGTH x
+       cmp     \x, %rdx
+.endm
+
+/* The definition of forward-copy mode.
+ * It will jump to the corresponding function based on the copy-length
+ */
+
+
+#define GOTO_FWD_COPY                  \
+       add     %rdx, %rsi;             \
+       add     %rdx, %rdi;             \
+       BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
+
+/* Round up the dest to 16-aligned address boundary and then
+ * calculate whether the src is aligned with the dest
+ */
+#define SHL_COPY_PREPROCESS            \
+       movdqu  (%rsi), %xmm0;          \
+       mov     $0xff000000, %ecx;      \
+       movd    %ecx, %xmm6;            \
+       mov     %rdi, %r8;              \
+       and     $-16, %rdi;             \
+       add     $16, %rdi;              \
+       mov     %r8, %rcx;              \
+       sub     %rdi, %rcx;             \
+       add     %rcx, %rdx;             \
+       sub     %rcx, %rsi;             \
+                                       \
+       mov     %rsi, %rcx;             \
+       pshufd  $0, %xmm6, %xmm6;       \
+       and     $0xf, %rcx;             \
+       por     %xmm6, %xmm0;
+
+/* Caculate the unaligned-length and use the unaligned-copy mode.
+ * The unaligned-length should be considered when packing two 16-bytes into
+ * another 16-bytes by using palignr instruction */
+#define GOTO_UNALIGNED_COPY            \
+       sub     %rcx, %rsi;             \
+       lea     L(shl_table)(%rip), %r11;       \
+       shr     $2, %rcx;               \
+       movaps  (%rsi), %xmm1;          \
+       movdqu  %xmm0, (%r8);           \
+       movslq  (%r11, %rcx, 4), %rcx;  \
+       lea     (%r11, %rcx), %rcx;     \
+       jmp     *%rcx;                  \
+       ud2;                            \
+
+/* The macro definition is to make preparation for aligned-copy.
+        * Write the first 16-byte.(Maybe it is not on the 16-byte boundary).
+ */
+#define SHL0_COPY_PREPROCESS           \
+       movdqu  %xmm0, (%r8);           \
+       xor     %ecx, %ecx;
+
+/* the following two macro definitions are used to copy the
+ * data when the length is less than 128. Every time it will
+ * try to copy 32-bytes.
+ */
+#define SHL0_COPY_32BYTES                      \
+       movdqa  (%rsi, %rcx), %xmm0;            \
+       sub     $32, %rdx;                      \
+       movdqa  16(%rsi, %rcx), %xmm1;          \
+       por     %xmm6,  %xmm0;                  \
+       por     %xmm6,  %xmm1;                  \
+       movdqa  %xmm0, (%rdi, %rcx);            \
+       movdqa  %xmm1, 16(%rdi, %rcx);          \
+       lea     32(%rcx), %rcx;
+
+/* when the left length is less than 32, Use the forward-copy */
+#define SHL0_COPY_POSTPROCESS                  \
+       lea     32(%rdx), %rdx;                 \
+       add     %rdx, %rcx;                     \
+       add     %rcx, %rsi;                     \
+       add     %rcx, %rdi;                     \
+       BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
+
+
+#define SHL0_COPY_128BYTES                     \
+       ALIGN_COPY_128BYTES %rsi, %rdi, %rdx
+
+#define SHL0_GOBBLE_PREPROCESS
+
+#define HANDLE_64BYTES                 \
+       ALIGN_COPY_64BYTES %rsi, %rdi, %rdx
+
+#define HANDLE_32BYTES                 \
+       ALIGN_COPY_32BYTES %rsi, %rdi, %rdx
+
+#define HANDLE_16BYTES                 \
+       ALIGN_COPY_16BYTES %rsi, %rdi, %rdx
+
+/* When the left length is < 16 */
+#define HANDLE_LESS_16BYTES                    \
+       add     %rdx, %rsi;                     \
+       add     %rdx, %rdi;                     \
+       BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4)
+
+/* the definition of PREPROCESS for unaligned case. 4, 8, 12 */
+#define SHL_PREPROCESS                 \
+       lea     -32(%rdx), %rdx;        \
+
+/* the definition of POSTPROCESS for unaligned case. 4, 8, 12 */
+#define SHL_POSTPROCESS(x)                     \
+       lea     32(%rdx), %rdx;                 \
+       lea     x(%rsi, %rdx), %rsi;            \
+       add     %rdx, %rdi;                     \
+       BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4)
+
+/* This is used when the copy is alread finished */
+#define FWD_WRITE_0BYTES                       \
+       ret
+
+#else
+/* the following is the macro definition on 32-bits */
+# define PARMS         8               /* Preserve EBX.  */
+# define DEST          PARMS
+# define SRC           DEST+4
+# define LEN           SRC+4
+
+#define CFI_PUSH(REG)                                          \
+  cfi_adjust_cfa_offset (4);                                   \
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)                                           \
+  cfi_adjust_cfa_offset (-4);                                  \
+  cfi_restore (REG)
+
+#define PUSH(REG)      pushl REG; CFI_PUSH (REG)
+#define POP(REG)       popl REG; CFI_POP (REG)
+
+# define RETURN_END    POP (%ebx); ret
+# define RETURN                RETURN_END; CFI_PUSH (%ebx)
+
+#define ENTRANCE                       \
+       PUSH (%ebx);                    \
+       movl    LEN(%esp), %ecx;        \
+       movl    SRC(%esp), %eax;        \
+       movl    DEST(%esp), %edx;
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)           \
+    /* We first load PC into EBX.  */                          \
+    call       __i686.get_pc_thunk.bx;                         \
+    /* Get the address of the jump table.  */                  \
+    addl       $(TABLE - .), %ebx;                             \
+    shr                $2, INDEX;                              \
+    /* Get the entry and convert the relative offset to the    \
+       absolute address.  */                                   \
+    addl       (%ebx,INDEX,SCALE), %ebx;                       \
+    /* We loaded the jump table.  Go.  */                      \
+    jmp                *%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)                   \
+    addl       $(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)      \
+    shr                $2, INDEX;                                      \
+    addl       (%ebx,INDEX,SCALE), %ebx;                       \
+    /* We loaded the jump table.  Go.  */                      \
+    jmp                *%ebx
+
+.macro DEC_COPY_LENGTH x
+       lea     -\x(%ecx), %ecx;
+.endm
+
+.macro INC_COPY_LENGTH x
+       lea     \x(%ecx), %ecx
+.endm
+
+.macro CMP_COPY_LENGTH x
+       cmp     \x, %ecx
+.endm
+
+
+#define GOTO_FWD_COPY                  \
+       add     %ecx, %edx;             \
+       add     %ecx, %eax;             \
+       BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+
+#define SHL_COPY_PREPROCESS            \
+       movdqu  (%eax), %xmm0;          \
+       PUSH (%edi);                    \
+       mov     $0xff000000, %edi;      \
+       movd    %edi, %xmm6;            \
+       movl    %edx, %edi;             \
+       and     $-16, %edx;             \
+       PUSH (%esi);                    \
+       add     $16, %edx;              \
+       movl    %edi, %esi;             \
+       sub     %edx, %edi;             \
+       add     %edi, %ecx;             \
+       sub     %edi, %eax;             \
+                                       \
+       mov     %eax, %edi;             \
+       pshufd  $0, %xmm6, %xmm6;       \
+       and     $0xf, %edi;             \
+       por     %xmm6, %xmm0;           \
+
+#define GOTO_UNALIGNED_COPY            \
+       sub     %edi, %eax;             \
+       call    __i686.get_pc_thunk.bx; \
+       addl    $(L(shl_table)- .), %ebx;       \
+       shr     $2, %edi;               \
+       movaps  (%eax), %xmm1;          \
+       addl    (%ebx,%edi,4), %ebx;    \
+       movdqu  %xmm0, (%esi);          \
+       jmp     *%ebx;
+
+#define SHL0_COPY_PREPROCESS           \
+       movdqu  %xmm0, (%esi);          \
+       xor     %edi, %edi;             \
+       POP (%esi);
+
+#define SHL0_COPY_32BYTES              \
+       movdqa  (%eax, %edi), %xmm0;    \
+       sub     $32, %ecx;              \
+       movdqa  16(%eax, %edi), %xmm1;  \
+       por     %xmm6,  %xmm0;          \
+       por     %xmm6,  %xmm1;          \
+       movdqa  %xmm0, (%edx, %edi);    \
+       movdqa  %xmm1, 16(%edx, %edi);  \
+       lea     32(%edi), %edi;         \
+
+#define SHL0_COPY_POSTPROCESS                  \
+       lea     32(%ecx), %ecx;                 \
+       add     %ecx, %edi;                     \
+       add     %edi, %edx;                     \
+       add     %edi, %eax;                     \
+       add     $4, %ecx;                       \
+       and     $60, %ecx;                      \
+       POP (%edi);                             \
+       BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+#define SHL0_GOBBLE_PREPROCESS                 \
+       POP(%edi)
+
+#define SHL0_COPY_128BYTES                     \
+       ALIGN_COPY_128BYTES %eax, %edx, %ecx
+
+#define HANDLE_64BYTES                 \
+       ALIGN_COPY_64BYTES  %eax, %edx, %ecx
+
+#define HANDLE_32BYTES                 \
+       ALIGN_COPY_32BYTES  %eax, %edx, %ecx
+
+#define HANDLE_16BYTES                 \
+       ALIGN_COPY_16BYTES  %eax, %edx, %ecx
+
+#define HANDLE_LESS_16BYTES                    \
+       add     %ecx, %edx;                     \
+       add     %ecx, %eax;                     \
+       add     $4, %ecx;                       \
+       and     $60, %ecx;                      \
+       BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+/* the definition of PREPROCESS for unaligned case. 4, 8, 12 */
+#define SHL_PREPROCESS                 \
+       BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd));     \
+       lea     -32(%ecx), %ecx;                                \
+       POP (%esi);                                             \
+       POP (%edi);                                             \
+
+/* the definition of POSTPROCESS for unaligned case. 4, 8, 12 */
+#define SHL_POSTPROCESS(x)                     \
+       lea     32(%ecx), %ecx;                 \
+       add     %ecx, %edx;                     \
+       lea     x(%ecx, %eax), %eax;            \
+       BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+/* the macro definition of shift copy in stage one . 32bytes */
+.macro SHL_COPY_STAGE_ONE x
+       movaps  16(%eax), %xmm2
+       sub     $32, %ecx
+       movaps  32(%eax), %xmm3
+       lea     32(%eax), %eax
+       movdqa  %xmm3, %xmm4
+       palignr \x, %xmm2, %xmm3
+       lea     32(%edx), %edx
+       palignr \x, %xmm1, %xmm2
+       por     %xmm6, %xmm2
+       movaps  %xmm2, -32(%edx)
+       por     %xmm6, %xmm3
+       movaps  %xmm3, -16(%edx)
+.endm
+
+/* the macro definition of shift copy in stage two . 32bytes */
+.macro SHL_COPY_STAGE_TWO x
+       movaps  16(%eax), %xmm2
+       sub     $32, %ecx
+       movaps  32(%eax), %xmm3
+       lea     32(%eax), %eax
+       movdqa  %xmm3, %xmm1
+       palignr \x, %xmm2, %xmm3
+       lea     32(%edx), %edx
+       palignr \x, %xmm4, %xmm2
+       por     %xmm6, %xmm2
+       movaps  %xmm2, -32(%edx)
+       por     %xmm6, %xmm3
+       movaps  %xmm3, -16(%edx)
+.endm
+
+/* the macro definition of forward copy for the remaining small bytes.*/
+.macro fwd_write_bytes x
+       movl    $0xff000000, %ecx
+       or      -\x(%eax), %ecx
+       movl    %ecx, -\x(%edx)
+.endm
+
+#define FWD_WRITE_0BYTES                       \
+       movl    DEST(%esp), %eax;               \
+       RETURN
+
+#endif
diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c
index 8025ced..69d7a4a 100644
--- a/pixman/pixman-ssse3.c
+++ b/pixman/pixman-ssse3.c
@@ -32,7 +32,54 @@
 #include "pixman-private.h"
 
 #ifdef USE_SSSE3
+
+/*---------------------------------------------------------------------
+ * src_x888_8888
+ */
+extern void *composite_line_src_x888_8888_ssse3(uint32_t * dest,
+                                               uint32_t *src,
+                                               int32_t count);
+static void
+ssse3_composite_src_x888_8888(pixman_implementation_t  *imp,
+                             pixman_op_t               op,
+                             pixman_image_t            *src_image,
+                             pixman_image_t            *mask_image,
+                             pixman_image_t            *dst_image,
+                             int32_t                   src_x,
+                             int32_t                   src_y,
+                             int32_t                   mask_x,
+                             int32_t                   mask_y,
+                             int32_t                   dest_x,
+                             int32_t                   dest_y,
+                             int32_t                   width,
+                             int32_t                   height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+       composite_line_src_x888_8888_ssse3(dst, src, w*4);
+    }
+}
+
 static const pixman_fast_path_t ssse3_fast_paths[] = {
+    /* PIXMAN_OP_OVER */
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, 
ssse3_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, 
ssse3_composite_src_x888_8888),
     { PIXMAN_OP_NONE },
 };
 
-- 
1.7.0.4

_______________________________________________
Pixman mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pixman

Reply via email to