Here is a ssse3 optimization example. I suspect why this can't get effect when using cairo-perf. But we did see some improvement when play flash.
So just provided here. >From 39b7a01f019cb0494af2a3cec7d094efb9b0a2ce Mon Sep 17 00:00:00 2001 From: Xu, Samuel <[email protected]> Date: Wed, 8 Dec 2010 22:19:50 +0800 Subject: [PATCH] [ssse3] Optimization for fetch_scanline_x8r8g8b8 Add x8888 ssse3 optimization. Signed-off-by: Xu Samuel <[email protected]> Signed-off-by: Ma Ling <[email protected]> Signed-off-by: Zhao Yakui <[email protected]> --- pixman/Makefile.am | 4 +- pixman/pixman-ssse3-x86-asm.S | 255 +++++++++++++++++++ pixman/pixman-ssse3-x86-asm.h | 552 +++++++++++++++++++++++++++++++++++++++++ pixman/pixman-ssse3.c | 47 ++++ 4 files changed, 857 insertions(+), 1 deletions(-) create mode 100755 pixman/pixman-ssse3-x86-asm.S create mode 100755 pixman/pixman-ssse3-x86-asm.h diff --git a/pixman/Makefile.am b/pixman/Makefile.am index ba6810c..c6a731c 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -98,7 +98,8 @@ endif if USE_SSSE3 noinst_LTLIBRARIES += libpixman-ssse3.la libpixman_ssse3_la_SOURCES = \ - pixman-ssse3.c + pixman-ssse3.c \ + pixman-ssse3-x86-asm.S libpixman_ssse3_la_CFLAGS = $(DEP_CFLAGS) $(SSSE3_CFLAGS) libpixman_ssse3_la_LIBADD = $(DEP_LIBS) libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS) @@ -106,6 +107,7 @@ libpixman_1_la_LIBADD += libpixman-ssse3.la ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS) endif + # arm simd code if USE_ARM_SIMD noinst_LTLIBRARIES += libpixman-arm-simd.la diff --git a/pixman/pixman-ssse3-x86-asm.S b/pixman/pixman-ssse3-x86-asm.S new file mode 100755 index 0000000..c9b187e --- /dev/null +++ b/pixman/pixman-ssse3-x86-asm.S @@ -0,0 +1,255 @@ +/* + * Copyright 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Ma Ling ([email protected]) + * Author: Xu, Samuel ([email protected]) + * Author: Yakui, Zhao ([email protected]) + */ +#include "pixman-ssse3-x86-asm.h" + + + .section .note.GNU-stack + .previous + +#if (!defined(__amd64__) && !defined(__x86_64__)) + + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + ALIGN (4) + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret +#endif + .section .text.ssse3,"ax",@progbits + +ENTRY(composite_line_src_x888_8888_ssse3) + /* This is meaningless on 64-bit. But on 32-bit system, + * it saves EBX register and get the input argument. + */ + ENTRANCE; + /* check whether the copy count is >= 48. + * if the copy count is >=48, goto 48bytesormore and use + * the XMM register to copy data. Otherwise the general + * purpose register is used. + */ + CMP_COPY_LENGTH $48; + jae L(48bytesormore); + /* + * When the copy length is less than 48, we will use the general-purpose + * register to copy the pixel data. + */ + GOTO_FWD_COPY; + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(48bytesormore): + /* Check whether the source address is aligned with the destination + * address at 16-bytes boundardy. If it is aligned, we will try to + * use 128-bytes loop copy mode. If they are not aligned, the + * packed-align copy mode will be used for the different unaligned + * length. For example: 4/8/12 + * It is noted that the following src/dest will also be regarded as + * aligned after we handle the first 16-bytes. + * SRC: 0x48000005 , DEST: 0x49000025 + */ + SHL_COPY_PREPROCESS; + /* If it is aligned, use the 128-bytes loop copy mode */ + jz L(shl_0); + /* calculate the different unaligned length and then use the + * unaligned copy mode. The possible unaligned length + * is 4/8/12 + */ + GOTO_UNALIGNED_COPY; + + ALIGN (4) +L(shl_0): + /* Compare whether the length is above 127bytes. If not, + * try to use 32-bytes copy mode several times and then handle the + * left length. For example: if the length is 100, it will use 32-bytes + * copy three times and then handle the left 4 bytes by using + * general-purpose register to do forward-copy mode. + */ + SHL0_COPY_PREPROCESS; + CMP_COPY_LENGTH $127 + /* If the length is >= 128bytes, we will use 128-byte loop-copy mode */ + ja L(shl_0_gobble); + DEC_COPY_LENGTH 32; + + /* Copy 32-bytes */ + SHL0_COPY_32BYTES; + jb L(shl_0_end) + + SHL0_COPY_32BYTES; + jb L(shl_0_end) + + SHL0_COPY_32BYTES; + jb L(shl_0_end) + + SHL0_COPY_32BYTES; +L(shl_0_end): + /* handle the left length, which can't be handled by 32-byte + * copy mode. + */ + SHL0_COPY_POSTPROCESS; + +L(shl_0_gobble): + SHL0_GOBBLE_PREPROCESS; + DEC_COPY_LENGTH 128 + /* Use the 128-byte loop-copy mode. The si/di/cx will be updated. Every + * loop will try to copy 128bytes by using XMM register */ +L(shl_0_gobble_cache_loop): + SHL0_COPY_128BYTES; + + jae L(shl_0_gobble_cache_loop) + + /* If the left length is < 128, it will compare the length with 64/32/16 + * then handle the corresponding length data */ + CMP_COPY_LENGTH $-0x40; + INC_COPY_LENGTH 0x80; + jl L(shl_0_cache_less_64bytes); + /* Copy 64bytes when length is >=64 */ + HANDLE_64BYTES; +L(shl_0_cache_less_64bytes): + CMP_COPY_LENGTH $32; + jb L(shl_0_cache_less_32bytes); + /* Copy 32bytes when length is >=32 */ + HANDLE_32BYTES; +L(shl_0_cache_less_32bytes): + CMP_COPY_LENGTH $16; + /* Copy 16bytes when length is >=16 */ + jb L(shl_0_cache_less_16bytes); + HANDLE_16BYTES; + +L(shl_0_cache_less_16bytes): + /* Use the general-purpose register to copy when the left length + * is < 16 + */ + HANDLE_LESS_16BYTES; + + ALIGN(4) + /* The unaligned length is 4 */ +L(shl_4): + SHL_PREPROCESS; + ALIGN(4) + /* Use the two-stage 32bytes copy. And the 4-unaligned length + * is considered in course of copy. It is noted that two + * 16-bytes XMM register will be packed into one 16-byte + * XMM register. + */ +L(shl_4_loop): + SHL_COPY_STAGE_ONE $4 + jb L(shl_4_end) + + SHL_COPY_STAGE_TWO $4 + jae L(shl_4_loop) +L(shl_4_end): + /* Copy the left length */ + SHL_POSTPROCESS(4); + + ALIGN (4) + /* The unaligned length is 8 */ +L(shl_8): + SHL_PREPROCESS; + ALIGN(4) +L(shl_8_loop): + SHL_COPY_STAGE_ONE $8 + jb L(shl_8_end) + + SHL_COPY_STAGE_TWO $8 + jae L(shl_8_loop) +L(shl_8_end): + SHL_POSTPROCESS(8); + + /* The unaligned length is 12 */ + ALIGN(4) +L(shl_12): + SHL_PREPROCESS; + ALIGN(4) + +L(shl_12_loop): + SHL_COPY_STAGE_ONE $12 + jb L(shl_12_end) + + SHL_COPY_STAGE_TWO $12 + + jae L(shl_12_loop) +L(shl_12_end): + SHL_POSTPROCESS(12); + +/* Forward copy for the length < 48 */ + ALIGN (4) +L(fwd_write_44bytes): + fwd_write_bytes 44 +L(fwd_write_40bytes): + fwd_write_bytes 40 +L(fwd_write_36bytes): + fwd_write_bytes 36 +L(fwd_write_32bytes): + fwd_write_bytes 32 +L(fwd_write_28bytes): + fwd_write_bytes 28 +L(fwd_write_24bytes): + fwd_write_bytes 24 +L(fwd_write_20bytes): + fwd_write_bytes 20 +L(fwd_write_16bytes): + fwd_write_bytes 16 +L(fwd_write_12bytes): + fwd_write_bytes 12 +L(fwd_write_8bytes): + fwd_write_bytes 8 +L(fwd_write_4bytes): + fwd_write_bytes 4 +L(fwd_write_0bytes): + FWD_WRITE_0BYTES; + + +/* the Jump table for the forward copy with different length.*/ + .pushsection .rodata.ssse3,"a",@progbits + ALIGN (2) +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + +/* the Jump table for the copy with different unalign case.*/ + ALIGN (2) +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + + .popsection + +END(composite_line_src_x888_8888_ssse3) diff --git a/pixman/pixman-ssse3-x86-asm.h b/pixman/pixman-ssse3-x86-asm.h new file mode 100755 index 0000000..d4a50ef --- /dev/null +++ b/pixman/pixman-ssse3-x86-asm.h @@ -0,0 +1,552 @@ +/* + * Copyright 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Ma Ling ([email protected]) + * Author: Xu, Samuel ([email protected]) + * Author: Yakui, Zhao ([email protected]) + * + */ + +#ifndef L +#define L(label) .L##label +#endif + +/* the align macro, which will be aligned to 2^n bytes boundary */ +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef cfi_rel_offset +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +#endif + +#ifndef cfi_restore +# define cfi_restore(reg) .cfi_restore reg +#endif + +#ifndef cfi_adjust_cfa_offset +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +#endif + +#ifndef cfi_remember_state +# define cfi_remember_state .cfi_remember_state +#endif + +#ifndef cfi_restore_state +# define cfi_restore_state .cfi_restore_state +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#define JMPTBL(I, B) I - B + +/* the macro definition of 128-bytes copy. It is only for the aligned case. + * The si/di/cx register will also be updated after copy + * It is noted that the si/di should be algined at 16-bytes boundardy when + * the movdqa/movaps instruction is used. + */ +.macro ALIGN_COPY_128BYTES si, di, cx + movdqa (\si), %xmm0 + sub $128, \cx + movaps 0x10(\si), %xmm1 + por %xmm6, %xmm0 + movaps 0x20(\si), %xmm2 + por %xmm6, %xmm1 + movaps 0x30(\si), %xmm3 + por %xmm6, %xmm2 + movdqa %xmm0, (\di) + por %xmm6, %xmm3 + movaps %xmm1, 0x10(\di) + movaps %xmm2, 0x20(\di) + movaps %xmm3, 0x30(\di) + movaps 0x40(\si), %xmm0 + lea 0x80(\di), \di + movaps 0x50(\si), %xmm1 + por %xmm6, %xmm0 + movaps 0x60(\si), %xmm2 + por %xmm6, %xmm1 + movaps 0x70(\si), %xmm3 + por %xmm6, %xmm2 + lea 0x80(\si), \si + movaps %xmm0, -0x40(\di) + por %xmm6, %xmm3 + movaps %xmm1, -0x30(\di) + movaps %xmm2, -0x20(\di) + movaps %xmm3, -0x10(\di) +.endm + +/* the macro definition of 64-bytes copy. It is only for the aligned case. + * The si/di/cx register will also be updated after copy + */ +.macro ALIGN_COPY_64BYTES si, di, cx + movdqa (\si), %xmm0 + sub $0x40, \cx + movdqa 0x10(\si), %xmm1 + por %xmm6, %xmm0 + movdqa 0x20(\si), %xmm2 + por %xmm6, %xmm1 + movdqa 0x30(\si), %xmm3 + por %xmm6, %xmm2 + movdqa %xmm0, (\di) + lea 0x40(\si), \si + movdqa %xmm1, 0x10(\di) + por %xmm6, %xmm3 + movdqa %xmm2, 0x20(\di) + movdqa %xmm3, 0x30(\di) + lea 0x40(\di), \di +.endm + +/* the macro definition of 32-bytes copy. It is only for the aligned case. + * The si/di/cx register will also be updated after copy + */ +.macro ALIGN_COPY_32BYTES si, di, cx + movdqa (\si), %xmm0 + sub $0x20, \cx + movdqa 0x10(\si), %xmm1 + por %xmm6, %xmm0 + lea 0x20(\si), \si + por %xmm6, %xmm1 + movdqa %xmm0, (\di) + movdqa %xmm1, 0x10(\di) + lea 0x20(\di), \di +.endm + +/* the macro definition of 16-bytes copy. It is only for the aligned case. + * The si/di/cx register will also be updated after copy + */ +.macro ALIGN_COPY_16BYTES si, di, cx + movdqa (\si), %xmm0 + sub $0x10, \cx + add $0x10, \si + por %xmm6, %xmm0 + movdqa %xmm0, (\di) + add $0x10, \di +.endm + +#if (defined(__amd64__) || defined(__x86_64__)) + +#define ENTRANCE + +/* Branch to an entry in a jump table. TABLE is a jump table with + relative offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + shr $2, INDEX; \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + jmp *INDEX; \ + ud2 + +/* the macro definition of forward copy for the remaining small bytes.*/ +.macro fwd_write_bytes x + movl $0xff000000, %ecx + or -\x(%rsi), %ecx + movl %ecx, -\x(%rdi) +.endm + +/* the macro definition of shift copy in stage one . 32bytes. + * Two 16-bytes XMM register will be packed into another 16-byte by using palignr + * instruction. */ +.macro SHL_COPY_STAGE_ONE x + movaps 16(%rsi), %xmm2 + sub $32, %rdx + movaps 32(%rsi), %xmm3 + lea 32(%rsi), %rsi + movdqa %xmm3, %xmm4 + palignr \x, %xmm2, %xmm3 + lea 32(%rdi), %rdi + palignr \x, %xmm1, %xmm2 + por %xmm6, %xmm2 + movaps %xmm2, -32(%rdi) + por %xmm6, %xmm3 + movaps %xmm3, -16(%rdi) +.endm + +/* the macro definition of shift copy in stage two . 32bytes. + * Two 16-bytes XMM register will be packed into another 16-byte by using palignr + * instruction. */ +.macro SHL_COPY_STAGE_TWO x + movaps 16(%rsi), %xmm2 + sub $32, %rdx + movaps 32(%rsi), %xmm3 + lea 32(%rsi), %rsi + movdqa %xmm3, %xmm1 + palignr \x, %xmm2, %xmm3 + lea 32(%rdi), %rdi + palignr \x, %xmm4, %xmm2 + por %xmm6, %xmm2 + movaps %xmm2, -32(%rdi) + por %xmm6, %xmm3 + movaps %xmm3, -16(%rdi) +.endm + +/* the macro definition of INC/DEC copy-length register */ +.macro DEC_COPY_LENGTH x + lea -\x(%rdx), %rdx; +.endm + +.macro INC_COPY_LENGTH x + lea \x(%rdx), %rdx; +.endm + +/* Compare the length with the given length */ +.macro CMP_COPY_LENGTH x + cmp \x, %rdx +.endm + +/* The definition of forward-copy mode. + * It will jump to the corresponding function based on the copy-length + */ + + +#define GOTO_FWD_COPY \ + add %rdx, %rsi; \ + add %rdx, %rdi; \ + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4) + +/* Round up the dest to 16-aligned address boundary and then + * calculate whether the src is aligned with the dest + */ +#define SHL_COPY_PREPROCESS \ + movdqu (%rsi), %xmm0; \ + mov $0xff000000, %ecx; \ + movd %ecx, %xmm6; \ + mov %rdi, %r8; \ + and $-16, %rdi; \ + add $16, %rdi; \ + mov %r8, %rcx; \ + sub %rdi, %rcx; \ + add %rcx, %rdx; \ + sub %rcx, %rsi; \ + \ + mov %rsi, %rcx; \ + pshufd $0, %xmm6, %xmm6; \ + and $0xf, %rcx; \ + por %xmm6, %xmm0; + +/* Caculate the unaligned-length and use the unaligned-copy mode. + * The unaligned-length should be considered when packing two 16-bytes into + * another 16-bytes by using palignr instruction */ +#define GOTO_UNALIGNED_COPY \ + sub %rcx, %rsi; \ + lea L(shl_table)(%rip), %r11; \ + shr $2, %rcx; \ + movaps (%rsi), %xmm1; \ + movdqu %xmm0, (%r8); \ + movslq (%r11, %rcx, 4), %rcx; \ + lea (%r11, %rcx), %rcx; \ + jmp *%rcx; \ + ud2; \ + +/* The macro definition is to make preparation for aligned-copy. + * Write the first 16-byte.(Maybe it is not on the 16-byte boundary). + */ +#define SHL0_COPY_PREPROCESS \ + movdqu %xmm0, (%r8); \ + xor %ecx, %ecx; + +/* the following two macro definitions are used to copy the + * data when the length is less than 128. Every time it will + * try to copy 32-bytes. + */ +#define SHL0_COPY_32BYTES \ + movdqa (%rsi, %rcx), %xmm0; \ + sub $32, %rdx; \ + movdqa 16(%rsi, %rcx), %xmm1; \ + por %xmm6, %xmm0; \ + por %xmm6, %xmm1; \ + movdqa %xmm0, (%rdi, %rcx); \ + movdqa %xmm1, 16(%rdi, %rcx); \ + lea 32(%rcx), %rcx; + +/* when the left length is less than 32, Use the forward-copy */ +#define SHL0_COPY_POSTPROCESS \ + lea 32(%rdx), %rdx; \ + add %rdx, %rcx; \ + add %rcx, %rsi; \ + add %rcx, %rdi; \ + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4) + + +#define SHL0_COPY_128BYTES \ + ALIGN_COPY_128BYTES %rsi, %rdi, %rdx + +#define SHL0_GOBBLE_PREPROCESS + +#define HANDLE_64BYTES \ + ALIGN_COPY_64BYTES %rsi, %rdi, %rdx + +#define HANDLE_32BYTES \ + ALIGN_COPY_32BYTES %rsi, %rdi, %rdx + +#define HANDLE_16BYTES \ + ALIGN_COPY_16BYTES %rsi, %rdi, %rdx + +/* When the left length is < 16 */ +#define HANDLE_LESS_16BYTES \ + add %rdx, %rsi; \ + add %rdx, %rdi; \ + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %rdx, 4) + +/* the definition of PREPROCESS for unaligned case. 4, 8, 12 */ +#define SHL_PREPROCESS \ + lea -32(%rdx), %rdx; \ + +/* the definition of POSTPROCESS for unaligned case. 4, 8, 12 */ +#define SHL_POSTPROCESS(x) \ + lea 32(%rdx), %rdx; \ + lea x(%rsi, %rdx), %rsi; \ + add %rdx, %rdi; \ + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %rdx, 4) + +/* This is used when the copy is alread finished */ +#define FWD_WRITE_0BYTES \ + ret + +#else +/* the following is the macro definition on 32-bits */ +# define PARMS 8 /* Preserve EBX. */ +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) + +#define ENTRANCE \ + PUSH (%ebx); \ + movl LEN(%esp), %ecx; \ + movl SRC(%esp), %eax; \ + movl DEST(%esp), %edx; + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + shr $2, INDEX; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ + addl $(TABLE - .), %ebx + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + shr $2, INDEX; \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + +.macro DEC_COPY_LENGTH x + lea -\x(%ecx), %ecx; +.endm + +.macro INC_COPY_LENGTH x + lea \x(%ecx), %ecx +.endm + +.macro CMP_COPY_LENGTH x + cmp \x, %ecx +.endm + + +#define GOTO_FWD_COPY \ + add %ecx, %edx; \ + add %ecx, %eax; \ + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + + +#define SHL_COPY_PREPROCESS \ + movdqu (%eax), %xmm0; \ + PUSH (%edi); \ + mov $0xff000000, %edi; \ + movd %edi, %xmm6; \ + movl %edx, %edi; \ + and $-16, %edx; \ + PUSH (%esi); \ + add $16, %edx; \ + movl %edi, %esi; \ + sub %edx, %edi; \ + add %edi, %ecx; \ + sub %edi, %eax; \ + \ + mov %eax, %edi; \ + pshufd $0, %xmm6, %xmm6; \ + and $0xf, %edi; \ + por %xmm6, %xmm0; \ + +#define GOTO_UNALIGNED_COPY \ + sub %edi, %eax; \ + call __i686.get_pc_thunk.bx; \ + addl $(L(shl_table)- .), %ebx; \ + shr $2, %edi; \ + movaps (%eax), %xmm1; \ + addl (%ebx,%edi,4), %ebx; \ + movdqu %xmm0, (%esi); \ + jmp *%ebx; + +#define SHL0_COPY_PREPROCESS \ + movdqu %xmm0, (%esi); \ + xor %edi, %edi; \ + POP (%esi); + +#define SHL0_COPY_32BYTES \ + movdqa (%eax, %edi), %xmm0; \ + sub $32, %ecx; \ + movdqa 16(%eax, %edi), %xmm1; \ + por %xmm6, %xmm0; \ + por %xmm6, %xmm1; \ + movdqa %xmm0, (%edx, %edi); \ + movdqa %xmm1, 16(%edx, %edi); \ + lea 32(%edi), %edi; \ + +#define SHL0_COPY_POSTPROCESS \ + lea 32(%ecx), %ecx; \ + add %ecx, %edi; \ + add %edi, %edx; \ + add %edi, %eax; \ + add $4, %ecx; \ + and $60, %ecx; \ + POP (%edi); \ + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + +#define SHL0_GOBBLE_PREPROCESS \ + POP(%edi) + +#define SHL0_COPY_128BYTES \ + ALIGN_COPY_128BYTES %eax, %edx, %ecx + +#define HANDLE_64BYTES \ + ALIGN_COPY_64BYTES %eax, %edx, %ecx + +#define HANDLE_32BYTES \ + ALIGN_COPY_32BYTES %eax, %edx, %ecx + +#define HANDLE_16BYTES \ + ALIGN_COPY_16BYTES %eax, %edx, %ecx + +#define HANDLE_LESS_16BYTES \ + add %ecx, %edx; \ + add %ecx, %eax; \ + add $4, %ecx; \ + and $60, %ecx; \ + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + +/* the definition of PREPROCESS for unaligned case. 4, 8, 12 */ +#define SHL_PREPROCESS \ + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)); \ + lea -32(%ecx), %ecx; \ + POP (%esi); \ + POP (%edi); \ + +/* the definition of POSTPROCESS for unaligned case. 4, 8, 12 */ +#define SHL_POSTPROCESS(x) \ + lea 32(%ecx), %ecx; \ + add %ecx, %edx; \ + lea x(%ecx, %eax), %eax; \ + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + +/* the macro definition of shift copy in stage one . 32bytes */ +.macro SHL_COPY_STAGE_ONE x + movaps 16(%eax), %xmm2 + sub $32, %ecx + movaps 32(%eax), %xmm3 + lea 32(%eax), %eax + movdqa %xmm3, %xmm4 + palignr \x, %xmm2, %xmm3 + lea 32(%edx), %edx + palignr \x, %xmm1, %xmm2 + por %xmm6, %xmm2 + movaps %xmm2, -32(%edx) + por %xmm6, %xmm3 + movaps %xmm3, -16(%edx) +.endm + +/* the macro definition of shift copy in stage two . 32bytes */ +.macro SHL_COPY_STAGE_TWO x + movaps 16(%eax), %xmm2 + sub $32, %ecx + movaps 32(%eax), %xmm3 + lea 32(%eax), %eax + movdqa %xmm3, %xmm1 + palignr \x, %xmm2, %xmm3 + lea 32(%edx), %edx + palignr \x, %xmm4, %xmm2 + por %xmm6, %xmm2 + movaps %xmm2, -32(%edx) + por %xmm6, %xmm3 + movaps %xmm3, -16(%edx) +.endm + +/* the macro definition of forward copy for the remaining small bytes.*/ +.macro fwd_write_bytes x + movl $0xff000000, %ecx + or -\x(%eax), %ecx + movl %ecx, -\x(%edx) +.endm + +#define FWD_WRITE_0BYTES \ + movl DEST(%esp), %eax; \ + RETURN + +#endif diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c index 8025ced..69d7a4a 100644 --- a/pixman/pixman-ssse3.c +++ b/pixman/pixman-ssse3.c @@ -32,7 +32,54 @@ #include "pixman-private.h" #ifdef USE_SSSE3 + +/*--------------------------------------------------------------------- + * src_x888_8888 + */ +extern void *composite_line_src_x888_8888_ssse3(uint32_t * dest, + uint32_t *src, + int32_t count); +static void +ssse3_composite_src_x888_8888(pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t *src_image, + pixman_image_t *mask_image, + pixman_image_t *dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int32_t w; + int dst_stride, src_stride; + + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + composite_line_src_x888_8888_ssse3(dst, src, w*4); + } +} + static const pixman_fast_path_t ssse3_fast_paths[] = { + /* PIXMAN_OP_OVER */ + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, ssse3_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, ssse3_composite_src_x888_8888), { PIXMAN_OP_NONE }, }; -- 1.7.0.4 _______________________________________________ Pixman mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/pixman
