From: Xianju Diao <xianjud...@gmail.com> make check: when I enable the USE_OPENMP, the test of 'glyph-test' and 'cover-test' will failed on Loongson-3A3000. Neither of the two test examples passed without optimizing the code.Maybe be multi-core synchronization of cpu bug,I will continue to debug this problem, Now, I use the critical of openMP, 'glyph-test' and ' cover-test' can passed.
benchmark: Running cairo-perf-trace benchmark on Loongson-3A. image image16 gvim 5.425 -> 5.069 5.531 -> 5.236 popler-reseau 2.149 -> 2.13 2.152 -> 2.139 swfdec-giant-steps-full 18.672 -> 8.215 33.167 -> 18.28 swfdec-giant-steps 7.014 -> 2.455 12.48 -> 5.982 xfce4-terminal-al 13.695 -> 5.241 15.703 -> 5.859 gonme-system-monitor 12.783 -> 7.058 12.780 -> 7.104 grads-heat-map 0.482 -> 0.486 0.516 -> 0.514 firefox-talos-svg 141.138 -> 134.621 152.495 -> 159.069 firefox-talos-gfx 23.119 -> 14.437 24.870 -> 15.161 firefox-world-map 32.018 -> 27.139 33.817 -> 28.085 firefox-periodic-table 12.305 -> 12.443 12.876 -> 12.913 evolution 7.071 -> 3.564 8.550 -> 3.784 firefox-planet-gnome 77.926 -> 67.526 81.554 -> 65.840 ocitysmap 4.934 -> 1.702 4.937 -> 1.701 --- configure.ac | 7 +- pixman/Makefile.am | 4 +- pixman/loongson-mmintrin.h | 46 ++ pixman/pixman-combine32.h | 6 + pixman/pixman-mips-dspr2-asm.h | 2 +- pixman/pixman-mips-memcpy-asm.S | 324 +++++------- pixman/pixman-mmx.c | 1088 ++++++++++++++++++++++++++++++++++++++- pixman/pixman-private.h | 32 +- pixman/pixman-solid-fill.c | 49 +- pixman/pixman-utils.c | 65 ++- test/Makefile.am | 2 +- test/utils.c | 8 + 12 files changed, 1418 insertions(+), 215 deletions(-) diff --git a/configure.ac b/configure.ac index e833e45..3e3dde5 100644 --- a/configure.ac +++ b/configure.ac @@ -154,9 +154,9 @@ AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"]) # has set CFLAGS. if test $SUNCC = yes && \ test "x$test_CFLAGS" = "x" && \ - test "$CFLAGS" = "-g" + test "$CFLAGS" = "-g -mabi=n64" then - CFLAGS="-O -g" + CFLAGS="-O -g -mabi=n64" fi # @@ -183,6 +183,7 @@ AC_SUBST(LT_VERSION_INFO) # Check for dependencies PIXMAN_CHECK_CFLAG([-Wall]) +PIXMAN_CHECK_CFLAG([-mabi=n64]) PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement]) PIXMAN_CHECK_CFLAG([-Wno-unused-local-typedefs]) PIXMAN_CHECK_CFLAG([-fno-strict-aliasing]) @@ -273,7 +274,7 @@ dnl =========================================================================== dnl Check for Loongson Multimedia Instructions if test "x$LS_CFLAGS" = "x" ; then - LS_CFLAGS="-march=loongson2f" + LS_CFLAGS="-march=loongson3a" fi have_loongson_mmi=no diff --git a/pixman/Makefile.am b/pixman/Makefile.am index 581b6f6..e3a080c 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -122,7 +122,7 @@ libpixman_mips_dspr2_la_SOURCES = \ pixman-mips-dspr2.h \ pixman-mips-dspr2-asm.S \ pixman-mips-dspr2-asm.h \ - pixman-mips-memcpy-asm.S + #pixman-mips-memcpy-asm.S libpixman_1_la_LIBADD += libpixman-mips-dspr2.la ASM_CFLAGS_mips_dspr2= @@ -131,7 +131,7 @@ endif # loongson code if USE_LOONGSON_MMI noinst_LTLIBRARIES += libpixman-loongson-mmi.la -libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h +libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h pixman-mips-memcpy-asm.S libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS) libpixman_1_la_LDFLAGS += $(LS_LDFLAGS) libpixman_1_la_LIBADD += libpixman-loongson-mmi.la diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h index 086c6e0..f049463 100644 --- a/pixman/loongson-mmintrin.h +++ b/pixman/loongson-mmintrin.h @@ -89,6 +89,17 @@ _mm_adds_pu8 (__m64 __m1, __m64 __m2) } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andn_si64 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pandn %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f"(__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_and_si64 (__m64 __m1, __m64 __m2) { __m64 ret; @@ -100,6 +111,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2) } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pcmpeqh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) { __m64 ret; @@ -110,6 +132,30 @@ _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) return ret; } +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +loongson_fand (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("fand %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pcmpgth %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + + extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_empty (void) { diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h index cdd56a6..27f62d9 100644 --- a/pixman/pixman-combine32.h +++ b/pixman/pixman-combine32.h @@ -14,6 +14,12 @@ #define RB_ONE_HALF 0x800080 #define RB_MASK_PLUS_ONE 0x10000100 +#define RGB_MASK 0xffffff +#define RGB_DMASK 0xffffffffffffULL +#define R_DMASK 0x0000ffff00000000ULL +#define G_DMASK 0x00000000ffff0000ULL +#define B_DMASK 0x000000000000ffffULL + #define ALPHA_8(x) ((x) >> A_SHIFT) #define RED_8(x) (((x) >> R_SHIFT) & MASK) #define GREEN_8(x) (((x) >> G_SHIFT) & MASK) diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h index e238566..63d7d96 100644 --- a/pixman/pixman-mips-dspr2-asm.h +++ b/pixman/pixman-mips-dspr2-asm.h @@ -77,7 +77,7 @@ .ent symbol, 0; \ symbol: .frame sp, 0, ra; \ .set push; \ - .set arch=mips32r2; \ + .set arch=mips64r2; \ .set noreorder; \ .set noat; diff --git a/pixman/pixman-mips-memcpy-asm.S b/pixman/pixman-mips-memcpy-asm.S index 9ad6da5..a140191 100644 --- a/pixman/pixman-mips-memcpy-asm.S +++ b/pixman/pixman-mips-memcpy-asm.S @@ -54,19 +54,20 @@ LEAF_MIPS32R2(pixman_mips_fast_memcpy) /* Test if the src and dst are word-aligned, or can be made word-aligned */ xor t8, a1, a0 - andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */ + andi t8, t8, 0x7 /* t8 is a0/a1 word-displacement */ bne t8, zero, $unaligned negu a3, a0 - andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */ + andi a3, a3, 0x7 /* we need to copy a3 bytes to make a0/a1 aligned */ beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */ subu a2, a2, a3 /* now a2 is the remining bytes count */ - LWHI t8, 0(a1) - addu a1, a1, a3 - SWHI t8, 0(a0) - addu a0, a0, a3 + ld t8, 0(a1) + daddu a1, a1, a3 + sdl t8, 7(a0) + sdr t8, 0(a0) + daddu a0, a0, a3 /* Now the dst/src are mutually word-aligned with word-aligned addresses */ $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ @@ -76,9 +77,9 @@ $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ /* There will be at most 1 32-byte chunk after it */ subu a3, a2, t8 /* subtract from a2 the reminder */ /* Here a3 counts bytes in 16w chunks */ - addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ + daddu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ - addu t0, a0, a2 /* t0 is the "past the end" address */ + daddu t0, a0, a2 /* t0 is the "past the end" address */ /* * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past @@ -89,119 +90,98 @@ $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ */ subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ - pref 0, 0(a1) /* bring the first line of src, addr 0 */ - pref 0, 32(a1) /* bring the second line of src, addr 32 */ - pref 0, 64(a1) /* bring the third line of src, addr 64 */ - pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ + lw $0, 0(a1) /* bring the first line of src, addr 0 */ + lw $0, 32(a1) /* bring the second line of src, addr 32 */ + lw $0, 64(a1) /* bring the third line of src, addr 64 */ + lw $0, 32(a0) /* safe, as we have at least 64 bytes ahead */ /* In case the a0 > t9 don't use "pref 30" at all */ sgtu v1, a0, t9 bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */ nop /* otherwise, start with using pref30 */ - pref 30, 64(a0) + lw $0, 64(a0) $loop16w: - pref 0, 96(a1) - lw t0, 0(a1) + lw $0, 96(a1) + ld t0, 0(a1) bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */ - lw t1, 4(a1) - pref 30, 96(a0) /* continue setting up the dest, addr 96 */ + lw $0, 96(a0) /* continue setting up the dest, addr 96 */ $skip_pref30_96: - lw t2, 8(a1) - lw t3, 12(a1) - lw t4, 16(a1) - lw t5, 20(a1) - lw t6, 24(a1) - lw t7, 28(a1) - pref 0, 128(a1) /* bring the next lines of src, addr 128 */ - - sw t0, 0(a0) - sw t1, 4(a0) - sw t2, 8(a0) - sw t3, 12(a0) - sw t4, 16(a0) - sw t5, 20(a0) - sw t6, 24(a0) - sw t7, 28(a0) - - lw t0, 32(a1) + ld t2, 8(a1) + ld t4, 16(a1) + ld t6, 24(a1) + lw $0, 128(a1) /* bring the next lines of src, addr 128 */ + lw $0, 0x0(a0) + + sd t0, 0(a0) + sd t2, 8(a0) + sd t4, 16(a0) + sd t6, 24(a0) + + ld t0, 32(a1) bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */ - lw t1, 36(a1) - pref 30, 128(a0) /* continue setting up the dest, addr 128 */ + lw $0, 128(a0) /* continue setting up the dest, addr 128 */ $skip_pref30_128: - lw t2, 40(a1) - lw t3, 44(a1) - lw t4, 48(a1) - lw t5, 52(a1) - lw t6, 56(a1) - lw t7, 60(a1) - pref 0, 160(a1) /* bring the next lines of src, addr 160 */ - - sw t0, 32(a0) - sw t1, 36(a0) - sw t2, 40(a0) - sw t3, 44(a0) - sw t4, 48(a0) - sw t5, 52(a0) - sw t6, 56(a0) - sw t7, 60(a0) - - addiu a0, a0, 64 /* adding 64 to dest */ + ld t2, 40(a1) + ld t4, 48(a1) + ld t6, 56(a1) + lw $0, 160(a1) /* bring the next lines of src, addr 160 */ + lw $0, 0x32(a0) + + sd t0, 32(a0) + sd t2, 40(a0) + sd t4, 48(a0) + sd t6, 56(a0) + + daddiu a0, a0, 64 /* adding 64 to dest */ sgtu v1, a0, t9 bne a0, a3, $loop16w - addiu a1, a1, 64 /* adding 64 to src */ + daddiu a1, a1, 64 /* adding 64 to src */ move a2, t8 /* Here we have src and dest word-aligned but less than 64-bytes to go */ $chk8w: - pref 0, 0x0(a1) + lw $0, 0x0(a1) andi t8, a2, 0x1f /* is there a 32-byte chunk? */ /* the t8 is the reminder count past 32-bytes */ beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */ nop - lw t0, 0(a1) - lw t1, 4(a1) - lw t2, 8(a1) - lw t3, 12(a1) - lw t4, 16(a1) - lw t5, 20(a1) - lw t6, 24(a1) - lw t7, 28(a1) - addiu a1, a1, 32 - - sw t0, 0(a0) - sw t1, 4(a0) - sw t2, 8(a0) - sw t3, 12(a0) - sw t4, 16(a0) - sw t5, 20(a0) - sw t6, 24(a0) - sw t7, 28(a0) - addiu a0, a0, 32 + ld t0, 0(a1) + ld t2, 8(a1) + ld t4, 16(a1) + ld t6, 24(a1) + lw $0, 0x0(a0) + daddiu a1, a1, 32 + + sd t0, 0(a0) + sd t2, 8(a0) + sd t4, 16(a0) + sd t6, 24(a0) + daddiu a0, a0, 32 $chk1w: andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ beq a2, t8, $last8 subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ - addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ + daddu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ /* copying in words (4-byte chunks) */ $wordCopy_loop: lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */ - addiu a1, a1, 4 - addiu a0, a0, 4 + daddiu a1, a1, 4 + daddiu a0, a0, 4 bne a0, a3, $wordCopy_loop sw t3, -4(a0) /* For the last (<8) bytes */ $last8: blez a2, leave - addu a3, a0, a2 /* a3 is the last dst address */ + daddu a3, a0, a2 /* a3 is the last dst address */ $last8loop: lb v1, 0(a1) - addiu a1, a1, 1 - addiu a0, a0, 1 + daddiu a1, a1, 1 + daddiu a0, a0, 1 bne a0, a3, $last8loop sb v1, -1(a0) @@ -214,15 +194,16 @@ leave: j ra $unaligned: /* got here with a3="negu a0" */ - andi a3, a3, 0x3 /* test if the a0 is word aligned */ + andi a3, a3, 0x7 /* test if the a0 is word aligned */ beqz a3, $ua_chk16w subu a2, a2, a3 /* bytes left after initial a3 bytes */ - LWHI v1, 0(a1) - LWLO v1, 3(a1) - addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */ - SWHI v1, 0(a0) - addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */ + ldl v1, 7(a1) + ldr v1, 0(a1) + daddu a1, a1, a3 /* a3 may be here 1, 2 or 3 */ + sdl v1, 7(a0) + sdr v1, 0(a0) + daddu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */ $ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ /* t8 is the byte count after 64-byte chunks */ @@ -230,149 +211,116 @@ $ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ /* There will be at most 1 32-byte chunk after it */ subu a3, a2, t8 /* subtract from a2 the reminder */ /* Here a3 counts bytes in 16w chunks */ - addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ + daddu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ - addu t0, a0, a2 /* t0 is the "past the end" address */ + daddu t0, a0, a2 /* t0 is the "past the end" address */ subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ - pref 0, 0(a1) /* bring the first line of src, addr 0 */ - pref 0, 32(a1) /* bring the second line of src, addr 32 */ - pref 0, 64(a1) /* bring the third line of src, addr 64 */ - pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ + lw $0, 0(a1) /* bring the first line of src, addr 0 */ + lw $0, 32(a1) /* bring the second line of src, addr 32 */ + lw $0, 64(a1) /* bring the third line of src, addr 64 */ + lw $0, 32(a0) /* safe, as we have at least 64 bytes ahead */ /* In case the a0 > t9 don't use "pref 30" at all */ sgtu v1, a0, t9 bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */ nop /* otherwise, start with using pref30 */ - pref 30, 64(a0) + lw $0, 64(a0) $ua_loop16w: - pref 0, 96(a1) - LWHI t0, 0(a1) - LWLO t0, 3(a1) - LWHI t1, 4(a1) + lw $0, 96(a1) + ldl t0, 7(a1) + ldr t0, 0(a1) bgtz v1, $ua_skip_pref30_96 - LWLO t1, 7(a1) - pref 30, 96(a0) /* continue setting up the dest, addr 96 */ + lw $0, 96(a0) /* continue setting up the dest, addr 96 */ $ua_skip_pref30_96: - LWHI t2, 8(a1) - LWLO t2, 11(a1) - LWHI t3, 12(a1) - LWLO t3, 15(a1) - LWHI t4, 16(a1) - LWLO t4, 19(a1) - LWHI t5, 20(a1) - LWLO t5, 23(a1) - LWHI t6, 24(a1) - LWLO t6, 27(a1) - LWHI t7, 28(a1) - LWLO t7, 31(a1) - pref 0, 128(a1) /* bring the next lines of src, addr 128 */ - - sw t0, 0(a0) - sw t1, 4(a0) - sw t2, 8(a0) - sw t3, 12(a0) - sw t4, 16(a0) - sw t5, 20(a0) - sw t6, 24(a0) - sw t7, 28(a0) - - LWHI t0, 32(a1) - LWLO t0, 35(a1) - LWHI t1, 36(a1) + ldl t2, 15(a1) + ldr t2, 8(a1) + ldl t4, 23(a1) + ldr t4, 16(a1) + ldl t6, 31(a1) + ldr t6, 24(a1) + lw $0, 128(a1) /* bring the next lines of src, addr 128 */ + lw $0, 0(a0) + + sd t0, 0(a0) + sd t2, 8(a0) + sd t4, 16(a0) + sd t6, 24(a0) + + ldl t0, 39(a1) + ldr t0, 32(a1) bgtz v1, $ua_skip_pref30_128 - LWLO t1, 39(a1) - pref 30, 128(a0) /* continue setting up the dest, addr 128 */ + lw $0, 128(a0) /* continue setting up the dest, addr 128 */ $ua_skip_pref30_128: - LWHI t2, 40(a1) - LWLO t2, 43(a1) - LWHI t3, 44(a1) - LWLO t3, 47(a1) - LWHI t4, 48(a1) - LWLO t4, 51(a1) - LWHI t5, 52(a1) - LWLO t5, 55(a1) - LWHI t6, 56(a1) - LWLO t6, 59(a1) - LWHI t7, 60(a1) - LWLO t7, 63(a1) - pref 0, 160(a1) /* bring the next lines of src, addr 160 */ - - sw t0, 32(a0) - sw t1, 36(a0) - sw t2, 40(a0) - sw t3, 44(a0) - sw t4, 48(a0) - sw t5, 52(a0) - sw t6, 56(a0) - sw t7, 60(a0) - - addiu a0, a0, 64 /* adding 64 to dest */ + ldl t2, 47(a1) + ldr t2, 40(a1) + ldl t4, 55(a1) + ldr t4, 48(a1) + ldl t6, 63(a1) + ldr t6, 56(a1) + lw $0, 32(a0) + lw $0, 160(a1) /* bring the next lines of src, addr 160 */ + + sd t0, 32(a0) + sd t2, 40(a0) + sd t4, 48(a0) + sd t6, 56(a0) + + daddiu a0, a0, 64 /* adding 64 to dest */ sgtu v1, a0, t9 bne a0, a3, $ua_loop16w - addiu a1, a1, 64 /* adding 64 to src */ + daddiu a1, a1, 64 /* adding 64 to src */ move a2, t8 /* Here we have src and dest word-aligned but less than 64-bytes to go */ $ua_chk8w: - pref 0, 0x0(a1) + lw $0, 0x0(a1) andi t8, a2, 0x1f /* is there a 32-byte chunk? */ /* the t8 is the reminder count */ beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */ - LWHI t0, 0(a1) - LWLO t0, 3(a1) - LWHI t1, 4(a1) - LWLO t1, 7(a1) - LWHI t2, 8(a1) - LWLO t2, 11(a1) - LWHI t3, 12(a1) - LWLO t3, 15(a1) - LWHI t4, 16(a1) - LWLO t4, 19(a1) - LWHI t5, 20(a1) - LWLO t5, 23(a1) - LWHI t6, 24(a1) - LWLO t6, 27(a1) - LWHI t7, 28(a1) - LWLO t7, 31(a1) - addiu a1, a1, 32 - - sw t0, 0(a0) - sw t1, 4(a0) - sw t2, 8(a0) - sw t3, 12(a0) - sw t4, 16(a0) - sw t5, 20(a0) - sw t6, 24(a0) - sw t7, 28(a0) - addiu a0, a0, 32 + ldl t0, 7(a1) + ldr t0, 0(a1) + ldl t2, 15(a1) + ldr t2, 8(a1) + ldl t4, 23(a1) + ldr t4, 16(a1) + ldl t6, 31(a1) + ldr t6, 24(a1) + lw $0, 0x0(a0) + daddiu a1, a1, 32 + + sd t0, 0(a0) + sd t2, 8(a0) + sd t4, 16(a0) + sd t6, 24(a0) + daddiu a0, a0, 32 $ua_chk1w: andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ beq a2, t8, $ua_smallCopy subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ - addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ + daddu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ /* copying in words (4-byte chunks) */ $ua_wordCopy_loop: LWHI v1, 0(a1) LWLO v1, 3(a1) - addiu a1, a1, 4 - addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */ + daddiu a1, a1, 4 + daddiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */ bne a0, a3, $ua_wordCopy_loop sw v1, -4(a0) /* Now less than 4 bytes (value in a2) left to copy */ $ua_smallCopy: beqz a2, leave - addu a3, a0, a2 /* a3 is the last dst address */ + daddu a3, a0, a2 /* a3 is the last dst address */ $ua_smallCopy_loop: lb v1, 0(a1) - addiu a1, a1, 1 - addiu a0, a0, 1 + daddiu a1, a1, 1 + daddiu a0, a0, 1 bne a0, a3, $ua_smallCopy_loop sb v1, -1(a0) diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index dec3974..edbf16b 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -59,6 +59,71 @@ _mm_empty (void) } #endif +#define COMBINE_A_OUT 1 +#define COMBINE_A_IN 2 +#define COMBINE_B_OUT 4 +#define COMBINE_B_IN 8 + +#define COMBINE_CLEAR 0 +#define COMBINE_A (COMBINE_A_OUT | COMBINE_A_IN) +#define COMBINE_B (COMBINE_B_OUT | COMBINE_B_IN) +#define COMBINE_A_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN) +#define COMBINE_B_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN) +#define COMBINE_A_ATOP (COMBINE_B_OUT | COMBINE_A_IN) +#define COMBINE_B_ATOP (COMBINE_A_OUT | COMBINE_B_IN) +#define COMBINE_XOR (COMBINE_A_OUT | COMBINE_B_OUT) + +/* no SIMD instructions for div, so leave it alone + * portion covered by a but not b + * min (1, (1-b) / a) + */ +static uint8_t +combine_disjoint_out_part (uint8_t a, uint8_t b) +{ + + b = ~b; + if (b >= a) + return MASK; + return DIV_UN8 (b, a); +} + +/* portion covered by both a and b + * max (1-(1-b)/a, 0) + */ +static uint8_t +combine_disjoint_in_part (uint8_t a, uint8_t b) +{ + + b = ~b; + if (b >= a) + return 0; + return ~DIV_UN8(b, a); +} + +/* portion covered by a but not b + * max (1-b/a ,0) + * */ +static uint8_t +combine_conjoint_out_part (uint8_t a, uint8_t b) +{ + + if (b >= a) + return 0x00; + return ~DIV_UN8(b, a); +} + +/* portion covered by both a and b + * min (1, b/a) + */ +static uint8_t +combine_conjoint_in_part (uint8_t a, uint8_t b) +{ + + if (b >= a) + return MASK; + return DIV_UN8 (b, a); +} + #ifdef USE_X86_MMX # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) # include <xmmintrin.h> @@ -78,7 +143,8 @@ _mm_movemask_pi8 (__m64 __A) return ret; } - +#define __OPTIMIZE__ +#ifdef __OPTIMIZE__ extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mulhi_pu16 (__m64 __A, __m64 __B) { @@ -88,7 +154,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B) ); return __A; } - +#else # define _mm_shuffle_pi16(A, N) \ ({ \ __m64 ret; \ @@ -102,7 +168,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B) }) # endif #endif - +#endif #ifndef _MSC_VER #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) @@ -710,6 +776,34 @@ combine (const uint32_t *src, const uint32_t *mask) return vsrc; } +static force_inline void +mmx_combine_mask_ca(const uint32_t *src, const uint32_t *mask, __m64 *s64, __m64 *m64) +{ + __m64 res, tmp; + + if(!(*mask)) + { + *s64 = 0; + *m64 = 0; + return; + } + + *s64 = load8888(src); + + if (*mask == ~0) + { + *m64 = expand_alpha(*s64); + return; + } + + *m64 = load8888(mask); + + res = pix_multiply(*s64, *m64); + tmp = expand_alpha(*s64); + *s64 = res; + *m64 = pix_multiply(*m64, tmp); +} + static force_inline __m64 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) { @@ -729,6 +823,39 @@ core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) } static void +mmx_combine_disjoint_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + uint32_t *end = dest + width; + uint32_t s32; + uint64_t sa64; + __m64 s64, d64; + + while (dest < end) + { + s64 = combine (src, mask); + + if (s64) + { + store8888(&s32, s64); + sa64 = combine_disjoint_out_part (*dest >> A_SHIFT, s32 >> A_SHIFT); + d64 = pix_add (pix_multiply (load8888 (dest),expand_alpha_rev ((*(__m64*)&sa64))), s64); + store8888 (dest, d64); + } + + ++dest; + ++src; + if (mask) + ++mask; + + } +} + +static void mmx_combine_over_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dest, @@ -1062,7 +1189,294 @@ mmx_combine_saturate_u (pixman_implementation_t *imp, } _mm_empty (); } +/* In functions such as ‘combine_conjoint_gerneral_u’, there are multiple branchs,determined by the parameter 'combine'. + * and this value will not change during functions operations,so it is not necessary to judge each value in the origin + * code. Can be judged at function entrance,and set the corresponding function pointer,can be called directly later. + */ +#define DEF_FUNC_ZERO_MASK(type, zm, suffix, res) \ + static type inline combine_joint_ ##zm## _ ##suffix( type sa, type da, type io_flag) \ + { \ + return res; \ + } + +/* 'conjoint' is same code structure as 'disjoint',the funtion name is different,set this macro to generate the corresponding + * function.The order of parameter is different,which is determined by 'io_flag',with '0' for 'in_part' and '1' for 'out_part'. + */ +#define DEF_FUNC_COMBINE_JOINT_U(cd, io) \ + static uint8_t inline combine_ ##cd## joint_ ##io## _part_u(uint8_t sa, uint8_t da, uint8_t io_flag) \ + { \ + uint8_t parm[2]; \ + parm[0] = sa * (io_flag ^ 0x1) + da * (io_flag ^ 0x0); \ + parm[1] = sa * (io_flag ^ 0x0) + da * (io_flag ^ 0x1); \ + return combine_ ##cd## joint_ ##io## _part (parm[0], parm[1]); \ + } +/* Sets the macro for the array of function pointers, storing the correct handler at the function entrance */ +#define DEF_COMB_FUNC_ARR(cd,SUFFIX,suffix) \ + COMBINE_JOINT_FUNC_##SUFFIX combine_ ##cd## joint_ ##suffix[4] ={ \ + combine_joint_zero_ ##suffix, \ + combine_ ##cd## joint_out_part_ ##suffix, \ + combine_ ##cd## joint_in_part_ ##suffix, \ + combine_joint_mask_ ##suffix \ + }; + +typedef uint8_t (*COMBINE_JOINT_FUNC_U)(uint8_t a, uint8_t b, uint8_t io_flag); + +DEF_FUNC_ZERO_MASK(uint8_t,zero,u, 0x0) +DEF_FUNC_ZERO_MASK(uint8_t,mask,u, ~0x0) + +DEF_FUNC_COMBINE_JOINT_U(dis, in); +DEF_FUNC_COMBINE_JOINT_U(dis, out); +DEF_COMB_FUNC_ARR(dis,U,u) + +DEF_FUNC_COMBINE_JOINT_U(con, in); +DEF_FUNC_COMBINE_JOINT_U(con, out); +DEF_COMB_FUNC_ARR(con, U, u) +/* Set an underlying function,'conjoint' and 'disjoint' related functions can be called. */ +static void +mmx_combine_joint_general_u (uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb, + COMBINE_JOINT_FUNC_U *cjf) +{ + COMBINE_JOINT_FUNC_U combine_joint_u[2]; + combine_joint_u[0] = cjf[comb & COMBINE_A]; /* in_part */ + combine_joint_u[1] = cjf[(comb & COMBINE_B)>>2]; /* out_par */ + + uint32_t *end = dest + width; + while (dest < end) + { + __m64 s64 = combine (src, mask); + __m64 d64,sa64,da64; + uint8_t sa, da; + uint32_t tmp; + uint64_t Fa, Fb; + + /* Because these function contain division instructions, + * multimedia instruction are not used to optimize them. + */ + store8888(&tmp, s64); + sa = tmp >> A_SHIFT; + da = *dest >> A_SHIFT; + + Fa = combine_joint_u[0](sa, da, 0); + Fb = combine_joint_u[1](sa, da, 1); + + d64 = load8888(dest); + sa64 = expand_alpha_rev (*(__m64*)&Fa); + da64 = expand_alpha_rev (*(__m64*)&Fb); + + d64 = pix_add_mul (s64, sa64, d64, da64); + + store8888 (dest, d64); + + ++dest; + ++src; + if (mask) + ++mask; + } +} + + +static void +mmx_combine_disjoint_general_u (uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb) +{ + mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_disjoint_u); +} + +static void +mmx_combine_disjoint_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN); +} + +static void +mmx_combine_disjoint_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN); +} + +static void +mmx_combine_disjoint_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT); +} + +static void +mmx_combine_disjoint_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT); +} + +static void +mmx_combine_disjoint_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP); +} + +static void +mmx_combine_disjoint_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP); +} + +static void +mmx_combine_disjoint_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR); +} + +/* Conjoint */ +static void +mmx_combine_conjoint_general_u(uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb) +{ + mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_conjoint_u); +} + +static void +mmx_combine_conjoint_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER); +} + +static void +mmx_combine_conjoint_over_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER); +} + +static void +mmx_combine_conjoint_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN); +} + +static void +mmx_combine_conjoint_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN); +} + +static void +mmx_combine_conjoint_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT); +} + +static void +mmx_combine_conjoint_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT); +} + +static void +mmx_combine_conjoint_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP); +} + +static void +mmx_combine_conjoint_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP); +} + +static void +mmx_combine_conjoint_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR); +} +/* Component alpha combiners */ static void mmx_combine_src_ca (pixman_implementation_t *imp, pixman_op_t op, @@ -1089,6 +1503,410 @@ mmx_combine_src_ca (pixman_implementation_t *imp, } static void +mmx_combine_saturate_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + uint32_t *end = dest + width; + while (dest < end) + { + uint16_t sa, sr, sg, sb; + uint32_t sa32, m32; + __m64 m64, s64, d64, sa64, da64, cmpf, res; + + mmx_combine_mask_ca (src, mask, &s64, &m64); + + d64 = load8888 (dest); + da64 = expand_alpha (negate(d64)); + cmpf = _mm_cmpgt_pi16 (m64, da64); + if (cmpf) + { + store8888 (&m32, m64); + sa = (m32 >> (A_SHIFT)); + sr = (m32 >> (R_SHIFT)) & MASK; + sg = (m32 >> (G_SHIFT)) & MASK; + sb = m32 & MASK; + sa32 = (~(*dest) >> A_SHIFT) & MASK; + + sa = (sa) ? sa : 0x1; + sr = (sr) ? sr : 0x1; + sg = (sg) ? sg : 0x1; + sb = (sb) ? sb : 0x1; + + sa32 = ((sa32 << G_SHIFT) / sb & MASK) | + ((((sa32 << G_SHIFT) / sg) & MASK) << G_SHIFT) | + ((((sa32 << G_SHIFT) / sr) & MASK) << R_SHIFT) | + ((((sa32 << G_SHIFT) / sa) & MASK) << A_SHIFT); + sa64 = load8888 (&sa32); + da64 = MC (4x00ff); + res = pix_multiply (s64, sa64); + s64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (s64, negate (cmpf))); + res = pix_multiply (d64, da64); + d64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (d64, negate (cmpf))); + } + res = _mm_adds_pu8 (s64, d64); + store8888 (dest, res); + + ++dest; + ++src; + if (mask) + ++mask; + } +} + +#define DEF_FUNC_COMBINE_JOINT_CA(cd, io) \ + static uint32_t inline combine_ ##cd## joint_ ##io## _part_ca(uint32_t sa, uint32_t da, uint32_t io_flag) \ + { \ + uint8_t da8 = da >> A_SHIFT; \ + uint32_t m, n, o, p, res; \ + uint8_t i, parm[2][4], shift=0; \ + for (i=0; i<4; i++) \ + { \ + parm[0][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x1) + da8 * (io_flag ^ 0x0); \ + parm[1][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x0) + da8 * (io_flag ^ 0x1); \ + shift += G_SHIFT; \ + } \ + m = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][0], parm[1][0]); \ + n = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][1], parm[1][1]) << G_SHIFT; \ + o = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][2], parm[1][2]) << R_SHIFT; \ + p = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][3], parm[1][3]) << A_SHIFT; \ + res = m | n | o | p; \ + return res; \ + } + +typedef uint32_t (*COMBINE_JOINT_FUNC_CA)(uint32_t sa, uint32_t da, uint32_t io_flag); + +DEF_FUNC_ZERO_MASK(uint32_t, zero, ca, 0x0) +DEF_FUNC_ZERO_MASK(uint32_t, mask, ca, ~0x0) + +DEF_FUNC_COMBINE_JOINT_CA(dis, in); +DEF_FUNC_COMBINE_JOINT_CA(dis, out); +DEF_COMB_FUNC_ARR(dis, CA, ca) + +DEF_FUNC_COMBINE_JOINT_CA(con, in); +DEF_FUNC_COMBINE_JOINT_CA(con, out); +DEF_COMB_FUNC_ARR(con, CA, ca) + +static void +mmx_combine_joint_general_ca (uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb, + COMBINE_JOINT_FUNC_CA *cjf) +{ + COMBINE_JOINT_FUNC_CA combine_joint_ca[2]; + combine_joint_ca[0] = cjf[comb & COMBINE_A]; + combine_joint_ca[1] = cjf[(comb & COMBINE_B)>>2]; + + uint32_t *end = dest + width; + while (dest < end) + { + __m64 m64, s64, sa64, da64, d64; + uint32_t m32, Fa, Fb; + + mmx_combine_mask_ca (src, mask, &s64, &m64); + store8888(&m32, m64); + + Fa = combine_joint_ca[0](m32, *dest, 0); + Fb = combine_joint_ca[1](m32, *dest, 1); + + sa64 = load8888 (&Fa); + da64 = load8888 (&Fb); + + d64 = load8888 (dest); + d64 = pix_add_mul(s64, sa64, d64, da64); + + store8888 (dest, d64); + + ++dest; + ++src; + if (mask) + ++mask; + } + +} + +static void +mmx_combine_disjoint_general_ca (uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb) +{ + mmx_combine_joint_general_ca (dest, src, mask, width, comb, combine_disjoint_ca); +} + +static void +mmx_combine_disjoint_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER); +} + +static void +mmx_combine_disjoint_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN); +} + +static void +mmx_combine_disjoint_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN); +} + +static void +mmx_combine_disjoint_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT); +} + +static void +mmx_combine_disjoint_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT); +} + +static void +mmx_combine_disjoint_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP); +} + +static void +mmx_combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP); +} + +static void +mmx_combine_disjoint_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR); +} + +static void +mmx_combine_conjoint_general_ca(uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb) +{ + mmx_combine_joint_general_ca(dest,src,mask,width,comb,combine_conjoint_ca); +} + +/* + * Multiply + * B(Dca, ad, Sca, as) = Dca.Sca + */ + +static void +mmx_combine_multiply_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + __m64 dia, d, sia; + __m64 s = combine (src, mask); + __m64 ss = s; + d = load8888 (dest); + sia = negate (expand_alpha (s)); + dia = negate (expand_alpha (d)); + ss = pix_add_mul (ss, dia, d, sia); + d = pix_multiply (d, s); + d = pix_add (d, ss); + store8888 (dest, d); + + ++dest; + ++src; + if (mask) + mask++; + } + _mm_empty (); +} + +static void +mmx_combine_multiply_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 r = d; + __m64 da = negate (expand_alpha (d)); + __m64 sa = expand_alpha (s); + s = pix_multiply (s, a); + a = pix_multiply (a, sa); + a = negate (a); + r = pix_add_mul (r, a, s, da); + d = pix_multiply (d, s); + r = pix_add (r, d); + store8888 (dest, r); + + ++src; + ++dest; + ++mask; + } + _mm_empty (); +} + +static void +mmx_combine_conjoint_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER); +} + +static void +mmx_combine_conjoint_over_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER); +} + +static void +mmx_combine_conjoint_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN); +} + +static void +mmx_combine_conjoint_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN); +} + +static void +mmx_combine_conjoint_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT); +} + +static void +mmx_combine_conjoint_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT); +} + +static void +mmx_combine_conjoint_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP); +} + +static void +mmx_combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP); +} + +static void +mmx_combine_conjoint_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR); +} + +static void mmx_combine_over_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dest, @@ -2089,23 +2907,34 @@ mmx_fill (pixman_implementation_t *imp, stride = stride * (int) sizeof (uint32_t) / 1; byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); byte_width = width; - stride *= 1; +/*non necessary???*/ +/* stride *= 1; */ filler = (filler & 0xff) * 0x01010101; } else if (bpp == 16) { stride = stride * (int) sizeof (uint32_t) / 2; byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); +#if 0 byte_width = 2 * width; stride *= 2; +#else + byte_width = width << 1; + stride <<= 1; +#endif filler = (filler & 0xffff) * 0x00010001; } else { stride = stride * (int) sizeof (uint32_t) / 4; byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); +#if 0 byte_width = 4 * width; stride *= 4; +#else + byte_width = width << 2; + stride <<= 2; +#endif } fill = ((uint64_t)filler << 32) | filler; @@ -3274,9 +4103,15 @@ mmx_blt (pixman_implementation_t *imp, dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); +#if 0 byte_width = 2 * width; src_stride *= 2; dst_stride *= 2; +#else + byte_width = width << 1; + src_stride <<= 1; + dst_stride <<= 1; +#endif } else if (src_bpp == 32) { @@ -3284,9 +4119,15 @@ mmx_blt (pixman_implementation_t *imp, dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); +#if 0 byte_width = 4 * width; src_stride *= 4; dst_stride *= 4; +#else + byte_width = width << 2; + src_stride <<= 2; + dst_stride <<= 2; +#endif } else { @@ -4003,6 +4844,186 @@ static const pixman_iter_info_t mmx_iters[] = { PIXMAN_null }, }; +#define MMX_PDF_SEPARABLE_BLEND_MODE(name) \ +static void \ +mmx_combine_ ## name ## _u (pixman_implementation_t *imp, \ + pixman_op_t op, \ + uint32_t * dest, \ + const uint32_t * src, \ + const uint32_t * mask, \ + int width) \ +{ \ + int i; \ + for (i = 0; i < width; ++i) { \ + __m64 s = load8888(src + i); \ + __m64 d = load8888(dest + i); \ + __m64 da = expand_alpha(d); \ + \ + if(mask) \ + { \ + __m64 m = load8888(mask + i); \ + __m64 ma = expand_alpha(m); \ + s = pix_multiply(s,ma); \ + } \ + __m64 sa = expand_alpha(s); \ + \ + __m64 isa = negate(sa); \ + __m64 ida = negate(da); \ + \ + uint32_t result,sada,res; \ + __m64 temp; \ + store8888(&result,pix_add_mul(d,isa,s,ida)); \ + store8888(&sada,pix_multiply(sa,da)); \ + store8888(&res,mmx_blend_ ## name(d,da,s,sa)); \ + \ + sada &= A_MASK; \ + res &= RGB_MASK; \ + temp = pix_add( pix_add(load8888(&result), load8888(&sada)), \ + load8888(&res)); \ + store8888(dest+i, temp); \ + } \ +} \ +static void \ +mmx_combine_ ## name ## _ca (pixman_implementation_t *imp, \ + pixman_op_t op, \ + uint32_t * dest, \ + const uint32_t * src, \ + const uint32_t * mask, \ + int width) \ + { \ + int i; \ + for (i = 0; i < width; ++i) { \ + __m64 m = load8888(mask + i); \ + __m64 s = load8888(src + i); \ + __m64 d = load8888(dest + i); \ + __m64 sa = expand_alpha(s); \ + __m64 da = expand_alpha(d); \ + __m64 ida = negate(da); \ + \ + s = pix_multiply(s,m); \ + m = pix_multiply(m,sa); \ + __m64 im = negate(m); \ + __m64 ima = expand_alpha(m); \ + \ + uint32_t result,mada,res; \ + __m64 temp; \ + store8888(&result,pix_add_mul(d,im,s,ida)); \ + store8888(&mada,pix_multiply(ima,da)); \ + store8888(&res,mmx_blend_ ## name(d,da,s,m)); \ + \ + mada &= A_MASK; \ + res &= RGB_MASK; \ + temp = pix_add( pix_add(load8888(&result), load8888(&mada)), \ + load8888(&res)); \ + store8888(dest+i, temp); \ + } \ +} \ + +static inline __m64 +_emulate_pminuh(__m64 s, __m64 d) +{ + uint64_t tmp_s = to_uint64(s); + uint64_t tmp_d = to_uint64(d); + + __m64 res = to_m64(MIN((tmp_s & R_DMASK), (tmp_d & R_DMASK)) + | MIN((tmp_s & G_DMASK), (tmp_d & G_DMASK)) + | MIN((tmp_s & B_DMASK), (tmp_d & B_DMASK))); + + return res; +} + +static inline __m64 +_emulate_pmaxuh(__m64 s, __m64 d) +{ + uint64_t tmp_s = to_uint64(s); + uint64_t tmp_d = to_uint64(d); + + __m64 res = to_m64(MAX((tmp_s & R_DMASK), (tmp_d & R_DMASK)) + | MAX((tmp_s & G_DMASK), (tmp_d & G_DMASK)) + | MAX((tmp_s & B_DMASK), (tmp_d & B_DMASK))); + + return res; +} + +#define R_GREATER(a, b) ((a > b) ? 0x0000ffff00000000ULL : 0) +#define G_GREATER(a, b) ((a > b) ? 0x00000000ffff0000ULL : 0) +#define B_GREATER(a, b) ((a > b) ? 0x000000000000ffffULL : 0) + +static inline __m64 +_emulate_pcmpgtuh(__m64 s, __m64 d) +{ + uint64_t tmp_s = to_uint64(s); + uint64_t tmp_d = to_uint64(d); + + __m64 res = to_m64(R_GREATER((tmp_s & R_DMASK), (tmp_d & R_DMASK)) + | G_GREATER((tmp_s & G_DMASK), (tmp_d & G_DMASK)) + | B_GREATER((tmp_s & B_DMASK), (tmp_d & B_DMASK))); + + return res; +} + +static inline __m64 +_emulate_paddcmpgtuh(__m64 s, __m64 d1, __m64 d2) +{ + uint64_t tmp_s = to_uint64(s); + uint64_t tmp_d1 = to_uint64(d1); + uint64_t tmp_d2 = to_uint64(d2); + + __m64 res = to_m64(R_GREATER((tmp_s & R_DMASK), (tmp_d1 & R_DMASK) + (tmp_d2 & R_DMASK)) + | G_GREATER((tmp_s & G_DMASK), (tmp_d1 & G_DMASK) + (tmp_d2 & G_DMASK)) + | B_GREATER((tmp_s & B_DMASK), (tmp_d1 & B_DMASK) + (tmp_d2 & B_DMASK))); + + return res; +} + + +/* + * Darken + * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa) + */ +static inline __m64 +mmx_blend_darken (__m64 dca, __m64 da, __m64 sca, __m64 sa) +{ + __m64 res; + + __m64 s = _mm_mullo_pi16(sca,da); + __m64 d = _mm_mullo_pi16(dca,sa); + + + res = _emulate_pminuh(s, d); + res = _mm_adds_pu16(res,MC(4x0080)); + res = _mm_mulhi_pu16(res,MC(4x0101)); + + return res; +} + +MMX_PDF_SEPARABLE_BLEND_MODE (darken) + +/* + * Lighten + * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa) + */ +static inline __m64 +mmx_blend_lighten (__m64 dca, __m64 da, __m64 sca, __m64 sa) +{ + __m64 res; + + __m64 s = _mm_mullo_pi16(sca,da); + __m64 d = _mm_mullo_pi16(dca,sa); + + res = _emulate_pmaxuh(s, d); + res = _mm_adds_pu16(res,MC(4x0080)); + res = _mm_mulhi_pu16(res,MC(4x0101)); + + return res; +} + +MMX_PDF_SEPARABLE_BLEND_MODE (lighten) + + +#undef MMX_PDF_SEPARABLE_BLEND_MODE + + static const pixman_fast_path_t mmx_fast_paths[] = { PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), @@ -4114,8 +5135,37 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback) { pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); + /* Unified alpha */ imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; + /* Disjoint, unified */ + imp->combine_32[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_u; + imp->combine_32[PIXMAN_OP_DISJOINT_OVER_REVERSE] = mmx_combine_saturate_u; + imp->combine_32[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_u; + imp->combine_32[PIXMAN_OP_DISJOINT_IN_REVERSE] = mmx_combine_disjoint_in_reverse_u; + imp->combine_32[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_u; + imp->combine_32[PIXMAN_OP_DISJOINT_OUT_REVERSE] = mmx_combine_disjoint_out_reverse_u; + imp->combine_32[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_u; + imp->combine_32[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = mmx_combine_disjoint_atop_reverse_u; + imp->combine_32[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_u; + + /* Conjoint, unified */ + imp->combine_32[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_u; + imp->combine_32[PIXMAN_OP_CONJOINT_OVER_REVERSE] = mmx_combine_conjoint_over_reverse_u; + imp->combine_32[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_u; + imp->combine_32[PIXMAN_OP_CONJOINT_IN_REVERSE] = mmx_combine_conjoint_in_reverse_u; + imp->combine_32[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_u; + imp->combine_32[PIXMAN_OP_CONJOINT_OUT_REVERSE] = mmx_combine_conjoint_out_reverse_u; + imp->combine_32[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_u; + imp->combine_32[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = mmx_combine_conjoint_atop_reverse_u; + imp->combine_32[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_u; + + /* Multiply, Unified */ + imp->combine_32[PIXMAN_OP_MULTIPLY] = mmx_combine_multiply_u; + imp->combine_32[PIXMAN_OP_DARKEN] = mmx_combine_darken_u; + imp->combine_32[PIXMAN_OP_LIGHTEN] = mmx_combine_lighten_u; + + /* Component alpha combiners */ imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; @@ -4137,7 +5187,35 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback) imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; - + imp->combine_32_ca[PIXMAN_OP_SATURATE] = mmx_combine_saturate_ca; + + /* Disjoint CA */ + imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = mmx_combine_saturate_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = mmx_combine_disjoint_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = mmx_combine_disjoint_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = mmx_combine_disjoint_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_ca; + + /* Conjoint CA */ + imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = mmx_combine_conjoint_over_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = mmx_combine_conjoint_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = mmx_combine_conjoint_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = mmx_combine_conjoint_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_ca; + + /* Multiply CA */ + imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = mmx_combine_multiply_ca; + imp->combine_32_ca[PIXMAN_OP_DARKEN] = mmx_combine_darken_ca; + imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = mmx_combine_lighten_ca; + imp->blt = mmx_blt; imp->fill = mmx_fill; diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h index 73a5414..93660b6 100644 --- a/pixman/pixman-private.h +++ b/pixman/pixman-private.h @@ -916,9 +916,39 @@ convert_8888_to_0565 (uint32_t s) static force_inline uint32_t convert_0565_to_0888 (uint16_t s) { + uint32_t ret; +#if USE_LOONGSON_MMI + asm(".set noreorder\r\n" + "sll $8, %1, 3\r\n" + "andi $8, 0xf8\r\n" + "sll $6, %1, 5\r\n" + "andi $6, 0xfc00\r\n" + "sll $4, %1, 8\r\n" + "li $2, 0xf80000\r\n" + "and $4, $2\r\n" + "or $6, $6, $4\r\n" + "or $8, $6\r\n" + "srl $4, %1, 2\r\n" + "andi $4, 0x7\r\n" + "srl $6, %1, 1\r\n" + "andi $6, 0x300\r\n" + "or $6, $6, $4\r\n" + "or $8, $6\r\n" + "sll $6, %1, 3\r\n" + "li $2, 0x70000\r\n" + "and $6, $2\r\n" + "or %0, $8, $6\r\n" + ".set reorder\r\n" + : "=r" (ret) + : "r" (s) + : "$8","$6","$4","$2" + ); +#else return (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) | ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) | ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000))); +#endif + return ret; } static force_inline uint32_t @@ -991,7 +1021,7 @@ unorm_to_unorm (uint32_t val, int from_bits, int to_bits) { \ result |= result >> from_bits; \ \ - from_bits *= 2; \ + from_bits <<= 1; \ } \ } \ while (0) diff --git a/pixman/pixman-solid-fill.c b/pixman/pixman-solid-fill.c index 4694ebc..c0ca417 100644 --- a/pixman/pixman-solid-fill.c +++ b/pixman/pixman-solid-fill.c @@ -40,12 +40,53 @@ static argb_t color_to_float (const pixman_color_t *color) { argb_t result; +#ifdef USE_LOONGSON_MMI + uint32_t a = color->alpha; + uint32_t r = color->red; + uint32_t g = color->green; + uint32_t b = color->blue; + uint32_t m; + float tmp; + float counta, countr, countg, countb; + /*m=((1<<16)-1)*/ + m=65535; + /* tmp=1.f / (float)m;*/ + float data = 65535.f; + asm(".set noreorder\r\n" + "recip.s %4,%5\r\n" - result.a = pixman_unorm_to_float (color->alpha, 16); - result.r = pixman_unorm_to_float (color->red, 16); - result.g = pixman_unorm_to_float (color->green, 16); - result.b = pixman_unorm_to_float (color->blue, 16); + "mtc1 %6, $f0\r\n" + "cvt.s.w $f2, $f0\r\n" + "mul.s %0,$f2,%4\r\n" + "mtc1 %7, $f10\r\n" + "cvt.s.w $f4, $f10\r\n" + "mul.s %1,$f4,%4\r\n" + + "mtc1 %8, $f12\r\n" + "cvt.s.w $f6, $f12\r\n" + "mul.s %2,$f6,%4\r\n" + + "mtc1 %9, $f14\r\n" + "cvt.s.w $f8, $f14\r\n" + "mul.s %3,$f8,%4\r\n" + + ".set reorder\r\n" + :"=f"(counta),"=f"(countr),"=f"(countg),"=f"(countb),"=f"(tmp) + :"f"(data),"r"(a),"r" (r),"r" (g),"r" (b) + :"$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14" + ); + + result.a = counta; + result.r = countr; + result.g = countg; + result.b = countb; +#else + result.a = pixman_unorm_to_float(color->alpha, 16); + result.r = pixman_unorm_to_float(color->red, 16); + result.g = pixman_unorm_to_float(color->green, 16); + result.b = pixman_unorm_to_float(color->blue, 16); +#endif return result; } diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c index 4a3a835..51f5cd8 100644 --- a/pixman/pixman-utils.c +++ b/pixman/pixman-utils.c @@ -80,28 +80,73 @@ pixman_malloc_abc (unsigned int a, return malloc (a * b * c); } + static force_inline uint16_t float_to_unorm (float f, int n_bits) { uint32_t u; - if (f > 1.0) - f = 1.0; - if (f < 0.0) - f = 0.0; - - u = f * (1 << n_bits); - u -= (u >> n_bits); - + if (f >= 1.0) + { + u = 1 << (n_bits); + u--; + return u; + } + else if (f <= 0.0) + { + return 0.0; + } + else + { +#ifdef USE_LOONGSON_MMI + asm(".set noreorder\r\n" + "li $8, 0x1\r\n" + "sll $8, %2\r\n" + "mtc1 $8, $f2\r\n" + "cvt.s.w $f0, $f2\r\n" + "mul.s $f0, $f0, %1\r\n" + "floor.w.s %0, $f0\r\n" + ".set reorder\r\n" + : "=f" (u) + : "f" (f), "r" (n_bits) + : "$8","$f0", "$f2" + ); +#else + u = f * (1 << n_bits); + u -= (u >> n_bits); +#endif + } return u; } static force_inline float unorm_to_float (uint16_t u, int n_bits) { + float result; +#ifdef USE_LOONGSON_MMI + asm(".set noreorder\r\n" + "li $8, 0x1\r\n" + "sll $8, %2\r\n" + "addu $8, -1\r\n" + "mtc1 $8, $f8\r\n" + "cvt.s.w $f2, $f8\r\n" + "and $8,%1\r\n" + "mtc1 $8, $f6\r\n" + "cvt.s.w $f4, $f6\r\n" + "recip.s $f0, $f2\r\n" + "mul.s %0,$f0,$f4\r\n" + + ".set reorder\r\n" + : "=f" (result) + : "r"(u), "r" (n_bits) + : "$8","$f0", "$f2","$f4","$f6","$f8" + ); + return result; +#else uint32_t m = ((1 << n_bits) - 1); return (u & m) * (1.f / (float)m); +#endif } /* @@ -206,8 +251,8 @@ pixman_contract_from_float (uint32_t *dst, for (i = 0; i < width; ++i) { - uint8_t a, r, g, b; - + uint8_t a, r, g, b; + a = float_to_unorm (src[i].a, 8); r = float_to_unorm (src[i].r, 8); g = float_to_unorm (src[i].g, 8); diff --git a/test/Makefile.am b/test/Makefile.am index 88dc36d..43cafb8 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -2,7 +2,7 @@ include $(top_srcdir)/test/Makefile.sources AM_CFLAGS = $(OPENMP_CFLAGS) $(PTHREAD_CFLAGS) AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS) $(PTHREAD_LDFLAGS) -LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm $(PNG_LIBS) $(PTHREAD_LIBS) +LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la $(top_builddir)/pixman/libpixman-loongson-mmi.la -lm $(PNG_LIBS) $(PTHREAD_LIBS) AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS) libutils_la_SOURCES = $(libutils_sources) $(libutils_headers) diff --git a/test/utils.c b/test/utils.c index f8e42a5..73ddb6f 100644 --- a/test/utils.c +++ b/test/utils.c @@ -877,7 +877,15 @@ fuzzer_test_main (const char *test_name, #endif for (i = n1; i <= n2; i++) { +#ifdef USE_LOONGSON_MMI + uint32_t crc; + #pragma omp critical + { + crc = call_test_function (test_function, i, 0); + } +#else uint32_t crc = call_test_function (test_function, i, 0); +#endif if (verbose) printf ("%d: %08X\n", i, crc); checksum += crc; -- 2.1.0 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/pixman