Hi there,

gcc seems to create some sub-optimal code for the following code sequence in radeon_accel.c:

 for (; nwords > 0; --nwords, ++d, ++s)
            *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);

the body of the loop compiles to

    lwz 9,40(31)
    lwz 9,0(9)
    rotlwi 10,9,16
    lwz 9,36(31)
    stw 10,0(9)
    lwz 9,44(31)
    addi 9,9,-1
    stw 9,44(31)
    lwz 9,36(31)
    addi 9,9,4
    stw 9,36(31)
    lwz 9,40(31)
    addi 9,9,4
    stw 9,40(31)

this patch adds some (hopefully optimal) assembler code, bringing it in line with the other cases in the switch:

diff --git a/src/radeon_accel.c b/src/radeon_accel.c
index 1def2a3..580fa33 100644
--- a/src/radeon_accel.c
+++ b/src/radeon_accel.c
@@ -138,7 +138,16 @@ void RADEONCopySwap(uint8_t *dst, uint8_t *src, unsigned int size, int swap)
            unsigned int nwords = size >> 2;

            for (; nwords > 0; --nwords, ++d, ++s)
-               *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
+#ifdef __powerpc__
+                       __asm__ volatile ("rlwinm %0,%1,%2,%3,%4\n\t"
+                                                "rlwimi %0,%1,%5,%6,%7\n\t"
+                                                 : "=&r" (*d)
+ : "r" (*s),"i" (16), "i" (16),"i" (31) ,"i" (16), "i" (0),"i" (15)
+                                                 :);
+
+#else
+                       *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
+#endif
            return;
         }
     case RADEON_HOST_DATA_SWAP_32BIT:

_______________________________________________
xorg-driver-ati mailing list
[email protected]
https://lists.x.org/mailman/listinfo/xorg-driver-ati

Reply via email to