Hi there,
gcc seems to create some sub-optimal code for the following code
sequence in radeon_accel.c:
for (; nwords > 0; --nwords, ++d, ++s)
*d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
the body of the loop compiles to
lwz 9,40(31)
lwz 9,0(9)
rotlwi 10,9,16
lwz 9,36(31)
stw 10,0(9)
lwz 9,44(31)
addi 9,9,-1
stw 9,44(31)
lwz 9,36(31)
addi 9,9,4
stw 9,36(31)
lwz 9,40(31)
addi 9,9,4
stw 9,40(31)
this patch adds some (hopefully optimal) assembler code, bringing it in
line with the other cases in the switch:
diff --git a/src/radeon_accel.c b/src/radeon_accel.c
index 1def2a3..580fa33 100644
--- a/src/radeon_accel.c
+++ b/src/radeon_accel.c
@@ -138,7 +138,16 @@ void RADEONCopySwap(uint8_t *dst, uint8_t *src,
unsigned int size, int swap)
unsigned int nwords = size >> 2;
for (; nwords > 0; --nwords, ++d, ++s)
- *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
+#ifdef __powerpc__
+ __asm__ volatile ("rlwinm %0,%1,%2,%3,%4\n\t"
+ "rlwimi %0,%1,%5,%6,%7\n\t"
+ : "=&r" (*d)
+ : "r" (*s),"i" (16),
"i" (16),"i" (31) ,"i" (16), "i" (0),"i" (15)
+ :);
+
+#else
+ *d = ((*s & 0xffff) << 16) | ((*s >> 16) & 0xffff);
+#endif
return;
}
case RADEON_HOST_DATA_SWAP_32BIT:
_______________________________________________
xorg-driver-ati mailing list
[email protected]
https://lists.x.org/mailman/listinfo/xorg-driver-ati