Re: [maemo-developers] Improving Cairo performance on the N800

Zeeshan Ali Tue, 16 Jan 2007 02:09:44 -0800

Hello!

Now, the recently announced Nokia N800 is different from the 770 in
various ways that are interesting for Cairo performance. I've got my
eye on the ARMv6 SIMD instructions and the PowerVR MBX accelerator.


  Yeah! me too. The combined power of these two can make it possible
to optimize a lot of nice free software out there for the N800 device.
However! while former is fully documented and the documentation is
available for general public, it doesn't have a lot to offer. ARMv6
SIMD only operate on 32-bit words and hence i find it unlikely that it
can be used to optimize double fp emulation in contrast to the intel
wirelesss MMX, which provides a big bunch of 128-bit (CORRECTME: or
was it 64- bit?) SIMD instructions. OTOH, these few SIMD instructions
can still be used to optimize a lot of code but would it be a good
idea for cairo if you need to convert the operand values to ints and
the result(s) back to float?

 I have already been thinking on utilizing ARMv6 before the N800 was
release to public. My proposed plan of attack for the community (and
also the Nokia employees) is simply the following:

1. Patch GCC to provide ARMv6 intrinsics. (1 MM at most)
2. Patch liboil [1] to utilize these intrinsics when compiled for
ARMv6 target (1-3 MM)
3. Make all the software utilize liboil wherever appropriate or ARMv6
intrinsics directly if needed.

  The 3rd step would ensure that you are optimizing your software for
all the platforms for which liboil provides optimizations. OTOH! one
can skip step#1 and write liboil implementations in assembly.

  I already did a little progress on this and the result is two
header files which provides inline functions abstracting the assembly
instructions. I am attaching the headers. One of my friend was
supposed to convert them to gcc intrinsics and patch gcc but i never
got around to finish them. However I am attaching the headers so
anyone can use it as a starter if he/she likes.

 Using PowerVR MBX accelerator is a completely different story.
Although it has a lot to offer but I failed to find any documentation
on it. There were tons of documentation on how to use the OpenGL ES
implemented on top of MBX. If you come across any documentation on
that, please let me know.

[1] http://liboil.freedesktop.org/

--
Regards,

Zeeshan Ali
Design Engineer, SW
Open Source Software Operations
Nokia Multimedia

#ifndef __ARMV6_ARITHMETIC__
#define __ARMV6_ARITHMETIC__

/********************** 8-bit SIMD operations *************************/


/* Signed 8-bit SIMD add */
static __inline unsigned long sadd8(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__(
	    "sadd8 %0, %1, %2\n"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Signed 8-bit SIMD subtraction */
static __inline unsigned long ssub8(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__(
	    "ssub8 %0, %1, %2\n"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned 8-bit SIMD addition */
static __inline unsigned long uadd8(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__(
	    "uadd8 %0, %1, %2\n"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned 8-bit SIMD subtraction */
static __inline unsigned long usub8(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("usub8 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Signed saturating 8-bit SIMD addition */
static __inline unsigned long qadd8(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("qadd8 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Signed saturating 8-bit SIMD subtraction */
static __inline unsigned long qsub8(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("qsub8 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned saturating 8-bit SIMD addition */
static __inline unsigned long uqadd8(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("uqadd8 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned saturating 8-bit SIMD subtraction */
static __inline unsigned long uqsub8(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("uqsub8 %0, %1, %2"
	    : "=r" (d)
	    : "r" (m), "r" (n)
	    : "cc");

    return d;
}

/********************** 16-bit SIMD operations *************************/

/* Signed 16-bit SIMD add */
static __inline unsigned long sadd16(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("sadd16 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Signed 16-bit SIMD subtraction */
static __inline unsigned long ssub16(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("ssub16 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned 16-bit SIMD addition */
static __inline unsigned long uadd16(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("uadd16 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned 16-bit SIMD subtraction */
static __inline unsigned long usub16(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("usub16 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Signed saturating 16-bit SIMD addition */
static __inline unsigned long qadd16(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("qadd16 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Signed saturating 16-bit SIMD subtraction */
static __inline unsigned long qsub16(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("qsub16 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned saturating 16-bit SIMD addition */
static __inline unsigned long uqadd16(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("uqadd16 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned saturating 16-bit SIMD subtraction */
static __inline unsigned long uqsub16(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("uqsub16 %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/********************** 16-bit SIMD operations with swap ********************/

/* Signed upper add, lower subtract, with a swap of halfwords in m */
static __inline unsigned long saddsubx(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("saddsubx %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned upper add, lower subtract, with a swap of halfwords in m */
static __inline unsigned long uaddsubx(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("uaddsubx %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Signed saturating upper add, lower subtract, with a swap of halfwords in
 * m 
 */
static __inline unsigned long qaddsubx(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("qaddsubx %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned saturating upper add, lower subtract, with a swap of halfwords in
 * m 
 */
static __inline unsigned long uqaddsubx(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("uqaddsubx %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Signed upper subtract, lower add, with a swap of halfwords in m */
static __inline unsigned long ssubaddx(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("ssubaddx %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned upper subtract, lower add, with a swap of halfwords in m */
static __inline unsigned long usubaddx(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("usubaddx %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Signed saturating upper subtract, lower add, with a swap of halfwords in
 * m 
 */
static __inline unsigned long qsubaddx(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("qsubaddx %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

/* Unsigned saturating upper subtract, lower add, with a swap of halfwords in
 * m 
 */
static __inline unsigned long uqsubaddx(unsigned long n, unsigned long m)
{
    unsigned long d;

    __asm__ __volatile__("uqsubaddx %0, %1, %2"
	    : "=r" (d)
	    : "r" (n), "r" (m)
	    : "cc");

    return d;
}

#endif /* __ARMV6_ARITHMETIC__ */

#ifndef __ARMV6_NONARITHMETIC__
#define __ARMV6_NONARITHMETIC__

/* To get the contents of the CPSR */
static __inline unsigned long get_cpsr(void)
{
    unsigned long d;

    __asm__ __volatile__(
	    "mrs %0, cpsr\n"
	    : "=r" (d));

    return d;
}

#define SWITCH_TO_BE() \
    __asm__ __volatile__("setend	BE" \
	    : \
	    : \
	    : "cc")

#define SWITCH_TO_LE() \
    __asm__ __volatile__("setend	LE" \
	    : \
	    : \
	    : "cc")

#endif /* __ARMV6_NONARITHMETIC__ */

_______________________________________________
maemo-developers mailing list
maemo-developers@maemo.org
https://maemo.org/mailman/listinfo/maemo-developers

Re: [maemo-developers] Improving Cairo performance on the N800

Reply via email to