On Tue, 14 Jul 2020 20:20:54 PDT (-0700), anshuman.khand...@arm.com wrote:


On 07/15/2020 02:56 AM, Emil Renner Berthing wrote:
This allows the pgtable tests to be built.

Signed-off-by: Emil Renner Berthing <ker...@esmil.dk>
---

The tests seem to succeed both in Qemu and on the HiFive Unleashed

Both with and without the recent additions in
https://lore.kernel.org/linux-riscv/1594610587-4172-1-git-send-email-anshuman.khand...@arm.com/

That's great, thanks for testing.

Actually, looking at this I'm not sure it actually helps us any.  This changes
the behavior of two functions.  Pulling out the relevant sections, I see:

unsigned int __sw_hweight32(unsigned int w)
{
#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
       w -= (w >> 1) & 0x55555555;
       w =  (w & 0x33333333) + ((w >> 2) & 0x33333333);
       w =  (w + (w >> 4)) & 0x0f0f0f0f;
       return (w * 0x01010101) >> 24;
#else
       unsigned int res = w - ((w >> 1) & 0x55555555);
       res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
       res = (res + (res >> 4)) & 0x0F0F0F0F;
       res = res + (res >> 8);
       return (res + (res >> 16)) & 0x000000FF;
#endif
}

and

unsigned long memchr_inv(unsigned long value64)
{
#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
       value64 *= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
       value64 *= 0x01010101;
       value64 |= value64 << 32;
#else
       value64 |= value64 << 8;
       value64 |= value64 << 16;
       value64 |= value64 << 32;
#endif
        return value64;
}

GCC optimizer the multiplication out of the first one:

__sw_hweight32:
        li      a4,1431654400
        srliw   a5,a0,1
        addi    a4,a4,1365
        and     a5,a5,a4
        subw    a0,a0,a5
        li      a5,858992640
        srliw   a4,a0,2
        addi    a5,a5,819
        and     a0,a5,a0
        and     a5,a5,a4
        addw    a5,a0,a5
        srliw   a0,a5,4
        addw    a0,a0,a5
        li      a5,252645376
        addi    a5,a5,-241
        and     a5,a5,a0
        srliw   a0,a5,8
        addw    a5,a0,a5
        srliw   a0,a5,16
        addw    a0,a0,a5
        andi    a0,a0,0xff
        ret

__sw_hweight32:
        li      a5,1431654400
        srliw   a4,a0,1
        addi    a5,a5,1365
        and     a5,a5,a4
        subw    a0,a0,a5
        li      a5,858992640
        srliw   a4,a0,2
        addi    a5,a5,819
        and     a0,a5,a0
        and     a5,a5,a4
        addw    a5,a0,a5
        srliw   a0,a5,4
        addw    a5,a0,a5
        li      a0,252645376
        addi    a0,a0,-241
        and     a5,a0,a5
        slliw   a0,a5,8
        addw    a0,a0,a5
        slliw   a5,a0,16
        addw    a0,a0,a5
        srliw   a0,a0,24
        ret

so that doesn't matter. The second one is really a wash:
memchr_inv:
        ld      a5,.LC0
        mul     a0,a0,a5
        ret
.rodata
.LC0:
        .dword  72340172838076673

vs

memchr_inv:
        slli    a5,a0,8
        or      a5,a5,a0
        slli    a0,a5,16
        or      a0,a0,a5
        slli    a5,a0,32
        or      a0,a5,a0
        ret

It's unlikely that load ends up relaxed, so it's going to be two instructions.
That means we have 4 cycles to forward the load and multiply, for a cache hit.
IIRC the multiplier on the existing hardware isn't that fast -- GCC lists it as
imul as 10 cycles, but I remember it being more like 5 so that might just be an
architecture-inaccurate tuning in the generic pipeline model.  This is out of
the inner loop, so it's probably not all that important anyway.  The result
isn't used for a while so on a bigger machine it's probably worth picking the
smaller code path, but it seems like a very small thing to optimize for either
way.

I'm actually a bit surprised about this.  Do you have benchmarks that indicate
ARCH_HAS_FAST_MULTIPLIER is actually faster?  Otherwise I guess I'm going to
reject this, as it's really more
ARCH_HAS_FAST_MULTIPLIER_AND_FAST_LARGE_CONSTANTS than just
ARCH_HAS_FAST_MULTIPLIER.

Reply via email to