Denys Vlasenko schrieb:
On Mon, Aug 7, 2017 at 10:58 PM, Johannes Schindelin
<johannes.schinde...@gmx.de> wrote:
+++ b/archival/libarchive/decompress_unxz.c
@@ -37,6 +37,11 @@ static uint32_t xz_crc32(const uint8_t *buf, size_t size, 
uint32_t crc)
   || !defined(put_unaligned_be32)
  # error get_unaligned_le32 accessors are not defined
  #endif
+static ALWAYS_INLINE uint32_t get_le32_fast(const void *p)
+{
+  return *(uint32_t *)p;
+}
+#define get_le32 get_le32_fast
It misses little-endian conversion.

It should also be noted that these days GCC is quite clever with assignments and byte-swapping. I tried this program with gcc 7.1.1:

int32_t
move_from_unaligned32 (void *p)
{
  int32_t v;
  memcpy (&v, p, sizeof (v));
  return v;
}

uint32_t
get_unaligned_le32(const uint8_t *buf)
{
  return (uint32_t)buf[0]
    | ((uint32_t)buf[1] << 8)
    | ((uint32_t)buf[2] << 16)
    | ((uint32_t)buf[3] << 24);
}

uint32_t
get_unaligned_be32(const uint8_t *buf)
{
  return (uint32_t)(buf[0] << 24)
    | ((uint32_t)buf[1] << 16)
    | ((uint32_t)buf[2] << 8)
    | (uint32_t)buf[3];
}

The assembly generated for these functions generated with -O2 is:

AMD/Intel 64-Bit

move_from_unaligned32:
        movl    (%rdi), %eax
        ret
get_unaligned_le32:
        movl    (%rdi), %eax
        ret
get_unaligned_be32:
        movl    (%rdi), %eax
        bswap   %eax
        ret

AMD/Intel 32-Bit
move_from_unaligned32:
        movl    4(%esp), %eax
        movl    (%eax), %eax
        ret
get_unaligned_le32:
        movl    4(%esp), %eax
        movl    (%eax), %eax
        ret
get_unaligned_be32:
        movl    4(%esp), %eax
        movl    (%eax), %eax
        bswap   %eax
        ret

As you can see, the memcpy of 4 bytes is replaced with a direct memory access, the loading and shifting of 4 bytes for the LE case is also replaced with a direct memory access, and the loading and shifting of 4 bytes for the BE case is replaced with a direct memory access followed by bswap. This also works with inline functions, automatic inlining and value propagation. So tricking the compiler here may result in less optimization.

It is still useful to provide functions for known aligned values for other architectures where unaligned access doesn't work and therefor GCC must access the single bytes, but on the architectures where unaligned access is possible, it would seem that GCC does the optimization, so it would not be necessary to distinguish between architectures.
_______________________________________________
busybox mailing list
busybox@busybox.net
http://lists.busybox.net/mailman/listinfo/busybox

Reply via email to