Em ter., 12 de nov. de 2024 às 07:56, Bertrand Drouvot <
bertranddrouvot...@gmail.com> escreveu:

> Hi,
>
> On Tue, Nov 12, 2024 at 03:56:13PM +0900, Michael Paquier wrote:
> > On Tue, Nov 12, 2024 at 06:09:04AM +0000, Bertrand Drouvot wrote:
> > > I think that the 64b len check done in v11 is mandatory for safety
> reasons.
> > >
> > > The loop above reads 64 bytes at once, so would read beyond the memory
> area bounds
> > > if len < 64: That could cause crash or read invalid data.
> >
> > Sorry, I was not following your argument.  You're right that we need
> > something else here.  However..
> >
> > +     /*
> > +      * For len < 64, compare byte per byte to ensure we'll not read
> beyond the
> > +      * memory area.
> > +      */
> > +     if (len < sizeof(size_t) * 8)
> > +     {
> > +             while (p < end)
> > +             {
> > +                     if (*p++ != 0)
> > +                             return false;
> > +             }
> > +             return true;
> > +     }
> > +
> > +     /* Compare bytes until the pointer "p" is aligned */
> > +     while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
> > +     {
> > +             if (p == end)
> > +                     return true;
> > +
> > +             if (*p++ != 0)
> > +                     return false;
> > +     }
> > +
> >
> > Still, this is not optimal, based on what's been discussed upthread.
> > The byte-per-byte check is more expensive than the size_t check,
>
> I think that depends of the memory area size. If the size is small enough
> then the
> byte per byte can be good enough.
>
> For example, with the allzeros_small.c attached:
>
It seems to me that it is enough to protect the SIMD loop when the size is
smaller.

    if (len > sizeof(size_t) * 8)
    {
      for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) *
8)
      {
          if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
              (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
              (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
              (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
              return false;
      }
    }

See v1_allzeros_small.c attached.

>
> == with BLCKSZ 32
>
> $ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c
> -o allzeros_small ; ./allzeros_small
> byte per byte: done in 22528 nanoseconds
> size_t: done in 6949 nanoseconds (3.24191 times faster than byte per byte)
> SIMD v10: done in 7562 nanoseconds (2.97911 times faster than byte per
> byte)
> SIMD v11: done in 22096 nanoseconds (1.01955 times faster than byte per
> byte)
>
gcc -march=native -O2 v1_allzeros_small.c -o v1_allzeros_small ;
./v1_allzeros_small
byte per byte: done in 97345 nanoseconds
size_t: done in 20305 nanoseconds (4.79414 times faster than byte per byte)
SIMD v10: done in 25813 nanoseconds (3.77116 times faster than byte per
byte)
SIMD v11: done in 24580 nanoseconds (3.96033 times faster than byte per
byte)


>
> == with BLCKSZ 63
>
> $ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c
> -o allzeros_small ; ./allzeros_small
> byte per byte: done in 29246 nanoseconds
> size_t: done in 10555 nanoseconds (2.77082 times faster than byte per byte)
> SIMD v10: done in 11220 nanoseconds (2.6066 times faster than byte per
> byte)
> SIMD v11: done in 29126 nanoseconds (1.00412 times faster than byte per
> byte)
>
 gcc -march=native -O2 v1_allzeros_small.c -o v1_allzeros_small ;
./v1_allzeros_small
byte per byte: done in 57763 nanoseconds
size_t: done in 19760 nanoseconds (2.92323 times faster than byte per byte)
SIMD v10: done in 24088 nanoseconds (2.398 times faster than byte per byte)
SIMD v11: done in 20151 nanoseconds (2.86651 times faster than byte per
byte)


> Obviously v11 is about the same time as "byte per byte" but we can see
> that the
> size_t or v10 improvment is not that much for small size.
>
> While for larger size:
>
> == with BLCKSZ 256
>
> $ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c
> -o allzeros_small ; ./allzeros_small
> byte per byte: done in 102703 nanoseconds
> size_t: done in 15381 nanoseconds (6.67726 times faster than byte per byte)
> SIMD v10: done in 7241 nanoseconds (14.1835 times faster than byte per
> byte)
> SIMD v11: done in 7899 nanoseconds (13.002 times faster than byte per byte)
>
gcc -march=native -O2 v1_allzeros_small.c -o v1_allzeros_small ;
./v1_allzeros_small
byte per byte: done in 213276 nanoseconds
size_t: done in 45288 nanoseconds (4.70933 times faster than byte per byte)
SIMD v10: done in 15840 nanoseconds (13.4644 times faster than byte per
byte)
SIMD v11: done in 15773 nanoseconds (13.5216 times faster than byte per
byte)


> == with BLCKSZ 8192
>
> $ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c
> -o allzeros_small ; ./allzeros_small
> byte per byte: done in 2993458 nanoseconds
> size_t: done in 436650 nanoseconds (6.85551 times faster than byte per
> byte)
> SIMD v10: done in 136413 nanoseconds (21.9441 times faster than byte per
> byte)
> SIMD v11: done in 155474 nanoseconds (19.2538 times faster than byte per
> byte)
>
gcc -march=native -O2 v1_allzeros_small.c -o v1_allzeros_small ;
./v1_allzeros_small
byte per byte: done in 10358761 nanoseconds
size_t: done in 864673 nanoseconds (11.98 times faster than byte per byte)
SIMD v10: done in 342880 nanoseconds (30.211 times faster than byte per
byte)
SIMD v11: done in 341332 nanoseconds (30.3481 times faster than byte per
byte)

best regards,
Ranier Vilela
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <immintrin.h>

#define BLCKSZ 32
#define LOOPS 1000

static inline bool
allzeros_byte_per_byte(const void *ptr, size_t len)
{
	const unsigned char *p = (const unsigned char *) ptr;
	const unsigned char *end = &p[len];

	while (p < end)
	{
		if (*p++ != 0)
			return false;
	}
	return true;
}

static inline bool
allzeros_size_t(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}

bool
pg_memory_is_all_zeros_v10(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     * There is no risk to read beyond the memory area thanks to the len < 64
     * check done below.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}


bool
pg_memory_is_all_zeros_v11(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     *
     */
    if (len > sizeof(size_t) * 8)
    {
      for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
      {
          if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
              (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
              (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
              (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
              return false;
      }
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * aligned_end cant' be > end as we ensured to take care of len < 8 (in
     * the len < 64 check below). So, no risk to read beyond the memory area.
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}

#define NANOSEC_PER_SEC 1000000000

// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
	int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
	nanosec += (t1->tv_nsec - t2->tv_nsec);

	return nanosec;
}

int main()
{
	size_t pagebytes[BLCKSZ] = {0};
	volatile bool result;
	struct timespec start,end;
	int64_t byte_time, size_t_time;

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = allzeros_byte_per_byte(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	byte_time = get_clock_diff(&end, &start);
	printf("byte per byte: done in %ld nanoseconds\n", byte_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = allzeros_size_t(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("size_t: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros_v10(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("SIMD v10: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros_v11(pagebytes, BLCKSZ);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	size_t_time = get_clock_diff(&end, &start);
	printf("SIMD v11: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time);

	return 0;
}

Reply via email to