Hi,

On Wed, Dec 11, 2024 at 03:03:34PM +0900, Michael Paquier wrote:
> On Tue, Dec 10, 2024 at 02:18:33PM +0000, Bertrand Drouvot wrote:
> > While searching for memcmp() calls in "*stat*.c" files (due to [1]), it 
> > appeared
> > that we could $SUBJECT. Please find attached a patch doing so.
> 
> -            SockAddr    zero_clientaddr;
> -            memset(&zero_clientaddr, 0, sizeof(zero_clientaddr));
> -            if (memcmp(&(beentry->st_clientaddr), &zero_clientaddr,
> -                       sizeof(zero_clientaddr)) == 0)
> +            if (pg_memory_is_all_zeros(&(beentry->st_clientaddr),
> +                                       sizeof(beentry->st_clientaddr)))
> 
> It also means that you're removing these variables used only for the
> all-zero comparisons.  Makes sense to me.

Yeap and also given its size (136 bytes), it looks like that it could provide
some performance benefits too (see check_SockAddr_is_zeroes.c attached):

$ gcc --version
gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

$ gcc -O2 check_SockAddr_is_zeroes.c -o check_SockAddr_is_zeroes; 
./check_SockAddr_is_zeroes
memcmp: done in 21460 nanoseconds
pg_memory_is_all_zeros: done in 1307 nanoseconds (16.4193 times faster than 
memcmp)

$ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -O2 check_SockAddr_is_zeroes.c -o 
check_SockAddr_is_zeroes; ./check_SockAddr_is_zeroes
memcmp: done in 21012 nanoseconds
pg_memory_is_all_zeros: done in 4039 nanoseconds (5.20228 times faster than 
memcmp)

That's not a hot path but still interesting to see.

Regards,

-- 
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <sys/socket.h> 

#define LOOPS 1000

typedef struct
{
    struct sockaddr_storage addr;
    socklen_t   salen;
} SockAddr;

static inline bool
pg_memory_is_all_zeros(const void *ptr, size_t len)
{
    const unsigned char *p = (const unsigned char *) ptr;
    const unsigned char *end = &p[len];
    const unsigned char *aligned_end = (const unsigned char *)
        ((uintptr_t) end & (~(sizeof(size_t) - 1)));

    if (len < sizeof(size_t))
    {
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }

    /* "len" in the [sizeof(size_t), sizeof(size_t) * 8 - 1] range */
    if (len < sizeof(size_t) * 8)
    {
        /* Compare bytes until the pointer "p" is aligned */
        while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
        {
            if (p == end)
                return true;
            if (*p++ != 0)
                return false;
        }

        /*
         * Compare remaining size_t-aligned chunks.
         *
         * There is no risk to read beyond the memory area, as "aligned_end"
         * cannot be higher than "end".
         */
        for (; p < aligned_end; p += sizeof(size_t))
        {
            if (*(size_t *) p != 0)
                return false;
        }

        /* Compare remaining bytes until the end */
        while (p < end)
        {
            if (*p++ != 0)
                return false;
        }
        return true;
    }

    /* "len" in the [sizeof(size_t) * 8, inf) range */

    /* Compare bytes until the pointer "p" is aligned */
    while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
    {
        if (p == end)
            return true;

        if (*p++ != 0)
            return false;
    }

    /*
     * Compare 8 * sizeof(size_t) chunks at once.
     *
     * For performance reasons, we manually unroll this loop and purposefully
     * use bitwise-ORs to combine each comparison.  This prevents boolean
     * short-circuiting and lets the compiler know that it's safe to access
     * all 8 elements regardless of the result of the other comparisons.  This
     * seems to be enough to coax a few compilers into using SIMD
     * instructions.
     */
    for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
    {
        if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
            (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
            (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
            (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
            return false;
    }

    /*
     * Compare remaining size_t-aligned chunks.
     *
     * There is no risk to read beyond the memory area, as "aligned_end"
     * cannot be higher than "end".
     */
    for (; p < aligned_end; p += sizeof(size_t))
    {
        if (*(size_t *) p != 0)
            return false;
    }

    /* Compare remaining bytes until the end */
    while (p < end)
    {
        if (*p++ != 0)
            return false;
    }

    return true;
}

#define NANOSEC_PER_SEC 1000000000

// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
	int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
	nanosec += (t1->tv_nsec - t2->tv_nsec);

	return nanosec;
}

int main()
{
	volatile bool result;
	struct timespec start,end;
	int64_t memcmp_time, func_time;


	SockAddr zero_clientaddr;
	SockAddr Fakeclientaddr;
	memset(&zero_clientaddr, 0, sizeof(zero_clientaddr));
	memset(&Fakeclientaddr, 0, sizeof(Fakeclientaddr));

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = (memcmp(&Fakeclientaddr, &zero_clientaddr, sizeof(Fakeclientaddr)) == 0);
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	memcmp_time = get_clock_diff(&end, &start);
	printf("memcmp: done in %ld nanoseconds\n", memcmp_time);

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);

	for (int i = 0; i < LOOPS; i++)
	{
		result = pg_memory_is_all_zeros(&Fakeclientaddr, sizeof(Fakeclientaddr));
	}

	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
	func_time = get_clock_diff(&end, &start);
	printf("pg_memory_is_all_zeros: done in %ld nanoseconds (%g times faster than memcmp)\n", func_time, (double) memcmp_time / func_time);

	return 0;
}

Reply via email to