Hi, On Wed, Dec 11, 2024 at 03:03:34PM +0900, Michael Paquier wrote: > On Tue, Dec 10, 2024 at 02:18:33PM +0000, Bertrand Drouvot wrote: > > While searching for memcmp() calls in "*stat*.c" files (due to [1]), it > > appeared > > that we could $SUBJECT. Please find attached a patch doing so. > > - SockAddr zero_clientaddr; > - memset(&zero_clientaddr, 0, sizeof(zero_clientaddr)); > - if (memcmp(&(beentry->st_clientaddr), &zero_clientaddr, > - sizeof(zero_clientaddr)) == 0) > + if (pg_memory_is_all_zeros(&(beentry->st_clientaddr), > + sizeof(beentry->st_clientaddr))) > > It also means that you're removing these variables used only for the > all-zero comparisons. Makes sense to me.
Yeap and also given its size (136 bytes), it looks like that it could provide some performance benefits too (see check_SockAddr_is_zeroes.c attached): $ gcc --version gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 Copyright (C) 2021 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. $ gcc -O2 check_SockAddr_is_zeroes.c -o check_SockAddr_is_zeroes; ./check_SockAddr_is_zeroes memcmp: done in 21460 nanoseconds pg_memory_is_all_zeros: done in 1307 nanoseconds (16.4193 times faster than memcmp) $ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -O2 check_SockAddr_is_zeroes.c -o check_SockAddr_is_zeroes; ./check_SockAddr_is_zeroes memcmp: done in 21012 nanoseconds pg_memory_is_all_zeros: done in 4039 nanoseconds (5.20228 times faster than memcmp) That's not a hot path but still interesting to see. Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
#include <stdbool.h> #include <string.h> #include <stdio.h> #include <stdint.h> #include <time.h> #include <sys/socket.h> #define LOOPS 1000 typedef struct { struct sockaddr_storage addr; socklen_t salen; } SockAddr; static inline bool pg_memory_is_all_zeros(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); if (len < sizeof(size_t)) { while (p < end) { if (*p++ != 0) return false; } return true; } /* "len" in the [sizeof(size_t), sizeof(size_t) * 8 - 1] range */ if (len < sizeof(size_t) * 8) { /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare remaining size_t-aligned chunks. * * There is no risk to read beyond the memory area, as "aligned_end" * cannot be higher than "end". */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } /* "len" in the [sizeof(size_t) * 8, inf) range */ /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * There is no risk to read beyond the memory area, as "aligned_end" * cannot be higher than "end". */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } #define NANOSEC_PER_SEC 1000000000 // Returns difference in nanoseconds int64_t get_clock_diff(struct timespec *t1, struct timespec *t2) { int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC; nanosec += (t1->tv_nsec - t2->tv_nsec); return nanosec; } int main() { volatile bool result; struct timespec start,end; int64_t memcmp_time, func_time; SockAddr zero_clientaddr; SockAddr Fakeclientaddr; memset(&zero_clientaddr, 0, sizeof(zero_clientaddr)); memset(&Fakeclientaddr, 0, sizeof(Fakeclientaddr)); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = (memcmp(&Fakeclientaddr, &zero_clientaddr, sizeof(Fakeclientaddr)) == 0); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); memcmp_time = get_clock_diff(&end, &start); printf("memcmp: done in %ld nanoseconds\n", memcmp_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = pg_memory_is_all_zeros(&Fakeclientaddr, sizeof(Fakeclientaddr)); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); func_time = get_clock_diff(&end, &start); printf("pg_memory_is_all_zeros: done in %ld nanoseconds (%g times faster than memcmp)\n", func_time, (double) memcmp_time / func_time); return 0; }