Em ter., 12 de nov. de 2024 às 07:56, Bertrand Drouvot < bertranddrouvot...@gmail.com> escreveu:
> Hi, > > On Tue, Nov 12, 2024 at 03:56:13PM +0900, Michael Paquier wrote: > > On Tue, Nov 12, 2024 at 06:09:04AM +0000, Bertrand Drouvot wrote: > > > I think that the 64b len check done in v11 is mandatory for safety > reasons. > > > > > > The loop above reads 64 bytes at once, so would read beyond the memory > area bounds > > > if len < 64: That could cause crash or read invalid data. > > > > Sorry, I was not following your argument. You're right that we need > > something else here. However.. > > > > + /* > > + * For len < 64, compare byte per byte to ensure we'll not read > beyond the > > + * memory area. > > + */ > > + if (len < sizeof(size_t) * 8) > > + { > > + while (p < end) > > + { > > + if (*p++ != 0) > > + return false; > > + } > > + return true; > > + } > > + > > + /* Compare bytes until the pointer "p" is aligned */ > > + while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) > > + { > > + if (p == end) > > + return true; > > + > > + if (*p++ != 0) > > + return false; > > + } > > + > > > > Still, this is not optimal, based on what's been discussed upthread. > > The byte-per-byte check is more expensive than the size_t check, > > I think that depends of the memory area size. If the size is small enough > then the > byte per byte can be good enough. > > For example, with the allzeros_small.c attached: > It seems to me that it is enough to protect the SIMD loop when the size is smaller. if (len > sizeof(size_t) * 8) { for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } } See v1_allzeros_small.c attached. > > == with BLCKSZ 32 > > $ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c > -o allzeros_small ; ./allzeros_small > byte per byte: done in 22528 nanoseconds > size_t: done in 6949 nanoseconds (3.24191 times faster than byte per byte) > SIMD v10: done in 7562 nanoseconds (2.97911 times faster than byte per > byte) > SIMD v11: done in 22096 nanoseconds (1.01955 times faster than byte per > byte) > gcc -march=native -O2 v1_allzeros_small.c -o v1_allzeros_small ; ./v1_allzeros_small byte per byte: done in 97345 nanoseconds size_t: done in 20305 nanoseconds (4.79414 times faster than byte per byte) SIMD v10: done in 25813 nanoseconds (3.77116 times faster than byte per byte) SIMD v11: done in 24580 nanoseconds (3.96033 times faster than byte per byte) > > == with BLCKSZ 63 > > $ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c > -o allzeros_small ; ./allzeros_small > byte per byte: done in 29246 nanoseconds > size_t: done in 10555 nanoseconds (2.77082 times faster than byte per byte) > SIMD v10: done in 11220 nanoseconds (2.6066 times faster than byte per > byte) > SIMD v11: done in 29126 nanoseconds (1.00412 times faster than byte per > byte) > gcc -march=native -O2 v1_allzeros_small.c -o v1_allzeros_small ; ./v1_allzeros_small byte per byte: done in 57763 nanoseconds size_t: done in 19760 nanoseconds (2.92323 times faster than byte per byte) SIMD v10: done in 24088 nanoseconds (2.398 times faster than byte per byte) SIMD v11: done in 20151 nanoseconds (2.86651 times faster than byte per byte) > Obviously v11 is about the same time as "byte per byte" but we can see > that the > size_t or v10 improvment is not that much for small size. > > While for larger size: > > == with BLCKSZ 256 > > $ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c > -o allzeros_small ; ./allzeros_small > byte per byte: done in 102703 nanoseconds > size_t: done in 15381 nanoseconds (6.67726 times faster than byte per byte) > SIMD v10: done in 7241 nanoseconds (14.1835 times faster than byte per > byte) > SIMD v11: done in 7899 nanoseconds (13.002 times faster than byte per byte) > gcc -march=native -O2 v1_allzeros_small.c -o v1_allzeros_small ; ./v1_allzeros_small byte per byte: done in 213276 nanoseconds size_t: done in 45288 nanoseconds (4.70933 times faster than byte per byte) SIMD v10: done in 15840 nanoseconds (13.4644 times faster than byte per byte) SIMD v11: done in 15773 nanoseconds (13.5216 times faster than byte per byte) > == with BLCKSZ 8192 > > $ /usr/local/gcc-14.1.0/bin/gcc-14.1.0 -march=native -O2 allzeros_small.c > -o allzeros_small ; ./allzeros_small > byte per byte: done in 2993458 nanoseconds > size_t: done in 436650 nanoseconds (6.85551 times faster than byte per > byte) > SIMD v10: done in 136413 nanoseconds (21.9441 times faster than byte per > byte) > SIMD v11: done in 155474 nanoseconds (19.2538 times faster than byte per > byte) > gcc -march=native -O2 v1_allzeros_small.c -o v1_allzeros_small ; ./v1_allzeros_small byte per byte: done in 10358761 nanoseconds size_t: done in 864673 nanoseconds (11.98 times faster than byte per byte) SIMD v10: done in 342880 nanoseconds (30.211 times faster than byte per byte) SIMD v11: done in 341332 nanoseconds (30.3481 times faster than byte per byte) best regards, Ranier Vilela
#include <stdbool.h> #include <stddef.h> #include <string.h> #include <stdio.h> #include <stdint.h> #include <time.h> #include <immintrin.h> #define BLCKSZ 32 #define LOOPS 1000 static inline bool allzeros_byte_per_byte(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; while (p < end) { if (*p++ != 0) return false; } return true; } static inline bool allzeros_size_t(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } bool pg_memory_is_all_zeros_v10(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * * There is no risk to read beyond the memory area thanks to the len < 64 * check done below. */ for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } bool pg_memory_is_all_zeros_v11(const void *ptr, size_t len) { const unsigned char *p = (const unsigned char *) ptr; const unsigned char *end = &p[len]; const unsigned char *aligned_end = (const unsigned char *) ((uintptr_t) end & (~(sizeof(size_t) - 1))); /* Compare bytes until the pointer "p" is aligned */ while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0) { if (p == end) return true; if (*p++ != 0) return false; } /* * Compare 8 * sizeof(size_t) chunks at once. * * For performance reasons, we manually unroll this loop and purposefully * use bitwise-ORs to combine each comparison. This prevents boolean * short-circuiting and lets the compiler know that it's safe to access * all 8 elements regardless of the result of the other comparisons. This * seems to be enough to coax a few compilers into using SIMD * instructions. * */ if (len > sizeof(size_t) * 8) { for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8) { if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) | (((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) | (((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) | (((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0)) return false; } } /* * Compare remaining size_t-aligned chunks. * * aligned_end cant' be > end as we ensured to take care of len < 8 (in * the len < 64 check below). So, no risk to read beyond the memory area. */ for (; p < aligned_end; p += sizeof(size_t)) { if (*(size_t *) p != 0) return false; } /* Compare remaining bytes until the end */ while (p < end) { if (*p++ != 0) return false; } return true; } #define NANOSEC_PER_SEC 1000000000 // Returns difference in nanoseconds int64_t get_clock_diff(struct timespec *t1, struct timespec *t2) { int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC; nanosec += (t1->tv_nsec - t2->tv_nsec); return nanosec; } int main() { size_t pagebytes[BLCKSZ] = {0}; volatile bool result; struct timespec start,end; int64_t byte_time, size_t_time; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = allzeros_byte_per_byte(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); byte_time = get_clock_diff(&end, &start); printf("byte per byte: done in %ld nanoseconds\n", byte_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = allzeros_size_t(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("size_t: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = pg_memory_is_all_zeros_v10(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("SIMD v10: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); for (int i = 0; i < LOOPS; i++) { result = pg_memory_is_all_zeros_v11(pagebytes, BLCKSZ); } clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); size_t_time = get_clock_diff(&end, &start); printf("SIMD v11: done in %ld nanoseconds (%g times faster than byte per byte)\n", size_t_time, (double) byte_time / size_t_time); return 0; }