On Fri, 1 Nov 2024 at 19:27, Michael Paquier <[email protected]> wrote:
> Under gcc -O2 or -O3, the single-byte check or the 8-byte check don't
> make a difference. Please see the attached (allzeros.txt) for a quick
> check if you want to check by yourself. With 1M iterations, both
> average around 3ms for 1M iterations on my laptop (not the fastest
> thing around).
>
> Under -O0, though, the difference is noticeable:
> - 1-byte check: 3.52s for 1M iterations, averaging one check at
> 3.52ns.
> - 8-byte check: 0.46s for 1M iterations, averaging one check at
> 0.46ns.
>
> Even for that, I doubt that this is going to be noticeable in
> practice, still the difference exists.
The reason you're not seeing the slowdown with -O2 and -O3 is because
your compiler didn't think there was anything to do so didn't emit the
code you were trying to benchmark. Try looking at allzeros.s after
doing "gcc allzeros.c -S -O2".
I've attached an updated version for you to try. I used a volatile
bool and assigned the function result to it to prevent the compiler
from optimising out the test.
$ gcc allzeros.c -O2 -o allzeros
$ ./allzeros
char: done in 1607800 nanoseconds
size_t: done in 208800 nanoseconds (7.70019 times faster)
$ gcc allzeros.c -O3 -o allzeros
$ ./allzeros
char: done in 1584500 nanoseconds
size_t: done in 225700 nanoseconds (7.02038 times faster)
David
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#define BLCKSZ 8192
#define LOOPS 1000
static inline bool
allzeros_char(const void *ptr, size_t len)
{
const char *p = (const char *) ptr;
for (size_t i = 0; i < len; i++)
{
if (p[i] != 0)
return false;
}
return true;
}
static inline bool
allzeros_size_t(const void *ptr, size_t len)
{
const size_t *p = (const size_t *) ptr;
for (size_t i = 0; i < len / sizeof(size_t); i++)
{
if (p[i] != 0)
return false;
}
return true;
}
#define NANOSEC_PER_SEC 1000000000
// Returns difference in nanoseconds
int64_t
get_clock_diff(struct timespec *t1, struct timespec *t2)
{
int64_t nanosec = (t1->tv_sec - t2->tv_sec) * NANOSEC_PER_SEC;
nanosec += (t1->tv_nsec - t2->tv_nsec);
return nanosec;
}
int main()
{
size_t pagebytes[BLCKSZ] = {0};
volatile bool result;
struct timespec start,end;
int64_t char_time, size_t_time;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
for (int i = 0; i < LOOPS; i++)
{
result = allzeros_char(pagebytes, BLCKSZ);
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
char_time = get_clock_diff(&end, &start);
printf("char: done in %ld nanoseconds\n", char_time);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
for (int i = 0; i < LOOPS; i++)
{
result = allzeros_size_t(pagebytes, BLCKSZ);
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
size_t_time = get_clock_diff(&end, &start);
printf("size_t: done in %ld nanoseconds (%g times faster)\n",
size_t_time, (double) char_time / size_t_time);
return 0;
}