On Fri, 3 Apr 2026 at 15:08, David Rowley <[email protected]> wrote: > IMO, if we can make bitmapset.c work with INT_MAX members and get a > performance increase, then we should do it.
Re-thinking this after a week's holiday, it seems fine to use an unsigned 32-bit int rather than a 64-bit int to fix this bug. I'd previously been uncertain if there were any guarantees in C to what (unsigned int) -1 would return, but going by [1] at 6.3.1.3, it says: "Otherwise, if the new type is unsigned, the value is converted by repeatedly adding or subtracting one more than the maximum value that can be represented in the new type until the value is in the range of the new type." So, it seems even on one's complement that -1 as an unsigned int will be UINT_MAX. When we add 1 to UINT_MAX, we're guaranteed to get 0, as it's unsigned maths and overflows are going to result in a value modulus the max value for the type. That leads me to the attached v2 patch. Compiler Explorer link showing the assembly at [2]. Testing the performance, it's better than master, so I got rid of the size_t wordnum stuff. We're post-freeze now, so fiddling with other optimisations seems a bit off the table as there appears to be no performance regression to compensate for now, per: drowley@amd3990x:~$ gcc test_bms_next3.c -O2 -o test_bms_next3 && ./test_bms_next3 Benchmarking 100000000 bms_next_member iterations... master: 1.18330 seconds Patched: 1.05493 seconds Benchmarking 100000000 bms_prev_member iterations... master: 2.94522 seconds Patched: 1.86130 seconds drowley@amd3990x:~$ clang test_bms_next3.c -O2 -o test_bms_next3 && ./test_bms_next3 Benchmarking 100000000 bms_next_member iterations... master: 1.07860 seconds Patched: 1.07896 seconds Benchmarking 100000000 bms_prev_member iterations... master: 2.76550 seconds Patched: 2.12369 seconds David [1] https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1548.pdf [2] https://godbolt.org/z/xW96rxd3P
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <limits.h>
//#define NULL ((void *) 0)
typedef uint32_t uint32;
typedef int32_t int32;
typedef uint64_t uint64;
typedef int64_t int64;
#define BITS_PER_BITMAPWORD 64
typedef uint64 bitmapword; /* must be an unsigned type */
typedef int64 signedbitmapword; /* must be the matching signed type */
#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
#ifdef __GNUC__
#define likely(x) __builtin_expect((x) != 0, 1)
#define unlikely(x) __builtin_expect((x) != 0, 0)
#else
#define likely(x) ((x) != 0)
#define unlikely(x) ((x) != 0)
#endif
typedef struct Bitmapset
{
int nwords; /* number of words in array */
bitmapword words[]; /* really [nwords] */
} Bitmapset;
static inline int
bmw_rightmost_one_pos(uint64 word)
{
return __builtin_ctzll(word);
}
static inline int
bmw_leftmost_one_pos(uint64 word)
{
return 63 - __builtin_clzll(word);
}
int
bms_next_member(const Bitmapset *a, int prevbit)
{
int nwords;
bitmapword mask;
if (a == NULL)
return -2;
nwords = a->nwords;
prevbit++;
mask = (~(bitmapword) 0) << BITNUM(prevbit);
for (int wordnum = WORDNUM(prevbit); wordnum < nwords; wordnum++)
{
bitmapword w = a->words[wordnum];
/* ignore bits before prevbit */
w &= mask;
if (w != 0)
{
int result;
result = wordnum * BITS_PER_BITMAPWORD;
result += bmw_rightmost_one_pos(w);
return result;
}
/* in subsequent words, consider all bits */
mask = (~(bitmapword) 0);
}
return -2;
}
int
bms_next_member_patched(const Bitmapset *a, int prevbit)
{
unsigned int currbit = prevbit;
int nwords;
bitmapword mask;
if (a == NULL)
return -2;
nwords = a->nwords;
/* use an unsigned int to avoid the risk that int overflows */
currbit++;
mask = (~(bitmapword) 0) << BITNUM(currbit);
for (int wordnum = WORDNUM(currbit); wordnum < nwords; wordnum++)
{
bitmapword w = a->words[wordnum];
/* ignore bits before currbit */
w &= mask;
if (w != 0)
{
int result;
result = wordnum * BITS_PER_BITMAPWORD;
result += bmw_rightmost_one_pos(w);
return result;
}
/* in subsequent words, consider all bits */
mask = (~(bitmapword) 0);
}
return -2;
}
int
bms_prev_member(const Bitmapset *a, int prevbit)
{
int ushiftbits;
bitmapword mask;
/*
* If set is NULL or if there are no more bits to the right then we've
* nothing to do.
*/
if (a == NULL || prevbit == 0)
return -2;
/* transform -1 to the highest possible bit we could have set */
if (prevbit == -1)
prevbit = a->nwords * BITS_PER_BITMAPWORD - 1;
else
prevbit--;
ushiftbits = BITS_PER_BITMAPWORD - (BITNUM(prevbit) + 1);
mask = (~(bitmapword) 0) >> ushiftbits;
for (int wordnum = WORDNUM(prevbit); wordnum >= 0; wordnum--)
{
bitmapword w = a->words[wordnum];
/* mask out bits left of prevbit */
w &= mask;
if (w != 0)
{
int result;
result = wordnum * BITS_PER_BITMAPWORD;
result += bmw_leftmost_one_pos(w);
return result;
}
/* in subsequent words, consider all bits */
mask = (~(bitmapword) 0);
}
return -2;
}
int
bms_prev_member_patched(const Bitmapset *a, int prevbit)
{
unsigned int currbit;
int ushiftbits;
bitmapword mask;
/*
* If set is NULL or if there are no more bits to the right then we've
* nothing to do.
*/
if (a == NULL || prevbit == 0)
return -2;
/*
* Transform -1 to the highest possible bit we could have set. We do
this
* in unsigned math to avoid the risk of overflowing a signed int.
*/
if (prevbit < 0)
currbit = (unsigned int) a->nwords * BITS_PER_BITMAPWORD - 1;
else
currbit = prevbit - 1;
ushiftbits = BITS_PER_BITMAPWORD - (BITNUM(currbit) + 1);
mask = (~(bitmapword) 0) >> ushiftbits;
for (int wordnum = WORDNUM(currbit); wordnum >= 0; wordnum--)
{
bitmapword w = a->words[wordnum];
/* mask out bits left of currbit */
w &= mask;
if (w != 0)
{
int result;
result = wordnum * BITS_PER_BITMAPWORD;
result += bmw_leftmost_one_pos(w);
return result;
}
/* in subsequent words, consider all bits */
mask = (~(bitmapword) 0);
}
return -2;
}
double get_time() {
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
Bitmapset *bms;
int main() {
int words_to_alloc = 1; // Large set to bypass CPU cache slightly
bms = malloc(sizeof(Bitmapset) + words_to_alloc * sizeof(bitmapword));
bms->nwords = words_to_alloc;
memset(bms->words, 0, words_to_alloc * sizeof(bitmapword));
double start, end;
int64 count = 0;
/* Set a bit far into the set to force a long scan */
bms->words[words_to_alloc - 1] |= 0xaf4;
int iterations = 100000000;
printf("Benchmarking %d bms_next_member iterations...\n", iterations);
/* master */
start = get_time();
for (int i = 0; i < iterations; i++)
{
int j = -1;
while ((j = bms_next_member(bms, j)) >= 0)
count++;
}
end = get_time();
printf("master: %.5f seconds\n", end - start);
// Test David
start = get_time();
for (int i = 0; i < iterations; i++)
{
int j = -1;
while ((j = bms_next_member_patched(bms, j)) >= 0)
count++;
}
end = get_time();
printf("Patched: %.5f seconds\n", end - start);
printf("\nBenchmarking %d bms_prev_member iterations...\n", iterations);
/* master */
start = get_time();
for (int i = 0; i < iterations; i++)
{
int j = -1;
while ((j = bms_prev_member(bms, j)) >= 0)
count++;
}
end = get_time();
printf("master: %.5f seconds\n", end - start);
// Test David
start = get_time();
for (int i = 0; i < iterations; i++)
{
int j = -1;
while ((j = bms_prev_member_patched(bms, j)) >= 0)
count++;
}
end = get_time();
printf("Patched: %.5f seconds\n", end - start);
printf("%ld\n", count);
free(bms);
return 0;
}
v2-0001-Fix-unlikely-overflow-bug-in-bms_next_member.patch
Description: Binary data
