https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102054
--- Comment #2 from Kewen Lin <linkw at gcc dot gnu.org> ---
Yet another reduced test case from 526.blender_r.
#include <math.h>
typedef struct QMCSampler {
struct QMCSampler *next, *prev;
int type;
int tot;
int used;
double *samp2d;
double offs[1][2];
} QMCSampler;
float BLI_thread_frand(int thread);
static void halton_sample(double *ht_invprimes, double *ht_nums, double *v) {
unsigned int i;
for (i = 0; i < 2; i++) {
double r = fabs((1.0 - ht_nums[i]) - 1e-10);
if (ht_invprimes[i] >= r) {
double lasth;
double h = ht_invprimes[i];
do {
lasth = h;
h *= ht_invprimes[i];
} while (h >= r);
ht_nums[i] += ((lasth + h) - 1.0);
} else
ht_nums[i] += ht_invprimes[i];
v[i] = (float)ht_nums[i];
}
}
void QMC_initPixel(QMCSampler *qsa, int thread) {
if (qsa->type == 2) {
qsa->offs[thread][0] = 0.5f * BLI_thread_frand(thread);
qsa->offs[thread][1] = 0.5f * BLI_thread_frand(thread);
} else {
double ht_invprimes[2], ht_nums[2];
double r[2];
int i;
ht_nums[0] = BLI_thread_frand(thread);
ht_nums[1] = BLI_thread_frand(thread);
ht_invprimes[0] = 0.5;
ht_invprimes[1] = 1.0 / 3.0;
for (i = 0; i < qsa->tot; i++) {
halton_sample(ht_invprimes, ht_nums, r);
qsa->samp2d[2 * i + 0] = r[0];
qsa->samp2d[2 * i + 1] = r[1];
}
}
}
Without loop vectorization, unrestricted pre makes the loop happy for cunroll
and the loop was completely unrolled. The affected pct. is also small, about
0.7%.