Greetings,
After noticing the "FIXME" in line 205 of dht/dht.c, suggesting a
potential optimisation, I looked further into it.
I have changed the size of the array Jjj (to size^2, instead of
size*(size+1)/2 and re-arranged its entries, such
that it can be read sequentially when 'gsl_dht_apply' is called. This
has the effect of reducing the number of
memory reads and thus makes 'gsl_dht_apply' run more efficiently. I've
ran some tests with 'size = 4000' and
5000 and have seen a speed-up of 2-3 times compared to the original GSL
implementation.
I've also ran 'make check' and no errors have been reported.
In order to perform the aforementioned re-arrangement, when
'gsl_dht_init' is called a temporary array is
allocated and populated with the elements that would originally be in
Jjj, then the code loops over the
temporary array and populates Jjj.
I am attaching to this email the patch containing the differences. Feel
free to reach out to me if there are any
questions.
Cheers,
Felipe Attanasio
diff --git a/dht/dht.c b/dht/dht.c
index 97b60224..0ddabaea 100644
--- a/dht/dht.c
+++ b/dht/dht.c
@@ -55,7 +55,7 @@ gsl_dht_alloc (size_t size)
GSL_ERROR_VAL("could not allocate memory for j", GSL_ENOMEM, 0);
}
- t->Jjj = (double *)malloc(size*(size+1)/2 * sizeof(double));
+ t->Jjj = (double *)malloc(size*size*sizeof(double));
if(t->Jjj == 0) {
free(t->j);
@@ -106,7 +106,6 @@ gsl_dht_new (size_t size, double nu, double xmax)
return 0;
status = gsl_dht_init(dht, nu, xmax);
-
if (status)
return 0;
@@ -122,10 +121,11 @@ gsl_dht_init(gsl_dht * t, double nu, double xmax)
GSL_ERROR ("nu is negative", GSL_EDOM);
}
else {
- size_t n, m;
+ size_t n, m, i;
int stat_bz = GSL_SUCCESS;
int stat_J = 0;
double jN;
+ double *tmp;
if(nu != t->nu) {
/* Recalculate Bessel zeros if necessary. */
@@ -147,14 +147,43 @@ gsl_dht_init(gsl_dht * t, double nu, double xmax)
/* J_nu(j[n] j[m] / j[N]) = Jjj[n(n-1)/2 + m - 1], 1 <= n,m <= size
*/
+
+ /* allocate temporary array */
+ tmp = (double *)malloc(t->size*(t->size+1)/2 * sizeof(double));
+ if(tmp == 0) {
+ gsl_dht_free(t);
+ GSL_ERROR_VAL("could not allocate memory for tmp", GSL_ENOMEM, 0);
+ }
+
+ /* compute necessary values of J, storing on temporary array */
for(n=1; n<t->size+1; n++) {
for(m=1; m<=n; m++) {
double arg = t->j[n] * t->j[m] / jN;
gsl_sf_result J;
stat_J += gsl_sf_bessel_Jnu_e(nu, arg, &J);
- t->Jjj[n*(n-1)/2 + m - 1] = J.val;
+ tmp[n*(n-1)/2 + m - 1] = J.val;
+ }
+ }
+
+ /* copy data into Jjj such that it can be read sequentially by gsl_dht_apply */
+ for(m=0; m<t->size; m++) {
+ for(i=0; i<t->size; i++) {
+ size_t m_local, n_local;
+ int coord;
+ if(i < m) {
+ m_local = i;
+ n_local = m;
+ }
+ else {
+ m_local = m;
+ n_local = i;
+ }
+ coord = n_local*(n_local+1)/2 + m_local;
+ t->Jjj[m * t->size + i] = tmp[coord];
}
}
+ /* free temporary array */
+ free(tmp);
if(stat_J != 0) {
GSL_ERROR("error computing bessel function", GSL_EFAILED);
@@ -191,32 +220,15 @@ void gsl_dht_free(gsl_dht * t)
int
gsl_dht_apply(const gsl_dht * t, double * f_in, double * f_out)
{
- const double jN = t->j[t->size + 1];
- const double r = t->xmax / jN;
- size_t m;
- size_t i;
+ double const jN = t->j[t->size + 1];
+ double const r = t->xmax / jN;
+ size_t m, i, l;
+ l = 0;
for(m=0; m<t->size; m++) {
double sum = 0.0;
- double Y;
- for(i=0; i<t->size; i++) {
- /* Need to find max and min so that we
- * address the symmetric Jjj matrix properly.
- * FIXME: we can presumably optimize this
- * by just running over the elements of Jjj
- * in a deterministic manner.
- */
- size_t m_local;
- size_t n_local;
- if(i < m) {
- m_local = i;
- n_local = m;
- }
- else {
- m_local = m;
- n_local = i;
- }
- Y = t->Jjj[n_local*(n_local+1)/2 + m_local] / t->J2[i+1];
+ for(i=0; i<t->size; i++, l++) {
+ double const Y = t->Jjj[l] / t->J2[i+1];
sum += Y * f_in[i];
}
f_out[m] = sum * 2.0 * r*r;