Greetings,

After noticing the "FIXME" in line 205 of dht/dht.c, suggesting a potential optimisation, I looked further into it. I have changed the size of the array Jjj (to size^2, instead of size*(size+1)/2 and re-arranged its entries, such that it can be read sequentially when 'gsl_dht_apply' is called. This has the effect of reducing the number of memory reads and thus makes 'gsl_dht_apply' run more efficiently. I've ran some tests with 'size = 4000' and 5000 and have seen a speed-up of 2-3 times compared to the original GSL implementation.
I've also ran 'make check' and no errors have been reported.

In order to perform the aforementioned re-arrangement, when 'gsl_dht_init' is called a temporary array is allocated and populated with the elements that would originally be in Jjj, then the code loops over the
temporary array and populates Jjj.

I am attaching to this email the patch containing the differences. Feel free to reach out to me if there are any
questions.

Cheers,

Felipe Attanasio
diff --git a/dht/dht.c b/dht/dht.c
index 97b60224..0ddabaea 100644
--- a/dht/dht.c
+++ b/dht/dht.c
@@ -55,7 +55,7 @@ gsl_dht_alloc (size_t size)
     GSL_ERROR_VAL("could not allocate memory for j", GSL_ENOMEM, 0);
   }
 
-  t->Jjj = (double *)malloc(size*(size+1)/2 * sizeof(double));
+  t->Jjj = (double *)malloc(size*size*sizeof(double));
 
   if(t->Jjj == 0) {
     free(t->j);
@@ -106,7 +106,6 @@ gsl_dht_new (size_t size, double nu, double xmax)
     return 0;
 
   status = gsl_dht_init(dht, nu, xmax);
-  
   if (status)
     return 0;
 
@@ -122,10 +121,11 @@ gsl_dht_init(gsl_dht * t, double nu, double xmax)
     GSL_ERROR ("nu is negative", GSL_EDOM);
   }
   else {
-    size_t n, m;
+    size_t n, m, i;
     int stat_bz = GSL_SUCCESS;
     int stat_J  = 0;
     double jN;
+    double *tmp;
 
     if(nu != t->nu) {
       /* Recalculate Bessel zeros if necessary. */
@@ -147,14 +147,43 @@ gsl_dht_init(gsl_dht * t, double nu, double xmax)
 
     /* J_nu(j[n] j[m] / j[N]) = Jjj[n(n-1)/2 + m - 1], 1 <= n,m <= size
      */
+
+    /* allocate temporary array */
+    tmp = (double *)malloc(t->size*(t->size+1)/2 * sizeof(double));
+    if(tmp == 0) {
+        gsl_dht_free(t);
+        GSL_ERROR_VAL("could not allocate memory for tmp", GSL_ENOMEM, 0);
+    }
+
+    /* compute necessary values of J, storing on temporary array */
     for(n=1; n<t->size+1; n++) {
       for(m=1; m<=n; m++) {
         double arg = t->j[n] * t->j[m] / jN;
         gsl_sf_result J;
         stat_J += gsl_sf_bessel_Jnu_e(nu, arg, &J);
-        t->Jjj[n*(n-1)/2 + m - 1] = J.val;
+        tmp[n*(n-1)/2 + m - 1] = J.val;
+      }
+    }
+
+    /* copy data into Jjj such that it can be read sequentially by gsl_dht_apply */
+    for(m=0; m<t->size; m++) {
+      for(i=0; i<t->size; i++) {
+        size_t m_local, n_local;
+        int coord;
+        if(i < m) {
+          m_local = i;
+          n_local = m;
+        }
+        else {
+          m_local = m;
+          n_local = i;
+        }
+        coord = n_local*(n_local+1)/2 + m_local;
+        t->Jjj[m * t->size + i] = tmp[coord];
       }
     }
+    /* free temporary array */
+    free(tmp);
 
     if(stat_J != 0) {
       GSL_ERROR("error computing bessel function", GSL_EFAILED);
@@ -191,32 +220,15 @@ void gsl_dht_free(gsl_dht * t)
 int
 gsl_dht_apply(const gsl_dht * t, double * f_in, double * f_out)
 {
-  const double jN = t->j[t->size + 1];
-  const double r  = t->xmax / jN;
-  size_t m;
-  size_t i;
+  double const jN = t->j[t->size + 1];
+  double const r  = t->xmax / jN;
+  size_t m, i, l;
 
+  l = 0;
   for(m=0; m<t->size; m++) {
     double sum = 0.0;
-    double Y;
-    for(i=0; i<t->size; i++) {
-      /* Need to find max and min so that we
-       * address the symmetric Jjj matrix properly.
-       * FIXME: we can presumably optimize this
-       * by just running over the elements of Jjj
-       * in a deterministic manner.
-       */
-      size_t m_local; 
-      size_t n_local;
-      if(i < m) {
-        m_local = i;
-        n_local = m;
-      }
-      else {
-        m_local = m;
-        n_local = i;
-      }
-      Y = t->Jjj[n_local*(n_local+1)/2 + m_local] / t->J2[i+1];
+    for(i=0; i<t->size; i++, l++) {
+      double const Y = t->Jjj[l] / t->J2[i+1];
       sum += Y * f_in[i];
     }
     f_out[m] = sum * 2.0 * r*r;

Reply via email to