Greetings,

It appears gsl_ran_multinomial was coded not to crap out due to incidental
negatives in the probability dist. due to floating point artifacts, but
the handling of these terms was inconsistent and could lead to silently
absurd results.

I didn't at first intend to go this far.

I'm coding an algorithmic simulation in C++ which makes heavy use of
multinomials and I discovered the other day that the Boost library has
only a minimal binomial implementation (Bernoulli trials), and that lead
me to explore the much improved algorithm in GSL.

Perhaps my problem is unusual, but many vertices in my graph have large K
and small N.  For efficiency, I was hoping to see GSL multinomial
implement early-out once all the elements are placed, but it does not.

The multinomial algorithm is quite simple, and on my way to a C++
implementation that suits my own needs, I decided to code up some fixes
and a proof-of-concept in C, which the GSL project could potentially adopt
as it sees fit.

I have supplying a working implementation with two bugs fixes, early-out
working properly, and a small set of unit tests to catch any glaring
errors.   I compile it under g++ with the command line:

g++ -l gsl -l gslcblas -O3 -Wall gsl_multinomial_patch.c

You are welcome to incorporate my changes into the GSL project under the
GPL in any way you see fit.  However, I consider my contributions to this
C version derivative of the C++ version I have not yet finished, which
will be released under a BSD license, or potentially a Boost license if it
goes that far.

What follows is output from my test harness for both my own version, and
the GSL version.  I think I was linking to 1.8, but reading sources for
1.9.

Output from my version is prefixed with ++, the GSL version with --.   The
existing GSL anomolies can be observed in the tests entitled "bad
addition" and "eschew negatives".

My version returns an index as part of the early-out optimization.  This
is the length of the non-zero prefix of the returned value.  My test
harness is not instrumented to count calls to gsl_ran_binomial() or
determine execution time, but I am quite certain my routine does call
gsl_ran_binomial() significantly less than the GSL routine for the test
case demonstrated.

My version is also more careful to establish strong and formal
post-conditions, which my test harness double checks.

The source code which produced this listing is attached.  My
gsl_rand_multinomial revision is written in a pure C idiom (I believe),
but the rest slides into a C-like style with unwitting C++ influences.

If you don't wish to add a return value to the function signature, I
suggest a future addition of gsl_multinomial_sparse() and hoisting sum
p[k] to an initialization routine to further alleviate the O(K)
performance term.

Feel free to contact me with any questions.

Allan Stokes


===Test harness output===

Begin <empty dist: K=(), N=40>
 ++  0:
 --   :
End <empty dist>
Begin <null dist: K=(0.000000, 0.000000, 0.000000), N=40>
 ++  0: 0 0 0
 --   : 0 0 0
End <null dist>
Begin <bad addition: K=(0.500000, 0.500000, -0.250000), N=100>
 ++  2: 47 53 0
 ++  2: 56 44 0
 ++  2: 43 57 0
 ++  2: 54 46 0
 ++  2: 46 54 0
 ++  2: 51 49 0
 ++  2: 46 54 0
 ++  2: 46 54 0
 ++  2: 48 52 0
 ++  2: 55 45 0
 --   : 69 31 0
 --   : 73 27 0
 --   : 64 36 0
 --   : 68 32 0
 --   : 67 33 0
 --   : 61 39 0
 --   : 68 32 0
 --   : 66 34 0
 --   : 61 39 0
 --   : 63 37 0
End <bad addition>
Begin <eschew negatives: K=(0.010000, -100.000000, 0.010000), N=100>
 ++  3: 55 0 45
 ++  3: 53 0 47
 ++  3: 51 0 49
 ++  3: 57 0 43
 ++  3: 52 0 48
 ++  3: 51 0 49
 ++  3: 59 0 41
 ++  3: 43 0 57
 ++  3: 48 0 52
 ++  3: 45 0 55
 --   : 0 0 100
 --   : 0 0 100
 --   : 0 0 100
 --   : 0 0 100
 --   : 0 0 100
 --   : 0 0 100
 --   : 0 0 100
 --   : 0 0 100
 --   : 0 0 100
 --   : 0 0 100
End <eschew negatives>
Begin <early out: K=(0.800000, 0.160000, 0.040000), N=10>
 ++  3: 8 1 1
 ++  3: 6 1 3
 ++  2: 9 1 0
 ++  3: 8 1 1
 ++  1: 10 0 0
 ++  3: 7 2 1
 ++  3: 9 0 1
 ++  2: 9 1 0
 ++  2: 7 3 0
 ++  2: 9 1 0
 ++  3: 8 1 1
 ++  3: 6 3 1
 ++  3: 8 1 1
 ++  3: 7 1 2
 ++  3: 9 0 1
 ++  2: 9 1 0
 ++  3: 6 2 2
 ++  2: 8 2 0
 ++  2: 9 1 0
 ++  2: 9 1 0
 ++  2: 9 1 0
 ++  2: 8 2 0
 ++  2: 8 2 0
 ++  2: 7 3 0
 ++  2: 9 1 0
 ++  3: 6 3 1
 ++  2: 9 1 0
 ++  2: 9 1 0
 ++  2: 9 1 0
 ++  2: 8 2 0
 ++  2: 9 1 0
 ++  3: 7 1 2
 ++  3: 8 1 1
 ++  3: 8 1 1
 ++  3: 8 1 1
 ++  2: 6 4 0
 ++  3: 9 0 1
 ++  3: 8 1 1
 ++  1: 10 0 0
 ++  3: 8 1 1
 --   : 8 1 1
 --   : 8 2 0
 --   : 8 2 0
 --   : 9 1 0
 --   : 8 2 0
 --   : 10 0 0
 --   : 6 3 1
 --   : 8 1 1
 --   : 9 1 0
 --   : 9 1 0
 --   : 8 1 1
 --   : 10 0 0
 --   : 8 2 0
 --   : 8 2 0
 --   : 8 2 0
 --   : 8 2 0
 --   : 8 2 0
 --   : 8 2 0
 --   : 10 0 0
 --   : 9 0 1
 --   : 8 2 0
 --   : 8 2 0
 --   : 8 1 1
 --   : 8 2 0
 --   : 10 0 0
 --   : 7 3 0
 --   : 7 0 3
 --   : 8 1 1
 --   : 6 4 0
 --   : 7 3 0
 --   : 10 0 0
 --   : 7 2 1
 --   : 10 0 0
 --   : 9 1 0
 --   : 8 2 0
 --   : 7 3 0
 --   : 9 1 0
 --   : 7 3 0
 --   : 9 0 1
 --   : 6 2 2
End <early out>


#include <stdio.h>
#include <assert.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>

/* Encourage users to pre-sort p[] monotonic descending for large K or small N.  
 * Improves early-out behaviour with fewer calls to gsl_ran_binomial().  
 *
 * Returns 'last': least index to all-zero tail sequence.   
 * last==0 && N>0 implies degeneracy 
 * if norm==0.0, degenerate by request (note: K==0 => norm==0.0) 
 * elsewise, freelance degeneracy. 
 */
size_t /* last */
gsl_ran_multinomial_early_out (const gsl_rng * r, const size_t K,
                     const unsigned int N, const double p[], unsigned int n[])
{
  size_t last;  /* return value */ 
  size_t k;
  double norm = 0.0;
  double sum_p = 0.0;
  unsigned int tail_sum_n;

  /* p[k] may contain non-negative weights that do not sum to 1.0.
   * Even a probability distribution will not exactly sum to 1.0
   * due to rounding errors.
   *   
   * Where users sort monotonic descending as encouraged for efficiency 
   * for large K, summation accuracy would improve by reverse summation  
   * from least to greatest; however, early-out requires traversal 
   * from greatest to least, and forward summation ensures sum_p converges 
   * on norm, perhaps more important, even if norm ends up less 
   * accurate summing in the foward direction. 
   * 
   * If norm sums to 0 (iff each p[k] <= 0.0), 
   * then not possible to place N elements. 
   */
  for (k = 0; k < K; ++k) {
    if (p[k] > 0.0) norm += p[k];
  }
  tail_sum_n = N; 

  /* iterate until all elements placed: ie. tail_sum_n exhausted 
   * for well-conditioned inputs tail_sum_n > 0 => k < K 
   */ 
  for (k = 0; tail_sum_n > 0 && k < K; ++k)
    {
      /* tempting to reflect sum_p into tail_sum_p and thus eliminate 
       * subtraction from 'norm' term, however, repeated subtraction 
       * from tail_sum_p could yield negative epsilon artifacts 
       */  
      if (p[k] > 0.0) {
        n[k] = gsl_ran_binomial (r, p[k] / (norm - sum_p), tail_sum_n);
        sum_p += p[k];
      }
      else {
        n[k] = 0; 
      }
      tail_sum_n -= n[k];
    }

  /* if tail_sum > 0 we have a bogus partial result 
   * norm == 0.0 => tail_sum_n == N on loop exit 
   * other cases might arise from bugs or floating point inaccuracy     
   */ 
  last = (tail_sum_n > 0) ? 0 : k; 

  /* establish that last is *least* index to all-zero tail sequence
   * from above last != 0 => k > 0, but let's not code brittle invariants  
   */
  assert (last == 0 || ((k > 0) && n[k-1] > 0)); 

  /* tail clear early-out or bogus-result 
   * in the case of early-out, this is unnecessary work if each use  
   * takes full acount of the return value, which we neither demand nor expect  
   */ 
  for (k = last; k < K; ++k) n[k] = 0;  

  /* 1) logical post-condition: last > 0 => sum n[0..last) == N 
   * 2) defensive post-condition: last==0 => each n[0..K) == 0 
   * 3) historical post-condition: each n[last..K) == 0
   * unless the API is refactored to pre-sum norm, no point relaxing (3) 
   * since we are O(K) already, even for N << K  
   * notation: [) expresses semi-open interval 
   */
  return last; 
}

/* WARNING: potential C++isms ahead; might require g++ */ 

void violation (const char * testname, const char * msg) {
  printf ("%20s: VIOLATION *** %s ***\n", testname, msg);
}

size_t /* last */
multinomial_post_check (const char * testname, const gsl_rng * r, const size_t K,
                     const unsigned int N, const double p[], unsigned int n[]) 
{
  size_t last = gsl_ran_multinomial_early_out (r, K, N, p, n); 
  size_t sum = 0; 

  for (size_t i = 0; i < last; ++i) 
    sum += n[i]; 
  if (last > 0 && sum != N) violation (testname, "partial placement"); 
  if (last > 0 && n[last-1] == 0) violation (testname, "non-maximal all-zero tail"); 
  { /* ISO scope error: loop-predicate amnesia */ 
    size_t i; 
    for (i = last; i < K && n[i] == 0; ++i)
      ; 
    if (i < K) violation (testname, "all-zero tail not all-zero\n"); 
  }
  return last; 
}

// don't have %z for size_t from C99 in g++ and not using cout 
#define FORMAT_Z "u"  
#define UNSIZE_T(x) ((unsigned int)(x))

void
multinomial_test (const char * testname, const size_t reps, 
                  const gsl_rng * r, const size_t K,
                     const unsigned int N, const double p[], unsigned int n[]) 
{
  printf ("Begin <%s: K=(", testname);
  for (size_t i = 0; i < K; ++i) printf ("%s%f", i?", ":"", p[i]);
  printf ("), N=%u>\n", N);
  for (size_t i = 0; i < reps; ++i) {
    size_t last = multinomial_post_check (testname, r, K, N, p, n);
    printf (" ++ %2" FORMAT_Z ": ", UNSIZE_T(last)); 
    for (size_t k = 0; k < K; ++k) printf ("%d ", n[k]); 
    printf ("\n");
  }

  for (size_t i = 0; i < reps; ++i) {
    gsl_ran_multinomial (r, K, N, p, n);
    printf (" --   : "); 
    for (size_t k = 0; k < K; ++k) printf ("%d ", n[k]); 
    printf ("\n");
    
  }
  printf ("End <%s>\n", testname); 
}

int
main (int argc, char* argv[])
{
    const gsl_rng_type * T;
    gsl_rng * r;
     
    gsl_rng_env_setup();
    T = gsl_rng_default;
    r = gsl_rng_alloc (T);
    
    const size_t K = 3; 
    unsigned int s[K]; 
    size_t reps = 0; 
    unsigned int N = 0; 

    double p_null[K] = { 0.0, 0.0, 0.0 }; 
    N = 40;
    reps = 1; 
    multinomial_test ("empty dist", reps, r, 0, N, p_null, s);
    multinomial_test ("null dist", reps, r, K, N, p_null, s);

    double p_sloppy[K] = { 0.5, 0.5, -0.25 };
    N = 100; 
    reps = 10; 
    multinomial_test ("bad addition", reps, r, K, N, p_sloppy, s);

    double p_small[K] = { 0.01, -100.0, 0.01 };
    N = 100;
    reps = 10; 
    multinomial_test ("eschew negatives", reps, r, K, N, p_small, s);

    double p_fast[K] = { 0.8, 0.16, 0.04 };
    N = 10;
    reps = 40; 
    multinomial_test ("early out", reps, r, K, N, p_fast, s);

    gsl_rng_free (r);
    return 0;
}
_______________________________________________
Bug-gsl mailing list
[email protected]
http://lists.gnu.org/mailman/listinfo/bug-gsl

Reply via email to