/* randist/discrete.c * * Copyright (C) 1996, 1997, 1998, 1999, 2000, 2007, 2009 James Theiler, Brian Gough * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this library; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /* Random Discrete Events Given K discrete events with different probabilities P[k] produce a value k consistent with its probability. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /* * Based on: Alastair J Walker, An efficient method for generating * discrete random variables with general distributions, ACM Trans * Math Soft 3, 253-256 (1977). See also: D. E. Knuth, The Art of * Computer Programming, Volume 2 (Seminumerical algorithms), 3rd * edition, Addison-Wesley (1997), p120. * Walker's algorithm does some preprocessing, and provides two * arrays: floating point F[k] and integer A[k]. A value k is chosen * from 0..K-1 with equal likelihood, and then a uniform random number * u is compared to F[k]. If it is less than F[k], then k is * returned. Otherwise, A[k] is returned. * Walker's original paper describes an O(K^2) algorithm for setting * up the F and A arrays. I found this disturbing since I wanted to * use very large values of K. I'm sure I'm not the first to realize * this, but in fact the preprocessing can be done in O(K) steps. * A figure of merit for the preprocessing is the average value for * the F[k]'s (that is, SUM_k F[k]/K); this corresponds to the * probability that k is returned, instead of A[k], thereby saving a * redirection. Walker's O(K^2) preprocessing will generally improve * that figure of merit, compared to my cheaper O(K) method; from some * experiments with a perl script, I get values of around 0.6 for my * method and just under 0.75 for Walker's. Knuth has pointed out * that finding _the_ optimum lookup tables, which maximize the * average F[k], is a combinatorially difficult problem. But any * valid preprocessing will still provide O(1) time for the call to * gsl_ran_discrete(). I find that if I artificially set F[k]=1 -- * ie, better than optimum! -- I get a speedup of maybe 20%, so that's * the maximum I could expect from the most expensive preprocessing. * Folding in the difference of 0.6 vs 0.75, I'd estimate that the * speedup would be less than 10%. * I've not implemented it here, but one compromise is to sort the * probabilities once, and then work from the two ends inward. This * requires O(K log K), still lots cheaper than O(K^2), and from my * experiments with the perl script, the figure of merit is within * about 0.01 for K up to 1000, and no sign of diverging (in fact, * they seemed to be converging, but it's hard to say with just a * handful of runs). * The O(K) algorithm goes through all the p_k's and decides if they * are "smalls" or "bigs" according to whether they are less than or * greater than the mean value 1/K. The indices to the smalls and the * bigs are put in separate stacks, and then we work through the * stacks together. For each small, we pair it up with the next big * in the stack (Walker always wanted to pair up the smallest small * with the biggest big). The small "borrows" from the big just * enough to bring the small up to mean. This reduces the size of the * big, so the (smaller) big is compared again to the mean, and if it * is smaller, it gets "popped" from the big stack and "pushed" to the * small stack. Otherwise, it stays put. Since every time we pop a * small, we are able to deal with it right then and there, and we * never have to pop more than K smalls, then the algorithm is O(K). * This implementation sets up two separate stacks, and allocates K * elements between them. Since neither stack ever grows, we do an * extra O(K) pass through the data to determine how many smalls and * bigs there are to begin with and allocate appropriately. In all * there are 2*K*sizeof(double) transient bytes of memory that are * used than returned, and K*(sizeof(int)+sizeof(double)) bytes used * in the lookup table. * Walker spoke of using two random numbers (an integer 0..K-1, and a * floating point u in [0,1]), but Knuth points out that one can just * use the integer and fractional parts of K*u where u is in [0,1]. * In fact, Knuth further notes that taking F'[k]=(k+F[k])/K, one can * directly compare u to F'[k] without having to explicitly set * u=K*u-int(K*u). * Usage: * Starting with an array of probabilities P, initialize and do * preprocessing with a call to: * gsl_rng *r; * gsl_ran_discrete_t *f; * f = gsl_ran_discrete_preproc(K,P); * Then, whenever a random index 0..K-1 is desired, use * k = gsl_ran_discrete(r,f); * Note that several different randevent struct's can be * simultaneously active. * Aside: A very clever alternative approach is described in * Abramowitz and Stegun, p 950, citing: Marsaglia, Random variables * and computers, Proc Third Prague Conference in Probability Theory, * 1962. A more accesible reference is: G. Marsaglia, Generating * discrete random numbers in a computer, Comm ACM 6, 37-38 (1963). * If anybody is interested, I (jt) have also coded up this version as * part of another software package. However, I've done some * comparisons, and the Walker method is both faster and more stingy * with memory. So, in the end I decided not to include it with the * GSL package. * Written 26 Jan 1999, James Theiler, jt@lanl.gov * Adapted to GSL, 30 Jan 1999, jt */ #include #include /* used for NULL, also fprintf(stderr,...) */ #include /* used for malloc's */ #include #include #include #include #define DEBUG 0 #define KNUTH_CONVENTION 1 /* Saves a few steps of arithmetic * in the call to gsl_ran_discrete() */ /*** Begin Stack (this code is used just in this file) ***/ /* Stack code converted to use unsigned indices (i.e. s->i == 0 now means an empty stack, instead of -1), for consistency and to give a bigger allowable range. BJG */ typedef struct { size_t N; /* max number of elts on stack */ size_t *v; /* array of values on the stack */ size_t i; /* index of top of stack */ } gsl_stack_t; static gsl_stack_t * new_stack(size_t N) { gsl_stack_t *s; s = (gsl_stack_t *)malloc(sizeof(gsl_stack_t)); s->N = N; s->i = 0; /* indicates stack is empty */ s->v = (size_t *)malloc(sizeof(size_t)*N); return s; } static int push_stack(gsl_stack_t *s, size_t v) { if ((s->i) >= (s->N)) { return -1; /* stack overflow (shouldn't happen) */ } (s->v)[s->i] = v; s->i += 1; return 0; } static size_t pop_stack(gsl_stack_t *s) { if ((s->i) == 0) { GSL_ERROR ("internal error - stack exhausted", GSL_ESANITY); } s->i -= 1; return ((s->v)[s->i]); } static inline size_t size_stack(const gsl_stack_t *s) { return s->i; } static void free_stack(gsl_stack_t *s) { free((char *)(s->v)); free((char *)s); } /*** End Stack ***/ /*** Begin Walker's Algorithm ***/ gsl_ran_discrete_t * gsl_ran_discrete_preproc(size_t Kevents, const double *ProbArray) { size_t k,b,s; gsl_ran_discrete_t *g; size_t nBigs, nSmalls; gsl_stack_t *Bigs; gsl_stack_t *Smalls; double *E; double pTotal = 0.0, mean, d; if (Kevents < 1) { /* Could probably treat Kevents=1 as a special case */ GSL_ERROR_VAL ("number of events must be a positive integer", GSL_EINVAL, 0); } /* Make sure elements of ProbArray[] are positive. * Won't enforce that sum is unity; instead will just normalize */ for (k=0; kK = Kevents; g->F = (double *)malloc(sizeof(double)*Kevents); g->A = (size_t *)malloc(sizeof(size_t)*Kevents); E = (double *)malloc(sizeof(double)*Kevents); if (E==NULL) { GSL_ERROR_VAL ("Cannot allocate memory for randevent", GSL_ENOMEM, 0); } for (k=0; kA[k] to indicate small or large */ size_t * const which = g->A; for (k=0; k 0) { s = pop_stack(Smalls); if (size_stack(Bigs) == 0) { (g->A)[s]=s; (g->F)[s]=1.0; continue; } b = pop_stack(Bigs); (g->A)[s]=b; (g->F)[s]=Kevents*E[s]; #if DEBUG fprintf(stderr,"s=%2d, A=%2d, F=%.4f\n",s,(g->A)[s],(g->F)[s]); #endif d = mean - E[s]; E[s] += d; /* now E[s] == mean */ E[b] -= d; if (E[b] < mean) { push_stack(Smalls,b); /* no longer big, join ranks of the small */ } else if (E[b] > mean) { push_stack(Bigs,b); /* still big, put it back where you found it */ } else { /* E[b]==mean implies it is finished too */ (g->A)[b]=b; (g->F)[b]=1.0; } } while (size_stack(Bigs) > 0) { b = pop_stack(Bigs); (g->A)[b]=b; (g->F)[b]=1.0; } /* Stacks have been emptied, and A and F have been filled */ if ( size_stack(Smalls) != 0) { GSL_ERROR_VAL ("Smalls stack has not been emptied", GSL_ESANITY, 0 ); } #if 0 /* if 1, then artificially set all F[k]'s to unity. This will * give wrong answers, but you'll get them faster. But, not * that much faster (I get maybe 20%); that's an upper bound * on what the optimal preprocessing would give. */ for (k=0; kF)[k] = 1.0; } #endif #if KNUTH_CONVENTION /* For convenience, set F'[k]=(k+F[k])/K */ /* This saves some arithmetic in gsl_ran_discrete(); I find that * it doesn't actually make much difference. */ for (k=0; kF)[k] += k; (g->F)[k] /= Kevents; } #endif free_stack(Bigs); free_stack(Smalls); free((char *)E); return g; } size_t gsl_ran_discrete(const gsl_rng *r, const gsl_ran_discrete_t *g) { size_t c=0; double u,f; u = gsl_rng_uniform(r); #if KNUTH_CONVENTION c = (u*(g->K)); #else u *= g->K; c = u; u -= c; #endif f = (g->F)[c]; /* fprintf(stderr,"c,f,u: %d %.4f %f\n",c,f,u); */ if (f == 1.0) return c; if (u < f) { return c; } else { return (g->A)[c]; } } void gsl_ran_discrete_free(gsl_ran_discrete_t *g) { RETURN_IF_NULL (g); free((char *)(g->A)); free((char *)(g->F)); free((char *)g); } double gsl_ran_discrete_pdf(size_t k, const gsl_ran_discrete_t *g) { size_t i,K; double f,p=0; K= g->K; if (k>K) return 0; for (i=0; iF)[i]; #if KNUTH_CONVENTION f = K*f-i; #endif if (i==k) { p += f; } else if (k == (g->A)[i]) { p += 1.0 - f; } } return p/K; }