Re: [HACKERS] gaussian distribution pgbench -- part 1/2

Fabien COELHO Wed, 16 Jul 2014 21:11:31 -0700


pgbench with gaussian & exponential, part 1 of 2.


This patch is a subset of the previous patch which only adds the two
new \setrandom gaussian and exponantial variants, but not the
adapted pgbench test cases, as suggested by Fujii Masao.
There is no new code nor code changes.

The corresponding documentation has been yet again extended wrt
to the initial patch, so that what is achieved is hopefully unambiguous
(there are two mathematical formula, tasty!), in answer to Andres Freund
comments, and partly to Robert Haas comments as well.

This patch also provides several sql/pgbench scripts and a README, so
that the feature can be tested. I do not know whether these scripts
should make it to postgresql. I would say yes, otherwise there is no way
to test...

part 2 which provide adapted pgbench test cases will come later.

--
Fabien.

diff --git a/contrib/pgbench/README b/contrib/pgbench/README
new file mode 100644
index 0000000..4b8fd59
--- /dev/null
+++ b/contrib/pgbench/README
@@ -0,0 +1,5 @@
+# gaussian and exponential tests
+# with XXX as "expo" or "gauss"
+psql test < test-init.sql
+./pgbench -f test-XXX-run.sql -t 1000000 -P 1 test
+psql test < test-XXX-check.sql
diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 4aa8a50..a80c0a5 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -41,6 +41,7 @@
 #include <math.h>
 #include <signal.h>
 #include <sys/time.h>
+#include <assert.h>
 #ifdef HAVE_SYS_SELECT_H
 #include <sys/select.h>
 #endif
@@ -98,6 +99,8 @@ static int	pthread_join(pthread_t th, void **thread_return);
 #define LOG_STEP_SECONDS	5	/* seconds between log messages */
 #define DEFAULT_NXACTS	10		/* default nxacts */
 
+#define MIN_GAUSSIAN_THRESHOLD		2.0	/* minimum threshold for gauss */
+
 int			nxacts = 0;			/* number of transactions per client */
 int			duration = 0;		/* duration in seconds */
 
@@ -471,6 +474,76 @@ getrand(TState *thread, int64 min, int64 max)
 	return min + (int64) ((max - min + 1) * pg_erand48(thread->random_state));
 }
 
+/*
+ * random number generator: exponential distribution from min to max inclusive.
+ * the threshold is so that the density of probability for the last cut-off max
+ * value is exp(-exp_threshold).
+ */
+static int64
+getExponentialrand(TState *thread, int64 min, int64 max, double exp_threshold)
+{
+	double cut, uniform, rand;
+	assert(exp_threshold > 0.0);
+	cut = exp(-exp_threshold);
+	/* erand in [0, 1), uniform in (0, 1] */
+	uniform = 1.0 - pg_erand48(thread->random_state);
+	/*
+	 * inner expresion in (cut, 1] (if exp_threshold > 0),
+	 * rand in [0, 1)
+	 */
+	assert((1.0 - cut) != 0.0);
+	rand = - log(cut + (1.0 - cut) * uniform) / exp_threshold;
+	/* return int64 random number within between min and max */
+	return min + (int64)((max - min + 1) * rand);
+}
+
+/* random number generator: gaussian distribution from min to max inclusive */
+static int64
+getGaussianrand(TState *thread, int64 min, int64 max, double stdev_threshold)
+{
+	double		stdev;
+	double		rand;
+
+	/*
+	 * Get user specified random number from this loop, with
+	 * -stdev_threshold < stdev <= stdev_threshold
+	 *
+	 * This loop is executed until the number is in the expected range.
+	 *
+	 * As the minimum threshold is 2.0, the probability of looping is low:
+	 * sqrt(-2 ln(r)) <= 2 => r >= e^{-2} ~ 0.135, then when taking the average
+	 * sinus multiplier as 2/pi, we have a 8.6% looping probability in the
+	 * worst case. For a 5.0 threshold value, the looping probability
+	 * is about e^{-5} * 2 / pi ~ 0.43%.
+	 */
+	do
+	{
+		/*
+		 * pg_erand48 generates [0,1), but for the basic version of the
+		 * Box-Muller transform the two uniformly distributed random numbers
+		 * are expected in (0, 1] (see http://en.wikipedia.org/wiki/Box_muller)
+		 */
+		double rand1 = 1.0 - pg_erand48(thread->random_state);
+		double rand2 = 1.0 - pg_erand48(thread->random_state);
+
+		/* Box-Muller basic form transform */
+		double var_sqrt = sqrt(-2.0 * log(rand1));
+		stdev = var_sqrt * sin(2.0 * M_PI * rand2);
+
+		/*
+ 		 * we may try with cos, but there may be a bias induced if the previous
+		 * value fails the test? To be on the safe side, let us try over.
+		 */
+	}
+	while (stdev < -stdev_threshold || stdev >= stdev_threshold);
+
+	/* stdev is in [-threshold, threshold), normalization to [0,1) */
+	rand = (stdev + stdev_threshold) / (stdev_threshold * 2.0);
+
+	/* return int64 random number within between min and max */
+	return min + (int64)((max - min + 1) * rand);
+}
+
 /* call PQexec() and exit() on failure */
 static void
 executeStatement(PGconn *con, const char *sql)
@@ -1319,6 +1392,7 @@ top:
 			char	   *var;
 			int64		min,
 						max;
+			double		threshold = 0;
 			char		res[64];
 
 			if (*argv[2] == ':')
@@ -1364,11 +1438,11 @@ top:
 			}
 
 			/*
-			 * getrand() needs to be able to subtract max from min and add one
-			 * to the result without overflowing.  Since we know max > min, we
-			 * can detect overflow just by checking for a negative result. But
-			 * we must check both that the subtraction doesn't overflow, and
-			 * that adding one to the result doesn't overflow either.
+			 * Generate random number functions need to be able to subtract
+			 * max from min and add one to the result without overflowing.
+			 * Since we know max > min, we can detect overflow just by checking
+			 * for a negative result. But we must check both that the subtraction
+			 * doesn't overflow, and that adding one to the result doesn't overflow either.
 			 */
 			if (max - min < 0 || (max - min) + 1 < 0)
 			{
@@ -1377,10 +1451,63 @@ top:
 				return true;
 			}
 
+			if (argc == 4) /* uniform */
+			{
 #ifdef DEBUG
-			printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
+				printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
 #endif
-			snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
+				snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
+			}
+			else if ((pg_strcasecmp(argv[4], "gaussian") == 0) ||
+				 (pg_strcasecmp(argv[4], "exponential") == 0))
+			{
+				if (*argv[5] == ':')
+				{
+					if ((var = getVariable(st, argv[5] + 1)) == NULL)
+					{
+						fprintf(stderr, "%s: invalid threshold number %s\n", argv[0], argv[5]);
+						st->ecnt++;
+						return true;
+					}
+					threshold = strtod(var, NULL);
+				}
+				else
+					threshold = strtod(argv[5], NULL);
+
+				if (pg_strcasecmp(argv[4], "gaussian") == 0)
+				{
+					if (threshold < MIN_GAUSSIAN_THRESHOLD)
+					{
+						fprintf(stderr, "%s: gaussian threshold must be more than %f\n,", argv[5], MIN_GAUSSIAN_THRESHOLD);
+						st->ecnt++;
+						return true;
+					}
+#ifdef DEBUG
+					printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getGaussianrand(thread, min, max, threshold));
+#endif
+					snprintf(res, sizeof(res), INT64_FORMAT, getGaussianrand(thread, min, max, threshold));
+				}
+				else if (pg_strcasecmp(argv[4], "exponential") == 0)
+				{
+					if (threshold <= 0.0)
+					{
+						fprintf(stderr, "%s: exponential threshold must be strictly positive\n,", argv[5]);
+						st->ecnt++;
+						return true;
+					}
+#ifdef DEBUG
+					printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getExponentialrand(thread, min, max, threshold));
+#endif
+					snprintf(res, sizeof(res), INT64_FORMAT, getExponentialrand(thread, min, max, threshold));
+				}
+			}
+			else /* uniform with extra arguments */
+			{
+#ifdef DEBUG
+				printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
+#endif
+				snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
+			}
 
 			if (!putVariable(st, argv[0], argv[1], res))
 			{
@@ -1920,9 +2047,34 @@ process_commands(char *buf)
 				exit(1);
 			}
 
-			for (j = 4; j < my_commands->argc; j++)
-				fprintf(stderr, "%s: extra argument \"%s\" ignored\n",
-						my_commands->argv[0], my_commands->argv[j]);
+			if (my_commands->argc == 4 ) /* uniform */
+			{
+				/* nothing to do */
+			}
+			else if ((pg_strcasecmp(my_commands->argv[4], "gaussian") == 0) ||
+				 (pg_strcasecmp(my_commands->argv[4], "exponential") == 0))
+			{
+				if (my_commands->argc < 6)
+				{
+					fprintf(stderr, "%s(%s): missing argument\n", my_commands->argv[0], my_commands->argv[4]);
+					exit(1);
+				}
+
+				for (j = 6; j < my_commands->argc; j++)
+					fprintf(stderr, "%s(%s): extra argument \"%s\" ignored\n",
+							my_commands->argv[0], my_commands->argv[4], my_commands->argv[j]);
+			}
+			else /* uniform with extra argument */
+			{
+				int arg_pos = 4;
+
+				if (pg_strcasecmp(my_commands->argv[4], "uniform") == 0)
+					arg_pos++;
+
+				for (j = arg_pos; j < my_commands->argc; j++)
+					fprintf(stderr, "%s(uniform): extra argument \"%s\" ignored\n",
+							my_commands->argv[0], my_commands->argv[j]);
+			}
 		}
 		else if (pg_strcasecmp(my_commands->argv[0], "set") == 0)
 		{
diff --git a/contrib/pgbench/test-expo-check.sql b/contrib/pgbench/test-expo-check.sql
new file mode 100644
index 0000000..fbf35fd
--- /dev/null
+++ b/contrib/pgbench/test-expo-check.sql
@@ -0,0 +1,14 @@
+-- val, min, max, threshold
+CREATE OR REPLACE FUNCTION
+expoProba(INTEGER, INTEGER, INTEGER, DOUBLE PRECISION)
+RETURNS DOUBLE PRECISION IMMUTABLE STRICT AS $$
+  SELECT (exp(-$4*($1-$2)/($3-$2+1)) - exp(-$4*($1-$2+1)/($3-$2+1))) /
+         (1.0 - exp(-$4));
+$$ LANGUAGE SQL;
+
+SELECT SUM(cnt) FROM pgbench_dist;
+
+SELECT id, 1.0*cnt/SUM(cnt) OVER(), expoProba(id, 0, 99, 10.0)
+FROM pgbench_dist
+ORDER BY id;
+
diff --git a/contrib/pgbench/test-expo-run.sql b/contrib/pgbench/test-expo-run.sql
new file mode 100644
index 0000000..1d476bc
--- /dev/null
+++ b/contrib/pgbench/test-expo-run.sql
@@ -0,0 +1,2 @@
+\setrandom id 0 99 exponential 10.0
+UPDATE pgbench_dist SET cnt=cnt+1 WHERE id = :id;
diff --git a/contrib/pgbench/test-gauss-check.sql b/contrib/pgbench/test-gauss-check.sql
new file mode 100644
index 0000000..f46c21d
--- /dev/null
+++ b/contrib/pgbench/test-gauss-check.sql
@@ -0,0 +1,57 @@
+-- approximation with maximal error of 1.2 10E-07, as told from
+-- https://en.wikipedia.org/wiki/Error_function#Numerical_approximation
+CREATE OR REPLACE FUNCTION erf(x DOUBLE PRECISION)
+RETURNS DOUBLE PRECISION IMMUTABLE STRICT AS $$
+DECLARE
+  t DOUBLE PRECISION := 1.0 / ( 1.0 + 0.5 * ABS(x));
+  tau DOUBLE PRECISION;
+BEGIN
+  IF ABS(x) >= 6.0 THEN
+    -- avoid underflow error
+    tau := 0.0;
+  ELSE
+    -- use approximation
+    tau := t * exp(-x*x - 1.26551223
+         + t * (1.00002368
+         + t * (0.37409196
+         + t * (0.09678418
+         + t * (-0.18628806
+         + t * (0.27886807
+         + t * (-1.13520398
+         + t * (1.48851587
+         + t * (-0.82215223
+         + t *  0.17087277)))))))));
+  END IF;
+  IF x >= 0 THEN
+    RETURN 1.0 - tau;
+  ELSE
+    RETURN tau - 1.0;
+  END IF;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION phi(DOUBLE PRECISION)
+RETURNS DOUBLE PRECISION IMMUTABLE STRICT AS $$
+  SELECT 0.5 * ( 1.0 + erf( $1 / SQRT(2.0) ) );
+$$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION
+gaussianProba(i INTEGER, mini INTEGER, maxi INTEGER, threshold DOUBLE PRECISION)
+RETURNS DOUBLE PRECISION IMMUTABLE STRICT AS $$
+DECLARE
+  extent DOUBLE PRECISION;
+  mu DOUBLE PRECISION;
+BEGIN
+  extent := maxi - mini + 1.0;
+  mu := 0.5 * (maxi + mini);
+  RETURN (phi(2.0 * threshold * (i - mini - mu + 0.5) / extent) -
+          phi(2.0 * threshold * (i - mini - mu - 0.5) / extent))
+         -- truncated gaussian
+	 / ( 2.0 * phi(threshold) - 1.0 );
+END;
+$$ LANGUAGE plpgsql;
+
+SELECT SUM(cnt) FROM pgbench_dist;
+SELECT id, 1.0*cnt/SUM(cnt) OVER(), gaussianProba(id, 0, 99, 2.0)
+FROM pgbench_dist
+ORDER BY id;
diff --git a/contrib/pgbench/test-gauss-run.sql b/contrib/pgbench/test-gauss-run.sql
new file mode 100644
index 0000000..984a3b4
--- /dev/null
+++ b/contrib/pgbench/test-gauss-run.sql
@@ -0,0 +1,2 @@
+\setrandom id 0 99 gaussian 2.0
+UPDATE pgbench_dist SET cnt=cnt+1 WHERE id = :id;
diff --git a/contrib/pgbench/test-init.sql b/contrib/pgbench/test-init.sql
new file mode 100644
index 0000000..84f7cc9
--- /dev/null
+++ b/contrib/pgbench/test-init.sql
@@ -0,0 +1,4 @@
+DROP TABLE IF EXISTS pgbench_dist;
+CREATE UNLOGGED TABLE pgbench_dist(id SERIAL PRIMARY KEY, cnt INTEGER NOT NULL DEFAULT 0);
+INSERT INTO pgbench_dist(id, cnt) 
+  SELECT i, 0 FROM generate_series(0, 99) AS i;
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index f264c24..babf88a 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -748,8 +748,8 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
 
    <varlistentry>
     <term>
-     <literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</></literal>
-    </term>
+     <literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</> [ uniform | [ { gaussian | exponential } <replaceable>threshold</> ] ]</literal>
+     </term>
 
     <listitem>
      <para>
@@ -761,9 +761,58 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </para>
 
      <para>
+      The default random distribution is uniform, that is all values in the
+      range are drawn with equal probability. The gaussian and exponential
+      options allow to change this default. The mandatory
+      <replaceable>threshold</> double value controls the actual distribution
+      with gaussian or exponential.
+     </para>
+
+     <para>
+      With the gaussian option, the interval is mapped onto a normal
+      distribution truncated at <literal>-threshold</> on the left and
+      <literal>+threshold</> on the right.
+      The larger the <replaceable>threshold</>, the more frequently values
+      close to the middle of the interval are drawn, and the less frequently
+      values close to the <replaceable>min</> and <replaceable>max</> bounds.
+      In other worlds, the larger the <replaceable>threshold</>,
+      the narrower the access range around the middle.
+      The smaller the threshold, the smoother the access pattern distribution.
+      To be precise, if <literal>phi(x)</> is the cumulative distribution
+      function of the standard normal distribution, with mean <literal>mu</>
+      defined as <literal>(max + min) / 2.0</>, then value <replaceable>i</>
+      between <replaceable>min</> and <replaceable>max</> inclusive is drawn
+      with probability about:
+      <literal>
+        (phi(2.0 * threshold * (i - min - mu + 0.5) / (max - min + 1)) -
+         phi(2.0 * threshold * (i - min - mu - 0.5) / (max - min + 1))) /
+         (2.0 * phi(threshold) - 1.0)
+      </>
+      The minimum threshold is 2.0 for performance of the Box-Muller transform.
+     </para>
+
+     <para>
+      With the exponential option, the <replaceable>threshold</> parameter
+      controls the distribution by truncating an exponential distribution at
+      a specific value, and then projecting onto integers between the bounds.
+      To be precise, value <replaceable>i</> between <replaceable>min</> and
+      <replaceable>max</> inclusive is drawn with probability:
+      <literal>(exp(-threshold*(i-min)/(max+1-min)) -
+       exp(-threshold*(i+1-min)/(max+1-min))) / (1.0 - exp(-threshold))</>.
+      Intuitively, the larger the threshold, the more frequently values close to
+      <replaceable>min</> are accessed, and the less frequently values close to
+      <replaceable>max</> are accessed.
+      A crude approximation of the distribution is that the most frequent 1%
+      values are drawn <replaceable>threshold</>% of the time.
+      The closer to 0.0 the threshold, the flatter (more uniform) the access
+      distribution.
+      The threshold value must be strictly positive with the exponential option.
+     </para>
+
+     <para>
       Example:
 <programlisting>
-\setrandom aid 1 :naccounts
+\setrandom aid 1 :naccounts gaussian 5.0
 </programlisting></para>
     </listitem>
    </varlistentry>

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] gaussian distribution pgbench -- part 1/2

Reply via email to