On 03/13/2014 04:00 PM, Fujii Masao wrote:
On Thu, Mar 13, 2014 at 10:51 PM, Heikki Linnakangas
<hlinnakan...@vmware.com> wrote:
IMHO we should just implement the \setrandom changes, and not add any of
these options to modify the standard test workload. If someone wants to run
TPC-B workload with gaussian or exponential distribution, they can implement
it as a custom script. The docs include the script for the standard TPC-B
workload; just copy-paster that and modify the \setrandom lines.

Yeah, I'm OK with this.

So I took a look at the \setrandom parts of this patch to see if that's ready for commit, without any of the changes to modify the standard TPC-B workload. Attached is a patch with just those parts; everyone please focus on this.

A couple of comments:

* There should be an explicit "\setrandom ... uniform" option too, even though you get that implicitly if you don't specify the distribution

* What exactly does the "threshold" mean? The docs informally explain that "the larger the thresold, the more frequent values close to the middle of the interval are drawn", but that's pretty vague.

* Does min and max really make sense for gaussian and exponential distributions? For gaussian, I would expect mean and standard deviation as the parameters, not min/max/threshold.

* How about setting the variable as a float instead of integer? Would seem more natural to me. At least as an option.

- Heikki
diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 7c1e59e..a7713af 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -98,6 +98,9 @@ static int	pthread_join(pthread_t th, void **thread_return);
 #define LOG_STEP_SECONDS	5	/* seconds between log messages */
 #define DEFAULT_NXACTS	10		/* default nxacts */
 
+#define MIN_GAUSSIAN_THRESHOLD		2.0	/* minimum threshold for gauss */
+#define MIN_EXPONENTIAL_THRESHOLD	2.0	/* minimum threshold for exp */
+
 int			nxacts = 0;			/* number of transactions per client */
 int			duration = 0;		/* duration in seconds */
 
@@ -469,6 +472,79 @@ getrand(TState *thread, int64 min, int64 max)
 	return min + (int64) ((max - min + 1) * pg_erand48(thread->random_state));
 }
 
+/* random number generator: exponential distribution from min to max inclusive */
+static int64
+getExponentialrand(TState *thread, int64 min, int64 max, double exp_threshold)
+{
+	double		rand;
+
+	/*
+	 * Get user specified random number in this loop. This loop is executed until
+	 * the number in the expected range. As the minimum threshold is 2.0, the
+	 * probability of a retry is at worst 13.5% as - ln(0.135) ~ 2.0 ;
+	 * For a 5.0 threshold, it is about e^{-5} ~ 0.7%.
+	 */
+	do
+	{
+		/* as pg_erand48 is in [0, 1), uniform is in (0, 1] */
+		double uniform = 1.0 - pg_erand48(thread->random_state);
+		/* rand is in [0 LARGE) */
+		rand = - log(uniform);
+	} while (rand >= exp_threshold);
+
+	/* rand in [0, exp_threshold), normalized to [0,1) */
+	rand /= exp_threshold;
+
+	/* return int64 random number within between min and max */
+	return min + (int64)((max - min + 1) * rand);
+}
+
+/* random number generator: gaussian distribution from min to max inclusive */
+static int64
+getGaussianrand(TState *thread, int64 min, int64 max, double stdev_threshold)
+{
+	double		stdev;
+	double		rand;
+
+	/*
+	 * Get user specified random number from this loop, with
+	 * -stdev_threshold < stdev <= stdev_threshold
+	 *
+	 * This loop is executed until the number is in the expected range.
+	 *
+	 * As the minimum threshold is 2.0, the probability of looping is low:
+	 * sqrt(-2 ln(r)) <= 2 => r >= e^{-2} ~ 0.135, then when taking the average
+	 * sinus multiplier as 2/pi, we have a 8.6% looping probability in the
+	 * worst case. For a 5.0 threshold value, the looping proability
+	 * is about e^{-5} * 2 / pi ~ 0.43%.
+	 */
+	do
+	{
+		/*
+		 * pg_erand48 generates [0,1), but for the basic version of the
+		 * Box-Muller transform the two uniformly distributed random numbers
+		 * are expected in (0, 1] (see http://en.wikipedia.org/wiki/Box_muller)
+		 */
+		double rand1 = 1.0 - pg_erand48(thread->random_state);
+		double rand2 = 1.0 - pg_erand48(thread->random_state);
+
+		/* Box-Muller basic form transform */
+		double var_sqrt = sqrt(-2.0 * log(rand1));
+		stdev = var_sqrt * sin(2.0 * M_PI * rand2);
+
+		/* we may try with cos, but there may be a bias induced if the previous
+		 * value fails the test? To be on the safe side, let us try over.
+		 */
+	}
+	while (stdev < -stdev_threshold || stdev >= stdev_threshold);
+
+	/* stdev is in [-threshold, threshold), normalization to [0,1) */
+	rand = (stdev + stdev_threshold) / (stdev_threshold * 2.0);
+
+	/* return int64 random number within between min and max */
+	return min + (int64)((max - min + 1) * rand);
+}
+
 /* call PQexec() and exit() on failure */
 static void
 executeStatement(PGconn *con, const char *sql)
@@ -1312,6 +1388,7 @@ top:
 			char	   *var;
 			int64		min,
 						max;
+			double		threshold = 0;
 			char		res[64];
 
 			if (*argv[2] == ':')
@@ -1357,7 +1434,7 @@ top:
 			}
 
 			/*
-			 * getrand() needs to be able to subtract max from min and add one
+			 * Random number generation functions need to be able to subtract max from min and add one
 			 * to the result without overflowing.  Since we know max > min, we
 			 * can detect overflow just by checking for a negative result. But
 			 * we must check both that the subtraction doesn't overflow, and
@@ -1370,10 +1447,63 @@ top:
 				return true;
 			}
 
+			if (argc == 4) /* uniform */
+			{
 #ifdef DEBUG
-			printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
+				printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
 #endif
-			snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
+				snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
+			}
+			else if ((pg_strcasecmp(argv[4], "gaussian") == 0) ||
+				 (pg_strcasecmp(argv[4], "exponential") == 0))
+			{
+				if (*argv[5] == ':')
+				{
+					if ((var = getVariable(st, argv[5] + 1)) == NULL)
+					{
+						fprintf(stderr, "%s: invalid threshold number %s\n", argv[0], argv[5]);
+						st->ecnt++;
+						return true;
+					}
+					threshold = strtod(var, NULL);
+				}
+				else
+					threshold = strtod(argv[5], NULL);
+
+				if (pg_strcasecmp(argv[4], "gaussian") == 0)
+				{
+					if (threshold < MIN_GAUSSIAN_THRESHOLD)
+					{
+						fprintf(stderr, "%s: gaussian threshold must be more than %f\n,", argv[5], MIN_GAUSSIAN_THRESHOLD);
+						st->ecnt++;
+						return true;
+					}
+#ifdef DEBUG
+					printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getGaussianrand(thread, min, max, threshold));
+#endif
+					snprintf(res, sizeof(res), INT64_FORMAT, getGaussianrand(thread, min, max, threshold));
+				}
+				else if (pg_strcasecmp(argv[4], "exponential") == 0)
+				{
+					if (threshold < MIN_EXPONENTIAL_THRESHOLD)
+					{
+						fprintf(stderr, "%s: exponential threshold must be more than %f\n,", argv[5], MIN_EXPONENTIAL_THRESHOLD);
+						st->ecnt++;
+						return true;
+					}
+#ifdef DEBUG
+					printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getExponentialrand(thread, min, max, threshold));
+#endif
+					snprintf(res, sizeof(res), INT64_FORMAT, getExponentialrand(thread, min, max, threshold));
+				}
+			}
+			else /* uniform with extra arguments */
+			{
+#ifdef DEBUG
+				printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
+#endif
+				snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
+			}
 
 			if (!putVariable(st, argv[0], argv[1], res))
 			{
@@ -1903,9 +2033,29 @@ process_commands(char *buf)
 				exit(1);
 			}
 
-			for (j = 4; j < my_commands->argc; j++)
-				fprintf(stderr, "%s: extra argument \"%s\" ignored\n",
-						my_commands->argv[0], my_commands->argv[j]);
+			if (my_commands->argc == 4) /* uniform */
+			{
+				/* nothing to do */
+			}
+			else if ((pg_strcasecmp(my_commands->argv[4], "gaussian") == 0) ||
+				 (pg_strcasecmp(my_commands->argv[4], "exponential") == 0))
+			{
+				if (my_commands->argc < 6)
+				{
+					fprintf(stderr, "%s(%s): missing argument\n", my_commands->argv[0], my_commands->argv[4]);
+					exit(1);
+				}
+
+				for (j = 6; j < my_commands->argc; j++)
+					fprintf(stderr, "%s(%s): extra argument \"%s\" ignored\n",
+							my_commands->argv[0], my_commands->argv[4], my_commands->argv[j]);
+			}
+			else /* uniform with extra argument */
+			{
+				for (j = 4; j < my_commands->argc; j++)
+					fprintf(stderr, "%s(uniform): extra argument \"%s\" ignored\n",
+							my_commands->argv[0], my_commands->argv[j]);
+			}
 		}
 		else if (pg_strcasecmp(my_commands->argv[0], "set") == 0)
 		{
@@ -2319,7 +2469,7 @@ main(int argc, char **argv)
 		{"unlogged-tables", no_argument, &unlogged_tables, 1},
 		{"sampling-rate", required_argument, NULL, 4},
 		{"aggregate-interval", required_argument, NULL, 5},
-		{"rate", required_argument, NULL, 'R'},
+		{"gaussian", required_argument, NULL, 6},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -2599,7 +2749,7 @@ main(int argc, char **argv)
 #endif
 				break;
 			default:
-				fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),	progname);
 				exit(1);
 				break;
 		}
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 4367563..48e3641 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -748,8 +748,8 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
 
    <varlistentry>
     <term>
-     <literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</></literal>
-    </term>
+     <literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</> [ { gaussian | exponential } <replaceable>threshold</> ]</literal>
+     </term>
 
     <listitem>
      <para>
@@ -761,9 +761,31 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
      </para>
 
      <para>
+      Moreover, set gaussian or exponential with threshold interger value,
+      we can get gaussian or exponential random in integer value between
+      <replaceable>min</> and <replaceable>max</> bounds inclusive.
+      The threshold controls the distribution pattern. Without these options,
+      we can get uniform random in interger value between <replaceable>min</>
+      and <replaceable>max</> bounds inclusive.
+     </para>
+
+     <para>
+      In gaussian option, the larger the threshold, the more frequent values
+      close to the middle of the interval are drawn. The threshold is
+      the deviation for the <replaceable>min</> and <replaceable>max</> bounds.
+      The minimum threshold is 2.0, for performance.
+     </para>
+
+     <para>
+      In exponential option, the deviation, threshold controls the distribution
+      pattern: the larger the deviation threshold, the more frequent values
+      close to <replaceable>min</> are drawn. The minimum threshold is 2.0, for performance.
+     </para>
+
+     <para>
       Example:
 <programlisting>
-\setrandom aid 1 :naccounts
+\setrandom aid 1 :naccounts gaussian 5
 </programlisting></para>
     </listitem>
    </varlistentry>
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to