Hi,
attached is a patch that adds support for random sampling in pgbench, when
it's executed with "-l" flag. You can do for example this:
$ pgbench -l -T 120 -R 1 db
and then only 1% of transactions will be written into the log file. If you
omit the tag, all the transactions are written (i.e. it's backward
compatible).
Recently I've been using pgbench on hardware that can handle a lot of
transactions (various flavors of SSDs, lots of RAM, ...), and in such
cases the log files may get pretty big (even several gigabytes for a
single 1-hour run). A reasonable random sample (along with the stats
provided by pgbench itself) is often more than enough in such cases.
kind regards
Tomas
diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c
index 00cab73..e849b2b 100644
--- a/contrib/pgbench/pgbench.c
+++ b/contrib/pgbench/pgbench.c
@@ -145,6 +145,9 @@ char *index_tablespace = NULL;
#define naccounts 100000
bool use_log; /* log transaction latencies to
a file */
+bool use_log_sampling; /* sample the log randomly */
+int nsample_rate = 100; /* default log sampling rate */
+
bool is_connect; /* establish connection for
each transaction */
bool is_latencies; /* report per-command latencies */
int main_pid; /* main process id used
in log filename */
@@ -364,6 +367,7 @@ usage(void)
" -f FILENAME read transaction script from FILENAME\n"
" -j NUM number of threads (default: 1)\n"
" -l write transaction times to log file\n"
+ " -R NUM log sampling rate in pct (default: 100)\n"
" -M simple|extended|prepared\n"
" protocol for submitting queries to server
(default: simple)\n"
" -n do not run VACUUM before tests\n"
@@ -877,21 +881,25 @@ top:
instr_time diff;
double usec;
- INSTR_TIME_SET_CURRENT(now);
- diff = now;
- INSTR_TIME_SUBTRACT(diff, st->txn_begin);
- usec = (double) INSTR_TIME_GET_MICROSEC(diff);
+ /* either no sampling or is within the sample */
+ if ((! use_log_sampling) || (rand() % 100 <
nsample_rate)) {
+
+ INSTR_TIME_SET_CURRENT(now);
+ diff = now;
+ INSTR_TIME_SUBTRACT(diff, st->txn_begin);
+ usec = (double) INSTR_TIME_GET_MICROSEC(diff);
#ifndef WIN32
- /* This is more than we really ought to know about
instr_time */
- fprintf(logfile, "%d %d %.0f %d %ld %ld\n",
- st->id, st->cnt, usec, st->use_file,
- (long) now.tv_sec, (long) now.tv_usec);
+ /* This is more than we really ought to know
about instr_time */
+ fprintf(logfile, "%d %d %.0f %d %ld %ld\n",
+ st->id, st->cnt, usec,
st->use_file,
+ (long) now.tv_sec, (long)
now.tv_usec);
#else
- /* On Windows, instr_time doesn't provide a timestamp
anyway */
- fprintf(logfile, "%d %d %.0f %d 0 0\n",
- st->id, st->cnt, usec, st->use_file);
+ /* On Windows, instr_time doesn't provide a
timestamp anyway */
+ fprintf(logfile, "%d %d %.0f %d 0 0\n",
+ st->id, st->cnt, usec,
st->use_file);
#endif
+ }
}
if (commands[st->state]->type == SQL_COMMAND)
@@ -1962,7 +1970,7 @@ main(int argc, char **argv)
state = (CState *) xmalloc(sizeof(CState));
memset(state, 0, sizeof(CState));
- while ((c = getopt_long(argc, argv,
"ih:nvp:dSNc:j:Crs:t:T:U:lf:D:F:M:", long_options, &optindex)) != -1)
+ while ((c = getopt_long(argc, argv,
"ih:nvp:dSNc:j:Crs:t:T:U:lf:R:D:F:M:", long_options, &optindex)) != -1)
{
switch (c)
{
@@ -2070,6 +2078,15 @@ main(int argc, char **argv)
case 'l':
use_log = true;
break;
+ case 'R':
+ use_log_sampling = true;
+ nsample_rate = atoi(optarg);
+ if (nsample_rate <= 0 || nsample_rate > 100)
+ {
+ fprintf(stderr, "invalid sampling rate:
%d\n", nsample_rate);
+ exit(1);
+ }
+ break;
case 'f':
ttype = 3;
filename = optarg;
@@ -2158,6 +2175,12 @@ main(int argc, char **argv)
exit(1);
}
+ /* -R may be used only with -l */
+ if (use_log_sampling && (! use_log)) {
+ fprintf(stderr, "log sampling rate is allowed only when logging
transactions\n");
+ exit(1);
+ }
+
/*
* is_latencies only works with multiple threads in thread-based
* implementations, not fork-based ones, because it supposes that the
diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml
index 437fcea..962e446 100644
--- a/doc/src/sgml/pgbench.sgml
+++ b/doc/src/sgml/pgbench.sgml
@@ -317,6 +317,22 @@ pgbench <optional> <replaceable>options</> </optional>
<replaceable>dbname</>
</varlistentry>
<varlistentry>
+ <term><option>-R</option> <replaceable>rate</></term>
+ <listitem>
+ <para>
+ Sampling rate, used when writing data into the log in percent. 100
means all
+ transactions will be logged, 1 means only 1% of the transactions will
be logged.
+ Default is 100 (all transactions).
+ </para>
+ <para>
+ Be careful when processing the log file - e.g. when computing tps
values, you
+ need to multiply the numbers accordingly (e.g. with 1% sample you'll
get 1/100
+ of the actual tps).
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><option>-M</option> <replaceable>querymode</></term>
<listitem>
<para>
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers