On 12/6/10 6:13 PM, Tom Lane wrote:
> Josh Berkus <[email protected]> writes:
>> OK, patch coming then. Right now test_fsync aborts when O_DIRECT fails.
>> What should I have it do instead?
>
> Report that it fails, and keep testing the other methods.
Patch attached. Includes a fair amount of comment cleanup, since
existing comments did not meet our current project standards. Tests all
6 of the methods we support separately.
Some questions, though:
(1) Why are we doing the open_sync different-size write test? AFAIK,
this doesn't match any behavior which PostgreSQL has.
(2) In this patch, I'm stepping down the number of loops which
fsync_writethrough does by 90%. The reason for that was that on the
platforms where I tested writethrough (desktop machines), doing 10,000
loops took 15-20 *minutes*, which seems hard on the user. Would be easy
to revert if you think it's a bad idea.
Possibly auto-sizing the number of loops based on the first fsync test
might be a good idea, but seems like going a bit too far.
(3) Should the multi-descriptor test be using writethrough on platforms
which support it?
--
-- Josh Berkus
PostgreSQL Experts Inc.
http://www.pgexperts.com
diff --git a/src/tools/fsync/Makefile b/src/tools/fsync/Makefile
index 252c087..2ddbbe9 100644
*** a/src/tools/fsync/Makefile
--- b/src/tools/fsync/Makefile
***************
*** 4,10 ****
#
# Copyright (c) 2003-2010, PostgreSQL Global Development Group
#
! # src/tools/fsync/Makefile
#
#-------------------------------------------------------------------------
--- 4,10 ----
#
# Copyright (c) 2003-2010, PostgreSQL Global Development Group
#
! # $PostgreSQL: pgsql/src/tools/fsync/Makefile,v 1.9 2010/07/05 18:54:38 tgl Exp $
#
#-------------------------------------------------------------------------
*************** override CPPFLAGS := -I$(libpq_srcdir) $
*** 16,24 ****
OBJS= test_fsync.o
! all: test_fsync
! test_fsync: test_fsync.o | submake-libpq submake-libpgport
$(CC) $(CFLAGS) test_fsync.o $(libpq_pgport) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $...@$(X)
clean distclean maintainer-clean:
--- 16,24 ----
OBJS= test_fsync.o
! all: submake-libpq submake-libpgport test_fsync
! test_fsync: test_fsync.o $(libpq_builddir)/libpq.a
$(CC) $(CFLAGS) test_fsync.o $(libpq_pgport) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $...@$(X)
clean distclean maintainer-clean:
diff --git a/src/tools/fsync/README b/src/tools/fsync/README
index 6d9acd3..5b45581 100644
*** a/src/tools/fsync/README
--- b/src/tools/fsync/README
***************
*** 1,4 ****
! src/tools/fsync/README
fsync
=====
--- 1,4 ----
! $PostgreSQL: pgsql/src/tools/fsync/README,v 1.5 2009/11/28 15:04:54 momjian Exp $
fsync
=====
*************** fsync
*** 6,11 ****
This program tests fsync. The tests are described as part of the program output.
Usage: test_fsync [-f filename] [loops]
! Loops defaults to 5000. The default output file is /var/tmp/test_fsync.out.
! Consider that /tmp or /var/tmp might be memory-based file systems.
--- 6,25 ----
This program tests fsync. The tests are described as part of the program output.
Usage: test_fsync [-f filename] [loops]
+
+ test_fsync is intended to give you a reasonable idea of what the fastest
+ fsync_method is on your specific system, as well as supplying diagnostic
+ information in the event of an identified I/O problem. However, differences
+ shown by test_fsync may not make any difference in real database throughput,
+ especially since many database servers are not speed-limited by their
+ transaction logs.
! Filename defaults to test_fsync.out in the current directory. test_fsync
! should be run on the same filesystem where your transaction log currently
! resides.
!
! Loops default to 10000, except for writethrough tests, where there are 1/10 of
! that in order to make the user not wait forever. You should lower loops if you
! have a slow system and the tests are taking more than 5 minutes each. You should
! raise loops if your system is faster than 5000/second, in order to get useful
! statistics.
diff --git a/src/tools/fsync/test_fsync.c b/src/tools/fsync/test_fsync.c
index 28c2119..5980b70 100644
*** a/src/tools/fsync/test_fsync.c
--- b/src/tools/fsync/test_fsync.c
***************
*** 3,9 ****
*
*
* test_fsync.c
! * test various fsync() methods
*/
#include "postgres.h"
--- 3,9 ----
*
*
* test_fsync.c
! * tests all supported fsync() methods
*/
#include "postgres.h"
***************
*** 22,55 ****
#include <unistd.h>
#include <string.h>
!
! #ifdef WIN32
#define FSYNC_FILENAME "./test_fsync.out"
- #else
- /* /tmp might be a memory file system */
- #define FSYNC_FILENAME "/var/tmp/test_fsync.out"
- #endif
#define WRITE_SIZE (8 * 1024) /* 8k */
#define LABEL_FORMAT "\t%-30s"
int loops = 10000;
void die(char *str);
void print_elapse(struct timeval start_t, struct timeval stop_t);
int
main(int argc, char *argv[])
{
struct timeval start_t;
struct timeval stop_t;
! int tmpfile,
! i;
char *full_buf = (char *) malloc(XLOG_SEG_SIZE),
*buf;
char *filename = FSYNC_FILENAME;
if (argc > 2 && strcmp(argv[1], "-f") == 0)
{
filename = argv[2];
--- 22,58 ----
#include <unistd.h>
#include <string.h>
! /*
! * put the temp files in the local directory
! * unless the user specifies otherwise
! */
#define FSYNC_FILENAME "./test_fsync.out"
#define WRITE_SIZE (8 * 1024) /* 8k */
#define LABEL_FORMAT "\t%-30s"
int loops = 10000;
+ int writethrough_loops = 1000;
void die(char *str);
void print_elapse(struct timeval start_t, struct timeval stop_t);
+ void print_elapse_writethrough(struct timeval start_t, struct timeval stop_t);
int
main(int argc, char *argv[])
{
struct timeval start_t;
struct timeval stop_t;
! int tmpfile;
! int i;
char *full_buf = (char *) malloc(XLOG_SEG_SIZE),
*buf;
char *filename = FSYNC_FILENAME;
+ /*
+ * arguments: loops and filename (optional)
+ */
if (argc > 2 && strcmp(argv[1], "-f") == 0)
{
filename = argv[2];
*************** main(int argc, char *argv[])
*** 57,73 ****
argc -= 2;
}
! if (argc > 1)
loops = atoi(argv[1]);
for (i = 0; i < XLOG_SEG_SIZE; i++)
full_buf[i] = random();
if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
die("Cannot open output file.");
if (write(tmpfile, full_buf, XLOG_SEG_SIZE) != XLOG_SEG_SIZE)
die("write failed");
! /* fsync now so later fsync's don't have to do it */
if (fsync(tmpfile) != 0)
die("fsync failed");
close(tmpfile);
--- 60,88 ----
argc -= 2;
}
! /*
! * set writethrough_loops to be 1/10 of loops
! * since writethroughs are very slow
! */
! if (argc > 1)
! {
loops = atoi(argv[1]);
+ writethrough_loops = loops / 10;
+ }
for (i = 0; i < XLOG_SEG_SIZE; i++)
full_buf[i] = random();
+ /*
+ * test if we can open the target file
+ */
if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
die("Cannot open output file.");
if (write(tmpfile, full_buf, XLOG_SEG_SIZE) != XLOG_SEG_SIZE)
die("write failed");
! /*
! * fsync now so that dirty buffers don't skew later tests
! */
if (fsync(tmpfile) != 0)
die("fsync failed");
close(tmpfile);
*************** main(int argc, char *argv[])
*** 77,83 ****
printf("Loops = %d\n\n", loops);
/*
! * Simple write
*/
printf("Simple write:\n");
printf(LABEL_FORMAT, "8k write");
--- 92,98 ----
printf("Loops = %d\n\n", loops);
/*
! * Test a simple write without fsync
*/
printf("Simple write:\n");
printf(LABEL_FORMAT, "8k write");
*************** main(int argc, char *argv[])
*** 95,104 ****
print_elapse(start_t, stop_t);
/*
! * Compare file sync methods with one 8k write
*/
printf("\nCompare file sync methods using one write:\n");
#ifdef OPEN_DATASYNC_FLAG
printf(LABEL_FORMAT, "open_datasync 8k write");
fflush(stdout);
--- 110,122 ----
print_elapse(start_t, stop_t);
/*
! * Test all fsync methods using single 8k writes
*/
printf("\nCompare file sync methods using one write:\n");
+ /*
+ * Test open_datasync if available
+ */
#ifdef OPEN_DATASYNC_FLAG
printf(LABEL_FORMAT, "open_datasync 8k write");
fflush(stdout);
*************** main(int argc, char *argv[])
*** 115,124 ****
--- 133,174 ----
gettimeofday(&stop_t, NULL);
close(tmpfile);
print_elapse(start_t, stop_t);
+
+ /*
+ * If O_DIRECT is enabled, test that with open_datasync
+ */
+ if ( PG_O_DIRECT != 0 )
+ {
+ printf(LABEL_FORMAT, "open_datasync 8k directIO write");
+ fflush(stdout);
+ if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT, 0)) == -1)
+ printf("\t(unavailable: o_direct on this filesystem)\n");
+ else
+ {
+ gettimeofday(&start_t, NULL);
+ for (i = 0; i < loops; i++)
+ {
+ if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
+ die("write failed");
+ if (lseek(tmpfile, 0, SEEK_SET) == -1)
+ die("seek failed");
+ }
+ gettimeofday(&stop_t, NULL);
+ close(tmpfile);
+ print_elapse(start_t, stop_t);
+ }
+ }
+ else
+ {
+ printf("\t(unavailable: o_direct)\n");
+ }
#else
printf("\t(unavailable: open_datasync)\n");
#endif
+ /*
+ * Test open_sync if available
+ */
#ifdef OPEN_SYNC_FLAG
printf(LABEL_FORMAT, "open_sync 8k write");
fflush(stdout);
*************** main(int argc, char *argv[])
*** 135,144 ****
--- 185,226 ----
gettimeofday(&stop_t, NULL);
close(tmpfile);
print_elapse(start_t, stop_t);
+
+ /*
+ * If O_DIRECT is enabled, test that with open_sync
+ */
+ if ( PG_O_DIRECT != 0 )
+ {
+ printf(LABEL_FORMAT, "open_sync 8k directIO write");
+ fflush(stdout);
+ if ((tmpfile = open(filename, O_RDWR | O_SYNC | PG_O_DIRECT, 0)) == -1)
+ printf("\t(unavailable: o_direct on this filesystem)\n");
+ else
+ {
+ gettimeofday(&start_t, NULL);
+ for (i = 0; i < loops; i++)
+ {
+ if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
+ die("write failed");
+ if (lseek(tmpfile, 0, SEEK_SET) == -1)
+ die("seek failed");
+ }
+ gettimeofday(&stop_t, NULL);
+ close(tmpfile);
+ print_elapse(start_t, stop_t);
+ }
+ }
+ else
+ {
+ printf("\t(unavailable: o_direct)\n");
+ }
#else
printf("\t(unavailable: open_sync)\n");
#endif
+ /*
+ * Test fdatasync if available
+ */
#ifdef HAVE_FDATASYNC
printf(LABEL_FORMAT, "8k write, fdatasync");
fflush(stdout);
*************** main(int argc, char *argv[])
*** 160,165 ****
--- 242,250 ----
printf("\t(unavailable: fdatasync)\n");
#endif
+ /*
+ * Test fsync
+ */
printf(LABEL_FORMAT, "8k write, fsync");
fflush(stdout);
if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
*************** main(int argc, char *argv[])
*** 177,188 ****
gettimeofday(&stop_t, NULL);
close(tmpfile);
print_elapse(start_t, stop_t);
/*
! * Compare file sync methods with two 8k write
*/
printf("\nCompare file sync methods using two writes:\n");
#ifdef OPEN_DATASYNC_FLAG
printf(LABEL_FORMAT, "2 open_datasync 8k writes");
fflush(stdout);
--- 262,304 ----
gettimeofday(&stop_t, NULL);
close(tmpfile);
print_elapse(start_t, stop_t);
+
+ /*
+ * If fsync_writethrough is available, test as well
+ * This uses 1/10 the number of loops because it tends
+ * to take forever otherwise.
+ */
+ #ifdef HAVE_FSYNC_WRITETHROUGH
+ printf(LABEL_FORMAT, "8k write, fsync_writethrough");
+ fflush(stdout);
+ if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+ die("Cannot open output file.");
+ gettimeofday(&start_t, NULL);
+ for (i = 0; i < writethrough_loops; i++)
+ {
+ if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
+ die("write failed");
+ if (fcntl(tmpfile, F_FULLFSYNC ) != 0)
+ die("fsync failed");
+ if (lseek(tmpfile, 0, SEEK_SET) == -1)
+ die("seek failed");
+ }
+ gettimeofday(&stop_t, NULL);
+ close(tmpfile);
+ print_elapse_writethrough(start_t, stop_t);
+ #else
+ printf("\t(unavailable: fsync_writethrough)\n");
+ #endif
/*
! * Compare some of the file sync methods with
! * two 8k writes to see if timing is different
*/
printf("\nCompare file sync methods using two writes:\n");
+ /*
+ * Test open_datasync with and without o_direct
+ */
#ifdef OPEN_DATASYNC_FLAG
printf(LABEL_FORMAT, "2 open_datasync 8k writes");
fflush(stdout);
*************** main(int argc, char *argv[])
*** 201,210 ****
--- 317,354 ----
gettimeofday(&stop_t, NULL);
close(tmpfile);
print_elapse(start_t, stop_t);
+
+ if ( PG_O_DIRECT != 0 )
+ {
+ printf(LABEL_FORMAT, "2 open_datasync directIO 8k writes");
+ fflush(stdout);
+ if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT, 0)) == -1)
+ die("Cannot open output file.");
+ gettimeofday(&start_t, NULL);
+ for (i = 0; i < loops; i++)
+ {
+ if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
+ die("write failed");
+ if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
+ die("write failed");
+ if (lseek(tmpfile, 0, SEEK_SET) == -1)
+ die("seek failed");
+ }
+ gettimeofday(&stop_t, NULL);
+ close(tmpfile);
+ print_elapse(start_t, stop_t);
+ }
+ else
+ {
+ printf("\t(unavailable: o_direct)\n");
+ }
#else
printf("\t(unavailable: open_datasync)\n");
#endif
+ /*
+ * Test open_sync with and without o_direct
+ */
#ifdef OPEN_SYNC_FLAG
printf(LABEL_FORMAT, "2 open_sync 8k writes");
fflush(stdout);
*************** main(int argc, char *argv[])
*** 223,230 ****
--- 367,404 ----
gettimeofday(&stop_t, NULL);
close(tmpfile);
print_elapse(start_t, stop_t);
+
+ if ( PG_O_DIRECT != 0 )
+ {
+ printf(LABEL_FORMAT, "2 open_sync directIO 8k writes");
+ fflush(stdout);
+ if ((tmpfile = open(filename, O_RDWR | O_SYNC | PG_O_DIRECT, 0)) == -1)
+ die("Cannot open output file.");
+ gettimeofday(&start_t, NULL);
+ for (i = 0; i < loops; i++)
+ {
+ if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
+ die("write failed");
+ if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
+ die("write failed");
+ if (lseek(tmpfile, 0, SEEK_SET) == -1)
+ die("seek failed");
+ }
+ gettimeofday(&stop_t, NULL);
+ close(tmpfile);
+ print_elapse(start_t, stop_t);
+ }
+ else
+ {
+ printf("\t(unavailable: o_direct)\n");
+ }
+ #else
+ printf("\t(unavailable: open_sync)\n");
#endif
+ /*
+ * Test fdatasync
+ */
#ifdef HAVE_FDATASYNC
printf(LABEL_FORMAT, "8k write, 8k write, fdatasync");
fflush(stdout);
*************** main(int argc, char *argv[])
*** 248,253 ****
--- 422,430 ----
printf("\t(unavailable: fdatasync)\n");
#endif
+ /*
+ * Test basic fsync
+ */
printf(LABEL_FORMAT, "8k write, 8k write, fsync");
fflush(stdout);
if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
*************** main(int argc, char *argv[])
*** 267,278 ****
--- 444,488 ----
gettimeofday(&stop_t, NULL);
close(tmpfile);
print_elapse(start_t, stop_t);
+
+ /*
+ * Test fsync_writethrough if available
+ * Again, using 1/10 as many loops
+ */
+ #ifdef HAVE_FSYNC_WRITETHROUGH
+ printf(LABEL_FORMAT, "8k write, 8k write, fsync_writethrough");
+ fflush(stdout);
+ if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+ die("Cannot open output file.");
+ gettimeofday(&start_t, NULL);
+ for (i = 0; i < writethrough_loops; i++)
+ {
+ if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
+ die("write failed");
+ if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
+ die("write failed");
+ if (fcntl(tmpfile, F_FULLFSYNC) != 0)
+ die("fsync failed");
+ if (lseek(tmpfile, 0, SEEK_SET) == -1)
+ die("seek failed");
+ }
+ gettimeofday(&stop_t, NULL);
+ close(tmpfile);
+ print_elapse_writethrough(start_t, stop_t);
+ #else
+ printf("\t(unavailable: fsync_writethrough)\n");
+ #endif
/*
* Compare 1 to 2 writes
*/
printf("\nCompare open_sync with different sizes:\n");
+ /*
+ * Test open_sync with different size files
+ * It's unclear why this is in test_fsync, since it's
+ * not anything PostgreSQL does
+ */
#ifdef OPEN_SYNC_FLAG
printf(LABEL_FORMAT, "open_sync 16k write");
fflush(stdout);
*************** main(int argc, char *argv[])
*** 312,323 ****
#endif
/*
! * Fsync another file descriptor?
*/
printf("\nTest if fsync on non-write file descriptor is honored:\n");
printf("(If the times are similar, fsync() can sync data written\n");
printf("on a different descriptor.)\n");
printf(LABEL_FORMAT, "8k write, fsync, close");
fflush(stdout);
gettimeofday(&start_t, NULL);
--- 522,541 ----
#endif
/*
! * Test whether fsync can sync data written on a different
! * descriptor for the same file. This checks the efficiency
! * of multi-process fsyncs against the same file.
! * Possibly this should be done with writethrough on platforms
! * which support it.
*/
printf("\nTest if fsync on non-write file descriptor is honored:\n");
printf("(If the times are similar, fsync() can sync data written\n");
printf("on a different descriptor.)\n");
+ /*
+ * first write, fsync and close, which is the
+ * normal behavior without multiple descriptors
+ */
printf(LABEL_FORMAT, "8k write, fsync, close");
fflush(stdout);
gettimeofday(&start_t, NULL);
*************** main(int argc, char *argv[])
*** 330,343 ****
if (fsync(tmpfile) != 0)
die("fsync failed");
close(tmpfile);
if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
die("Cannot open output file.");
- /* do nothing but the open/close the tests are consistent. */
close(tmpfile);
}
gettimeofday(&stop_t, NULL);
print_elapse(start_t, stop_t);
printf(LABEL_FORMAT, "8k write, close, fsync");
fflush(stdout);
gettimeofday(&start_t, NULL);
--- 548,569 ----
if (fsync(tmpfile) != 0)
die("fsync failed");
close(tmpfile);
+ /*
+ * open and close the file again to be consistent
+ * with the following test
+ */
if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
die("Cannot open output file.");
close(tmpfile);
}
gettimeofday(&stop_t, NULL);
print_elapse(start_t, stop_t);
+ /*
+ * Now open, write, close, open again and fsync
+ * This simulates processes fsyncing each other's
+ * writes.
+ */
printf(LABEL_FORMAT, "8k write, close, fsync");
fflush(stdout);
gettimeofday(&start_t, NULL);
*************** main(int argc, char *argv[])
*** 358,381 ****
gettimeofday(&stop_t, NULL);
print_elapse(start_t, stop_t);
! /* cleanup */
free(full_buf);
unlink(filename);
return 0;
}
void
print_elapse(struct timeval start_t, struct timeval stop_t)
{
double total_time = (stop_t.tv_sec - start_t.tv_sec) +
- /* usec subtraction might be negative, e.g. 5.4 - 4.8 */
(stop_t.tv_usec - start_t.tv_usec) * 0.000001;
double per_second = loops / total_time;
printf("%9.3f/second\n", per_second);
}
void
die(char *str)
{
--- 584,624 ----
gettimeofday(&stop_t, NULL);
print_elapse(start_t, stop_t);
! /*
! * cleanup
! */
free(full_buf);
unlink(filename);
return 0;
}
+ /*
+ * print out the writes per second for most tests
+ */
void
print_elapse(struct timeval start_t, struct timeval stop_t)
{
double total_time = (stop_t.tv_sec - start_t.tv_sec) +
(stop_t.tv_usec - start_t.tv_usec) * 0.000001;
double per_second = loops / total_time;
printf("%9.3f/second\n", per_second);
}
+ /*
+ * print out the writes per second for writethrough tests
+ */
+ void
+ print_elapse_writethrough(struct timeval start_t, struct timeval stop_t)
+ {
+ double total_time = (stop_t.tv_sec - start_t.tv_sec) +
+ (stop_t.tv_usec - start_t.tv_usec) * 0.000001;
+ double per_second = writethrough_loops / total_time;
+
+ printf("%9.3f/second\n", per_second);
+ }
+
void
die(char *str)
{
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers