[ccache] patch: workaround for NFS issues

2007-03-24 Thread John Coiner

Here is a second attempt at an NFS workaround.

This patch applies to ccache-2.4. The CCACHE_NFS_WORKAROUND variable 
enables the workaround. This workaround requires the existence of the 
lockfile program on the $PATH.

Here's what I think is going on, when NFS corruption occurs:

  * Process X writes object A into the cache.
  * Process Y starts reading object A.
  * Process Z writes a new copy of object A into the cache, renaming 
over the original A, while Y is still reading.

A local filesystem will keep the original object A around until Y closes 
the file. But on NFS, which is a stateless protocol, the server 
immediately forgets that the original A ever existed. When process Y 
comes back requesting the next chunk of the original A, it is gone.

This patch prevents overwriting an object that's already in the cache. 
It seems to work, YMMV.

- John



diff -u ccache-2.4/ccache.c ccache-2.4-fix/ccache.c
--- ccache-2.4/ccache.c Mon Sep 13 06:38:30 2004
+++ ccache-2.4-fix/ccache.c Sat Mar 24 15:43:45 2007
@@ -158,6 +158,30 @@
struct stat st1, st2;
int status;

+   char *workaround = getenv(CCACHE_NFS_WORKAROUND);
+   char *lockfile, *lock_cmd;
+   struct stat st;
+
+   if( workaround )
+   {
+   /* acquire lock -- permission to create the cache entry. */
+   x_asprintf( lockfile, %s.lock, hashname );
+   x_asprintf( lock_cmd, lockfile -l 600 %s, lockfile );
+   if( 0 != system( lock_cmd ) )
+   {
+   cc_log( failed to get lockfile %s\n, lockfile );
+   failed();
+   }
+
+   /* got lock -- is the file there now? */
+   if( stat( hashname, st ) == 0 )
+   {
+   /* another ccache process built it. Just return. */
+   unlink( lockfile );
+   return;
+   }
+   }
+
x_asprintf(tmp_stdout, %s/tmp.stdout.%s, temp_dir, tmp_string());
x_asprintf(tmp_stderr, %s/tmp.stderr.%s, temp_dir, tmp_string());
x_asprintf(tmp_hashname, %s/tmp.hash.%s.o, temp_dir, tmp_string());
@@ -187,6 +211,8 @@
unlink(tmp_stdout);
unlink(tmp_stderr);
unlink(tmp_hashname);
+   if( workaround )
+   unlink(lockfile);
failed();
}
unlink(tmp_stdout);
@@ -196,6 +222,9 @@
cc_log(compile of %s gave status = %d\n, output_file, status);
stats_update(STATS_STATUS);

+   if( workaround )
+   unlink( lockfile );
+
fd = open(tmp_stderr, O_RDONLY | O_BINARY);
if (fd != -1) {
if (strcmp(output_file, /dev/null) == 0 ||
@@ -236,11 +265,20 @@
rename(tmp_stderr, path_stderr) != 0) {
cc_log(failed to rename tmp files - %s\n, strerror(errno));
stats_update(STATS_ERROR);
+   if( workaround )
+   unlink( lockfile );
failed();
}

cc_log(Placed %s into cache\n, output_file);
stats_tocache(file_size(st1) + file_size(st2));
+
+   if( workaround )
+   {
+   unlink( lockfile );
+   free( lockfile );
+   free( lock_cmd );
+   }

free(tmp_hashname);
free(tmp_stderr);



[ccache] patch: workaround for NFS issues

2007-03-24 Thread John Coiner

Hi all,

Here's a patch, for ccache 2.4, which works around the NFS bug. From the 
ccache man page:

BUGS. When the cache is stored on an NFS filesystem, the filesystem 
must be exported with the no_subtree_check option to make renames 
between directories reliable.

I have seen corrupt object files when doing parallel builds. At my 
company, I am a mere user, so I asked someone in the IT department about 
enabling no_subtree_check. He says our fileserver doesn't support it.

This patch provides another option, when no_subtree_check isn't available.

How it works: when CCACHE_NFS_WORKAROUND is set, ccache won't rename 
files across directories. Instead, it copies the file to its destination 
directory, and then renames it. This is inefficient, but it's more 
efficient than not using ccache. The cost of the extra copy is only 
incurred on a cache miss.

To test this, I set up a testbench which starts 12 parallel builds on 
different machines simultaneously. Each build compiles the same .c file 
into a .o, and then links the .o into an executable. They all use the 
same ccache. Without the workaround, this produces a linker failure in 
about 50% of trials; with the workaround, I have not seen a failure in 
 15 trials. So it might actually work.

- John



--- ccache-2.4/ccache.c 2004-09-13 06:38:30.0 -0400
+++ ccache-2.4-fix/ccache.c 2007-03-24 10:40:46.0 -0500
@@ -157,6 +157,7 @@
 char *tmp_stdout, *tmp_stderr, *tmp_hashname;
 struct stat st1, st2;
 int status;
+   char *workaround = getenv(CCACHE_NFS_WORKAROUND);

 x_asprintf(tmp_stdout, %s/tmp.stdout.%s, temp_dir, 
tmp_string());
 x_asprintf(tmp_stderr, %s/tmp.stderr.%s, temp_dir, 
tmp_string());
@@ -232,7 +233,8 @@

 if (stat(tmp_stderr, st1) != 0 ||
 stat(tmp_hashname, st2) != 0 ||
-   rename(tmp_hashname, hashname) != 0 ||
+   (  workaround  ( copy_file(tmp_hashname, hashname) != 0 ) ) ||
+   ( !workaround  ( rename   (tmp_hashname, hashname) != 0 ) ) ||
 rename(tmp_stderr, path_stderr) != 0) {
 cc_log(failed to rename tmp files - %s\n, 
strerror(errno));
 stats_update(STATS_ERROR);
diff -u ccache-2.4/util.c ccache-2.4-fix/util.c
--- ccache-2.4/util.c   2004-09-13 06:38:30.0 -0400
+++ ccache-2.4-fix/util.c   2007-03-24 10:46:15.0 -0500
@@ -68,7 +68,16 @@
 char *tmp_name;
 mode_t mask;

-   x_asprintf(tmp_name, %s.XX, dest);
+   if( getenv(CCACHE_NFS_WORKAROUND) )
+   {
+   /* hostname and pid means this filename is unique. */
+   x_asprintf( tmp_name, %s.XX.%s.%d, dest,
+   getenv(HOSTNAME), getpid() );
+   }
+   else
+   {
+   x_asprintf( tmp_name, %s.XX, dest );
+   }

 fd1 = open(src, O_RDONLY|O_BINARY);
 if (fd1 == -1) {