Nathan,

I committed the attached patch, It fix all the issues I saw.

Can you try to use the SVN or GIT repository? they have other fixes.

SVN:

 1. svn checkout svn://svn.code.sf.net/p/amanda/code/amanda/branches/3_5
    amanda-3-5
 2. cd amanda-3-5
 3. ./autogen
 4. ./configure ....
 5. make

GIT:

 1. git init git-amanda-3.5
 2. cd git-amanda-3.5
 3. git remote add zmanda https://github.com/zmanda/amanda.git
 4. git fetch zmanda
 5. git branch --track 3_5 zmanda/3_5
 6. git checkout 3_5
 7. ./autogen
 8. ./configure ...
 9. make

Jean-Louis

On 21/11/17 08:13 PM, Nathan Stratton Treadway wrote:
> On Mon, Nov 20, 2017 at 15:09:25 -0500, Jean-Louis Martineau wrote:
> > perl automatically close it when it come out of scope.
> > The close should before the 'if (kill' line.
>
> Okay, I moved the close() line up to that spot as I applied the patch.
>
> I can confirm that with your patch applied and my followup patch
> applied, if I kick off amflush while amdump is running in the
> background, it (amflush) prints out
> Could not find any Amanda directories to flush.
> and exits without attempting to flush anything.... but if I try again
> after amdump finished, I'm able to complete the flush successfully.
>
>
> A few followup points that came up during my testing:
>
> * when amflush completes now, the pid file is left out there in the
> holding directory, thus preventing the directory from getting deleted:
>
> =====
> # ls -lR /amanda/TestBackup-holding/*
> /amanda/TestBackup-holding/20171120172656:
> total 4
> -rw------- 1 backup backup 5 Nov 21 18:50 pid
>
> /amanda/TestBackup-holding/20171121145009:
> total 4
> -rw------- 1 backup backup 5 Nov 21 18:50 pid
> =====
>
> Off hand I don't see code in amflush or Amflush.pm 
> <http://Amflush.pm> 
> trying to clean up
> the holding disk directories so I don't know if these "zombie" pid
> files are related to the third issue (i.e. holding_cleanup_dir()),
> but...
>
> * ...as a more general question I was wondering if it made sense
> for programs that take a lock on a holding directory to have a way to
> explicitly release that lock when they are done? (Probably this
> would be by deleting the pid file, thought it could instead be
> truncating it to zero length, or something.)
>
> The main reason I was wondering about this is that with the current
> behavior of leaving a particular pid in the pid file forever, it
> seems like there's a good chance of false positives when the lock is
> checked by later jobs (i.e. some other, unrelated process using the
> same pid when the lock-check happens). If there were some way for a
> process to release the lock, false positives could only happen after a
> process crashed.
>
> (I don't know how often these false positives would happen in
> practice, but I can imagine it would cause a headache if the pid
> locking a directory containing dumps needing to be flushed to vault
> storage ended up getting reused by some long-runing process...)
>
> * I noticed that holding.c:holding_cleanup_dir() doesn't include parent
> pid in the pid-file check. I haven't tracked back to figure out which
> applications call the holding_cleanup* family of functions so I don't
> know if that will cause actual problems, but I was wondering if that
> function should instead just call call_take_holding() instead, so the
> logic is all found in one place?
>
>
> Nathan
>
>
>
>
> ----------------------------------------------------------------------------
> Nathan Stratton Treadway - [email protected] - Mid-Atlantic region
> Ray Ontko & Co. - Software consulting services - http://www.ontko.com/ 
> <http://www.ontko.com/>
> GPG Key: http://www.ontko.com/~nathanst/gpg_key.txt 
> <http://www.ontko.com/~nathanst/gpg_key.txt> 
> ID: 1023D/ECFB6239
> Key fingerprint = 6AD8 485E 20B9 5C71 231C 0C32 15F3 ADCD ECFB 6239
This message is the property of CARBONITE, INC. and may contain confidential or 
privileged information.
If this message has been delivered to you by mistake, then do not copy or 
deliver this message to anyone.  Instead, destroy it and notify me by reply 
e-mail
diff --git a/installcheck/Amanda_Holding.pl b/installcheck/Amanda_Holding.pl
index e8f206c..974ba95 100644
--- a/installcheck/Amanda_Holding.pl
+++ b/installcheck/Amanda_Holding.pl
@@ -127,7 +127,7 @@ is_deeply([ sort(+Amanda::Holding::disks()) ],
     [ sort($holding1, $holding2) ],
     "all active holding disks, but not inactive (defined but not used) disks");
 
-is_deeply([ sort(+Amanda::Holding::files()) ],
+is_deeply([ sort(+Amanda::Holding::files(0)) ],
     [ sort(
 	"$holding1/20070303000000/videoserver._video_a",
 	"$holding1/20070306123456/videoserver._video_a",
@@ -163,17 +163,17 @@ is_deeply([ Amanda::Holding::get_all_datestamps() ],
 	  [ sort("20070303000000", "20070306123456") ],
 	  "get_all_datestamps");
 
-is_deeply([ sort(+Amanda::Holding::get_files_for_flush("023985")) ],
+is_deeply([ sort(+Amanda::Holding::get_files_for_flush(0, "023985")) ],
 	  [ sort() ],
 	  "get_files_for_flush with no matching datestamps returns no files");
-is_deeply([ Amanda::Holding::get_files_for_flush("20070306123456") ],
+is_deeply([ Amanda::Holding::get_files_for_flush(0, "20070306123456") ],
 	  [ sort(
 		"$holding2/20070306123456/audio._usr",
 		"$holding2/20070306123456/audio._var",
 		"$holding1/20070306123456/videoserver._video_a",
 	  )],
 	  "get_files_for_flush gets only files listed in disklist (no _video_b)");
-is_deeply([ Amanda::Holding::get_files_for_flush() ],
+is_deeply([ Amanda::Holding::get_files_for_flush(0) ],
 	  [ sort(
 		"$holding1/20070303000000/videoserver._video_a",
 		"$holding2/20070306123456/audio._usr",
diff --git a/installcheck/amrestore.pl b/installcheck/amrestore.pl
index 2aa84cc..03109b6 100644
--- a/installcheck/amrestore.pl
+++ b/installcheck/amrestore.pl
@@ -207,7 +207,7 @@ is($hdr_size, $orig_size + Amanda::Holding::DISK_BLOCK_BYTES,
 
 # find the holding files
 config_init($CONFIG_INIT_EXPLICIT_NAME, "TESTCONF");
-@filenames = sort(+Amanda::Holding::files());
+@filenames = sort(+Amanda::Holding::files(0));
 is(scalar @filenames, 2, "two holding files found") or die("holding is not what I thought");
 my $holding_filename = $filenames[0];
 
diff --git a/perl/Amanda/Amflush.pm b/perl/Amanda/Amflush.pm
index 8323733..52adfde 100644
--- a/perl/Amanda/Amflush.pm
+++ b/perl/Amanda/Amflush.pm
@@ -228,7 +228,7 @@ sub get_flush {
 	@ts = @{$params{'datestamps'}};
     }
     @ts = sort @ts;
-    my @hfiles = Amanda::Holding::get_files_for_flush(@ts);
+    my @hfiles = Amanda::Holding::get_files_for_flush(1, @ts);
 
     my $conf_cmdfile = config_dir_relative(getconf($CNF_CMDFILE));
     my $cmdfile = Amanda::Cmdfile->new($conf_cmdfile);
diff --git a/perl/Amanda/Holding.pm b/perl/Amanda/Holding.pm
index e51deac..ab89c43 100644
--- a/perl/Amanda/Holding.pm
+++ b/perl/Amanda/Holding.pm
@@ -24,7 +24,8 @@ use base qw( Exporter );
 use File::Spec;
 use File::stat;
 use IO::Dir;
-use POSIX qw( :fcntl_h );
+use POSIX qw( :fcntl_h :sys_wait_h );
+use Errno;
 use Math::BigInt;
 use strict;
 use warnings;
@@ -46,7 +47,7 @@ Amanda::Holding -- interface to the holding disks
 Get some statistics:
 
     my %size_per_host;
-    for my $hfile (Amanda::Holding::files()) {
+    for my $hfile (Amanda::Holding::files($take_pid_lock)) {
 	my $hdr = Amanda::Holding::get_header($hfile);
 	next unless $hdr;
 	$size_per_host{$hdr->{'name'}} += Amanda::Holding::file_size($hfile);
@@ -58,7 +59,7 @@ Schematic for something like C<amflush>:
 	print $ts, "\n";
     }
     my @to_dump = <>;
-    for my $hfile (Amanda::Holding::get_files_for_flush(@to_dump)) {
+    for my $hfile (Amanda::Holding::get_files_for_flush($take_pid_lock, @to_dump)) {
 	# flush $hfile
     }
 
@@ -161,7 +162,7 @@ The remaining two functions are utilities for amflush and related tools:
 
 returns a sorted list of all timestamps with dumps in any active holding disk.
 
-=item C<get_files_for_flush(@timestamps)>
+=item C<get_files_for_flush($take_pid_lock, @timestamps)>
 
 returns a sorted list of files matching any of the supplied timestamps.  Files
 for which no DLE exists in the disklist are ignored.  If no timestamps are
@@ -199,7 +200,7 @@ sub _is_datestr {
 }
 
 sub _walk {
-    my ($file_fn, $verbose, $take_pid) = @_;
+    my ($file_fn, $verbose, $take_pid, $datestamps) = @_;
 
     # walk disks, directories, and files with nested loops
     for my $disk (disks()) {
@@ -212,6 +213,9 @@ sub _walk {
 	while (defined(my $datestr = $diskh->read())) {
 	    next if $datestr eq '.' or $datestr eq '..';
 
+	    if (defined $datestamps && @{$datestamps} && !grep { $_ eq $datestr } @{$datestamps}) {
+		next;
+	    }
 	    my $dirfn = File::Spec->catfile($disk, $datestr);
 
 	    if (!_is_datestr($datestr)) {
@@ -229,17 +233,30 @@ sub _walk {
 		next;
 	    }
 
+	    my $already_own = 0;
 	    my $pidfn = File::Spec->catfile($dirfn, "pid");
 	    if (open(my $pidh,  $pidfn)) {
 		my $pid = <$pidh>;
 		close($pidh);
-		if (kill($pid, 0) == 0) {
-		    # pid is alive, skip this directory
-		    next;
+		if ($pid != $$ && $pid != getppid) {
+		    # remove if zoombie
+		    waitpid($pid, WNOHANG);
+		    if (kill(0, $pid) == 1) {
+			# pid is alive, skip this directory
+			next;
+		    }
+		    if ($! == Errno::EPERM) {
+			# the process exists  (but we don't have permission to kill it)
+			next;
+		    }
+		    unlink($pidfn);
+		} else {
+		    $already_own = 1;
 		}
 	    }
-	    if ($take_pid) {
-		open(my $pidh,  ">", $pidfn) || next;
+	    if ($take_pid && !$already_own) {
+		sysopen(my $pidh, $pidfn, O_CREAT | O_EXCL | O_WRONLY, 0644 ) || next;
+		#open(my $pidh,  ">", $pidfn) || next;
 		print $pidh "$$";
 		close($pidh);
 	    }
@@ -285,6 +302,7 @@ sub disks {
 
 sub files {
     my $verbose = shift;
+    my $take_pid_lock = shift;
     my @results;
 
     my $each_file_fn = sub {
@@ -507,6 +525,7 @@ sub file_size {
 }
 
 sub get_files_for_flush {
+    my $take_pid_lock = shift;
     my (@dateargs) = @_;
     my @results;
 
@@ -523,7 +542,7 @@ sub get_files_for_flush {
 	}
 	push @results, $filename;
     };
-    _walk($each_file_fn, 0, 1);
+    _walk($each_file_fn, 0, $take_pid_lock, \@dateargs);
 
     return sort @results;
 }
diff --git a/perl/Amanda/Report/human.pm b/perl/Amanda/Report/human.pm
index db345cb..12624c2 100644
--- a/perl/Amanda/Report/human.pm
+++ b/perl/Amanda/Report/human.pm
@@ -493,7 +493,7 @@ sub output_tapeinfo
 
     } else {
 
-        my @holding_list = Amanda::Holding::get_files_for_flush();
+        my @holding_list = Amanda::Holding::get_files_for_flush(0);
         my $h_size = 0;
         foreach my $holding_file (@holding_list) {
             $h_size += (0 + Amanda::Holding::file_size($holding_file, 1));
diff --git a/perl/Amanda/Report/json.pm b/perl/Amanda/Report/json.pm
index 44c7ff9..2c94fe9 100644
--- a/perl/Amanda/Report/json.pm
+++ b/perl/Amanda/Report/json.pm
@@ -407,7 +407,7 @@ sub output_tapeinfo
 
     } else {
 
-        my @holding_list = Amanda::Holding::get_files_for_flush();
+        my @holding_list = Amanda::Holding::get_files_for_flush(0);
         my $h_size = 0;
         foreach my $holding_file (@holding_list) {
             $h_size += (0 + Amanda::Holding::file_size($holding_file, 1));
diff --git a/perl/Amanda/Rest/Runs.pm b/perl/Amanda/Rest/Runs.pm
index 5675f57..bf831c0 100644
--- a/perl/Amanda/Rest/Runs.pm
+++ b/perl/Amanda/Rest/Runs.pm
@@ -606,10 +606,11 @@ sub amflush {
 			source_line     => __LINE__,
 			code         => $code,
 			severity     => $Amanda::Message::WARNING);
+	log_add($L_INFO, "pid-done $$");
 	return (-1, \@result_messages);
     }
 
-    # fork the amdump process and detach
+    # fork the amflush process and detach
     my $oldpid = $$;
     my $pid = POSIX::fork();
     if ($pid == 0) {
diff --git a/server-src/cmdline.c b/server-src/cmdline.c
index 2d4fd7a..85ee667 100644
--- a/server-src/cmdline.c
+++ b/server-src/cmdline.c
@@ -246,7 +246,7 @@ cmdline_match_holding(
     GSList *matching_files = NULL;
     dumpfile_t file;
 
-    holding_files = holding_get_files(NULL, 1);
+    holding_files = holding_get_files(NULL, 1, 0);
 
     for (hi = holding_files; hi != NULL; hi = hi->next) {
 	/* TODO add level */
diff --git a/server-src/find.c b/server-src/find.c
index 43eaa22..5699560 100644
--- a/server-src/find.c
+++ b/server-src/find.c
@@ -262,7 +262,7 @@ search_holding_disk(
     disk_t *dp;
     char   *orig_name;
 
-    holding_file_list = holding_get_files(NULL, 1);
+    holding_file_list = holding_get_files(NULL, 1, 0);
 
     if (string_chunk == NULL) {
 	string_chunk = g_string_chunk_new(32768);
diff --git a/server-src/holding.c b/server-src/holding.c
index 1629b44..98658db 100644
--- a/server-src/holding.c
+++ b/server-src/holding.c
@@ -64,7 +64,7 @@ static int is_emptyfile(char *fname);
 static int is_datestr(char *fname);
 
 gboolean take_holding_pid(char *diskdir, int pid);
-static gboolean can_take_holding(char *pid_file);
+static int can_take_holding(char *pid_file, int remove);
 /*
  * Static functions */
 
@@ -433,6 +433,7 @@ holding_walk(
 typedef struct {
     GSList *result;
     int fullpaths;
+    int take_pid_lock;
 } holding_get_datap_t;
 
 /* Functor for holding_get_*; Stop if pid fileexists and is still alive
@@ -440,17 +441,24 @@ typedef struct {
  */
 static int
 holding_dir_stop_if_pid_fn(
-    gpointer datap G_GNUC_UNUSED,
+    gpointer datap,
     char *hdisk G_GNUC_UNUSED,
     char *element G_GNUC_UNUSED,
     char *hdir,
     int is_cruft)
 {
+    holding_get_datap_t *data = (holding_get_datap_t *)datap;
+
     if (is_cruft) {
 	return 0;
     }
 
-    return take_holding_pid(hdir, getppid());
+    if (data->take_pid_lock) {
+	return take_holding_pid(hdir, getppid());
+    } else {
+	char *pid_file = g_strconcat(hdir, "/pid", NULL);
+	return can_take_holding(pid_file, 0);
+    }
 }
 
 
@@ -505,11 +513,13 @@ holding_get_disks(void)
 GSList *
 holding_get_files(
     char *hdir,
-    int fullpaths)
+    int fullpaths,
+    int take_pid_lock)
 {
     holding_get_datap_t data;
     data.result = NULL;
     data.fullpaths = fullpaths;
+    data.take_pid_lock = take_pid_lock;
 
     if (hdir) {
         holding_walk_dir(hdir, (gpointer)&data,
@@ -530,6 +540,7 @@ holding_get_file_chunks(char *hfile)
     holding_get_datap_t data;
     data.result = NULL;
     data.fullpaths = 1;
+    data.take_pid_lock = 0;
 
     holding_walk_file(hfile, (gpointer)&data,
 	holding_get_walk_fn);
@@ -549,7 +560,7 @@ holding_get_files_for_flush(
 
     /* loop over *all* files, checking each one's datestamp against the expressions
      * in dateargs */
-    file_list = holding_get_files(NULL, 1);
+    file_list = holding_get_files(NULL, 1, 1);
     for (file_elt = file_list; file_elt != NULL; file_elt = file_elt->next) {
         /* get info on that file */
 	if (!holding_file_get_dumpfile((char *)file_elt->data, &file))
@@ -597,7 +608,7 @@ holding_get_all_datestamps(void)
     GSList *datestamps = NULL;
 
     /* enumerate all files */
-    all_files = holding_get_files(NULL, 1);
+    all_files = holding_get_files(NULL, 1, 0);
     for (file = all_files; file != NULL; file = file->next) {
 	dumpfile_t dfile;
 	if (!holding_file_get_dumpfile((char *)file->data, &dfile))
@@ -748,7 +759,6 @@ holding_cleanup_dir(
 {
     holding_cleanup_datap_t *data = (holding_cleanup_datap_t *)datap;
     char *pid_file;
-    FILE *pid_FILE;
 
     if (is_cruft) {
 	if (data->verbose_output)
@@ -759,26 +769,8 @@ holding_cleanup_dir(
 
     /* Do not cleanup if not from us and their amdump is still running */
     pid_file = g_strconcat(fqpath, "/pid", NULL);
-    pid_FILE = fopen(pid_file, "r");
-    if (pid_FILE) {
-	char line[1000];
-	int  pid;
-	if (fgets(line, 1000, pid_FILE) != NULL) {
-	    pid = atoi(line);
-	    if (pid != getpid()) {
-		/* check if pid is alive */
-		if (kill(pid, 0) == 0) {
-		    if (data->verbose_output)
-			g_fprintf(data->verbose_output,
-			    _("..skipping running directory '%s'\n"), element);
-		    g_free(pid_file);
-		    fclose(pid_FILE);
-		    return 0;
-		}
-	    }
-	    unlink(pid_file);
-	}
-	fclose(pid_FILE);
+    if (!can_take_holding(pid_file, 1)) {
+	return 0;
     }
     g_free(pid_file);
 
@@ -1085,8 +1077,14 @@ mkholdingdir(
     return success;
 }
 
-static gboolean can_take_holding(
-    char *pid_file)
+/*
+ * return  0 - can't take
+ *         1 - can take
+ *         2 - already own
+ */
+static int can_take_holding(
+    char *pid_file,
+    int   remove)
 {
     FILE *pid_FILE;
     int result = 1;
@@ -1102,6 +1100,15 @@ static gboolean can_take_holding(
 		if (kill(pid, 0) != -1) {
 		    result = 0;
 		}
+		// remove pid file of dead process
+		unlink(pid_file);
+	    } else {
+		if (remove) {
+		    // remove my own pid file
+		    unlink(pid_file);
+		} else {
+		    result = 2;
+		}
 	    }
 	}
 	fclose(pid_FILE);
@@ -1116,19 +1123,22 @@ take_holding_pid(
     char *	diskdir,
     int		pid)
 {
-    int   result = 1;
     char *pid_file;
     FILE *pid_FILE;
+    int   result;
 
     pid_file = g_strconcat(diskdir, "/pid", NULL);
 
-    if (!can_take_holding(pid_file)) {
+    result = can_take_holding(pid_file, 0);
+    if (result == 0) {
 	g_free(pid_file);
 	return 0;
+    } else if (result == 2) {
+	return 1;
     }
 
     /* create a 'pid' file */
-    pid_FILE = fopen(pid_file, "w");
+    pid_FILE = fopen(pid_file, "wx");
     if (!pid_FILE) {
 	log_add(L_WARNING, _("WARNING: Can't create '%s': %s"),
 		pid_file, strerror(errno));
diff --git a/server-src/holding.h b/server-src/holding.h
index 7adfe5f..0de8c37 100644
--- a/server-src/holding.h
+++ b/server-src/holding.h
@@ -72,7 +72,8 @@ holding_get_disks(void);
  */
 GSList *
 holding_get_files(char *hdir,
-                  int fullpaths);
+                  int fullpaths,
+		  int take_pid_lock);
 
 /* Get a list of holding files chunks in the given holding
  * file.  Always returns full paths.

Reply via email to