Each process should take a lock on the holding directory (a pid file),
The attached path fix it.
Please try it.
On 17/11/17 12:38 AM, Nathan Stratton Treadway wrote:
> (Amanda v3.5)
>
> I noticed that Amanda 3.5 no longer aborts amflush if amdump is
> currently running (as older versions of Amanda do).
>
> So out of curiousity I kicked of "amflush TestBackup" while amdump was
> busy dumping to the holding disk... and I discovered that amflush
> actually tries to go ahead and flush the ".tmp" file files that it finds
> in the holding directory:
>
> ===== From /var/log/amanda/server/TestBackup/amflush.20171116200510.debug:
> Thu Nov 16 20:05:17.590062176 2017: pid 26860: thd-0x2f07e00: amflush:
> flushing /amanda/TestBackup-holding/20171116200002/client1._.0.tmp
> Thu Nov 16 20:05:17.590096226 2017: pid 26860: thd-0x2f07e00: amflush:
> flushing /amanda/TestBackup-holding/20171116200002/client2._.1.tmp
> =====
>
> In this case the taper failed (and thus the amflush didn't actually do
> anything with the .tmp files)....:
>
> ===== From /var/log/amanda/TestBackup/amdump.1:
> driver: send-cmd time 14.132 to taper0: FILE-WRITE worker0-0 00-00002
> /amanda/TestBackup-holding/20171116200002/client1._.0.tmp client1 / 0
> 20171116200002 "" "" "" "" "" "" "" "" 0
> writing taper command 'FILE-WRITE worker0-0 00-00002
> /amanda/TestBackup-holding/20171116200002/client1._.0.tmp client1 / 0
> 20171116200002 "" "" "" "" "" "" "" "" 0
> ' failed: Broken pipe
> =====
> (There is a line break in the log file just before "' failed".)
>
>
> ===== From /var/log/amanda/server/TestBackup/taper.20171116200517.debug:
> Thu Nov 16 20:05:18.502419601 2017: pid 26862: thd-0x4232000: taper:
> Building type SPLIT_FILE header of 32768-32768 bytes with
> name='client1' disk='/' dumplevel=0 and blocksize=32768
> Thu Nov 16 20:05:22.427709157 2017: pid 26862: thd-0x4232050: taper:
> no next_filename
> Thu Nov 16 20:05:22.427743969 2017: pid 26862: thd-0x4232050: taper:
> sending XMSG_CRC message
> Thu Nov 16 20:05:22.427748905 2017: pid 26862: thd-0x4232050: taper:
> xfer-source-holding CRC: 2e4f7128 size: 249856000
> Thu Nov 16 20:05:22.427757739 2017: pid 26862: thd-0x4232050: taper:
> xfer_queue_message: MSG: <XMsg@0x7f46b8001bf0 type=XMSG_CRC
> elt=<XferSourceHolding@0x4230000> version=0>
> Thu Nov 16 20:05:22.427767783 2017: pid 26862: thd-0x4232050: taper:
> xfer-source-holding sending XMSG_DONE message
> Thu Nov 16 20:05:22.427773216 2017: pid 26862: thd-0x4232050: taper:
> xfer_queue_message: MSG: <XMsg@0x7f46b8001f00 type=XMSG_DONE
> elt=<XferSourceHolding@0x4230000> version=0>
> [ *** file ends abruptly here ***]
> =====
>
>
> .... but whether or not that indicates a bug in the taper, it seems like
> amflush should not ever try to flush .tmp files from the holding disk...
> (right?)
>
>
>
> Finally, after this testing I notice that the command_file still has
> FLUSH commands for those .tmp files (even though neither the files nor
> the containing holding directory now exist). I've run both "amdump" and
> "amflush" since then, and tried "amcleanup" as well. Is there any
> (good) way to clean up these orphan commands?
No, you must manually remove them. Do it when no other amanda processes
are running.
Jean-Louis
>
> ===== From /etc/amanda/TestBackup/command_file:
> ID 1633
> 1603 FLUSH TestBackup
> /amanda/TestBackup-holding/20171116200002/client1._.0.tmp client1 /
> 20171116200002 0 TestBackup WORKING:17072 TODO
> 1604 FLUSH TestBackup
> /amanda/TestBackup-holding/20171116200002/client2._.1.tmp client2 /
> 20171116200002 0 TestBackup WORKING:17072 TODO
> =====
>
> =====
> # ls -l /amanda/TestBackup-holding/
> total 0
> =====
>
>
> Nathan
>
> ----------------------------------------------------------------------------
> Nathan Stratton Treadway - [email protected] - Mid-Atlantic region
> Ray Ontko & Co. - Software consulting services - http://www.ontko.com/
> <http://www.ontko.com/>
> GPG Key: http://www.ontko.com/~nathanst/gpg_key.txt
> <http://www.ontko.com/~nathanst/gpg_key.txt>
> ID: 1023D/ECFB6239
> Key fingerprint = 6AD8 485E 20B9 5C71 231C 0C32 15F3 ADCD ECFB 6239
This message is the property of CARBONITE, INC. and may contain confidential or
privileged information.
If this message has been delivered to you by mistake, then do not copy or
deliver this message to anyone. Instead, destroy it and notify me by reply
e-mail
diff --git a/perl/Amanda/Holding.pm b/perl/Amanda/Holding.pm
index 88f1fd8..5b550c9 100644
--- a/perl/Amanda/Holding.pm
+++ b/perl/Amanda/Holding.pm
@@ -199,7 +199,7 @@ sub _is_datestr {
}
sub _walk {
- my ($file_fn, $verbose) = @_;
+ my ($file_fn, $verbose, $take_pid) = @_;
# walk disks, directories, and files with nested loops
for my $disk (disks()) {
@@ -229,6 +229,21 @@ sub _walk {
next;
}
+ my $pidfn = File::Spec->catfile($dirfn, "pid");
+ if (open(my $pidh, $pidfn)) {
+ my $pid = <$pidh>;
+ if (kill($pid, 0) == 0) {
+ # pid is alive, skip this directory
+ next;
+ }
+ close($pidh);
+ }
+ if ($take_pid) {
+ open(my $pidh, ">", $pidfn) || next;
+ print $pidh "$$";
+ close($pidh);
+ }
+
while (defined(my $dirent = $dirh->read)) {
next if $dirent eq '.' or $dirent eq '..' or $dirent eq 'pid';
@@ -243,7 +258,9 @@ sub _walk {
$file_fn->($filename, $hdr);
}
+ $dirh->close();
}
+ $diskh->close();
}
}
@@ -276,7 +293,7 @@ sub files {
push @results, $filename;
};
- _walk($each_file_fn, $verbose);
+ _walk($each_file_fn, $verbose, 1);
return @results;
}
@@ -289,7 +306,7 @@ sub all_files {
my ($filename, $header) = @_;
push @results, { filename => $filename, header => $header };
};
- _walk($each_file_fn, $verbose);
+ _walk($each_file_fn, $verbose,1 );
return @results;
}
@@ -506,7 +523,7 @@ sub get_files_for_flush {
}
push @results, $filename;
};
- _walk($each_file_fn, 0);
+ _walk($each_file_fn, 0, 1);
return sort @results;
}
@@ -520,7 +537,7 @@ sub get_all_datestamps {
$datestamps{$header->{'datestamp'}} = 1;
};
- _walk($each_file_fn, 0);
+ _walk($each_file_fn, 0, 0);
return sort keys %datestamps;
}
diff --git a/server-src/holding.c b/server-src/holding.c
index dcd8031..1629b44 100644
--- a/server-src/holding.c
+++ b/server-src/holding.c
@@ -63,6 +63,8 @@ static int is_emptyfile(char *fname);
*/
static int is_datestr(char *fname);
+gboolean take_holding_pid(char *diskdir, int pid);
+static gboolean can_take_holding(char *pid_file);
/*
* Static functions */
@@ -354,18 +356,20 @@ holding_walk_disk(
is_cruft = 1; /* unexpected */
}
- if (per_dir_fn)
- proceed = per_dir_fn(datap,
- hdisk,
- workdir->d_name,
- hdir,
+ if (per_dir_fn) {
+ proceed = per_dir_fn(datap,
+ hdisk,
+ workdir->d_name,
+ hdir,
is_cruft);
- if (!is_cruft && proceed && stop_at != STOP_AT_DIR)
+ }
+ if (!is_cruft && proceed && stop_at != STOP_AT_DIR) {
holding_walk_dir(hdir,
datap,
stop_at,
per_file_fn,
per_chunk_fn);
+ }
}
closedir(dir);
@@ -431,6 +435,25 @@ typedef struct {
int fullpaths;
} holding_get_datap_t;
+/* Functor for holding_get_*; Stop if pid fileexists and is still alive
+ * the result.
+ */
+static int
+holding_dir_stop_if_pid_fn(
+ gpointer datap G_GNUC_UNUSED,
+ char *hdisk G_GNUC_UNUSED,
+ char *element G_GNUC_UNUSED,
+ char *hdir,
+ int is_cruft)
+{
+ if (is_cruft) {
+ return 0;
+ }
+
+ return take_holding_pid(hdir, getppid());
+}
+
+
/* Functor for holding_get_*; adds 'element' or 'fqpath' to
* the result.
*/
@@ -495,7 +518,7 @@ holding_get_files(
} else {
holding_walk((gpointer)&data,
STOP_AT_FILE,
- NULL, NULL, holding_get_walk_fn, NULL);
+ NULL, holding_dir_stop_if_pid_fn, holding_get_walk_fn, NULL);
}
return data.result;
@@ -1010,6 +1033,14 @@ mkholdingdir(
int success = 1;
char *pid_file;
FILE *pid_FILE;
+ struct stat statbuf;
+
+ pid_file = g_strconcat(diskdir, "/pid", NULL);
+ // shorcut if the pid_file already exists
+ if (stat(pid_file, &statbuf) == 0) {
+ g_free(pid_file);
+ return success;
+ }
if (mkpdir(diskdir, 0770, (uid_t)-1, (gid_t)-1) != 0 && errno != EEXIST) {
log_add(L_WARNING, _("WARNING: could not create parents of %s: %s"),
@@ -1040,7 +1071,6 @@ mkholdingdir(
}
/* create a 'pid' file */
- pid_file = g_strconcat(diskdir, "/pid", NULL);
pid_FILE = fopen(pid_file, "w");
if (!pid_FILE) {
log_add(L_WARNING, _("WARNING: Can't create '%s': %s"),
@@ -1054,3 +1084,61 @@ mkholdingdir(
return success;
}
+
+static gboolean can_take_holding(
+ char *pid_file)
+{
+ FILE *pid_FILE;
+ int result = 1;
+
+ pid_FILE = fopen(pid_file, "r");
+ if (pid_FILE) {
+ char line[1000];
+ int pid;
+ if (fgets(line, 1000, pid_FILE) != NULL) {
+ pid = atoi(line);
+ if (pid != getpid() && pid != getppid()) {
+ /* check if pid is alive */
+ if (kill(pid, 0) != -1) {
+ result = 0;
+ }
+ }
+ }
+ fclose(pid_FILE);
+ }
+
+ return result;
+}
+
+gboolean take_holding_pid(char *diskdir, int pid);
+gboolean
+take_holding_pid(
+ char * diskdir,
+ int pid)
+{
+ int result = 1;
+ char *pid_file;
+ FILE *pid_FILE;
+
+ pid_file = g_strconcat(diskdir, "/pid", NULL);
+
+ if (!can_take_holding(pid_file)) {
+ g_free(pid_file);
+ return 0;
+ }
+
+ /* create a 'pid' file */
+ pid_FILE = fopen(pid_file, "w");
+ if (!pid_FILE) {
+ log_add(L_WARNING, _("WARNING: Can't create '%s': %s"),
+ pid_file, strerror(errno));
+ result = 0;
+ } else {
+ fprintf(pid_FILE, "%d", pid);
+ fclose(pid_FILE);
+ }
+ g_free(pid_file);
+
+ return result;
+}
+