#!/usr/bin/perl #============================================================= -*-perl-*- # # BackupPC_fixLinks.pl: Identify and correct duplicate pool entries # and missing links to pool # # DESCRIPTION # See below for detailed description of what it does and how it works # # AUTHOR # Jeff Kosowsky # # COPYRIGHT # Copyright (C) 2008, 2009, 2010 Jeff Kosowsky # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # #======================================================================== # # Version 0.3.1, released December 2010 # #======================================================================== use strict; use warnings; use File::Path; use File::Find; #use File::Compare; use Getopt::Std; use Fcntl; #Required for RW I/O masks use lib "/usr/share/BackupPC/lib"; use BackupPC::FileZIO; use BackupPC::Lib; use BackupPC::jLib; use BackupPC::Attrib qw(:all); no utf8; die("BackupPC::Lib->new failed\n") if ( !(my $bpc = BackupPC::Lib->new) ); %Conf = $bpc->Conf(); #Global variable defined in jLib.pm (do not use 'my') my %opts; if ( !getopts("i:l:fb:Vdsqvch", \%opts) || @ARGV > 0 || $opts{h} || ($opts{i} && $opts{l})) { print STDERR < Read pool dups from file and proceed with 2nd pc tree pass -l Read pool dups & bad pc links from file and proceed with final repair pass NOTE: -i and -l options are mutually exclusive. -s Skip first pass of generating (or tabulating if -i or -l options are set) cpool dups -f Fix links -c Clean up pool - schedule BackupPC_nightly to run (requires server running) -b Search backups from (relative to TopDir/pc) -V Verify links of all files in pc path (WARNING: slow!) -d Dry-run -q Quiet - only print summaries & results -v Verbose - print details on each relink -h Print this usage message EOF exit(1); } my $file = ($opts{i} ? $opts{i} : $opts{l}); my $verifypc=$opts{V}; my $notquiet =!$opts{q}; my $verbose=$opts{v}; $dryrun = $opts{d}; #global variable in jLib.pm my $fixlinks = $opts{f}; my $runnightly = $opts{c}; #$dryrun =1; #JJK - for testing force to always dry run my $DRYRUN = ($dryrun == 0 ? "" : " DRY-RUN"); ######################## my $md5 = Digest::MD5->new; my $MaxLinks = $Conf{HardLinkMax}; my $cmprsslvl; #Note we get rid of any extra lurking double slashes and any trailing slash for directories (my $TopDir = $bpc->TopDir()) =~ s|//*|/|g; $TopDir =~ s|/$||; (my $pooldir = $bpc->{PoolDir}) =~ s|//*|/|g; $pooldir =~ s|/$||; (my $cpooldir = $bpc->{CPoolDir}) =~ s|//*|/|g; $cpooldir =~ s|/$||; chdir($TopDir); #Do this because 'find' will later try to return to working #directory which may not be accessible if you are su backuppc my $pc = "${TopDir}/pc"; my @backups; if ($opts{b}) { (my $backups = "$pc/$opts{b}") =~ s|//*|/|g; $backups =~ s|/$||; die "ERROR: '$backups' directory doesn't exist\n" unless -d $backups; @backups = ($backups =~ m|^($pc/[^/]+)/?$| ? glob("$1/[0-9]*") : ($backups)); # If path stops at host, then glob for all backup numbers. } else { # Look at all backups - begin 2 levels down i.e. in: TopDir/pc// @backups = glob("$pc/*/[0-9]*"); } my %md5sumhash; #Hash used to store previously seen full file md5sums for NewFiles my (%inodHOA); # First find and create hash of arrays of duplicated pool entries: # %inodHOA = ( # => [ , , , , , ], # ... # => [ , , , , , ], # => [ , , , , , ], # ); # where checksum = [=-#x@] # = if files match # - if only decompresed versions match # # if only decompressed versions match (and flipped)) # x if newlink/badlink # @ if same inode my @MatchA; # @MatchA = (, , , , pool, , , ) # where: # # matchname = File name and partial path (beginning after 'pc') to # the match in the pc tree. Note when we print it to a # file we enclose it in double-quotes "" # # inoM = Inode of the match # # md5sum = Name of pool entry that has the same (uncompressed) # contents as matchname. The name equals the md5sum of # the (uncompressed) file plus potentially an _NNN suffix # if the data matches something other than the stem # md5sum in the pool (or equals all zeros if sum is not # calculable for some reason - shouldn't happen). # This is the target that we want to link matchname to # # dupmd5 = Name of duplicate pool entry (which is again the md5sum # of the contents plus potentially an _NNN suffix). We # don't actually need to modify this file. We just unlink # all the backup files that share its inode and then let # BackupPC nightly delete it when it has no more other # links. # # matchtype = One of the following # NewLink = if match has only one inode but matches # an existing pool element # NewFile = if match has only one inode but doesn't match # an existing pool element # MD5Err = if for some reason couldn't calculate MD5sum # (this shouldn't happen) # # pool = pool/cpool # # cmparflg = Flag showing how the match and the target compare # @ if this is a duplicate pool element with the SAME inode # as its parent (i.e. as 'md5sum') -- shouldn't happen # = if 'matchname' has the same contents as 'md5sum'. # - if 'matchname' inflates (i.e. uncompresses) to the same # contents as 'md5sum' (this typically happens # when 'md5sum' has a checksum seed and 'matchname' doesn't # # if 'dupmd5' inflates (i.e. uncompresses) to the same # contents as 'md5sum' but this time 'dupmd5' has the # checksum seed (and the parent which now has a lower # suffix doesn't. For pool dups, this is the reverse case # of '-'. Not applicable for NewLinks and NewFiles. # x MD5Err or if first NewFile that has this contents # (and corresponding md5sum) # y if NewFile but a previous NewFile already has this contents # (and corresponding md5sum # # matchbyte = First byte of the matched file (or dup pool element) # md5sumbyte = First byte of the corresponding (parent) pool entry that we # will be linking to # = d6 or d7 if file is compressed and checksum seed present # = 78 if file is compressed and checksum seed NOT present # = 00 for the not-yet-existent match for a NewFile # nlink1M = Number of links to the match MINUS 1 # sizeM = Size of the match in bytes # # Note for matches corresponding to duplicate pool elements, by design: # MatchA = (, $inoM, @{$inodHOA{$inoM}}) my ($totdups, $collisions, $totlinks, $totsize) = (0, 0, 0, 0); my ($totmatches, $totmd5errs, $totunlinked, $totnewfiles, $totnewlinks, $totfixed, $totbroken) = (0, 0, 0, 0, 0, 0, 0); # Find or read-in list of duplicate pool entries if (!$opts{s}) { # Read in or find duplicate pool entries if ($opts{i} || $opts{l}) { #Read in and tabulate previously generated list of inodes from input file (note link entries will be ignored if they exist) read_inodHOA($file); print_inodHOA() if $notquiet; } else{ # Find inodes by recursing through the pool find(\&pool_dups, $pooldir, $cpooldir); } print "Found $totdups dups (and $collisions true collisions) with $totlinks total links and $totsize size\n"; } # Find backup files with broken/missing links or with links to duplicate pool entries if ($opts{l}) { # Read in previously generated list of inodes & optionally start fixing links & duplicate pool entries if -f flag set read_LinkFile($file); } else { #Find bad links in pc path and optionally fix together with duplicate pool nodes if -f flag set foreach my $backup (@backups) { $backup =~ m#^($pc/[^/]*/[^/]*)#; $cmprsslvl = get_bakinfo($1, "compress"); #Note this is set at the level of the backup number $cmprsslvl = $Conf{CompressLevel} unless defined($cmprsslvl); print "Finding links in $backup\n"; find(\&find_BadOrMissingLinks, $backup); } } $totunlinked = $totnewlinks + $totnewfiles; print "Found $totmatches matching files and $totunlinked unlinked files ($totnewfiles NewFiles, $totnewlinks NewLinks, $totmd5errs MD5Errors)\n"; print "Fixed $totfixed out of $totbroken links\n" if $fixlinks; run_nightly() if (!$dryrun && $runnightly); print "DONE\n"; exit; ##################################################################################################### sub pool_dups { my ($devD, $inoD, $modeD, $nlinkD, $uidD, $gidD, $rdevD, $sizeD, $therestD); my ($devP, $inoP, $modeP, $nlinkP, $uidP, $gidP, $rdevP, $sizeP, $therestP); my $comparflg; unless (-r) { # First check for read error on found element warnerr "Can't read : $File::Find::name\n"; return; } # Then get root/suffix and check if it is a potential duplicate return unless -f && m|(.*)_(.*)|; # file doesn't end with _ my $root=$1; my $suffix=$2; my $dup=$_; $File::Find::dir =~ m|(c?pool)/[/[:xdigit:]]+$|; my $thepool = $1; # Then get file information unless (($devD, $inoD, $modeD, $nlinkD, $uidD, $gidD, $rdevD, $sizeD, $therestD) = stat($dup)) { warnerr "Can't stat: $File::Find::name\n"; return; } my $prevsuffix = ($suffix == 0 ? '' : '_' . ($suffix -1)); warnerr "Hole in pool chain at $root$prevsuffix" unless -f "$root$prevsuffix"; # Then check to see if any of its "parents" are duplicates my $parent = $root; for (my $i=-1; $i < $suffix; $i++, $parent="$root\_$i" ) { #Start at base of chain and move up (note start with -1 for root) unless( -f $parent ) { warnerr "Parent not a file or unreadable: $File::Find::dir/$parent\n"; next; } ($devP, $inoP, $modeP, $nlinkP, $uidP, $gidP, $rdevP, $sizeP, $therestP) = stat($parent); if ($inoP == $inoD) { #same inodes $comparflg='@'; } elsif (($nlinkP + $nlinkD) >= $MaxLinks) { next; # Too many links even if files the same } elsif ( ($comparflg = compare_files($parent,$dup, ($thepool eq "cpool" ? 1 :0))) > 0 ) { #Found match $comparflg = ($comparflg == 1 ? '=' : '-'); } else { next; } # Parent is not a copy my $fbyteD = firstbyte("$File::Find::dir/$dup"); my $fbyteP = firstbyte("$File::Find::dir/$parent"); if(($fbyteD eq 'd6' || $fbyteD eq 'd7') && !($fbyteP eq 'd6' || $fbyteP eq 'd7')) #NOTE: compressed file without checksums starts with 0x78 # compressed file with checksums starts with 0xd6 or 0xd7 { #swap $dup & $parent if only $dup has rsync seed my $temp = $dup; $dup = $parent; $parent = $temp; $temp = $fbyteD; $fbyteD = $fbyteP; $fbyteP = $temp; $nlinkD = $nlinkP; $sizeD = $sizeP; $comparflg='#'; } $inodHOA{$inoD} = [$parent, $dup, $thepool, $comparflg.$fbyteD.$fbyteP, --$nlinkD, $sizeD]; print "$inoD @{ $inodHOA{$inoD} }\n" if $notquiet; # print "$inoD $parent $dup $thepool $comparflg, $nlinkD $sizeD\n"; $totdups++; $totlinks += $nlinkD; $totsize += $sizeD; return; #Earliest duplicate checksum (i.e. parent) in the chain found so stop going down chain } # No matching copies found in the chain print "$inoD $dup COLLISION $thepool X $nlinkD $sizeD\n" if $notquiet; $collisions++; } sub print_inodHOA { for my $inode (keys %inodHOA) { print "$inode @{ $inodHOA{$inode} }\n"; # print "$inodHOA{$inode}[0] $inodHOA{$inode}[1] etc...\n"; } } sub read_inodHOA { my $file=$_[0]; $totdups = $collisions = $totlinks = $totsize = 0; die "Error: file not readable: $file\n" unless -f $file && -r $file; open(IN,$file) || die "Can't open $file for reading"; while() { m|^(\d+)\s+([[:xdigit:]]+(_\d+)?)\s+([[:xdigit:]]+(_\d+)?)\s+(c?pool)\s+([-=#@][[:xdigit:]]+)\s+(\d+)\s+(\d+)| || next; $inodHOA{$1} = [$2, $4, $6, $7, $8, $9]; $totdups++; $totsize += $10; $totlinks += $9; } } sub find_BadOrMissingLinks { my $fixed =''; unless (-r) { # First check for read error on found element warnerr "Can't read : $File::Find::name\n"; return; } return unless -f; #Not a file return unless m|^f| || m|^attrib$|; # Skip files without 'f' mangle and that are not attrib files my $matchtype= BadOrMissingLinks($File::Find::name); return if $matchtype < 0; if($fixlinks && $matchtype > 0) { $totbroken++; if(fix_links($matchtype) > 0) { #Go fix link... $totfixed++; $fixed=" FIXED$DRYRUN"; } else {$fixed=" BROKEN$DRYRUN";} } if ($notquiet) { my $name = shift(@MatchA); print "\"" . $name . "\" " . join(" ", @MatchA) . "$fixed\n"; } } # Return -1 if no problem detected with link # Return -2 if can't stat file (shouldn't happen) # Return 0 if MD5Err - shouldn't happen # Return 1 if links to pool dup in %inodHoA # Return 2 if no links to pool but matching pool entry found (NewLink) # Return 3 if no links to pool and no matching pool entry found (NewFile) sub BadOrMissingLinks { my $matchpath = $_[0]; (my $matchname = $matchpath) =~ s|^$pc/*||; # Delete leading path directories (up to machine) my $rettype; my $matchtype; my ($devM, $inoM, $modeM, $nlinkM, $uidM, $gidM, $rdevM, $sizeM, $therestM); unless (($devM, $inoM, $modeM, $nlinkM, $uidM, $gidM, $rdevM, $sizeM, $therestM) = stat($_)) { warnerr "Can't stat: $matchpath\n"; return -2; #This really shouldn't happen! } if (exists $inodHOA{$inoM}) { #File links to dup pool element in our list @MatchA = ($matchname, $inoM, @{$inodHOA{$inoM}}); # print "\"$matchname\" $inoM @{ $inodHOA{$inoM} }\n"; $totmatches++; return 1; #type=1 } elsif($sizeM == 0 || ($nlinkM > 1 && !$verifypc)){ return -1; #Zero length or single-linked file } else { my $matchbyte = firstbyte($matchpath); my $comparflg = 'x'; # Default if no link to pool my $matchtype = "NewFile"; # Default if no link to pool my $md5sumbyte = '00'; # Default if no link to pool my $md5sum = zFile2MD5($bpc, $md5, $matchpath, 0, $cmprsslvl); if ($md5sum == -1) { #Can't create MD5sum $md5sum = "00000000000000000000000000000000"; $matchtype = "MD5Error"; $totmd5errs++; $rettype=0; goto match_return; } my $thepool = ($cmprsslvl > 0 ? "cpool" : "pool"); my $thepooldir = ($cmprsslvl > 0 ? $cpooldir : $pooldir); my $md5sumpathbase = $bpc->MD52Path($md5sum, 0, $thepooldir); my $i; if($verifypc) { for ($i=-1, my $md5sumpath = $md5sumpathbase; -f $md5sumpath; $md5sumpath = $md5sumpathbase . '_' . ++$i) { #Start at the root, looking for inode match in the pool... return -1 if($inoM == (stat($md5sumpath))[1]); } #Otherwise, pc file not found in pool } # Now we know we have a pc file that doesn't link to the pool... for ($i=-1, my $md5sumpath = $md5sumpathbase; -f $md5sumpath; $md5sumpath = $md5sumpathbase . '_' . ++$i) { #Again start at the root, try to find file content match in pool... if ((my $cmpresult = compare_files ($matchpath, $md5sumpath, $cmprsslvl)) > 0) { #Exact file match found my $inod =(stat($md5sumpath))[1]; #inode if (exists $inodHOA{$inod}) { #Oops target set to be relinked $md5sum = $inodHOA{$inod}[0]; # Set to parent $md5sumpath =$bpc->MD52Path($md5sum, 0, $thepooldir); $cmpresult = compare_files($matchpath,$md5sumpath, $cmprsslvl); $|++; warn "Note: NewLink is also a duplicate pool entry - relinking & fixed\n"; } else { ($md5sum .= '_' . $i) if $i >= 0; } $comparflg = ($cmpresult == 1 ? '=' : '-'); $md5sumbyte = firstbyte($md5sumpath); $matchtype = "NewLink"; $totnewlinks++; $rettype=2; #NewLink goto match_return; } #Otherwise, continue up the chain looking for a pool match... } $totnewfiles++; #Otherwise must be a NewFile since not found in pool my $fullmd5sum = zFile2FullMD5($bpc, $md5, $matchpath, $cmprsslvl); ($md5sum .= '_' . $i) if $i >= 0; # Name of first empty pool slot if ($md5sumhash{$fullmd5sum}) { #Already seen before! $comparflg = 'y'; $md5sum = $md5sumhash{$fullmd5sum}; $rettype=4; #NewFile-y } else { $md5sumhash{$fullmd5sum} = $md5sum; $rettype=3; #NewFile-x } match_return: @MatchA = ($matchname, $inoM, $md5sum, $matchtype, $thepool, ${comparflg}.${matchbyte}.${md5sumbyte}, $nlinkM, $sizeM); # print "\"$matchname\" $inoM $md5sum $matchtype $thepool ${comparflg}${matchbyte}${md5sumbyte} $nlinkM $sizeM\n"; return $rettype; } } #Read in link file for matching pool md5sums(dups), NewFiles, NewLinks; don't read in MD5Err entries or other errors sub read_LinkFile { my $file=$_[0]; my $matchtype; my $fixed=''; die "Error: file not readable: $file\n" unless -f $file && -r $file; open(IN,$file) || die "Can't open $file for reading"; while() { $matchtype = read_match($_); ++$totmatches if $matchtype==1; ++$totnewlinks if $matchtype==2 || $matchtype==4; ++$totnewfiles if $matchtype==3; if($fixlinks && $matchtype > 0) { $totbroken++; if (fix_links($matchtype) > 0) { $totfixed++; $fixed=" FIXED$DRYRUN"; } else {$fixed=" BROKEN$DRYRUN";} } my $name = shift(@MatchA); print "\"" . $name . "\" " . join(" ", @MatchA) . "$fixed\n" if $matchtype >= 0 && $notquiet; } } sub read_match { my $ret=-1; if (m|^"(.*)"\s+(\d+)\s+([[:xdigit:]]+(_\d+)?)\s+([[:xdigit:]]+(_\d+)?)\s+(c?pool)\s+([-=#@][[:xdigit:]]+)\s+(\d+)\s+(\d+)|) { $ret=1; #Dup match: Link to dup node in pool } elsif (m|^"(.*)"\s+(\d+)\s+([[:xdigit:]]+(_\d+)?)\s+((NewLink))\s+(c?pool)\s+([-=][[:xdigit:]]+)\s+(\d+)\s+(\d+)|) { $ret=2; #NewLink: File without links but has matching pool entry (Note parentheses added to keep numbering the same) } elsif (m|^"(.*)"\s+(\d+)\s+([[:xdigit:]]+(_\d+)?)\s+((NewFile))\s+(c?pool)\s+(x[[:xdigit:]]+)\s+(\d+)\s+(\d+)|) { $ret=3; #NewFile-x: File without links and without existing matching pool entry and without a previous NewFile #with the same content (Note parentheses added to keep numbering the same) } elsif (m|^"(.*)"\s+(\d+)\s+([[:xdigit:]]+(_\d+)?)\s+((NewFile))\s+(c?pool)\s+(y[[:xdigit:]]+)\s+(\d+)\s+(\d+)|) { $ret=4; #NewFile-y: File without links and without existing matching pool entry but a previous NewFile with the same #content will previously have created the new pool entry (Note parentheses added to keep numbering the same) } else {return -1;} @MatchA = ( $1, $2, $3, $5, $7, $8, $9, $10); return $ret; } sub fix_links { my ($type) = @_; my ($matchname, $inoM, $md5sum, $matchtype, $thepool, $checksumbytes, $nlinkM, $sizeM) = @MatchA; $checksumbytes =~ m|^(.)(..)(..)$|; my $cmprflag = $1; my $matchbyte = $2; my $md5sumbyte = $3; my $md5sumpath = $bpc->MD52Path($md5sum, 0, ($thepool eq "cpool" ? $cpooldir : $pooldir)); my $matchpath = "$pc/$matchname"; my $compress = ($thepool eq "cpool" ? 1 : 0); #First, perform extra checks (should be unncessary, but I'm paranoid) unless (-r $matchpath) { warnerr "\"$matchpath\" - Can't read file\n"; return -1; } my ($devMM, $inoMM, $modeMM, $nlinkMM, $uidMM, $gidMM, $rdevMM, $sizeMM, $therestMM) = stat($matchpath); if ($inoM != $inoMM || $sizeM != $sizeMM) { warnerr "\"$matchpath\" - Something changed... Inode or size doesn't match previous\n"; return -1; } $type =3 if $dryrun && $type == 4; # For dry-run NewFile-y behaves like NewFile-x since link not created if (($type == 1 && $matchtype =~ m|^[[:xdigit:]]+(_\d+)?$|) || #Duplicate pool entry ($type == 2 && $matchtype =~ m|^NewLink$|) || #New Link ($type == 4 && $matchtype =~ m|^NewFile$|)) { #New File with previously created link (by previous NewFile) # Unlink $matchname and relink to $md5sum unless ( -r $md5sumpath) { warnerr "\"$matchname\" - Can't read new link target: \"$md5sum\"\n"; return -1; } my ($devP, $inoP, $modeP, $nlinkP, $uidP, $gidP, $rdevP, $sizeP, $therestP) = stat($md5sumpath); if (($nlinkP + 1) >= $MaxLinks) { $|++; warn "Warn: \"$matchname\" - Linking would exceed HardLinkMax for \"$md5sum\"\n"; return -1; #Note, this still leaves everything OK, since the file is still linked to the pool #just means we can't free up an extra pool entry (should rarely happen anyway since #we have already checked this... } if(compare_files($matchpath, $md5sumpath, $compress) <= 0) { warnerr "\"$matchname\" - contents don't match \"$md5sum\"\n"; return -1; } if(!junlink($matchpath)){ warnerr "\"$matchname\" - unlink failed\n"; return -1; } if(!jlink($md5sumpath, $matchpath)){ warnerr "\"$matchname\" - link from \"$md5sum\" failed\n"; return -1; } print "\"$matchname\" successfully (re)linked from $matchtype [$inoM] to $md5sum [$inoP]" if $verbose; return 1; } elsif ($type == 3 && $matchtype =~ m|^NewFile$|) { #New File # Make new link in pool directory, adding additional subdirectories as needed if ( -r $md5sumpath) { # Check to see if something else took the planned target warnerr "\"$matchname\" - target already exists: \"$md5sum\"\n"; return -1; } $md5sum =~ m|^([[:xdigit:]]+)|; # Strip off the suffix unless (zFile2MD5($bpc, $md5, $matchpath, 0, $compress) == $1) { warnerr "\"$matchname\" - md5sum doesn't match \"$md5sum\"\n"; return -1; } $md5sumpath =~ m|(.*)/|; # Find the containing directory jmkpath($1, 0, 0777) if (!-d $1); print "\"$matchname\" - Making new pool directory $1\n" if ($verbose && ! -d $1); if (!jlink($matchpath, $md5sumpath)){ # Note reverse order of link from types 1&2 warnerr "\"$matchname\" - link to \"$md5sum\" failed\n"; return -1; } print "\"$matchname\" successfully linked to new file $md5sum [$inoM]" if $verbose; return 1; } else { warnerr "Invalid type ($type) doesn't match $matchtype\n"; return -1; } } sub compare_files { my ($file1, $file2, $compress)=@_; return 1 if !jcompare($file1, $file2); #Matches as-is return 2 if $compress && !zcompare($file1, $file2, $compress); #Matches post-inflation return 0; # Not a match or error } #Simple wrappers to protect when just doing dry runs sub jlink { return 1 if $dryrun; link $_[0], $_[1]; } sub junlink { return 1 if $dryrun; unlink @_; } sub jmkpath { return 1 if $dryrun; mkpath $_[0], $_[1], $_[2]; }