Dave Smith wrote:
I have a co-worker who has hundreds of duplicate large files strewn
about his Linux file system, and I want to help him consolidate them.
What I have in mind is a program that will find all files with a
certain extension, md5sum them all, and then create a master directory
with the files, and replace all the duplicates with symbolic links.
Does anyone know of such a utility?
I wrote my own little solution Qt/C++, called qduper. It's not a GUI
app, just runs in the console, but uses things like QDir, QString,
QFileInfo, and QProcess to ease the job. I wrote it against Qt 4.3.2,
but it should work with any Qt 4 version. It most likely will not
compile against Qt 3 (any version). It only runs on *nix, not Windows.
It might run on Mac, though it's untested.
The program recursively finds all duplicates in a list of directories,
and moves one of the duplicate files from each set to a master
directory, symbolically linking to it from all the other duplicates and
its previous location. Oh, and you can specify a file suffix so it only
operates on files that end in a certain extension (or leave it blank).
If you've got a lot of copied files lying around, this might help you
clean them up.
If anyone finds it useful, let me know. I sure did.
--Dave
P.S. This certainly isn't a shining example of Qt's awesomeness, but it
works okay'ish, and the code isn't awful. :)
#include <QDirIterator>
#include <QStringList>
#include <QByteArray>
#include <QtGlobal>
#include <QIODevice>
#include <QProcess>
#include <QString>
#include <QtDebug>
#include <QSet>
#include <QDir>
int main( int argc, char **argv )
{
if( argc < 4 )
{
printf( "qduper finds duplicate files in a directory (and
sub-directories), moves one of each duplicate set to a master directory, and
then sym-links all the other duplicates to it. It preserves all files names.\n"
);
printf( "\n" );
printf( "Usage: %s <file suffix> <master directory> <directory to
search> [<directory to search> ... ]\n", argv[0] );
return 1;
}
QString fileSuffix( argv[1] );
QString masterDirectory( argv[2] );
QSet<QString> directorySet;
for( int i=3; i<argc; i++ )
directorySet << argv[i];
if( ! QDir(masterDirectory).exists() )
{
qWarning() << "Error: The specified master directory" <<
masterDirectory << "does not exist.";
return 1;
}
masterDirectory = QFileInfo(masterDirectory).canonicalFilePath();
qDebug() << "";
qDebug() << "File suffix: " << fileSuffix;
qDebug() << "Master directory:" << masterDirectory;
qDebug() << "Directory list: " << QStringList(directorySet.toList()).join(
", " );
qDebug() << "";
QRegExp md5regex( "^([a-z0-9]+)\\s*", Qt::CaseInsensitive );
QMap<QString,QSet<QString> > md5map;
foreach( QString directory, directorySet )
{
QDirIterator dirIterator( directory, QDirIterator::Subdirectories );
while( dirIterator.hasNext() )
{
QString fileName = dirIterator.next();
QFileInfo fileInfo( fileName );
fileName = fileInfo.canonicalFilePath();
if( fileInfo.isFile() && fileName.endsWith( fileSuffix ) )
{
if( fileInfo.dir() == QDir(masterDirectory) )
{
qDebug() << "Skipping" << fileName << "beacuse it is inside
the master directory.";
continue;
}
else if( fileInfo.isSymLink() )
{
qDebug() << "Skipping" << fileName << "beacuse it is a
symbolic link.";
continue;
}
QProcess process;
process.start( "md5sum", QStringList() << fileName,
QIODevice::ReadOnly );
process.waitForFinished( -1 );
if( process.exitCode() == 0 && process.exitStatus() ==
QProcess::NormalExit )
{
QString md5output = process.readAllStandardOutput();
if( md5regex.indexIn( md5output ) != -1 )
{
QString md5 = md5regex.cap(1);
md5map[ md5 ].insert( fileName );
}
}
else
{
qWarning() << "Warning:" <<
process.readAllStandardError().trimmed();
}
}
}
}
int duplicateCount = 0;
foreach( QString md5, md5map.keys() )
{
QStringList duplicateFileNames = md5map[md5].toList();
if( duplicateFileNames.count() > 1 )
{
duplicateCount += duplicateFileNames.count();
QString masterFile = duplicateFileNames[0];
qDebug() << "I will move" << masterFile << "to directory" <<
masterDirectory;
qDebug() << " And sym-link the following files to it:";
for( int i=1; i<duplicateFileNames.count(); i++ )
qDebug() << " " << duplicateFileNames[i];
}
}
if( duplicateCount == 0 )
{
qDebug() << "No duplicates found with file suffix" << fileSuffix;
return 0;
}
printf( "\nDo you want to do this?\n yes or no: " );
fflush( stdout );
char answer[16];
scanf( "%s", answer );
if( ! QString(answer).toLower().contains( "y" ) )
{
qDebug() << "\nNot doing anything.\n";
return 0;
}
foreach( QString md5, md5map.keys() )
{
QStringList duplicateFileNames = md5map[md5].toList();
if( duplicateFileNames.count() > 1 )
{
QString masterFile = duplicateFileNames[0];
QString masterFileNewName = masterDirectory + "/" +
QFileInfo(masterFile).fileName();
if( QProcess::execute( "mv", QStringList() << "-v" << masterFile <<
masterFileNewName ) == 0 )
{
Q_ASSERT( ! QFile::exists(masterFile) );
if( QProcess::execute( "ln", QStringList() << "-s" <<
masterFileNewName << masterFile ) == 0 )
{
for( int i=1; i<duplicateFileNames.count(); i++ )
{
QString fileName = duplicateFileNames[i];
if( QProcess::execute( "rm", QStringList() << "-v" <<
fileName ) == 0 )
{
if( QProcess::execute( "ln", QStringList() << "-s"
<< masterFile << fileName ) != 0 )
{
qWarning() << "Error: Could not create
sym-link from" << fileName << "to" << masterFile;
}
}
else
{
qWarning() << "Error: Could not remove file" <<
fileName << "to make it a sym-link.";
}
}
}
else
{
qWarning() << "Error: Could not sym-link master file" <<
masterFile << "to the master directory" << masterFileNewName;
}
}
else
{
qWarning() << "Error: Could not move file" << masterFile << "to
the master directory" << masterDirectory;
}
}
}
return 0;
}
/*
PLUG: http://plug.org, #utah on irc.freenode.net
Unsubscribe: http://plug.org/mailman/options/plug
Don't fear the penguin.
*/