Dave Smith wrote:
I have a co-worker who has hundreds of duplicate large files strewn about his Linux file system, and I want to help him consolidate them. What I have in mind is a program that will find all files with a certain extension, md5sum them all, and then create a master directory with the files, and replace all the duplicates with symbolic links. Does anyone know of such a utility?

I wrote my own little solution Qt/C++, called qduper. It's not a GUI app, just runs in the console, but uses things like QDir, QString, QFileInfo, and QProcess to ease the job. I wrote it against Qt 4.3.2, but it should work with any Qt 4 version. It most likely will not compile against Qt 3 (any version). It only runs on *nix, not Windows. It might run on Mac, though it's untested.

The program recursively finds all duplicates in a list of directories, and moves one of the duplicate files from each set to a master directory, symbolically linking to it from all the other duplicates and its previous location. Oh, and you can specify a file suffix so it only operates on files that end in a certain extension (or leave it blank). If you've got a lot of copied files lying around, this might help you clean them up.

If anyone finds it useful, let me know. I sure did.

--Dave

P.S. This certainly isn't a shining example of Qt's awesomeness, but it works okay'ish, and the code isn't awful. :)
#include <QDirIterator>
#include <QStringList>
#include <QByteArray>
#include <QtGlobal>
#include <QIODevice>
#include <QProcess>
#include <QString>
#include <QtDebug>
#include <QSet>
#include <QDir>

int main( int argc, char **argv )
{
    if( argc < 4 )
    {
        printf( "qduper finds duplicate files in a directory (and 
sub-directories), moves one of each duplicate set to a master directory, and 
then sym-links all the other duplicates to it. It preserves all files names.\n" 
);
        printf( "\n" );
        printf( "Usage: %s <file suffix> <master directory> <directory to 
search> [<directory to search> ... ]\n", argv[0] );
        return 1;
    }

    QString fileSuffix( argv[1] );
    QString masterDirectory( argv[2] );
    QSet<QString> directorySet;
    for( int i=3; i<argc; i++ )
        directorySet << argv[i];

    if( ! QDir(masterDirectory).exists() )
    {
        qWarning() << "Error: The specified master directory" << 
masterDirectory << "does not exist.";
        return 1;
    }

    masterDirectory = QFileInfo(masterDirectory).canonicalFilePath();

    qDebug() << "";
    qDebug() << "File suffix:     " << fileSuffix;
    qDebug() << "Master directory:" << masterDirectory;
    qDebug() << "Directory list:  " << QStringList(directorySet.toList()).join( 
", " );
    qDebug() << "";

    QRegExp md5regex( "^([a-z0-9]+)\\s*", Qt::CaseInsensitive );
    QMap<QString,QSet<QString> > md5map;
    foreach( QString directory, directorySet )
    {
        QDirIterator dirIterator( directory, QDirIterator::Subdirectories );
        while( dirIterator.hasNext() )
        {
            QString fileName = dirIterator.next();
            QFileInfo fileInfo( fileName );
            fileName = fileInfo.canonicalFilePath();
            if( fileInfo.isFile() && fileName.endsWith( fileSuffix ) )
            {
                if( fileInfo.dir() == QDir(masterDirectory) )
                {
                    qDebug() << "Skipping" << fileName << "beacuse it is inside 
the master directory.";
                    continue;
                }
                else if( fileInfo.isSymLink() )
                {
                    qDebug() << "Skipping" << fileName << "beacuse it is a 
symbolic link.";
                    continue;
                }

                QProcess process;
                process.start( "md5sum", QStringList() << fileName, 
QIODevice::ReadOnly );
                process.waitForFinished( -1 );

                if( process.exitCode() == 0 && process.exitStatus() == 
QProcess::NormalExit )
                {
                    QString md5output = process.readAllStandardOutput();
                    if( md5regex.indexIn( md5output ) != -1 )
                    {
                        QString md5 = md5regex.cap(1);
                        md5map[ md5 ].insert( fileName );
                    }
                }
                else
                {
                    qWarning() << "Warning:" << 
process.readAllStandardError().trimmed();
                }
            }
        }
    }

    int duplicateCount = 0;
    foreach( QString md5, md5map.keys() )
    {
        QStringList duplicateFileNames = md5map[md5].toList();
        if( duplicateFileNames.count() > 1 )
        {
            duplicateCount += duplicateFileNames.count();
            QString masterFile = duplicateFileNames[0];
            qDebug() << "I will move" << masterFile << "to directory" << 
masterDirectory;
            qDebug() << "   And sym-link the following files to it:";
            for( int i=1; i<duplicateFileNames.count(); i++ )
                qDebug() << "      " << duplicateFileNames[i];
        }
    }

    if( duplicateCount == 0 )
    {
        qDebug() << "No duplicates found with file suffix" << fileSuffix;
        return 0;
    }

    printf( "\nDo you want to do this?\n  yes or no: " );
    fflush( stdout );
    char answer[16];
    scanf( "%s", answer );
    if( ! QString(answer).toLower().contains( "y" ) )
    {
        qDebug() << "\nNot doing anything.\n";
        return 0;
    }

    foreach( QString md5, md5map.keys() )
    {
        QStringList duplicateFileNames = md5map[md5].toList();

        if( duplicateFileNames.count() > 1 )
        {
            QString masterFile = duplicateFileNames[0];
            QString masterFileNewName = masterDirectory + "/" + 
QFileInfo(masterFile).fileName();
            if( QProcess::execute( "mv", QStringList() << "-v" << masterFile << 
masterFileNewName ) == 0 )
            {
                 Q_ASSERT( ! QFile::exists(masterFile) );
                 if( QProcess::execute( "ln", QStringList() << "-s" << 
masterFileNewName << masterFile ) == 0 )
                 {
                     for( int i=1; i<duplicateFileNames.count(); i++ )
                     {
                         QString fileName = duplicateFileNames[i];
                         if( QProcess::execute( "rm", QStringList() << "-v" << 
fileName ) == 0 )
                         {
                             if( QProcess::execute( "ln", QStringList() << "-s" 
<< masterFile << fileName ) != 0 )
                             {
                                 qWarning() << "Error: Could not create 
sym-link from" << fileName << "to" << masterFile;
                             }
                         }
                         else
                         {
                             qWarning() << "Error: Could not remove file" << 
fileName << "to make it a sym-link.";
                         }
                     }
                 }
                 else
                 {
                     qWarning() << "Error: Could not sym-link master file" << 
masterFile << "to the master directory" << masterFileNewName;
                 }
            }
            else
            {
                qWarning() << "Error: Could not move file" << masterFile << "to 
the master directory" << masterDirectory;
            }
        }
    }

    return 0;
}
/*
PLUG: http://plug.org, #utah on irc.freenode.net
Unsubscribe: http://plug.org/mailman/options/plug
Don't fear the penguin.
*/

Reply via email to