Update of /cvsroot/nutch/nutch/src/java/net/nutch/util
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv4633/src/java/net/nutch/util

Modified Files:
        FileUtil.java 
Added Files:
        NutchFile.java NutchFileSystem.java 
        NutchGenericFileSystem.java NutchNFSFileSystem.java 
        NutchRemoteFileSystem.java ShareGroup.java ShareSet.java 
Log Message:

  Full commit for Nutch distributed WebDB.

  This is a lot of new code that implements the multi-machine
web database.  This means we should be able to update the db
with multiple CPUs and disks simultaneously.  (This has been
a major bottleneck for us so far.)

  This commit also contains files for the NutchFileSystem, which
is a rudimentary distributed file system.  The Distributed WebDB
is built on top of NutchFS.  There are two implementations of
NutchFS: one for machines mounting NFS (network file system), and
one for machines that need to use a remote SSL connection,  The
former is well-tested, but the latter is still a little sketchy.

  I've done what little testing I can do on my laptop.  I'm putting
code back so that other people can take a look, and so we can put
it on multiple machines.

  Note that I've put changes back to the files "DistributedWebDBWriter"
and "DistributedWebDBReader".  These are meant to replace "WebDBWriter" 
and "WebDBReader," but I didn't want to disturb the source base
until the distributed code is tested further.   



--- NEW FILE: NutchFile.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import java.io.*;

/************************************************************
 * A class that names a file in the "NutchFileSpace".  You can 
 * convert a NutchFile to a real file with the help of
 * an instance of NutchFileSystem.
 *
 * @author Mike Cafarella
 *************************************************************/
public class NutchFile {
    String dbName;
    String shareGroupName;
    File name;
    NutchFileSystem nfs;

    /**
     * A NutchFile contains:
     *   dbName, which labels the cooperating NutchFileSystem it
     *           belongs to.
     *   shareGroupName,  which tells the NutchFileSystem which group should get
     *           access to this file.  If the value is null, then no remote 
     *           group will get access.
     *   name, which gives the file a unique name.
     */
    public NutchFile(NutchFileSystem nfs, String dbName, String shareGroupName, File 
name) {
        this.nfs = nfs;
        this.dbName = dbName;
        this.shareGroupName = shareGroupName;
        this.name = name;
    }

    /**
     * Create a NutchFile from a previous one that is a directory.
     */
    public NutchFile(NutchFile dir, String name) {
        this.nfs = dir.nfs;
        this.dbName = dir.getDBName();
        this.shareGroupName = dir.getShareGroupName();
        this.name = new File(dir.getName(), name);
    }

    /**
     * DB Name the NutchFile lives in.
     */
    public String getDBName() {
        return dbName;
    }

    /**
     * Get the name of the sharegroup this file belongs to.
     */
    public String getShareGroupName() {
        return shareGroupName;
    }

    /**
     * Terminating filename for the NutchFile.
     */
    public File getName() {
        return name;
    }

    /**
     * Grab a handle to the NutchFileSystem
     */
    public NutchFileSystem getFS() {
        return nfs;
    }

    /**
     * Get the almost-fully-qualified name for this NutchFile.
     */
    public String getFilename() {
        File target = new File(new File(dbName), shareGroupName);
        target = new File(target, name.getPath());
        return target.getPath();
    }

    /**
     * Get the almost-fully-qualified name for this NutchFile's
     * 'completed' flag file.
     */
    public String getCompleteFlagName() {
        File db = new File(dbName);
        File target = new File(new File(dbName), shareGroupName);
        target = new File(target, name.getPath() + ".completed");
        return target.getPath();
    }

    /**
     */
    public String toString() {
        return getFilename();
    }
}

--- NEW FILE: NutchFileSystem.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import java.io.*;

/****************************************************************
 * NutchFileSystem is an interface for a fairly simple
 * distributed file system.  A Nutch installation might consist
 * of multiple machines, which should swap files transparently.
 * This interface allows other Nutch systems to find and place
 * files into the distributed Nutch-controlled file world.
 *
 * The standard job of NutchFileSystem is to take the location-
 * independent NutchFile objects, and resolve them using local
 * knowledge and local instances of ShareGroup.
 * 
 * @author Mike Cafarella
 *****************************************************************/
public interface NutchFileSystem {

    /**
     * Get a real File for a name that's not yet under NutchFS control.
     * This may improve performance later on when the
     * File is put() under NutchFS control.  It's also handy for
     * finding a file location where there is a lot of extra room.
     */
    public File getWorkingFile() throws IOException;

    /**
     * Associates a NutchFile with a given real-fs File.  The
     * real-world File will be moved to a proper location according
     * to its NutchFile representation.  It will be moved locally
     * or remotely, as appropriate.
     *
     * The given "real" File can no longer be assumed to exist at
     * the given location after the call to putFile().  In the future,
     * the File should only be obtained via its NutchFile identifier.
     * 
     * Returns the File that was there previously, if any.
     */
    public void put(NutchFile nutchFile, File workingFile, boolean overwrite) throws 
IOException;

    /**
     * Sometimes the NutchFileSystem user constructs a directory of many
     * subparts, often built slowly over time.  However, that highest-level
     * directory might not ever have been put(); instead, its subparts 
     * have been put(), one piece at a time.
     *
     * Eventually, though, all the subdirs will be in place, and the
     * entire directory structure will be complete.  That event is
     * signified by calling "completeDir".  This call will mark
     * the given directory as completed.
     */
    public void completeDir(NutchFile nutchFile) throws IOException;

    /**
     * Obtains the indicated NutchFile, whether remote or local.
     * The function will block until the file is available.
     */
    public File get(NutchFile nutchFile) throws IOException;

    /**
     * Same as above, but expires after the given number of ms, 
     * returning null.
     */
    public File get(NutchFile nutchFile, long timeout) throws IOException;

    /**
     * Obtain a lock with the given NutchFile as the lock object
     */
    public void lock(NutchFile lockFile, boolean exclusive) throws IOException;

    /**
     * Release the lock.  Must be in the lock() state.
     */
    public void release(NutchFile lockFile) throws IOException;

    /**
     * Delete the given NutchFile and everything below it.  This is
     * propagated to the different appropriate machines, the same
     * way a put() operation is.
     */
    public void delete(NutchFile nutchFile) throws IOException;

    /**
     * Rename the given NutchFile to something new.  Files cannot
     * be moved across share-spaces.  The change is propagated 
     * immediately to all participants in the share-space.  The
     * client is responsible for any necessary locking or process
     * synchronization.
     */
    public void renameTo(NutchFile src, NutchFile dst) throws IOException;

    /**
     * Close down the fs.
     */
    public void close() throws IOException;
}

--- NEW FILE: NutchGenericFileSystem.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import java.io.*;
import java.util.*;
import java.nio.channels.*;

/****************************************************************
 * NutchGenericFileSystem implements the NutchFileSystem interface
 * and adds some generic utility methods for subclasses to use.
 *
 * The standard task any implementor of NutchFileSystem
 *
 * @author Mike Cafarella
 ****************************************************************/
public abstract class NutchGenericFileSystem implements NutchFileSystem {
    File dbRoot, localTmp, flagFile;
    FileInputStream lockData;
    FileLock lock;
    ShareSet shareSet;
    boolean destructivePut;

    /**
     * Create a Nutch Filesystem at the indicated mounted
     * directory.
     */
    public NutchGenericFileSystem(File dbRoot, ShareSet shareSet, boolean 
destructivePut) throws IOException {
        if (shareSet == null) {
            this.shareSet = new ShareSet(dbRoot);
        } else {
            this.shareSet = shareSet;
        }

        //
        // 1.  Create/find main work area (which will receive files from
        //     other processes and may be shared).
        this.dbRoot = dbRoot;
        if (! dbRoot.exists()) {
            dbRoot.mkdirs();
        }
        if (! dbRoot.isDirectory()) {
            throw new IOException("Directory " + dbRoot + " does not exist.");
        }

        //
        // 2.  Attempt to acquire an exclusive lock on the directory.
        //     If this succeeds, the process should then clear out the
        //     tmp storage area.  If this fails, just continue.
        //
        Vector tmpDirs = new Vector();
        File rootFiles[] = dbRoot.listFiles();
        for (int i = 0; i < rootFiles.length; i++) {
            if (rootFiles[i].isDirectory() && 
rootFiles[i].getName().startsWith("localtmpdir")) {
                tmpDirs.add(rootFiles[i]);
            }
        }
        // If there are any tmpDirs for us to delete, try to do it.
        if (tmpDirs.size() > 0) {
            File exclusiveLockFile = new File(dbRoot, "nutchfslock");
            exclusiveLockFile.createNewFile();
            FileOutputStream exclusiveLockData = new 
FileOutputStream(exclusiveLockFile);
            FileLock exclusiveLock = exclusiveLockData.getChannel().tryLock();

            // Once we have the lock, go and delete them
            if (exclusiveLock != null) {
                for (Enumeration e = tmpDirs.elements(); e.hasMoreElements(); ) {
                    FileUtil.fullyDelete((File) e.nextElement());
                }
                exclusiveLock.release();
                exclusiveLockData.close();
            }
        }


        //
        // 3.  Acquire a non-exclusive lock on the directory.  Block
        //     until this is acquired.  (The only thing preventing it
        //     would be another process in step 2.)
        //
        File lockFile = new File(dbRoot, "nutchfslock");
        lockFile.createNewFile();
        this.lockData = new FileInputStream(lockFile);
        this.lock = lockData.getChannel().lock(0L, Long.MAX_VALUE, true);

        //
        // 4.  Create the tmp directory
        //
        this.localTmp = File.createTempFile("localtmpdir", "", dbRoot);
        this.localTmp.delete();
        if (! localTmp.exists()) {
            localTmp.mkdirs();
        }
        if (! localTmp.isDirectory()) {
            throw new IOException("Directory " + localTmp + " does not exist.");
        }

        //
        // 5.  Create the src lock file
        //
        this.flagFile = File.createTempFile("flag", "tmp");

        //
        // 6.  Whether files should be deleted after being copied
        //
        this.destructivePut = destructivePut;
    }

    /**
     * Acquire a real File for a name that's not yet under NutchFS
     * control.  This may improve performance later on when the
     * File is put() under NutchFS control.  It's also handy for
     * finding a file location where there is a lot of extra room.
     */
    public File getWorkingFile() throws IOException {        
        File f = File.createTempFile("tmp", "", localTmp);
        f.delete();
        return f;
    }

    /**
     * Wait for a NutchFile from somewhere in NutchSpace.  Translate 
     * it to a regular old filesystem File.
     *
     * The file should already be in place.  So we wait until it is.
     */
    public File get(NutchFile nutchFile) throws IOException {
        return get(nutchFile, -1);
    }

    /**
     * Wait for a NutchFile for the specified amount of time.  Return null
     * if we don't get it before 'timeout' ms have elapsed.
     */
    public File get(NutchFile nutchFile, long timeout) throws IOException {
        long startTime = System.currentTimeMillis();
        int numTries = 0;
        ShareGroup sg = shareSet.getShareGroup(nutchFile);

        File target = new File(dbRoot, nutchFile.getFilename());
        File completeFlag = new File(dbRoot, nutchFile.getCompleteFlagName());
        while (! completeFlag.exists()) {
            try {
                if ((numTries > 0) && 
                    (timeout > 0) && 
                    (System.currentTimeMillis() - startTime > timeout)) {
                    return null;
                }
                Thread.sleep(1000);
                numTries++;
                if (numTries > 10) {
                    System.err.println("NutchGenericFileSystem waiting for file " + 
completeFlag);
                }
            } catch (InterruptedException ie) {
            }
        }
        return target;
    }

    /**
     * Obtain a lock with the given NutchFile.  This might mean obtaining
     * locks across many different machines/filesystems.  That's fine,
     * as long as every machine always obtains the locks in a standard 
     * ordering.
     */
    public void lock(NutchFile nutchFile, boolean exclusive) throws IOException {
        File lockFile = getWorkingFile();
        lockFile.createNewFile();
        put(nutchFile, lockFile, false);

        ShareGroup sg = shareSet.getShareGroup(nutchFile);
        String locations[] = sg.getLocations();
        for (int i = 0; i < locations.length; i++) {
            String locMach = extractMachine(locations[i]);
            String locStr = extractPath(locations[i]);
            lockFile(locMach, locStr, nutchFile.getFilename(), exclusive);
        }
    }

    /**
     * Release the lock for the given NutchFile
     */
    public void release(NutchFile nutchFile) throws IOException {
        ShareGroup sg = shareSet.getShareGroup(nutchFile);
        String locations[] = sg.getLocations();
        for (int i = 0; i < locations.length; i++) {
            String locMach = extractMachine(locations[i]);
            String locStr = extractPath(locations[i]);
            release(locMach, locStr, nutchFile.getFilename());
        }
    }

    /**
     * Add a single file or a directory of files to the filesystem.
     * If the source File is a directory, we want to reproduce
     * the entire directory structure, rooted at the given
     * NutchFile.
     */
    public void put(NutchFile nutchFile, File workingFile, boolean overwrite) throws 
IOException {
        if (workingFile.isDirectory()) {
            putDir(nutchFile, workingFile, overwrite);
        } else {
            putFile(nutchFile, workingFile, overwrite);
        }
        FileUtil.fullyDelete(workingFile);
    }

    /**
     * Add a directory and its contents to the filesystem
     */
    void putDir(NutchFile nutchDir, File workingDir, boolean overwrite) throws 
IOException {    
        File workingFiles[] = workingDir.listFiles();
        NutchFile nutchFiles[] = new NutchFile[workingFiles.length];

        //
        // Remove target dir's completion flag
        //
        ShareGroup sg = shareSet.getShareGroup(nutchDir);
        String locations[] = sg.getLocations();
        for (int i = 0; i < locations.length; i++) {
            String locMach = extractMachine(locations[i]);
            String locStr = extractPath(locations[i]);
            deleteFile(locMach, locStr, nutchDir.getCompleteFlagName());
        }

        //
        // Build a list of all contained items
        //
        for (int i = 0; i < nutchFiles.length; i++) {
            nutchFiles[i] = new NutchFile(nutchDir, workingFiles[i].getName());
        }

        //
        // Put the list to the FS
        //
        for (int i = 0; i < workingFiles.length; i++) {
            put(nutchFiles[i], workingFiles[i], overwrite);
        }

        //
        // We've written dir's contents, so write out completion flag
        //
        for (int i = 0; i < locations.length; i++) {
            String locMach = extractMachine(locations[i]);
            String locStr = extractPath(locations[i]);
            copyFile(flagFile, locMach, locStr, nutchDir.getCompleteFlagName(), true);
        }
    }

    /**
     * Add a single file to the filesystem.
     */
    void putFile(NutchFile nutchFile, File workingFile, boolean overwrite) throws 
IOException {
        ShareGroup sg = shareSet.getShareGroup(nutchFile);
        String locations[] = sg.getLocations();
        for (int i = 0; i < locations.length; i++) {
            String locMach = extractMachine(locations[i]);
            String locStr = extractPath(locations[i]);

            // Remove 'complete' flag
            deleteFile(locMach, locStr, nutchFile.getCompleteFlagName());

            // Write file, if necessary.
            copyFile(workingFile, locMach, locStr, nutchFile.getFilename(), overwrite);

            // Write 'complete' flag
            copyFile(flagFile, locMach, locStr, nutchFile.getCompleteFlagName(), true);
        }
    }

    /**
     * Complete the given directory
     */
    public void completeDir(NutchFile nutchFile) throws IOException {
        ShareGroup sg = shareSet.getShareGroup(nutchFile);
        String locations[] = sg.getLocations();
        for (int i = 0; i < locations.length; i++) {
            String locMach = extractMachine(locations[i]);
            String locStr = extractPath(locations[i]);

            // Write 'complete' flag
            copyFile(flagFile, locMach, locStr, nutchFile.getCompleteFlagName(), true);
        }
    }

    /**
     * Take the file out of the NutchFileSystem.
     */
    public void delete(NutchFile nutchFile) throws IOException {
        ShareGroup sg = shareSet.getShareGroup(nutchFile);
        String locations[] = sg.getLocations();
        for (int i = 0; i < locations.length; i++) {
            String locMach = extractMachine(locations[i]);
            String locStr = extractPath(locations[i]);

            deleteFile(locMach, locStr, nutchFile.getFilename());
            deleteFile(locMach, locStr, nutchFile.getCompleteFlagName());
        }
    }

    /**
     * Rename the thing.  Usually done at close.
     */
    public void renameTo(NutchFile src, NutchFile dst) throws IOException {
        // Make sure src file is complete
        File srcFile = get(src);

        // Remove src complete flags
        ShareGroup sg = shareSet.getShareGroup(src);
        String locations[] = sg.getLocations();
        for (int i = 0; i < locations.length; i++) {
            String locMach = extractMachine(locations[i]);
            String locStr = extractPath(locations[i]);

            // Remove src complete flags
            deleteFile(locMach, locStr, src.getCompleteFlagName());

            // Rename contents
            renameFile(srcFile, locMach, locStr, dst.getFilename(), true);

            // Create target flags
            copyFile(flagFile, locMach, locStr, dst.getCompleteFlagName(), true);
        }
    }

    /**
     * Close down the Generic File System
     */
    public void close() throws IOException {
        // Get rid of the tmp directory
        FileUtil.fullyDelete(localTmp);
        
        // Get rid of tmp flag file
        FileUtil.fullyDelete(flagFile);

        this.lock.release();
        this.lockData.close();
    }

    /**
     * To be implemented by subclasses
     */
    protected abstract void copyFile(File srcFile, String locationMach, String 
locationStr, String nutchFileName, boolean overwrite) throws IOException;
    protected abstract void deleteFile(String locationMach, String locationStr, String 
nutchFileName) throws IOException;
    protected abstract void renameFile(File srcFile, String locationMach, String 
locationStr, String nutchFileName, boolean overwrite) throws IOException;
    protected abstract void lockFile(String locMach, String locStr, String filename, 
boolean exclusive) throws IOException;
    protected abstract void release(String locMach, String locStr, String filename) 
throws IOException;

    /**
     * Utility str-processing of location-string.
     * (format "machinename:path")
     */
    String extractMachine(String location) {
        int colDex = location.indexOf(":");
        if (colDex < 0) {
            return null;
        }
        return location.substring(0, colDex);
    }
    
    /**
     * Utility str-processing of location-string.
     * (format "machinename:path")
     */
    String extractPath(String location) {
        int colDex = location.indexOf(":");
        if (colDex < 0) {
            return location;
        }
        return location.substring(colDex + 1);
    }
}

--- NEW FILE: NutchNFSFileSystem.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import java.io.*;
import java.util.*;
import java.nio.channels.*;

/****************************************************************
 * NutchNFSFileSystem implements NutchFileSystem over the Network File System.
 * We assume all participants are mounting the same drive.
 *
 * @author Mike Cafarella
 *****************************************************************/
public class NutchNFSFileSystem extends NutchGenericFileSystem {
    TreeMap lockDataSet = new TreeMap(), lockObjSet = new TreeMap();

    /**
     * Create the ShareSet automatically, and then go on to
     * the regular constructor.
     */
    public NutchNFSFileSystem(File dbRoot, boolean destructiveCopy) throws IOException 
{
        this(dbRoot, null, destructiveCopy);
    }

    /**
     * Create a Nutch Filesystem at the indicated mounted
     * directory.  We're given a ShareSet.
     */
    public NutchNFSFileSystem(File dbRoot, ShareSet initShareSet, boolean 
destructiveCopy) throws IOException {
        super(dbRoot, initShareSet, destructiveCopy);

        // Make sure the shareGroups are in good working order
        for (Iterator it = shareSet.getShareGroups().values().iterator(); 
it.hasNext(); ) {
            ShareGroup sg = (ShareGroup) it.next();
            String locations[] = sg.getLocations();

            for (int i = 0; i < locations.length; i++) {
                if (locations[i].indexOf(":") >= 0) {
                    throw new IOException("Cannot process non-local locations");
                }
            }
        }
    }

    /**
     * Obtain a lock with the given info.
     */
    public synchronized void lockFile(String locMach, String locStr, String filename, 
boolean exclusive) throws IOException {
        // NFSFileSystem ignores the locMach value

        File lockTarget = new File(locStr, filename);
        FileInputStream lockData = new FileInputStream(lockTarget);
        FileLock lockObj = lockData.getChannel().lock(0L, Long.MAX_VALUE, exclusive);
        lockDataSet.put(lockTarget, lockData);
        lockObjSet.put(lockTarget, lockObj);
    }

    /**
     * Release the lock for the given NutchFile
     */
    public synchronized void release(String locMach, String locStr, String filename) 
throws IOException {
        // NFSFileSystem ignores the locMach value
        File lockTarget= new File(locStr, filename);

        FileLock lockObj = (FileLock) lockObjSet.get(lockTarget);
        FileInputStream lockData = (FileInputStream) lockDataSet.get(lockTarget);

        lockObj.release();
        lockData.close();

        lockObjSet.remove(lockTarget);
        lockDataSet.remove(lockTarget);
    }

    /**
     * Copy a file to the right place in the local dir, which assumes
     * NFS-connectivity.
     */
    protected void copyFile(File srcFile, String locMach, String locStr, String 
filename, boolean overwrite) throws IOException {
        // NFSFileSystem has no locMachine component.
        File target = new File(locStr, filename);
        FileUtil.copyContents(srcFile, target, overwrite);
    }

    /**
     * Remove a file from its current location.  Assumes an NFS-universe.
     */
    protected void deleteFile(String locMach, String locStr, String filename) throws 
IOException {
        // NFSFileSystem has no machine component
        FileUtil.fullyDelete(new File(locStr, filename));
    }

    /**
     * Rename the existing file or dir to a new location
     */
    protected void renameFile(File srcFile, String locMach, String locStr, String 
filename, boolean overwrite) throws IOException {
        // NFSFileSystem has no machine component
        File target = new File(locStr, filename);
        srcFile.renameTo(target);
    }
}

--- NEW FILE: NutchRemoteFileSystem.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import java.io.*;
import java.util.*;

/*****************************************************
 * NutchRemoteFileSystem implements the NutchFileSystem over
 * machines that can be linked via some set of command-line args.
 * (presumably 'scp').
 *
 * @author Mike Cafarella
 *****************************************************/
public class NutchRemoteFileSystem extends NutchGenericFileSystem {
    static String SRCPATH_SYMBOL = "%srcpath%";
    static String DSTPATH_SYMBOL = "%dstpath%";
    static String DSTMACH_SYMBOL = "%dstmach%";

    String cpTemplate = null, rmTemplate = null, mkdirTemplate = null;

    /**
     * Create the ShareSet automatically, then do regular constructor.
     */
    public NutchRemoteFileSystem(File dbRoot, String cpTemplate, String rmTemplate, 
String mkdirTemplate) throws IOException {
        this(dbRoot, new ShareSet(dbRoot), cpTemplate, rmTemplate, mkdirTemplate);
    }

    /**
     * The NutchRemoteFileSystem takes template-strings for 
     * its various needed commands, which may differ among installations.  
     * The class will fill in these templates with the necessary args,
     * and then invoke them via System.exec().
     *
     * We're given the ShareSet here.
     */
    public NutchRemoteFileSystem(File dbRoot, ShareSet shareSet, String cpTemplate, 
String rmTemplate, String mkdirTemplate) throws IOException {
        super(dbRoot, shareSet, true);
        this.cpTemplate = cpTemplate;
        this.rmTemplate = rmTemplate;
        this.mkdirTemplate = mkdirTemplate;
        
        // Make sure templates are found
        if (cpTemplate == null) {
            throw new IOException("No value found for cptemplate");
        }
        if (rmTemplate == null) {
            throw new IOException("No value found for rmtemplate");
        }
        if (mkdirTemplate == null) {
            throw new IOException("No value found for mkdirtemplate");
        }

        // Make sure the templates have everything they should
        if (cpTemplate.indexOf(SRCPATH_SYMBOL) < 0) {
            throw new IOException("The cptemplate string does not contain " + 
SRCPATH_SYMBOL);
        }
        if (cpTemplate.indexOf(DSTPATH_SYMBOL) < 0) {
            throw new IOException("The cptemplate string does not contain " + 
DSTPATH_SYMBOL);
        }
        if (rmTemplate.indexOf(DSTPATH_SYMBOL) < 0) {
            throw new IOException("The rmtemplate string does not contain " + 
DSTPATH_SYMBOL);
        }
        if (mkdirTemplate.indexOf(DSTPATH_SYMBOL) < 0) {
            throw new IOException("The mkdirtemplate string does not contain " + 
DSTPATH_SYMBOL);
        }
    }
    
    /**
     * Copy a file from one place to another.  Requires that
     * template-strings be set correctly.  
     */
    protected void copyFile(File srcFile, String locationMach, String locationStr, 
String nutchFileName, boolean overwrite) throws IOException {
        //
        // Use values to fill in the template strs.
        //
        String cpCommand = cpTemplate.replaceAll(SRCPATH_SYMBOL, srcFile.getPath());
        cpCommand = cpCommand.replaceAll(DSTMACH_SYMBOL, locationMach);
        cpCommand = cpCommand.replaceAll(DSTPATH_SYMBOL, new File(new 
File(locationStr), nutchFileName).getPath());

        String mkdirCommand = mkdirTemplate.replaceAll(DSTPATH_SYMBOL, new File(new 
File(locationStr), nutchFileName).getParentFile().getPath());

        //
        // Make sure the target directory exists
        //
        invoke(mkdirCommand);

        //
        // Finally, invoke the newly-built copy command.
        //
        invoke(cpCommand);
    }

    /**
     * Remove a file the given location.  Requires that template-
     * strings be set correctly.  
     */
    protected void deleteFile(String locationMach, String locationStr, String 
nutchFileName) throws IOException {
        //
        // Use values to fill in template strs
        //
        String rmCommand = rmTemplate.replaceAll(DSTMACH_SYMBOL, locationMach);
        rmCommand = rmCommand.replaceAll(DSTPATH_SYMBOL, new File(new 
File(locationStr), nutchFileName).getPath());

        //
        // Finally, invoke newly-built command
        //
        invoke(rmCommand);
    }

    /**
     * Currently unimplemented
     */
    protected void lockFile(String locMach, String locStr, String filename, boolean 
exclusive) throws IOException {
    }

    /**
     */
    protected void release(String locMach, String locStr, String filename) throws 
IOException {
    }

    /**
     */
    protected void renameFile(File srcFile, String locMach, String locStr, String 
filename, boolean overwrite) throws IOException {
    }

    /**
     * Take care of the details of invoking an external process.
     * We always assume traditional error-code interpretation 
     * (0 for success, non-zero for failure).
     */
    void invoke(String command) throws IOException {
        Process p = Runtime.getRuntime().exec(command);
        int returnCode = 0;
        try {
            returnCode = p.waitFor();
        } catch (InterruptedException ie) {
            returnCode = -1;
        }

        if (returnCode != 0) {
            throw new IOException("Runtime.exec() failed with code " + returnCode + " 
while running " + command);
        }
    }
}

--- NEW FILE: ShareGroup.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import java.io.*;
import java.util.*;

/****************************************************************
 * A ShareGroup combines the name of a group with where the Nutch
 * filesystem can find members of that group.  Used by NutchFileSystem
 * to help resolve NutchFile objects.
 *
 * @author Mike Cafarella
 *****************************************************************/
public class ShareGroup {
    String name;
    String locations[];

    /**      
     * Make a named ShareGroup, to be found at the given location.
     * locationDesc is a semicolon-separated list of the form 
     * "machinename:dbroot;machinename2:dbroot2...".  The leading
     * "machinename:" part is optional, in which case the location
     * is a locally-(probably NFS)-mounted disk.
     */
    public ShareGroup(String name, String locationDescs) {
        this.name = name;

        Vector v = new Vector();
        StringTokenizer toks = new StringTokenizer(locationDescs, ";");
        while (toks.hasMoreTokens()) {
            v.add(toks.nextToken());
        }

        this.locations = new String[v.size()];
        v.copyInto(this.locations);
    }

    /**
     * Create a ShareGroup as above, but assume the location description
     * can be found via NutchConf.
     */
    public ShareGroup(String name) {
        this(name, NutchConf.get("nutchfs.sharegroup." + name));
    }
    
    /**
     * ShareGroup name.
     */
    public String getName() {
        return name;
    }

    /**
     * Locations for the ShareGroup (machinename:path)
     */
    public String[] getLocations() {
        return locations;
    }
}

--- NEW FILE: ShareSet.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import java.io.*;
import java.util.*;

/****************************************************************
 * A ShareSet is a library of ShareGroup objects.  It defines
 * every other machine in the current NutchFileSystem's universe.
 *
 * @author Mike Cafarella
 *****************************************************************/
public class ShareSet {
    TreeMap shareGroups = new TreeMap();

    /**
     * Build a ShareSet out of a Vector of ShareGroup objects.
     */
    public ShareSet(File dbRoot, Vector shareList) {
        for (Enumeration e = shareList.elements(); e.hasMoreElements(); ) {
            ShareGroup sg = (ShareGroup) e.nextElement();
            shareGroups.put(sg.getName(), sg);
        }
        buildDefault(dbRoot);
    }

    /**
     * Default constructor.  Loads configuration from NutchConf.
     */
    public ShareSet(File dbRoot) {
        String groupList = NutchConf.get("nutchfs.sharegroups.names");
        if (groupList != null) {
            StringTokenizer toks = new StringTokenizer(groupList, ",");
            Vector sharenames = new Vector();
            while (toks.hasMoreTokens()) {
                sharenames.add(toks.nextToken());
            }

            for (Enumeration e = sharenames.elements(); e.hasMoreElements();) {
                String shareName = (String) e.nextElement();
                shareGroups.put(shareName, new ShareGroup(shareName));
            }        
        }
        buildDefault(dbRoot);
    }

    /**
     * Add a default ShareGroup if necessary.
     */
    void buildDefault(File dbRoot) {
        // Create a default shareGroup if necessary
        if (shareGroups.get("*") == null) {
            ShareGroup defaultSG = new ShareGroup("*", dbRoot.getPath());
            shareGroups.put(defaultSG.getName(), defaultSG);
        }
    }

    /**
     * Find the relevant ShareGroup object
     */
    ShareGroup getShareGroup(NutchFile nutchFile) {
        // Check if there is a registered ShareGroup that matches this NutchFile
        ShareGroup sg = (ShareGroup) shareGroups.get(nutchFile.getShareGroupName());

        // If not, find the default ShareGroup
        if (sg == null) {
            sg = (ShareGroup) shareGroups.get("*");
        }
        return sg;
    }

    /**
     * Return entire TreeMap of ShareGroups
     */
    TreeMap getShareGroups() {
        return shareGroups;
    }
}



Index: FileUtil.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/util/FileUtil.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** FileUtil.java       23 May 2003 03:13:36 -0000      1.4
--- FileUtil.java       30 Jan 2004 22:11:44 -0000      1.5
***************
*** 10,39 ****
   */
  public class FileUtil {
- 
      /**
       * Delete a directory and all its contents.  If
!      * we throw an exception, the directory may be
!      * partially-deleted.
       */
!     public static void fullyDelete(File dir) throws IOException {
          File contents[] = dir.listFiles();
!         for (int i = 0; i < contents.length; i++) {
!             if (contents[i].isFile()) {
!                 if (! contents[i].delete()) {
!                     throw new IOException("Could not delete " + 
contents[i].getPath());
                  }
-             } else {
-                 fullyDelete(contents[i]);
              }
          }
!         if (! dir.delete()) {
!             throw new IOException("Could not delete " + dir.getPath());
!         }
      }
  
      /**
!      * Copy a file's contents to a new location
       */
!     public static void copyContents(File src, File dst) throws IOException {
          DataInputStream in = new DataInputStream(new FileInputStream(src));
          try {
--- 10,46 ----
   */
  public class FileUtil {
      /**
       * Delete a directory and all its contents.  If
!      * we return false, the directory may be partially-deleted.
       */
!     public static boolean fullyDelete(File dir) throws IOException {
          File contents[] = dir.listFiles();
!         if (contents != null) {
!             for (int i = 0; i < contents.length; i++) {
!                 if (contents[i].isFile()) {
!                     if (! contents[i].delete()) {
!                         throw new IOException("Could not delete " + 
contents[i].getPath());
!                     }
!                 } else {
!                     fullyDelete(contents[i]);
                  }
              }
          }
!         return dir.delete();
      }
  
      /**
!      * Copy a file's contents to a new location.
!      * Returns whether a target file was overwritten
       */
!     public static boolean copyContents(File src, File dst, boolean overwrite) throws 
IOException {
!         if (dst.exists() && !overwrite) {
!             return false;
!         }
! 
!         File dstParent = dst.getParentFile();
!         if (! dstParent.exists()) {
!             dstParent.mkdirs();
!         }
          DataInputStream in = new DataInputStream(new FileInputStream(src));
          try {
***************
*** 53,56 ****
--- 60,64 ----
              in.close();
          }
+         return true;
      }
  
***************
*** 76,80 ****
              // If the source is a file, then just copy the contents
              //
!             copyContents(src, dst);
          } else {
              //
--- 84,88 ----
              // If the source is a file, then just copy the contents
              //
!             copyContents(src, dst, true);
          } else {
              //



-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to