Update of /cvsroot/nutch/nutch/src/java/net/nutch/tools
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2281/src/java/net/nutch/tools

Modified Files:
        FetchListTool.java WebDBAdminTool.java 
Added Files:
        CrawlTool.java 
Log Message:
Added a new command, crawl, that constructs a database, injects a url
file and performs a few rounds of generate/fetch/updatedb.  This
simplifies use for intranet sites.  Changed some defaults to be
more intranet friendly.

Also fixed a bug where Fetcher.java didn't construct correct relative links
when a page was redirected.


Index: WebDBAdminTool.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/tools/WebDBAdminTool.java,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** WebDBAdminTool.java 6 May 2003 01:31:31 -0000       1.15
--- WebDBAdminTool.java 21 Apr 2004 22:52:54 -0000      1.16
***************
*** 156,160 ****
          if ("-create".equals(command)) {
              WebDBWriter.createWebDB(new File(dir));
!             System.out.println("Created webdb at " + dir);
              return;
          }
--- 156,160 ----
          if ("-create".equals(command)) {
              WebDBWriter.createWebDB(new File(dir));
!             LOG.info("Created webdb at " + dir);
              return;
          }

--- NEW FILE: CrawlTool.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.tools;

import java.io.*;
import java.net.*;
import java.util.*;
import java.text.*;
import java.util.logging.*;

import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.util.*;
import net.nutch.fetcher.*;

/*
 */
public class CrawlTool {
  public static final Logger LOG =
    LogFormatter.getLogger("net.nutch.tools.CrawlTool");

  private static String getDate() {
    return new SimpleDateFormat("yyyyMMddHHmmss").format
      (new Date(System.currentTimeMillis()));
  }

  /**
   * Generate a fetchlist from the pagedb and linkdb
   */
  public static void main(String args[]) throws Exception {
    if (args.length < 1) {
      System.out.println
        ("Usage: CrawlTool <root_url_file> [-threads i] [-depth j] [-delay k]");
      return;
    }

    String rootUrlFile = args[0];

    int threads = NutchConf.getInt("fetcher.threads.fetch", 10);
    int serverDelay = NutchConf.getInt("fetcher.server.delay", 1);
    int depth = 5;

    String root = "crawl-" + getDate();
    String db = root + "/db";
    String segments = root + "/segments";

    for (int i = 1; i < args.length; i++) {
      if ("-threads".equals(args[i])) {
        threads = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-depth".equals(args[i])) {
        depth = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-delay".equals(args[i])) {
        serverDelay = Integer.parseInt(args[i+1]);
        i++;
      }
    }

    LOG.info("CrawlTool started");
    LOG.info("rootUrlFile = " + rootUrlFile);
    LOG.info("threads = " + threads);
    LOG.info("depth = " + depth);
    LOG.info("serverDelay = " + serverDelay);

    // initialize the web database
    WebDBAdminTool.main(new String[] { db, "-create" } );

    // inject the root urls into the database
    WebDBInjector.main(new String[] { db, "-urlfile", rootUrlFile } );
      
    for (int i = 0; i < depth; i++) {

      // generate a new segment
      FetchListTool.main(new String[] { db, segments } );

      // get the name of the last segment
      String[] allSegments = new File(segments).list();
      Arrays.sort(allSegments);
      String segment = segments + "/" + allSegments[allSegments.length-1];

      // fetch the segment
      Fetcher.main(new String[] { "-threads", ""+threads,
                                  "-delay", ""+serverDelay,
                                  segment } );

      // update the database
      UpdateDatabaseTool.main(new String[] { db, segment } );
    }

    // still need to refetch once more to get all of the inlinks

    // and need to index, dedup & merge

    LOG.info("CrawlTool finished");
  }
}

Index: FetchListTool.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/tools/FetchListTool.java,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -d -r1.21 -r1.22
*** FetchListTool.java  14 Oct 2003 06:41:31 -0000      1.21
--- FetchListTool.java  21 Apr 2004 22:52:44 -0000      1.22
***************
*** 640,654 ****
          // Finally, start things up.
          //
!         System.out.println("FetchListTool started at " + new 
Date(System.currentTimeMillis()));
          if (topN != Long.MAX_VALUE) {
!             System.out.println("topN:" + topN);
          }
          if (cutoffScore >= 0) {
!             System.out.println("cutoffscore:" + cutoffScore);
          }
          if (numFetchers > 1) {
!             System.out.println("seed:" + seed);
          }
-         System.out.println();
  
          FetchListTool flt = new FetchListTool(dbDir, refetchOnly, anchorOptimize, 
cutoffScore, seed);
--- 640,653 ----
          // Finally, start things up.
          //
!         LOG.info("FetchListTool started");
          if (topN != Long.MAX_VALUE) {
!             LOG.info("topN:" + topN);
          }
          if (cutoffScore >= 0) {
!             LOG.info("cutoffscore:" + cutoffScore);
          }
          if (numFetchers > 1) {
!             LOG.info("seed:" + seed);
          }
  
          FetchListTool flt = new FetchListTool(dbDir, refetchOnly, anchorOptimize, 
cutoffScore, seed);
***************
*** 658,662 ****
              flt.emitFetchList(segmentDir, topN, curTime);
          }
!         System.out.println("FetchListTool completed at " + new 
Date(System.currentTimeMillis()));
      }
  }
--- 657,661 ----
              flt.emitFetchList(segmentDir, topN, curTime);
          }
!         LOG.info("FetchListTool completed");
      }
  }



-------------------------------------------------------
This SF.Net email is sponsored by: IBM Linux Tutorials
Free Linux tutorial presented by Daniel Robbins, President and CEO of
GenToo technologies. Learn everything from fundamentals to system
administration.http://ads.osdn.com/?ad_id=1470&alloc_id=3638&op=click
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to