Update of /cvsroot/nutch/nutch/src/java/net/nutch/tools
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2281/src/java/net/nutch/tools
Modified Files:
FetchListTool.java WebDBAdminTool.java
Added Files:
CrawlTool.java
Log Message:
Added a new command, crawl, that constructs a database, injects a url
file and performs a few rounds of generate/fetch/updatedb. This
simplifies use for intranet sites. Changed some defaults to be
more intranet friendly.
Also fixed a bug where Fetcher.java didn't construct correct relative links
when a page was redirected.
Index: WebDBAdminTool.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/tools/WebDBAdminTool.java,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** WebDBAdminTool.java 6 May 2003 01:31:31 -0000 1.15
--- WebDBAdminTool.java 21 Apr 2004 22:52:54 -0000 1.16
***************
*** 156,160 ****
if ("-create".equals(command)) {
WebDBWriter.createWebDB(new File(dir));
! System.out.println("Created webdb at " + dir);
return;
}
--- 156,160 ----
if ("-create".equals(command)) {
WebDBWriter.createWebDB(new File(dir));
! LOG.info("Created webdb at " + dir);
return;
}
--- NEW FILE: CrawlTool.java ---
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.tools;
import java.io.*;
import java.net.*;
import java.util.*;
import java.text.*;
import java.util.logging.*;
import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.util.*;
import net.nutch.fetcher.*;
/*
*/
public class CrawlTool {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.tools.CrawlTool");
private static String getDate() {
return new SimpleDateFormat("yyyyMMddHHmmss").format
(new Date(System.currentTimeMillis()));
}
/**
* Generate a fetchlist from the pagedb and linkdb
*/
public static void main(String args[]) throws Exception {
if (args.length < 1) {
System.out.println
("Usage: CrawlTool <root_url_file> [-threads i] [-depth j] [-delay k]");
return;
}
String rootUrlFile = args[0];
int threads = NutchConf.getInt("fetcher.threads.fetch", 10);
int serverDelay = NutchConf.getInt("fetcher.server.delay", 1);
int depth = 5;
String root = "crawl-" + getDate();
String db = root + "/db";
String segments = root + "/segments";
for (int i = 1; i < args.length; i++) {
if ("-threads".equals(args[i])) {
threads = Integer.parseInt(args[i+1]);
i++;
} else if ("-depth".equals(args[i])) {
depth = Integer.parseInt(args[i+1]);
i++;
} else if ("-delay".equals(args[i])) {
serverDelay = Integer.parseInt(args[i+1]);
i++;
}
}
LOG.info("CrawlTool started");
LOG.info("rootUrlFile = " + rootUrlFile);
LOG.info("threads = " + threads);
LOG.info("depth = " + depth);
LOG.info("serverDelay = " + serverDelay);
// initialize the web database
WebDBAdminTool.main(new String[] { db, "-create" } );
// inject the root urls into the database
WebDBInjector.main(new String[] { db, "-urlfile", rootUrlFile } );
for (int i = 0; i < depth; i++) {
// generate a new segment
FetchListTool.main(new String[] { db, segments } );
// get the name of the last segment
String[] allSegments = new File(segments).list();
Arrays.sort(allSegments);
String segment = segments + "/" + allSegments[allSegments.length-1];
// fetch the segment
Fetcher.main(new String[] { "-threads", ""+threads,
"-delay", ""+serverDelay,
segment } );
// update the database
UpdateDatabaseTool.main(new String[] { db, segment } );
}
// still need to refetch once more to get all of the inlinks
// and need to index, dedup & merge
LOG.info("CrawlTool finished");
}
}
Index: FetchListTool.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/tools/FetchListTool.java,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -d -r1.21 -r1.22
*** FetchListTool.java 14 Oct 2003 06:41:31 -0000 1.21
--- FetchListTool.java 21 Apr 2004 22:52:44 -0000 1.22
***************
*** 640,654 ****
// Finally, start things up.
//
! System.out.println("FetchListTool started at " + new
Date(System.currentTimeMillis()));
if (topN != Long.MAX_VALUE) {
! System.out.println("topN:" + topN);
}
if (cutoffScore >= 0) {
! System.out.println("cutoffscore:" + cutoffScore);
}
if (numFetchers > 1) {
! System.out.println("seed:" + seed);
}
- System.out.println();
FetchListTool flt = new FetchListTool(dbDir, refetchOnly, anchorOptimize,
cutoffScore, seed);
--- 640,653 ----
// Finally, start things up.
//
! LOG.info("FetchListTool started");
if (topN != Long.MAX_VALUE) {
! LOG.info("topN:" + topN);
}
if (cutoffScore >= 0) {
! LOG.info("cutoffscore:" + cutoffScore);
}
if (numFetchers > 1) {
! LOG.info("seed:" + seed);
}
FetchListTool flt = new FetchListTool(dbDir, refetchOnly, anchorOptimize,
cutoffScore, seed);
***************
*** 658,662 ****
flt.emitFetchList(segmentDir, topN, curTime);
}
! System.out.println("FetchListTool completed at " + new
Date(System.currentTimeMillis()));
}
}
--- 657,661 ----
flt.emitFetchList(segmentDir, topN, curTime);
}
! LOG.info("FetchListTool completed");
}
}
-------------------------------------------------------
This SF.Net email is sponsored by: IBM Linux Tutorials
Free Linux tutorial presented by Daniel Robbins, President and CEO of
GenToo technologies. Learn everything from fundamentals to system
administration.http://ads.osdn.com/?ad_id=1470&alloc_id=3638&op=click
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs