Author: cutting
Date: Tue May 31 11:30:44 2005
New Revision: 179255
URL: http://svn.apache.org/viewcvs?rev=179255&view=rev
Log:
Fixed command lines & directory structure.
Modified:
incubator/nutch/branches/mapred/bin/nutch
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java
Modified: incubator/nutch/branches/mapred/bin/nutch
URL:
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/bin/nutch?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
--- incubator/nutch/branches/mapred/bin/nutch (original)
+++ incubator/nutch/branches/mapred/bin/nutch Tue May 31 11:30:44 2005
@@ -32,7 +32,6 @@
echo " admin database administration, including creation"
echo " inject inject new urls into the database"
echo " generate generate new segments to fetch"
- echo " fetchlist print the fetchlist of a segment"
echo " fetch fetch a segment's pages"
echo " parse parse a segment's pages"
echo " index run the indexer on a segment's fetcher output"
@@ -128,17 +127,13 @@
# figure out which class to run
if [ "$COMMAND" = "crawl" ] ; then
- CLASS=org.apache.nutch.tools.CrawlTool
-elif [ "$COMMAND" = "admin" ] ; then
- CLASS=org.apache.nutch.tools.WebDBAdminTool
+ CLASS=org.apache.nutch.crawl.Crawl
elif [ "$COMMAND" = "inject" ] ; then
- CLASS=org.apache.nutch.db.WebDBInjector
+ CLASS=org.apache.nutch.crawl.Injector
elif [ "$COMMAND" = "generate" ] ; then
- CLASS=org.apache.nutch.tools.FetchListTool
-elif [ "$COMMAND" = "fetchlist" ] ; then
- CLASS=org.apache.nutch.pagedb.FetchListEntry
+ CLASS=org.apache.nutch.crawl.Generator
elif [ "$COMMAND" = "fetch" ] ; then
- CLASS=org.apache.nutch.fetcher.Fetcher
+ CLASS=org.apache.nutch.crawl.Fetcher
elif [ "$COMMAND" = "parse" ] ; then
CLASS=org.apache.nutch.tools.ParseSegment
elif [ "$COMMAND" = "index" ] ; then
Modified:
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL:
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
---
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
(original)
+++
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
Tue May 31 11:30:44 2005
@@ -166,21 +166,21 @@
}
- public void fetch(File inputDir, File outputDir, int threads)
+ public void fetch(File segment, int threads)
throws IOException {
JobConf job = new JobConf(getConf());
job.setInt("fetcher.threads.fetch", threads);
- job.setInputDir(inputDir);
+ job.setInputDir(new File(segment, "fetchlist"));
job.setInputFormat(SequenceFileInputFormat.class);
job.setInputKeyClass(UTF8.class);
job.setInputValueClass(CrawlDatum.class);
job.setMapRunnerClass(Fetcher.class);
- job.setOutputDir(outputDir);
+ job.setOutputDir(segment);
job.setOutputFormat(FetcherOutputFormat.class);
job.setOutputKeyClass(UTF8.class);
job.setOutputValueClass(FetcherOutput.class);
@@ -191,15 +191,14 @@
/** Run the fetcher. */
public static void main(String[] args) throws Exception {
- String usage = "Usage: Fetcher <inDir> <outDir> [-threads n]";
+ String usage = "Usage: Fetcher <segment> [-threads n]";
- if (args.length < 2) {
+ if (args.length < 1) {
System.err.println(usage);
System.exit(-1);
}
- File inDir = new File(args[0]);
- File outDir = new File(args[1]);
+ File segment = new File(args[0]);
NutchConf conf = NutchConf.get();
@@ -213,7 +212,7 @@
Fetcher fetcher = new Fetcher(conf); // make a Fetcher
- fetcher.fetch(inDir, outDir, threads); // run the Fetcher
+ fetcher.fetch(segment, threads); // run the Fetcher
}
}
Modified:
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java
URL:
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
---
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java
(original)
+++
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java
Tue May 31 11:30:44 2005
@@ -38,15 +38,16 @@
public RecordWriter getRecordWriter(NutchFileSystem fs, JobConf job,
String name) throws IOException {
- File dir = new File(job.getOutputDir(), name);
+ File crawl =
+ new File(new File(job.getOutputDir(), CrawlDatum.DIR_NAME), name);
+ File content =
+ new File(new File(job.getOutputDir(), Content.DIR_NAME), name);
final MapFile.Writer crawlOut =
- new MapFile.Writer(fs, new File(dir, CrawlDatum.DIR_NAME).toString(),
- UTF8.class, CrawlDatum.class);
+ new MapFile.Writer(fs, crawl.toString(), UTF8.class, CrawlDatum.class);
final MapFile.Writer contentOut =
- new MapFile.Writer(fs, new File(dir, Content.DIR_NAME).toString(),
- UTF8.class, Content.class);
+ new MapFile.Writer(fs, content.toString(), UTF8.class, Content.class);
return new RecordWriter() {
Modified:
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL:
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
---
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
(original)
+++
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
Tue May 31 11:30:44 2005
@@ -100,14 +100,17 @@
this.dbDir = dbDir;
}
- /** Generate fetchlists. */
- public void generate(File dir, int numLists, long topN, long curTime)
+ /** Generate fetchlists in a segment. */
+ public void generate(File segments, int numLists, long topN, long curTime)
throws IOException {
File tempDir =
new File("generate-temp-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ File segment = new File(segments, getDate());
+ File output = new File(segment, "fetchlist");
+
// map to inverted subset due for fetch, sort by link count
JobConf job = new JobConf(getConf());
@@ -143,7 +146,7 @@
job.setPartitionerClass(PartitionUrlByHost.class);
job.setNumReduceTasks(numLists);
- job.setOutputDir(dir);
+ job.setOutputDir(output);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(UTF8.class);
job.setOutputValueClass(CrawlDatum.class);
@@ -151,6 +154,11 @@
JobClient.runJob(job);
new JobClient(getConf()).getFs().delete(tempDir);
+ }
+
+ private static String getDate() {
+ return new SimpleDateFormat("yyyyMMddHHmmss").format
+ (new Date(System.currentTimeMillis()));
}
/**
Modified:
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java
URL:
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
---
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java
(original)
+++
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java
Tue May 31 11:30:44 2005
@@ -66,7 +66,7 @@
super(conf);
}
- public void inject(File urlDir, File crawlDb) throws IOException {
+ public void inject(File crawlDb, File urlDir) throws IOException {
File tempDir =
new File("inject-temp-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -93,8 +93,6 @@
mergeJob.setInputKeyClass(UTF8.class);
mergeJob.setInputValueClass(CrawlDatum.class);
- mergeJob.setInt("partition.url.by.host.seed", new Random().nextInt());
- mergeJob.setPartitionerClass(PartitionUrlByHost.class);
mergeJob.setReducerClass(CrawlDBReducer.class);
mergeJob.setOutputDir(newCrawlDb);
@@ -115,6 +113,12 @@
public static void main(String[] args) throws Exception {
Injector injector = new Injector(NutchConf.get());
+
+ if (args.length < 2) {
+ System.err.println("Usage: Injector <crawldb> <url_dir>");
+ return;
+ }
+
injector.inject(new File(args[0]), new File(args[1]));
}
-------------------------------------------------------
This SF.Net email is sponsored by Yahoo.
Introducing Yahoo! Search Developer Network - Create apps using Yahoo!
Search APIs Find out how you can build Yahoo! directly into your own
Applications - visit http://developer.yahoo.net/?fr=offad-ysdn-ostg-q22005
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs