Author: cutting
Date: Tue May 31 11:30:44 2005
New Revision: 179255

URL: http://svn.apache.org/viewcvs?rev=179255&view=rev
Log:
Fixed command lines & directory structure.

Modified:
    incubator/nutch/branches/mapred/bin/nutch
    incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
    
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java
    
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
    
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java

Modified: incubator/nutch/branches/mapred/bin/nutch
URL: 
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/bin/nutch?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
--- incubator/nutch/branches/mapred/bin/nutch (original)
+++ incubator/nutch/branches/mapred/bin/nutch Tue May 31 11:30:44 2005
@@ -32,7 +32,6 @@
   echo "  admin             database administration, including creation"
   echo "  inject            inject new urls into the database"
   echo "  generate          generate new segments to fetch"
-  echo "  fetchlist         print the fetchlist of a segment"
   echo "  fetch             fetch a segment's pages"
   echo "  parse             parse a segment's pages"
   echo "  index             run the indexer on a segment's fetcher output"
@@ -128,17 +127,13 @@
 
 # figure out which class to run
 if [ "$COMMAND" = "crawl" ] ; then
-  CLASS=org.apache.nutch.tools.CrawlTool
-elif [ "$COMMAND" = "admin" ] ; then
-  CLASS=org.apache.nutch.tools.WebDBAdminTool
+  CLASS=org.apache.nutch.crawl.Crawl
 elif [ "$COMMAND" = "inject" ] ; then
-  CLASS=org.apache.nutch.db.WebDBInjector
+  CLASS=org.apache.nutch.crawl.Injector
 elif [ "$COMMAND" = "generate" ] ; then
-  CLASS=org.apache.nutch.tools.FetchListTool
-elif [ "$COMMAND" = "fetchlist" ] ; then
-  CLASS=org.apache.nutch.pagedb.FetchListEntry
+  CLASS=org.apache.nutch.crawl.Generator
 elif [ "$COMMAND" = "fetch" ] ; then
-  CLASS=org.apache.nutch.fetcher.Fetcher
+  CLASS=org.apache.nutch.crawl.Fetcher
 elif [ "$COMMAND" = "parse" ] ; then
   CLASS=org.apache.nutch.tools.ParseSegment
 elif [ "$COMMAND" = "index" ] ; then

Modified: 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
--- 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
(original)
+++ 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
Tue May 31 11:30:44 2005
@@ -166,21 +166,21 @@
     
   }
 
-  public void fetch(File inputDir, File outputDir, int threads)
+  public void fetch(File segment, int threads)
     throws IOException {
 
     JobConf job = new JobConf(getConf());
 
     job.setInt("fetcher.threads.fetch", threads);
 
-    job.setInputDir(inputDir);
+    job.setInputDir(new File(segment, "fetchlist"));
     job.setInputFormat(SequenceFileInputFormat.class);
     job.setInputKeyClass(UTF8.class);
     job.setInputValueClass(CrawlDatum.class);
 
     job.setMapRunnerClass(Fetcher.class);
 
-    job.setOutputDir(outputDir);
+    job.setOutputDir(segment);
     job.setOutputFormat(FetcherOutputFormat.class);
     job.setOutputKeyClass(UTF8.class);
     job.setOutputValueClass(FetcherOutput.class);
@@ -191,15 +191,14 @@
   /** Run the fetcher. */
   public static void main(String[] args) throws Exception {
 
-    String usage = "Usage: Fetcher <inDir> <outDir> [-threads n]";
+    String usage = "Usage: Fetcher <segment> [-threads n]";
 
-    if (args.length < 2) {
+    if (args.length < 1) {
       System.err.println(usage);
       System.exit(-1);
     }
       
-    File inDir = new File(args[0]);
-    File outDir = new File(args[1]);
+    File segment = new File(args[0]);
 
     NutchConf conf = NutchConf.get();
 
@@ -213,7 +212,7 @@
 
     Fetcher fetcher = new Fetcher(conf);          // make a Fetcher
     
-    fetcher.fetch(inDir, outDir, threads);        // run the Fetcher
+    fetcher.fetch(segment, threads);              // run the Fetcher
 
   }
 }

Modified: 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
--- 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java
 (original)
+++ 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/FetcherOutputFormat.java
 Tue May 31 11:30:44 2005
@@ -38,15 +38,16 @@
   public RecordWriter getRecordWriter(NutchFileSystem fs, JobConf job,
                                       String name) throws IOException {
 
-    File dir = new File(job.getOutputDir(), name);
+    File crawl =
+      new File(new File(job.getOutputDir(), CrawlDatum.DIR_NAME), name);
+    File content =
+      new File(new File(job.getOutputDir(), Content.DIR_NAME), name);
 
     final MapFile.Writer crawlOut =
-      new MapFile.Writer(fs, new File(dir, CrawlDatum.DIR_NAME).toString(),
-                         UTF8.class, CrawlDatum.class);
+      new MapFile.Writer(fs, crawl.toString(), UTF8.class, CrawlDatum.class);
     
     final MapFile.Writer contentOut =
-      new MapFile.Writer(fs, new File(dir, Content.DIR_NAME).toString(),
-                         UTF8.class, Content.class);
+      new MapFile.Writer(fs, content.toString(), UTF8.class, Content.class);
 
     return new RecordWriter() {
 

Modified: 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
--- 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
(original)
+++ 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
Tue May 31 11:30:44 2005
@@ -100,14 +100,17 @@
     this.dbDir = dbDir;
   }
 
-  /** Generate fetchlists. */
-  public void generate(File dir, int numLists, long topN, long curTime)
+  /** Generate fetchlists in a segment. */
+  public void generate(File segments, int numLists, long topN, long curTime)
     throws IOException {
 
     File tempDir =
       new File("generate-temp-"+
                Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
+    File segment = new File(segments, getDate());
+    File output = new File(segment, "fetchlist");
+
     // map to inverted subset due for fetch, sort by link count
     JobConf job = new JobConf(getConf());
     
@@ -143,7 +146,7 @@
     job.setPartitionerClass(PartitionUrlByHost.class);
     job.setNumReduceTasks(numLists);
 
-    job.setOutputDir(dir);
+    job.setOutputDir(output);
     job.setOutputFormat(SequenceFileOutputFormat.class);
     job.setOutputKeyClass(UTF8.class);
     job.setOutputValueClass(CrawlDatum.class);
@@ -151,6 +154,11 @@
     JobClient.runJob(job);
 
     new JobClient(getConf()).getFs().delete(tempDir);
+  }
+
+  private static String getDate() {
+    return new SimpleDateFormat("yyyyMMddHHmmss").format
+      (new Date(System.currentTimeMillis()));
   }
 
   /**

Modified: 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewcvs/incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java?rev=179255&r1=179254&r2=179255&view=diff
==============================================================================
--- 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java 
(original)
+++ 
incubator/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java 
Tue May 31 11:30:44 2005
@@ -66,7 +66,7 @@
     super(conf);
   }
 
-  public void inject(File urlDir, File crawlDb) throws IOException {
+  public void inject(File crawlDb, File urlDir) throws IOException {
     File tempDir =
       new File("inject-temp-"+
                Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -93,8 +93,6 @@
     mergeJob.setInputKeyClass(UTF8.class);
     mergeJob.setInputValueClass(CrawlDatum.class);
 
-    mergeJob.setInt("partition.url.by.host.seed", new Random().nextInt());
-    mergeJob.setPartitionerClass(PartitionUrlByHost.class);
     mergeJob.setReducerClass(CrawlDBReducer.class);
 
     mergeJob.setOutputDir(newCrawlDb);
@@ -115,6 +113,12 @@
 
   public static void main(String[] args) throws Exception {
     Injector injector = new Injector(NutchConf.get());
+    
+    if (args.length < 2) {
+      System.err.println("Usage: Injector <crawldb> <url_dir>");
+      return;
+    }
+    
     injector.inject(new File(args[0]), new File(args[1]));
   }
 




-------------------------------------------------------
This SF.Net email is sponsored by Yahoo.
Introducing Yahoo! Search Developer Network - Create apps using Yahoo!
Search APIs Find out how you can build Yahoo! directly into your own
Applications - visit http://developer.yahoo.net/?fr=offad-ysdn-ostg-q22005
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to