Author: siren
Date: Mon Mar  2 12:28:22 2009
New Revision: 749289

URL: http://svn.apache.org/viewvc?rev=749289&view=rev
Log:
NUTCH-669 - Consolidate code for Fetcher and Fetcher2

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
      - copied, changed from r747319, 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
Removed:
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/bin/nutch
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
    lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=749289&r1=749288&r2=749289&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar  2 12:28:22 2009
@@ -372,6 +372,8 @@
      
 139. NUTCH-700 - Neko1.9.11 goes into a loop (Julien Nioche, siren)
 
+140. NUTCH-669 - Consolidate code for Fetcher and Fetcher2 (siren)
+
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?rev=749289&r1=749288&r2=749289&view=diff
==============================================================================
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Mon Mar  2 12:28:22 2009
@@ -41,7 +41,6 @@
   echo "  generate          generate new segments to fetch from crawl db"
   echo "  freegen           generate new segments to fetch from text files"
   echo "  fetch             fetch a segment's pages"
-  echo "  fetch2            fetch a segment's pages using Fetcher2 
implementation"
   echo "  parse             parse a segment's pages"
   echo "  readseg           read / dump segment data"
   echo "  mergesegs         merge several segments, with optional filtering 
and slicing"

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=749289&r1=749288&r2=749289&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Mar  2 
12:28:22 2009
@@ -24,7 +24,6 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
-import org.apache.nutch.fetcher.Fetcher;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
@@ -36,6 +35,8 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 
+import org.apache.nutch.fetcher.Fetcher;
+
 public class Crawl {
   public static final Log LOG = LogFactory.getLog(Crawl.class);
 
@@ -118,7 +119,7 @@
         LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
         break;
       }
-      fetcher.fetch(segment, threads);  // fetch it
+      fetcher.fetch(segment, threads, 
org.apache.nutch.fetcher.Fetcher.isParsing(conf));  // fetch it
       if (!Fetcher.isParsing(job)) {
         parseSegment.parse(segment);    // parse it, if needed
       }

Copied: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (from 
r747319, lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java)
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?p2=lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java&p1=lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java&r1=747319&r2=749289&rev=749289&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar  
2 12:28:22 2009
@@ -1,9 +1,10 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
@@ -13,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.fetcher;
 
 import java.io.IOException;
@@ -83,10 +83,16 @@
  * 
  * @author Andrzej Bialecki
  */
-public class Fetcher2 extends Configured implements
+public class Fetcher extends Configured implements
     MapRunnable<Text, CrawlDatum, Text, NutchWritable> { 
 
-  public static final Log LOG = LogFactory.getLog(Fetcher2.class);
+  public static final int PERM_REFRESH_TIME = 5;
+
+  public static final String CONTENT_REDIR = "content";
+
+  public static final String PROTOCOL_REDIR = "protocol";
+
+  public static final Log LOG = LogFactory.getLog(Fetcher.class);
   
   public static class InputFormat extends SequenceFileInputFormat<Text, 
CrawlDatum> {
     /** Don't split inputs, to keep things polite. */
@@ -837,9 +843,9 @@
     
   }
 
-  public Fetcher2() { super(null); }
+  public Fetcher() { super(null); }
 
-  public Fetcher2(Configuration conf) { super(conf); }
+  public Fetcher(Configuration conf) { super(conf); }
 
   private void updateStatus(int bytesInPage) throws IOException {
     pages.incrementAndGet();
@@ -953,7 +959,7 @@
     FileInputFormat.addInputPath(job, new Path(segment, 
CrawlDatum.GENERATE_DIR_NAME));
     job.setInputFormat(InputFormat.class);
 
-    job.setMapRunnerClass(Fetcher2.class);
+    job.setMapRunnerClass(Fetcher.class);
 
     FileOutputFormat.setOutputPath(job, segment);
     job.setOutputFormat(FetcherOutputFormat.class);
@@ -992,7 +998,7 @@
     if (!parsing) {
       conf.setBoolean("fetcher.parse", parsing);
     }
-    Fetcher2 fetcher = new Fetcher2(conf);          // make a Fetcher
+    Fetcher fetcher = new Fetcher(conf);          // make a Fetcher
     
     fetcher.fetch(segment, threads, parsing);              // run the Fetcher
 

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=749289&r1=749288&r2=749289&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Mon 
Mar  2 12:28:22 2009
@@ -97,9 +97,8 @@
 
     long time=System.currentTimeMillis();
     //fetch
-    conf.setBoolean("fetcher.parse", true);
     Fetcher fetcher=new Fetcher(conf);
-    fetcher.fetch(generatedSegment, 1);
+    fetcher.fetch(generatedSegment, 1, true);
 
     time=System.currentTimeMillis()-time;
     
@@ -175,7 +174,7 @@
 
     try {
       conf.setBoolean("fetcher.parse", true);
-      Fetcher2 fetcher = new Fetcher2(conf);
+      Fetcher fetcher = new Fetcher(conf);
       fetcher.fetch(null, 1, false);
     } catch (IllegalArgumentException iae) {
       String message = iae.getMessage();


Reply via email to