Author: siren Date: Mon Mar 2 12:28:22 2009 New Revision: 749289 URL: http://svn.apache.org/viewvc?rev=749289&view=rev Log: NUTCH-669 - Consolidate code for Fetcher and Fetcher2
Added: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java - copied, changed from r747319, lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Removed: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/bin/nutch lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=749289&r1=749288&r2=749289&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 2 12:28:22 2009 @@ -372,6 +372,8 @@ 139. NUTCH-700 - Neko1.9.11 goes into a loop (Julien Nioche, siren) +140. NUTCH-669 - Consolidate code for Fetcher and Fetcher2 (siren) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?rev=749289&r1=749288&r2=749289&view=diff ============================================================================== --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Mon Mar 2 12:28:22 2009 @@ -41,7 +41,6 @@ echo " generate generate new segments to fetch from crawl db" echo " freegen generate new segments to fetch from text files" echo " fetch fetch a segment's pages" - echo " fetch2 fetch a segment's pages using Fetcher2 implementation" echo " parse parse a segment's pages" echo " readseg read / dump segment data" echo " mergesegs merge several segments, with optional filtering and slicing" Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=749289&r1=749288&r2=749289&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Mar 2 12:28:22 2009 @@ -24,7 +24,6 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.nutch.fetcher.Fetcher; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; @@ -36,6 +35,8 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; +import org.apache.nutch.fetcher.Fetcher; + public class Crawl { public static final Log LOG = LogFactory.getLog(Crawl.class); @@ -118,7 +119,7 @@ LOG.info("Stopping at depth=" + i + " - no more URLs to fetch."); break; } - fetcher.fetch(segment, threads); // fetch it + fetcher.fetch(segment, threads, org.apache.nutch.fetcher.Fetcher.isParsing(conf)); // fetch it if (!Fetcher.isParsing(job)) { parseSegment.parse(segment); // parse it, if needed } Copied: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (from r747319, lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java) URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?p2=lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java&p1=lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java&r1=747319&r2=749289&rev=749289&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar 2 12:28:22 2009 @@ -1,9 +1,10 @@ -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -13,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.fetcher; import java.io.IOException; @@ -83,10 +83,16 @@ * * @author Andrzej Bialecki */ -public class Fetcher2 extends Configured implements +public class Fetcher extends Configured implements MapRunnable<Text, CrawlDatum, Text, NutchWritable> { - public static final Log LOG = LogFactory.getLog(Fetcher2.class); + public static final int PERM_REFRESH_TIME = 5; + + public static final String CONTENT_REDIR = "content"; + + public static final String PROTOCOL_REDIR = "protocol"; + + public static final Log LOG = LogFactory.getLog(Fetcher.class); public static class InputFormat extends SequenceFileInputFormat<Text, CrawlDatum> { /** Don't split inputs, to keep things polite. */ @@ -837,9 +843,9 @@ } - public Fetcher2() { super(null); } + public Fetcher() { super(null); } - public Fetcher2(Configuration conf) { super(conf); } + public Fetcher(Configuration conf) { super(conf); } private void updateStatus(int bytesInPage) throws IOException { pages.incrementAndGet(); @@ -953,7 +959,7 @@ FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); - job.setMapRunnerClass(Fetcher2.class); + job.setMapRunnerClass(Fetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); @@ -992,7 +998,7 @@ if (!parsing) { conf.setBoolean("fetcher.parse", parsing); } - Fetcher2 fetcher = new Fetcher2(conf); // make a Fetcher + Fetcher fetcher = new Fetcher(conf); // make a Fetcher fetcher.fetch(segment, threads, parsing); // run the Fetcher Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=749289&r1=749288&r2=749289&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Mon Mar 2 12:28:22 2009 @@ -97,9 +97,8 @@ long time=System.currentTimeMillis(); //fetch - conf.setBoolean("fetcher.parse", true); Fetcher fetcher=new Fetcher(conf); - fetcher.fetch(generatedSegment, 1); + fetcher.fetch(generatedSegment, 1, true); time=System.currentTimeMillis()-time; @@ -175,7 +174,7 @@ try { conf.setBoolean("fetcher.parse", true); - Fetcher2 fetcher = new Fetcher2(conf); + Fetcher fetcher = new Fetcher(conf); fetcher.fetch(null, 1, false); } catch (IllegalArgumentException iae) { String message = iae.getMessage();