Author: cutting Date: Thu Feb 9 15:20:28 2006 New Revision: 376485 URL: http://svn.apache.org/viewcvs?rev=376485&view=rev Log: Fix for NUTCH-209. Nutch now supplies all code to remote MapReduce daemons through a job jar file. So Hadoop daemons no longer need to be restarted when Nutch code changes.
Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java Modified: lucene/nutch/trunk/bin/nutch lucene/nutch/trunk/build.xml lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Thu Feb 9 15:20:28 2006 @@ -82,13 +82,13 @@ CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf} CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar -# for developers, add Nutch classes to CLASSPATH -if [ -d "$NUTCH_HOME/build/classes" ]; then - CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes -fi +# for developers, add plugins, job & test code to CLASSPATH if [ -d "$NUTCH_HOME/build/plugins" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build fi +for f in $NUTCH_HOME/build/nutch-*.job; do + CLASSPATH=${CLASSPATH}:$f; +done if [ -d "$NUTCH_HOME/build/test/classes" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes fi @@ -96,14 +96,14 @@ # so that filenames w/ spaces are handled correctly in loops below IFS= -# for releases, add Nutch jar to CLASSPATH -for f in $NUTCH_HOME/nutch-*.jar; do +# for releases, add Nutch job to CLASSPATH +for f in $NUTCH_HOME/nutch-*.job; do CLASSPATH=${CLASSPATH}:$f; done # add plugins to classpath if [ -d "$NUTCH_HOME/plugins" ]; then - CLASSPATH=${CLASSPATH}:$NUTCH_HOME + CLASSPATH=${NUTCH_HOME}:${CLASSPATH} fi # add libs to CLASSPATH Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Thu Feb 9 15:20:28 2006 @@ -1,6 +1,6 @@ <?xml version="1.0"?> -<project name="Nutch" default="compile"> +<project name="Nutch" default="job"> <!-- Load all the default properties, and any the user wants --> <!-- to contribute (without having to type -D or edit this file --> @@ -100,7 +100,6 @@ <target name="dynamic" depends="generate-src, compile"> </target> - <!-- ================================================================== --> <!-- Make nutch.jar --> <!-- ================================================================== --> @@ -119,6 +118,21 @@ </target> <!-- ================================================================== --> + <!-- Make job jar --> + <!-- ================================================================== --> + <!-- --> + <!-- ================================================================== --> + <target name="job" depends="compile"> + <jar jarfile="${build.dir}/${final.name}.job"> + <zipfileset dir="${build.classes}"/> + <zipfileset dir="${conf.dir}" excludes="*.template"/> + <zipfileset dir="${lib.dir}" prefix="lib" + includes="**/*.jar" excludes="hadoop-*.jar"/> + <zipfileset dir="${build.plugins}" prefix="plugins"/> + </jar> + </target> + + <!-- ================================================================== --> <!-- Make nutch.war --> <!-- ================================================================== --> <!-- --> @@ -385,7 +399,7 @@ <!-- ================================================================== --> <!-- --> <!-- ================================================================== --> - <target name="package" depends="jar, war, javadoc"> + <target name="package" depends="jar, job, war, javadoc"> <mkdir dir="${dist.dir}"/> <mkdir dir="${dist.dir}/lib"/> <mkdir dir="${dist.dir}/bin"/> @@ -402,7 +416,7 @@ </copy> <copy file="${build.dir}/${final.name}.jar" todir="${dist.dir}"/> - + <copy file="${build.dir}/${final.name}.job" todir="${dist.dir}"/> <copy file="${build.dir}/${final.name}.war" todir="${dist.dir}"/> <copy todir="${dist.dir}/bin"> Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Feb 9 15:20:28 2006 @@ -31,6 +31,7 @@ import org.apache.nutch.indexer.IndexMerger; import org.apache.nutch.indexer.Indexer; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; public class Crawl { public static final Logger LOG = @@ -52,7 +53,7 @@ Configuration conf = NutchConfiguration.create(); conf.addDefaultResource("crawl-tool.xml"); - JobConf job = new JobConf(conf); + JobConf job = new NutchJob(conf); File rootUrlDir = null; File dir = new File("crawl-" + getDate()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Feb 9 15:20:28 2006 @@ -27,6 +27,7 @@ import org.apache.hadoop.mapred.*; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; /** This class takes a flat file of URLs and adds them to the of pages to be * crawled. Useful for bootstrapping the system. */ @@ -61,7 +62,7 @@ new File(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - JobConf job = new JobConf(config); + JobConf job = new NutchJob(config); job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Thu Feb 9 15:20:28 2006 @@ -44,6 +44,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + /** * Read utility for the CrawlDB. * @@ -137,7 +139,7 @@ LOG.info("CrawlDb statistics start: " + crawlDb); File tmpFolder = new File(crawlDb, "stat_tmp" + System.currentTimeMillis()); - JobConf job = new JobConf(config); + JobConf job = new NutchJob(config); job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); @@ -224,7 +226,7 @@ LOG.info("CrawlDb db: " + crawlDb); File outFolder = new File(output); - JobConf job = new JobConf(config); + JobConf job = new NutchJob(config); job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Feb 9 15:20:28 2006 @@ -29,6 +29,7 @@ import org.apache.hadoop.mapred.lib.*; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; /** Generates a subset of a crawl db to fetch. */ public class Generator extends Configured { @@ -175,7 +176,7 @@ // map to inverted subset due for fetch, sort by link count LOG.info("Generator: Selecting most-linked urls due for fetch."); - JobConf job = new JobConf(getConf()); + JobConf job = new NutchJob(getConf()); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task @@ -201,7 +202,7 @@ // invert again, paritition by host, sort by url hash LOG.info("Generator: Partitioning selected urls by host, for politeness."); - job = new JobConf(getConf()); + job = new NutchJob(getConf()); job.setInt("partition.url.by.host.seed", new Random().nextInt()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Feb 9 15:20:28 2006 @@ -28,6 +28,7 @@ import org.apache.nutch.net.*; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; /** This class takes a flat file of URLs and adds them to the of pages to be * crawled. Useful for bootstrapping the system. */ @@ -101,7 +102,7 @@ // map text input file to a <url,CrawlDatum> file LOG.info("Injector: Converting injected urls to crawl db entries."); - JobConf sortJob = new JobConf(getConf()); + JobConf sortJob = new NutchJob(getConf()); sortJob.setInputDir(urlDir); sortJob.setMapperClass(InjectMapper.class); sortJob.setReducerClass(InjectReducer.class); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Feb 9 15:20:28 2006 @@ -29,6 +29,7 @@ import org.apache.nutch.parse.*; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; /** Maintains an inverted link map, listing incoming links for each url. */ public class LinkDb extends Configured implements Mapper, Reducer { @@ -155,7 +156,7 @@ new File(linkDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - JobConf job = new JobConf(config); + JobConf job = new NutchJob(config); job.setInputFormat(SequenceFileInputFormat.class); job.setInputKeyClass(UTF8.class); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Thu Feb 9 15:20:28 2006 @@ -27,6 +27,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; import java.util.logging.Logger; @@ -72,7 +73,7 @@ LOG.info("LinkDb db: " + linkdb); File outFolder = new File(output); - JobConf job = new JobConf(config); + JobConf job = new NutchJob(config); job.addInputDir(new File(linkdb, LinkDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Feb 9 15:20:28 2006 @@ -341,7 +341,7 @@ LOG.info("Fetcher: starting"); LOG.info("Fetcher: segment: " + segment); - JobConf job = new JobConf(getConf()); + JobConf job = new NutchJob(getConf()); job.setInt("fetcher.threads.fetch", threads); job.set(SEGMENT_NAME_KEY, segment.getName()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Thu Feb 9 15:20:28 2006 @@ -27,6 +27,7 @@ import org.apache.hadoop.mapred.*; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; import org.apache.lucene.index.IndexReader; import org.apache.lucene.document.Document; @@ -296,7 +297,7 @@ new File("dedup-hash-"+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - JobConf job = new JobConf(getConf()); + JobConf job = new NutchJob(getConf()); for (int i = 0; i < indexDirs.length; i++) { LOG.info("Dedup: adding indexes in: " + indexDirs[i]); @@ -318,7 +319,7 @@ JobClient.runJob(job); - job = new JobConf(getConf()); + job = new NutchJob(getConf()); job.addInputDir(hashDir); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Feb 9 15:20:28 2006 @@ -31,6 +31,7 @@ import org.apache.nutch.analysis.*; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; @@ -240,7 +241,7 @@ LOG.info("Indexer: starting"); LOG.info("Indexer: linkdb: " + linkDb); - JobConf job = new JobConf(getConf()); + JobConf job = new NutchJob(getConf()); for (int i = 0; i < segments.length; i++) { LOG.info("Indexer: adding segment: " + segments[i]); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Thu Feb 9 15:20:28 2006 @@ -85,7 +85,7 @@ LOG.info("Parse: starting"); LOG.info("Parse: segment: " + segment); - JobConf job = new JobConf(getConf()); + JobConf job = new NutchJob(getConf()); job.setInputDir(new File(segment, Content.DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Thu Feb 9 15:20:28 2006 @@ -50,6 +50,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; /** Dump the content of a segment. */ public class SegmentReader extends Configured implements Reducer { @@ -149,7 +150,7 @@ public void reader(File segment) throws IOException { LOG.info("Reader: segment: " + segment); - JobConf job = new JobConf(getConf()); + JobConf job = new NutchJob(getConf()); job.addInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME)); job.addInputDir(new File(segment, CrawlDatum.FETCH_DIR_NAME)); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=376485&r1=376484&r2=376485&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Thu Feb 9 15:20:28 2006 @@ -22,6 +22,7 @@ /** Utility to create Hadoop [EMAIL PROTECTED] Configuration}s that include Nutch-specific * resources. */ public class NutchConfiguration { + private NutchConfiguration() {} // singleton // for back-compatibility, add old aliases for these Writable classes // this may be removed after the 0.8 release Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java?rev=376485&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java Thu Feb 9 15:20:28 2006 @@ -0,0 +1,30 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapred.JobConf; + +/** A [EMAIL PROTECTED] JobConf} for Nutch jobs. */ +public class NutchJob extends JobConf { + + public NutchJob(Configuration conf) { + super(conf, NutchJob.class); + } + +} +