Author: cutting
Date: Thu Feb 9 15:20:28 2006
New Revision: 376485
URL: http://svn.apache.org/viewcvs?rev=376485&view=rev
Log:
Fix for NUTCH-209. Nutch now supplies all code to remote MapReduce daemons
through a job jar file. So Hadoop daemons no longer need to be restarted when
Nutch code changes.
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java
Modified:
lucene/nutch/trunk/bin/nutch
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
Modified: lucene/nutch/trunk/bin/nutch
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Thu Feb 9 15:20:28 2006
@@ -82,13 +82,13 @@
CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}
CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
-# for developers, add Nutch classes to CLASSPATH
-if [ -d "$NUTCH_HOME/build/classes" ]; then
- CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes
-fi
+# for developers, add plugins, job & test code to CLASSPATH
if [ -d "$NUTCH_HOME/build/plugins" ]; then
CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build
fi
+for f in $NUTCH_HOME/build/nutch-*.job; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
if [ -d "$NUTCH_HOME/build/test/classes" ]; then
CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes
fi
@@ -96,14 +96,14 @@
# so that filenames w/ spaces are handled correctly in loops below
IFS=
-# for releases, add Nutch jar to CLASSPATH
-for f in $NUTCH_HOME/nutch-*.jar; do
+# for releases, add Nutch job to CLASSPATH
+for f in $NUTCH_HOME/nutch-*.job; do
CLASSPATH=${CLASSPATH}:$f;
done
# add plugins to classpath
if [ -d "$NUTCH_HOME/plugins" ]; then
- CLASSPATH=${CLASSPATH}:$NUTCH_HOME
+ CLASSPATH=${NUTCH_HOME}:${CLASSPATH}
fi
# add libs to CLASSPATH
Modified: lucene/nutch/trunk/build.xml
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Thu Feb 9 15:20:28 2006
@@ -1,6 +1,6 @@
<?xml version="1.0"?>
-<project name="Nutch" default="compile">
+<project name="Nutch" default="job">
<!-- Load all the default properties, and any the user wants -->
<!-- to contribute (without having to type -D or edit this file -->
@@ -100,7 +100,6 @@
<target name="dynamic" depends="generate-src, compile">
</target>
-
<!-- ================================================================== -->
<!-- Make nutch.jar -->
<!-- ================================================================== -->
@@ -119,6 +118,21 @@
</target>
<!-- ================================================================== -->
+ <!-- Make job jar -->
+ <!-- ================================================================== -->
+ <!-- -->
+ <!-- ================================================================== -->
+ <target name="job" depends="compile">
+ <jar jarfile="${build.dir}/${final.name}.job">
+ <zipfileset dir="${build.classes}"/>
+ <zipfileset dir="${conf.dir}" excludes="*.template"/>
+ <zipfileset dir="${lib.dir}" prefix="lib"
+ includes="**/*.jar" excludes="hadoop-*.jar"/>
+ <zipfileset dir="${build.plugins}" prefix="plugins"/>
+ </jar>
+ </target>
+
+ <!-- ================================================================== -->
<!-- Make nutch.war -->
<!-- ================================================================== -->
<!-- -->
@@ -385,7 +399,7 @@
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
- <target name="package" depends="jar, war, javadoc">
+ <target name="package" depends="jar, job, war, javadoc">
<mkdir dir="${dist.dir}"/>
<mkdir dir="${dist.dir}/lib"/>
<mkdir dir="${dist.dir}/bin"/>
@@ -402,7 +416,7 @@
</copy>
<copy file="${build.dir}/${final.name}.jar" todir="${dist.dir}"/>
-
+ <copy file="${build.dir}/${final.name}.job" todir="${dist.dir}"/>
<copy file="${build.dir}/${final.name}.war" todir="${dist.dir}"/>
<copy todir="${dist.dir}/bin">
Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Feb 9
15:20:28 2006
@@ -31,6 +31,7 @@
import org.apache.nutch.indexer.IndexMerger;
import org.apache.nutch.indexer.Indexer;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
public class Crawl {
public static final Logger LOG =
@@ -52,7 +53,7 @@
Configuration conf = NutchConfiguration.create();
conf.addDefaultResource("crawl-tool.xml");
- JobConf job = new JobConf(conf);
+ JobConf job = new NutchJob(conf);
File rootUrlDir = null;
File dir = new File("crawl-" + getDate());
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Feb 9
15:20:28 2006
@@ -27,6 +27,7 @@
import org.apache.hadoop.mapred.*;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system. */
@@ -61,7 +62,7 @@
new File(crawlDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- JobConf job = new JobConf(config);
+ JobConf job = new NutchJob(config);
job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Thu
Feb 9 15:20:28 2006
@@ -44,6 +44,8 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
/**
* Read utility for the CrawlDB.
*
@@ -137,7 +139,7 @@
LOG.info("CrawlDb statistics start: " + crawlDb);
File tmpFolder = new File(crawlDb, "stat_tmp" +
System.currentTimeMillis());
- JobConf job = new JobConf(config);
+ JobConf job = new NutchJob(config);
job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
@@ -224,7 +226,7 @@
LOG.info("CrawlDb db: " + crawlDb);
File outFolder = new File(output);
- JobConf job = new JobConf(config);
+ JobConf job = new NutchJob(config);
job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Feb
9 15:20:28 2006
@@ -29,6 +29,7 @@
import org.apache.hadoop.mapred.lib.*;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
/** Generates a subset of a crawl db to fetch. */
public class Generator extends Configured {
@@ -175,7 +176,7 @@
// map to inverted subset due for fetch, sort by link count
LOG.info("Generator: Selecting most-linked urls due for fetch.");
- JobConf job = new JobConf(getConf());
+ JobConf job = new NutchJob(getConf());
if (numLists == -1) { // for politeness make
numLists = job.getNumMapTasks(); // a partition per fetch task
@@ -201,7 +202,7 @@
// invert again, paritition by host, sort by url hash
LOG.info("Generator: Partitioning selected urls by host, for politeness.");
- job = new JobConf(getConf());
+ job = new NutchJob(getConf());
job.setInt("partition.url.by.host.seed", new Random().nextInt());
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Feb 9
15:20:28 2006
@@ -28,6 +28,7 @@
import org.apache.nutch.net.*;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system. */
@@ -101,7 +102,7 @@
// map text input file to a <url,CrawlDatum> file
LOG.info("Injector: Converting injected urls to crawl db entries.");
- JobConf sortJob = new JobConf(getConf());
+ JobConf sortJob = new NutchJob(getConf());
sortJob.setInputDir(urlDir);
sortJob.setMapperClass(InjectMapper.class);
sortJob.setReducerClass(InjectReducer.class);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Feb 9
15:20:28 2006
@@ -29,6 +29,7 @@
import org.apache.nutch.parse.*;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
/** Maintains an inverted link map, listing incoming links for each url. */
public class LinkDb extends Configured implements Mapper, Reducer {
@@ -155,7 +156,7 @@
new File(linkDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- JobConf job = new JobConf(config);
+ JobConf job = new NutchJob(config);
job.setInputFormat(SequenceFileInputFormat.class);
job.setInputKeyClass(UTF8.class);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Thu
Feb 9 15:20:28 2006
@@ -27,6 +27,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
import java.util.logging.Logger;
@@ -72,7 +73,7 @@
LOG.info("LinkDb db: " + linkdb);
File outFolder = new File(output);
- JobConf job = new JobConf(config);
+ JobConf job = new NutchJob(config);
job.addInputDir(new File(linkdb, LinkDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Feb
9 15:20:28 2006
@@ -341,7 +341,7 @@
LOG.info("Fetcher: starting");
LOG.info("Fetcher: segment: " + segment);
- JobConf job = new JobConf(getConf());
+ JobConf job = new NutchJob(getConf());
job.setInt("fetcher.threads.fetch", threads);
job.set(SEGMENT_NAME_KEY, segment.getName());
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
Thu Feb 9 15:20:28 2006
@@ -27,6 +27,7 @@
import org.apache.hadoop.mapred.*;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.document.Document;
@@ -296,7 +297,7 @@
new File("dedup-hash-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- JobConf job = new JobConf(getConf());
+ JobConf job = new NutchJob(getConf());
for (int i = 0; i < indexDirs.length; i++) {
LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
@@ -318,7 +319,7 @@
JobClient.runJob(job);
- job = new JobConf(getConf());
+ job = new NutchJob(getConf());
job.addInputDir(hashDir);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Feb
9 15:20:28 2006
@@ -31,6 +31,7 @@
import org.apache.nutch.analysis.*;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -240,7 +241,7 @@
LOG.info("Indexer: starting");
LOG.info("Indexer: linkdb: " + linkDb);
- JobConf job = new JobConf(getConf());
+ JobConf job = new NutchJob(getConf());
for (int i = 0; i < segments.length; i++) {
LOG.info("Indexer: adding segment: " + segments[i]);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Thu
Feb 9 15:20:28 2006
@@ -85,7 +85,7 @@
LOG.info("Parse: starting");
LOG.info("Parse: segment: " + segment);
- JobConf job = new JobConf(getConf());
+ JobConf job = new NutchJob(getConf());
job.setInputDir(new File(segment, Content.DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Thu
Feb 9 15:20:28 2006
@@ -50,6 +50,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
/** Dump the content of a segment. */
public class SegmentReader extends Configured implements Reducer {
@@ -149,7 +150,7 @@
public void reader(File segment) throws IOException {
LOG.info("Reader: segment: " + segment);
- JobConf job = new JobConf(getConf());
+ JobConf job = new NutchJob(getConf());
job.addInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME));
job.addInputDir(new File(segment, CrawlDatum.FETCH_DIR_NAME));
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=376485&r1=376484&r2=376485&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
Thu Feb 9 15:20:28 2006
@@ -22,6 +22,7 @@
/** Utility to create Hadoop [EMAIL PROTECTED] Configuration}s that include
Nutch-specific
* resources. */
public class NutchConfiguration {
+ private NutchConfiguration() {} // singleton
// for back-compatibility, add old aliases for these Writable classes
// this may be removed after the 0.8 release
Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java?rev=376485&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java Thu Feb 9
15:20:28 2006
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+
+/** A [EMAIL PROTECTED] JobConf} for Nutch jobs. */
+public class NutchJob extends JobConf {
+
+ public NutchJob(Configuration conf) {
+ super(conf, NutchJob.class);
+ }
+
+}
+