Author: lewismc
Date: Sun Sep 20 12:50:51 2015
New Revision: 1704128

URL: http://svn.apache.org/viewvc?rev=1704128&view=rev
Log:
NUTCH-1946 Upgrade to Gora 0.6.1

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/ivy/ivy.xml
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
    
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
    
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
    nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
    nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
    nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
    nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
    nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
    nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java
    nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java
    nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java
    
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java
    nutch/branches/2.x/src/test/crawl-tests.xml
    nutch/branches/2.x/src/test/gora.properties
    nutch/branches/2.x/src/test/nutch-site.xml
    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
    nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
    
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
    nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
    nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sun Sep 20 12:50:51 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.4-SNAPSHOT
 
+* NUTCH-1946 Upgrade to Gora 0.6.1 (lewismc, hsaputra, Jeroen Vlek)
+
 * NUTCH-2094 Stopping and Restarting a crawl has issues in the Web UI (Prerna 
Satija via mattmann)
 
 * NUTCH-1679 UpdateDb using batchId, link may override crawled page (Tien 
Nguyen Manh, Koen Smets, Alfonso Nishikawa, Alexander Kingson via lewismc)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Sun Sep 20 12:50:51 2015
@@ -418,13 +418,13 @@
 
 <property>
   <name>db.fetch.schedule.adaptive.min_interval</name>
-  <value>60.0</value>
+  <value>60</value>
   <description>Minimum fetchInterval, in seconds.</description>
 </property>
 
 <property>
   <name>db.fetch.schedule.adaptive.max_interval</name>
-  <value>31536000.0</value>
+  <value>31536000</value>
   <description>Maximum fetchInterval, in seconds (365 days).
   NOTE: this is limited by db.fetch.interval.max. Pages with
   fetchInterval larger than db.fetch.interval.max
@@ -890,7 +890,7 @@
 
 <property>
   <name>plugin.folders</name>
-  <value>plugins</value>
+  <value>/usr/local/2scot/build/plugins</value>
   <description>Directories where nutch plugins are located.  Each
   element may be a relative or absolute path.  If absolute, it is used
   as is.  If relative, it is searched for on the classpath.</description>
@@ -1198,6 +1198,16 @@
   </description>
 </property>
 
+<property>
+  <name>io.serializations</name>
+  
<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+  <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+  <description>A list of serialization classes that can be used for
+  obtaining serializers and deserializers.</description>
+</property>
+
 <!-- solr index properties -->
 
 <property>
@@ -1323,6 +1333,12 @@
     
   org.apache.gora.memory.store.MemStore
     Gora class for storing data in a Memory based implementation for tests.
+
+  org.apache.gora.mongodb.store.MongoStore
+    Gora class for storing data in MongoDB.
+
+  org.apache.gora.solr.store.SolrStore
+    Gora class for storing data in Apache Solr.
   </description>
 </property>
 

Modified: nutch/branches/2.x/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Sun Sep 20 12:50:51 2015
@@ -46,13 +46,19 @@
     <dependency org="commons-codec" name="commons-codec" rev="1.3"
       conf="*->default" />
 
-    <dependency org="org.apache.hadoop" name="hadoop-core"
-      rev="1.2.0" conf="*->default">
+    <!-- Hadoop Dependencies -->
+    <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.5.2" 
conf="*->default">
+      <exclude org="hsqldb" name="hsqldb" />
       <exclude org="net.sf.kosmosfs" name="kfs" />
       <exclude org="net.java.dev.jets3t" name="jets3t" />
       <exclude org="org.eclipse.jdt" name="core" />
       <exclude org="org.mortbay.jetty" name="jsp-*" />
+      <exclude org="ant" name="ant" />
     </dependency>
+    <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.5.2" 
conf="*->default"/>
+    <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" 
rev="2.5.2" conf="*->default"/>
+    <dependency org="org.apache.hadoop" 
name="hadoop-mapreduce-client-jobclient" rev="2.5.2" conf="*->default"/>
+    <!-- End of Hadoop Dependencies -->
 
     <dependency org="com.ibm.icu" name="icu4j" rev="55.1" />
     <dependency org="org.apache.tika" name="tika-core" rev="1.10" />
@@ -70,31 +76,21 @@
     <dependency org="com.google.guava" name="guava" rev="11.0.2" />
     <dependency org="com.google.code.crawler-commons" name="crawler-commons" 
rev="0.5" />
     
-       <dependency org="org.restlet.jse" name="org.restlet" rev="2.2.3" 
conf="*->default" />
-    <dependency org="org.restlet.jse" name="org.restlet.ext.jackson" 
rev="2.2.3" 
-      conf="*->default" />
-    <dependency org="org.restlet.jse" name="org.restlet.ext.jaxrs" rev="2.2.3" 
-      conf="*->default" />
-    <!--Configuration: test -->
+    <dependency org="org.restlet.jse" name="org.restlet" rev="2.2.3" 
conf="*->default" />
+    <dependency org="org.restlet.jse" name="org.restlet.ext.jackson" 
rev="2.2.3" conf="*->default" />
+    <dependency org="org.restlet.jse" name="org.restlet.ext.jaxrs" rev="2.2.3" 
conf="*->default" />
 
     <!--artifacts needed for testing -->
     <dependency org="junit" name="junit" rev="4.11" conf="*->default" />
 
-    <dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0" 
conf="test->default">
-      <exclude org="net.sf.kosmosfs" name="kfs" />
-      <exclude org="net.java.dev.jets3t" name="jets3t" />
-      <exclude org="org.eclipse.jdt" name="core" />
-      <exclude org="org.mortbay.jetty" name="jsp-*" />
-    </dependency>
-
     <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.26" 
conf="test->default" />
     <dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.26" 
conf="test->default" />
     <dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.26" />
 
     <dependency org="org.hsqldb" name="hsqldb" rev="2.2.8" conf="*->default" />
     <dependency org="org.jdom" name="jdom" rev="1.1" conf="test->default"/>
-       <dependency org="org.mockito" name="mockito-all" rev="1.9.5" 
conf="test->default"/>
-       <dependency org="org.springframework" name="spring-test" 
rev="4.0.4.RELEASE" conf="test->default"/>
+    <dependency org="org.mockito" name="mockito-all" rev="1.9.5" 
conf="test->default"/>
+    <dependency org="org.springframework" name="spring-test" 
rev="4.0.4.RELEASE" conf="test->default"/>
        
 
     <!--================-->
@@ -103,11 +99,11 @@
     <!-- N.B. To use Gora SNAPSHOT's merely replace the 'ref' value with the 
SNAPSHOT version 
     and add changing="true" alongside the dependency declaration. An example 
has been
     provided for the gora-core dependency as below -->
-    <dependency org="org.apache.gora" name="gora-core" rev="0.5" 
conf="*->default"/>
+    <dependency org="org.apache.gora" name="gora-core" rev="0.6.1" 
conf="*->default"/>
     
     <!-- Uncomment this to use SQL as Gora backend. It should be noted that 
the 
     gora-sql 0.1.1-incubating artifact is NOT compatable with gora-core 0.3. 
Users should 
-    downgrade to gora-core 0.2.1 in order to use SQL as a backend. -->
+    downgrade to gora-core 0.2.1 in order to use SQL as a backend however this 
is not suggested. -->
     <!--
     <dependency org="org.apache.gora" name="gora-sql" rev="0.1.1-incubating" 
conf="*->default" />
     -->
@@ -117,29 +113,29 @@
     -->
     <!-- Uncomment this to use HBase as Gora backend. -->
     <!--     
-    <dependency org="org.apache.gora" name="gora-hbase" rev="0.5" 
conf="*->default" /> 
+    <dependency org="org.apache.gora" name="gora-hbase" rev="0.6.1" 
conf="*->default" /> 
     -->
     <!-- Uncomment this to use Accumulo as Gora backend. -->
     <!--
-    <dependency org="org.apache.gora" name="gora-accumulo" rev="0.5" 
conf="*->default" />
+    <dependency org="org.apache.gora" name="gora-accumulo" rev="0.6.1" 
conf="*->default" />
     -->
     <!-- Uncomment this to use Cassandra as Gora backend. -->
     <!-- 
-    <dependency org="org.apache.gora" name="gora-cassandra" rev="0.5" 
conf="*->default" />
+    <dependency org="org.apache.gora" name="gora-cassandra" rev="0.6.1" 
conf="*->default" />
     -->
     <!-- Uncomment this to use MongoDB as Gora backend. -->
     <!--
-    <dependency org="org.apache.gora" name="gora-mongodb" rev="0.5" 
conf="*->default" />
+    <dependency org="org.apache.gora" name="gora-mongodb" rev="0.6.1" 
conf="*->default" />
     -->    
     <!-- Uncomment this to use Solr as Gora backend. -->
     <!--
-    <dependency org="org.apache.gora" name="gora-solr" rev="0.5" 
conf="*->default" />
+    <dependency org="org.apache.gora" name="gora-solr" rev="0.6.1" 
conf="*->default" />
     -->
     <!-- The gora-compiler is used within the 'ant generate-gora-src' target 
to compile
     the Gora .avsc files within ./src/gora 
     -->
-    <dependency org="org.apache.gora" name="gora-compiler-cli" rev="0.5" 
conf="*->default"/>
-    <dependency org="org.apache.gora" name="gora-compiler" rev="0.5" 
conf="*->default"/>
+    <dependency org="org.apache.gora" name="gora-compiler-cli" rev="0.6.1" 
conf="*->default"/>
+    <dependency org="org.apache.gora" name="gora-compiler" rev="0.6.1" 
conf="*->default"/>
 
     <!-- web app dependencies -->
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java Sun 
Sep 20 12:50:51 2015
@@ -25,9 +25,7 @@ import java.util.Map.Entry;
 import org.apache.avro.util.Utf8;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.storage.Mark;
-import org.apache.nutch.util.NutchJob;
 import org.slf4j.Logger;
-import org.apache.hadoop.io.FloatWritable;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.scoring.ScoreDatum;
 import org.apache.nutch.scoring.ScoringFilterException;
@@ -45,6 +43,7 @@ public class DbUpdateMapper extends
 
   private final List<ScoreDatum> scoreData = new ArrayList<ScoreDatum>();
 
+  @SuppressWarnings("unused")
   private Utf8 batchId;
 
   // reuse writables

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Sun 
Sep 20 12:50:51 2015
@@ -24,7 +24,6 @@ import java.util.Map;
 import org.apache.avro.util.Utf8;
 import org.apache.gora.filter.FilterOp;
 import org.apache.gora.filter.MapFieldValueFilter;
-import org.apache.gora.filter.SingleFieldValueFilter;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
@@ -92,7 +91,7 @@ public class DbUpdaterJob extends NutchT
     HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
     fields.addAll(scoringFilters.getFields());
 
-    currentJob = new NutchJob(getConf(), "update-table");
+    currentJob = NutchJob.getInstance(getConf(), "update-table");
     if (crawlId != null) {
       currentJob.getConfiguration().set(Nutch.CRAWL_ID_KEY, crawlId);
     }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Sun 
Sep 20 12:50:51 2015
@@ -167,6 +167,12 @@ public class GeneratorJob extends NutchT
     String batchId = (String) args.get(Nutch.ARG_BATCH);
     if (batchId != null) {
       getConf().set(GeneratorJob.BATCH_ID, batchId);
+    } else {
+      // generate batchId
+      long curTime = System.currentTimeMillis();
+      int randomSeed = Math.abs(new Random().nextInt());
+      batchId = (curTime / 1000) + "-" + randomSeed;
+      getConf().set(BATCH_ID, batchId);
     }
 
     // map to inverted subset due for fetch, sort by score
@@ -209,7 +215,7 @@ public class GeneratorJob extends NutchT
     }
     numJobs = 1;
     currentJobNum = 0;
-    currentJob = new NutchJob(getConf(), "generate: " + 
getConf().get(BATCH_ID));
+    currentJob = NutchJob.getInstance(getConf(), "generate: " + 
getConf().get(BATCH_ID));
     Collection<WebPage.Field> fields = getFields(currentJob);
     StorageUtils.initMapperJob(currentJob, fields, SelectorEntry.class,
         WebPage.class, GeneratorMapper.class, SelectorEntryPartitioner.class,

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java Sun 
Sep 20 12:50:51 2015
@@ -16,7 +16,6 @@
  
******************************************************************************/
 package org.apache.nutch.crawl;
 
-import org.apache.avro.util.Utf8;
 import org.apache.gora.mapreduce.GoraMapper;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;
@@ -31,8 +30,6 @@ import org.apache.nutch.util.TableUtil;
 
 import java.io.IOException;
 import java.net.MalformedURLException;
-import java.nio.ByteBuffer;
-import java.util.HashMap;
 
 public class GeneratorMapper extends
     GoraMapper<String, WebPage, SelectorEntry, WebPage> {

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java 
Sun Sep 20 12:50:51 2015
@@ -23,9 +23,6 @@ import java.util.Map;
 
 import org.apache.avro.util.Utf8;
 import org.apache.gora.mapreduce.GoraReducer;
-import org.apache.gora.query.Query;
-import org.apache.gora.query.Result;
-import org.apache.gora.store.DataStore;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;
 import org.apache.nutch.fetcher.FetcherJob.FetcherMapper;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java Sun Sep 
20 12:50:51 2015
@@ -208,7 +208,7 @@ public class InjectorJob extends NutchTo
     }
     numJobs = 1;
     currentJobNum = 0;
-    currentJob = new NutchJob(getConf(), "inject " + input);
+    currentJob = NutchJob.getInstance(getConf(), "inject " + input);
     FileInputFormat.addInputPath(currentJob, input);
     currentJob.setMapperClass(UrlMapper.class);
     currentJob.setMapOutputKeyClass(String.class);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java Sun 
Sep 20 12:50:51 2015
@@ -314,7 +314,7 @@ public class WebTableReader extends Nutc
     }
 
     Path outFolder = new Path(output);
-    Job job = new NutchJob(getConf(), "db_dump");
+    Job job = NutchJob.getInstance(getConf(), "db_dump");
     Configuration cfg = job.getConfiguration();
     cfg.set(WebTableRegexMapper.regexParamName, regex);
     cfg.setBoolean(WebTableRegexMapper.contentParamName, content);
@@ -339,6 +339,7 @@ public class WebTableReader extends Nutc
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(Text.class);
 
+    @SuppressWarnings("unused")
     boolean success = job.waitForCompletion(true);
 
     if (LOG.isInfoEnabled()) {
@@ -540,7 +541,7 @@ public class WebTableReader extends Nutc
         + "stat_tmp" + System.currentTimeMillis());
 
     numJobs = 1;
-    currentJob = new NutchJob(getConf(), "db_stats");
+    currentJob = NutchJob.getInstance(getConf(), "db_stats");
 
     currentJob.getConfiguration().setBoolean(
         "mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java Sun 
Sep 20 12:50:51 2015
@@ -18,13 +18,10 @@ package org.apache.nutch.fetcher;
 
 import java.io.IOException;
 import java.text.SimpleDateFormat;
-import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Random;
-import java.util.StringTokenizer;
-
 import org.apache.avro.util.Utf8;
 import org.apache.gora.filter.FilterOp;
 import org.apache.gora.filter.MapFieldValueFilter;
@@ -188,7 +185,7 @@ public class FetcherJob extends NutchToo
     LOG.info("FetcherJob : timelimit set for : "
         + getConf().getLong("fetcher.timelimit", -1));
     numJobs = 1;
-    currentJob = new NutchJob(getConf(), "fetch");
+    currentJob = NutchJob.getInstance(getConf(), "fetch");
 
     // for politeness, don't permit parallel execution of a single task
     currentJob.setReduceSpeculativeExecution(false);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java Sun 
Sep 20 12:50:51 2015
@@ -94,7 +94,7 @@ public class HostDbUpdateJob implements
       FIELDS.add(WebPage.Field.OUTLINKS);
     }
 
-    NutchJob job = new NutchJob(getConf(), "hostdb-update");
+    NutchJob job = NutchJob.getInstance(getConf(), "hostdb-update");
 
     // === Map ===
     DataStore<String, WebPage> pageStore = StorageUtils.createWebStore(

Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java Sun 
Sep 20 12:50:51 2015
@@ -140,7 +140,7 @@ public class HostInjectorJob implements
   public boolean inject(Path hostDir) throws Exception {
     LOG.info("HostInjectorJob: starting");
     LOG.info("HostInjectorJob: hostDir: " + hostDir);
-    Job job = new NutchJob(getConf(), "inject-hosts " + hostDir);
+    Job job = NutchJob.getInstance(getConf(), "inject-hosts " + hostDir);
     FileInputFormat.addInputPath(job, hostDir);
     job.setMapperClass(UrlMapper.class);
     job.setMapOutputKeyClass(String.class);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java Sun 
Sep 20 12:50:51 2015
@@ -99,7 +99,6 @@ public class CleaningJob extends NutchTo
   public static class CleanReducer extends
       Reducer<String, WebPage, NullWritable, NullWritable> {
     private int numDeletes = 0;
-    private static final int NUM_MAX_DELETE_REQUEST = 1000;
     private boolean commit;
     IndexWriters writers = null;
 
@@ -135,7 +134,7 @@ public class CleaningJob extends NutchTo
   @Override
   public Map<String, Object> run(Map<String, Object> args) throws Exception {
     getConf().setBoolean(ARG_COMMIT, (Boolean) args.get(ARG_COMMIT));
-    currentJob = new NutchJob(getConf(), "CleaningJob");
+    currentJob = NutchJob.getInstance(getConf(), "CleaningJob");
     currentJob.getConfiguration().setClass(
         "mapred.output.key.comparator.class", StringComparator.class,
         RawComparator.class);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java Sun 
Sep 20 12:50:51 2015
@@ -139,7 +139,7 @@ public class IndexingJob extends NutchTo
     Configuration conf = getConf();
     conf.set(GeneratorJob.BATCH_ID, batchId);
 
-    Job job = new NutchJob(conf, "Indexer");
+    Job job = NutchJob.getInstance(conf, "Indexer");
     // TODO: Figure out why this needs to be here
     job.getConfiguration().setClass("mapred.output.key.comparator.class",
         StringComparator.class, RawComparator.class);

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
 (original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
 Sun Sep 20 12:50:51 2015
@@ -42,6 +42,7 @@ import org.apache.hadoop.mapreduce.lib.o
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.SolrServerException;
@@ -370,7 +371,7 @@ public class SolrDeleteDuplicates
 
     getConf().set(SolrConstants.SERVER_URL, solrUrl);
 
-    Job job = new Job(getConf(), "solrdedup");
+    Job job = NutchJob.getInstance(getConf(), "solrdedup");
 
     job.setInputFormatClass(SolrInputFormat.class);
     job.setOutputFormatClass(NullOutputFormat.class);

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Sun Sep 20 12:50:51 2015
@@ -142,113 +142,4 @@ public class OutlinkExtractor {
     return retval;
   }
 
-  /**
-   * Extracts outlinks from a plain text. <br />
-   * This Method takes the Jakarta Regexp API.
-   * 
-   * @param plainText
-   * 
-   * @return Array of <code>Outlink</code> s within found in plainText
-   * @deprecated only for tests
-   */
-  @Deprecated
-  private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
-
-    throw new UnsupportedOperationException(
-        "Implementation commented out. Please uncomment to use it.");
-
-    // final List outlinks = new ArrayList();
-    // String url;
-    // Outlink link;
-    //
-    // RE re = new RE(URL_PATTERN);
-    //
-    // int pos = 0;
-    //
-    // while (re.match(plainText, pos)) {
-    //
-    // url = re.getParen(0);
-    //
-    // if (LOG.isTraceEnabled()) {
-    // LOG.trace("Extracted url: " + url);
-    // }
-    //
-    // try {
-    //
-    // link = new Outlink(url, null);
-    // outlinks.add(link);
-    //
-    // } catch (MalformedURLException ex) {
-    // // if it is a malformed URL we just throw it away and continue with
-    // // extraction.
-    // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
-    // }
-    //
-    // pos = re.getParenEnd(0);
-    // }
-    //
-    // final Outlink[] retval;
-    //
-    // if (pos > 0) {
-    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
-    // } else {
-    // retval = new Outlink[0];
-    // }
-    //
-    // return retval;
-
-  }
-
-  /**
-   * Extracts outlinks from a plain text. </p> This Method takes the JDK5 
Regexp
-   * API.
-   * 
-   * @param plainText
-   * 
-   * @return Array of <code>Outlink</code> s within found in plainText
-   * @deprecated only for tests
-   */
-  @Deprecated
-  private Outlink[] getOutlinksJDK5Impl(final String plainText) {
-
-    throw new UnsupportedOperationException(
-        "Implementation commented out. Please uncomment to use it.");
-
-    // final List outlinks = new ArrayList();
-    // String url;
-    // Outlink link;
-    //
-    // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
-    // final RE re = new RE(urlPattern);
-    //
-    // int pos = 0;
-    //
-    // while (re.match(plainText, pos)) {
-    //
-    // url = re.getParen(0);
-    //
-    // try {
-    //
-    // link = new Outlink(url, null);
-    // outlinks.add(link);
-    // } catch (MalformedURLException ex) {
-    // // if it is a malformed URL we just throw it away and continue with
-    // // extraction.
-    // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
-    // }
-    //
-    // pos = re.getParenEnd(0);
-    // }
-    //
-    // final Outlink[] retval;
-    //
-    // if (pos > 0) {
-    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
-    // } else {
-    // retval = new Outlink[0];
-    // }
-    //
-    // return retval;
-  }
-
 }

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java 
Sun Sep 20 12:50:51 2015
@@ -16,8 +16,6 @@
  
******************************************************************************/
 package org.apache.nutch.parse;
 
-import java.util.HashMap;
-
 public interface ParseStatusCodes {
   // Primary status codes:
 

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java 
Sun Sep 20 12:50:51 2015
@@ -16,7 +16,6 @@
  
******************************************************************************/
 package org.apache.nutch.parse;
 
-import org.apache.avro.generic.GenericArray;
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.storage.ParseStatus;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Sun Sep 
20 12:50:51 2015
@@ -33,7 +33,6 @@ import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.GeneratorJob;
 import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.nutch.crawl.URLWebPage;
 import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.storage.Mark;
@@ -49,7 +48,6 @@ import org.apache.nutch.util.TableUtil;
 import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.ToolUtil;
 import org.apache.gora.filter.FilterOp;
-import org.apache.gora.filter.SingleFieldValueFilter;
 import org.apache.gora.mapreduce.GoraMapper;
 
 public class ParserJob extends NutchTool implements Tool {
@@ -250,7 +248,7 @@ public class ParserJob extends NutchTool
     } else {
       LOG.info("ParserJob: batchId:\t" + batchId);
     }
-    currentJob = new NutchJob(getConf(), "parse");
+    currentJob = NutchJob.getInstance(getConf(), "parse");
 
     Collection<WebPage.Field> fields = getFields(currentJob);
     MapFieldValueFilter<String, WebPage> batchIdFilter = 
getBatchIdFilter(batchId);

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java 
Sun Sep 20 12:50:51 2015
@@ -16,7 +16,6 @@
  
******************************************************************************/
 package org.apache.nutch.protocol;
 
-import org.apache.avro.generic.GenericArray;
 import org.apache.avro.util.Utf8;
 import org.apache.nutch.storage.ProtocolStatus;
 import org.apache.nutch.util.TableUtil;

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
Sun Sep 20 12:50:51 2015
@@ -22,7 +22,6 @@ import java.io.File;
 import java.io.FileReader;
 import java.io.LineNumberReader;
 import java.net.URL;
-import java.util.ArrayList;
 import java.util.Hashtable;
 import java.util.StringTokenizer;
 
@@ -33,8 +32,6 @@ import org.slf4j.LoggerFactory;
 // Nutch imports
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.io.Text;
-
 import com.google.common.io.Files;
 
 import crawlercommons.robots.BaseRobotRules;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java Sun 
Sep 20 12:50:51 2015
@@ -91,7 +91,7 @@ public class StorageUtils {
   public static <K, V extends Persistent> Class<? extends DataStore<K, V>> 
getDataStoreClass(
       Configuration conf) throws ClassNotFoundException {
     return (Class<? extends DataStore<K, V>>) Class.forName(conf.get(
-        "storage.data.store.class", "org.apache.gora.sql.store.SqlStore"));
+        "storage.data.store.class", "org.apache.gora.memory.store.MemStore"));
   }
 
   public static <K, V> void initMapperJob(Job job,

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java Sun Sep 
20 12:50:51 2015
@@ -186,7 +186,7 @@ public class Benchmark extends Configure
     conf.setInt(GeneratorJob.GENERATOR_MAX_COUNT, maxPerHost);
     conf.set(GeneratorJob.GENERATOR_COUNT_MODE,
         GeneratorJob.GENERATOR_COUNT_VALUE_HOST);
-    Job job = new NutchJob(conf);
+    Job job = NutchJob.getInstance(conf, "Benchmark");
     FileSystem fs = FileSystem.get(job.getConfiguration());
     Path dir = new Path(getConf().get("hadoop.tmp.dir"), "bench-"
         + System.currentTimeMillis());

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java Sun Sep 
20 12:50:51 2015
@@ -100,8 +100,7 @@ public class DmozParser {
    */
   private class RDFProcessor extends DefaultHandler {
     String curURL = null, curSection = null;
-    boolean titlePending = false, descPending = false,
-        insideAdultSection = false;
+    boolean titlePending = false, descPending = false;
     Pattern topicPattern = null;
     StringBuffer title = new StringBuffer(), desc = new StringBuffer();
     XMLReader reader;

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java 
Sun Sep 20 12:50:51 2015
@@ -147,7 +147,6 @@ public class FakeHandler extends Abstrac
       baseDomain = u.getHost();
       // chop off the TLD
       int pos = baseDomain.lastIndexOf('.');
-      String tld = baseDomain.substring(pos);
       baseDomain = baseDomain.substring(0, pos);
       String link;
       // external links

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java Sun Sep 
20 12:50:51 2015
@@ -43,7 +43,7 @@ public class HadoopFSUtil {
     return new PathFilter() {
       public boolean accept(final Path path) {
         try {
-          return fs.getFileStatus(path).isDir();
+          return fs.getFileStatus(path).isDirectory();
         } catch (IOException ioe) {
           return false;
         }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java Sun Sep 20 
12:50:51 2015
@@ -48,7 +48,7 @@ public class LockUtil {
     if (fs.exists(lockFile)) {
       if (!accept)
         throw new IOException("lock file " + lockFile + " already exists.");
-      if (fs.getFileStatus(lockFile).isDir())
+      if (fs.getFileStatus(lockFile).isDirectory())
         throw new IOException("lock file " + lockFile
             + " already exists and is a directory.");
       // do nothing - the file already exists.
@@ -76,7 +76,7 @@ public class LockUtil {
       throws IOException {
     if (!fs.exists(lockFile))
       return false;
-    if (fs.getFileStatus(lockFile).isDir())
+    if (fs.getFileStatus(lockFile).isDirectory())
       throw new IOException("lock file " + lockFile
           + " exists but is a directory!");
     return fs.delete(lockFile, false);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java Sun Sep 20 
12:50:51 2015
@@ -28,11 +28,26 @@ import java.io.IOException;
 /** A {@link Job} for Nutch jobs. */
 public class NutchJob extends Job {
 
+  /**
+   * 
+   * @param conf
+   * @throws IOException
+   * @deprecated use {@link NutchJob#getInstance(Configuration)}
+   */
+  @Deprecated
   public NutchJob(Configuration conf) throws IOException {
     super(conf);
-    setJarByClass(this.getClass());
+    super.setJarByClass(this.getClass());
   }
 
+  /**
+   * 
+   * @param conf
+   * @param jobName
+   * @throws IOException
+   * @deprecated use {@link NutchJob#getInstance(Configuration, String)}
+   */
+  @Deprecated
   public NutchJob(Configuration conf, String jobName) throws IOException {
     super(conf, jobName);
     // prefix jobName with crawlId if not empty
@@ -41,7 +56,57 @@ public class NutchJob extends Job {
       jobName = "[" + crawlId + "]" + jobName;
       setJobName(jobName);
     }
-    setJarByClass(this.getClass());
+    super.setJarByClass(this.getClass());
+  }
+  
+  /**
+   * Creates a new {@link NutchJob} with no particular {@link 
org.apache.hadoop.mapreduce.Cluster} and a 
+   * given {@link org.apache.hadoop.conf.Configuration}.
+   * 
+   * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> 
so 
+   * that any necessary internal modifications do not reflect on the incoming 
+   * parameter.
+   * 
+   * A Cluster will be created from the conf parameter only when it's needed.
+   * 
+   * This code heavily mimics that of Hadoop's.
+   * 
+   * @param conf the configuration
+   * @return the {@link NutchJob} , with no connection to a cluster yet.
+   * @throws IOException
+   */
+  public static NutchJob getInstance(Configuration conf) throws IOException {
+    // create with a null Cluster
+    NutchJobConf jobConf = new NutchJobConf(conf);
+    return new NutchJob(jobConf);
+  }
+
+      
+  /**
+   * Creates a new {@link NutchJob} with no particular {@link 
org.apache.hadoop.mapreduce.Cluster} 
+   * and a given jobName.
+   * A Cluster will be created from the conf parameter only when it's needed.
+   *
+   * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> 
so 
+   * that any necessary internal modifications do not reflect on the incoming 
+   * parameter.
+   * 
+   * @param conf the configuration
+   * @param jobName the name given to this NutchJob
+   * @return the {@link NutchJob} , with no connection to a cluster yet.
+   * @throws IOException
+   */
+  public static NutchJob getInstance(Configuration conf, String jobName)
+           throws IOException {
+    // create with a null Cluster
+    NutchJob result = getInstance(conf);
+    // prefix jobName with crawlId if not empty
+    String crawlId = conf.get("storage.crawl.id");
+    if (!StringUtils.isEmpty(crawlId)) {
+      jobName = "[" + crawlId + "]" + jobName;
+      result.setJobName(jobName);
+    }
+    return result;
   }
 
   @Override

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java Sun Sep 20 
12:50:51 2015
@@ -16,7 +16,6 @@
  
******************************************************************************/
 package org.apache.nutch.util;
 
-import org.apache.avro.util.Utf8;
 import org.apache.commons.lang.StringUtils;
 
 import java.net.MalformedURLException;

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java 
Sun Sep 20 12:50:51 2015
@@ -101,7 +101,7 @@ public class DomainStatistics extends Co
     long start = System.currentTimeMillis();
     LOG.info("DomainStatistics: starting at " + sdf.format(start));
 
-    Job job = new NutchJob(getConf(), "Domain statistics");
+    Job job = NutchJob.getInstance(getConf(), "Domain statistics");
     DataStore<String, WebPage> store = StorageUtils.createWebStore(
         job.getConfiguration(), String.class, WebPage.class);
 

Modified: nutch/branches/2.x/src/test/crawl-tests.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/crawl-tests.xml?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/crawl-tests.xml (original)
+++ nutch/branches/2.x/src/test/crawl-tests.xml Sun Sep 20 12:50:51 2015
@@ -38,14 +38,24 @@
   <value>true</value>
 </property>
 
-<property>                                                                     
                                                                              
-  <name>http.robots.agents</name>                                              
                                                                              
-  <value>test-nutch,*</value>                                                  
                                                                              
-  <description>The agent strings we'll look for in robots.txt files,           
                                                                              
-  comma-separated, in decreasing order of precedence. You should               
                                                                              
-  put the value of http.agent.name as the first agent name, and keep the       
                                                                              
-  default * at the end of the list. E.g.: BlurflDev,Blurfl,*                   
                                                                              
-  </description>                                                               
                                                                              
+<property>                             
+  <name>http.robots.agents</name>                               
+  <value>test-nutch,*</value>                  
+  <description>The agent strings we'll look for in robots.txt files, 
+    comma-separated, in decreasing order of precedence. You should
+    put the value of http.agent.name as the first agent name, and keep the
+    default * at the end of the list. E.g.: BlurflDev,Blurfl,*                 
                                                                                
+  </description>                           
+</property>
+
+<property>
+  <name>io.serializations</name>
+  
<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+  <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+  <description>A list of serialization classes that can be used for
+  obtaining serializers and deserializers.</description>
 </property>
 
 </configuration>

Modified: nutch/branches/2.x/src/test/gora.properties
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/gora.properties?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/gora.properties (original)
+++ nutch/branches/2.x/src/test/gora.properties Sun Sep 20 12:50:51 2015
@@ -15,11 +15,8 @@
 
 
 ###############################
-# Default SqlStore properties #
+# Default MemStore properties #
 ###############################
 
-gora.sqlstore.jdbc.driver=org.hsqldb.jdbc.JDBCDriver
-gora.sqlstore.jdbc.url=jdbc:hsqldb:mem:test
-gora.sqlstore.jdbc.user=sa
-gora.sqlstore.jdbc.password=
+gora.datastore.default=org.apache.gora.memory.store.MemStore
 

Modified: nutch/branches/2.x/src/test/nutch-site.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/nutch-site.xml?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/nutch-site.xml (original)
+++ nutch/branches/2.x/src/test/nutch-site.xml Sun Sep 20 12:50:51 2015
@@ -18,8 +18,8 @@
 
 <property>
   <name>storage.data.store.class</name>
-  <value>org.apache.gora.sql.store.SqlStore</value>
-  <description>Default class for storing data</description>
+  <value>org.apache.gora.memory.store.MemStore</value>
+  <description>Default in-memory datastore class for temp test 
data.</description>
 </property>
 
 </configuration>

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java 
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Sun 
Sep 20 12:50:51 2015
@@ -37,9 +37,13 @@ import java.util.Comparator;
 import static org.junit.Assert.assertEquals;
 
 /**
- * Basic generator test. 1. Insert entries in webtable 2. Generates entries to
- * fetch 3. Verifies that number of generated urls match 4. Verifies that
- * highest scoring urls are generated
+ * Basic generator test. 
+ * <ol>
+ * <li>Insert entries in webtable</li>
+ * <li>Generates entries to fetch</li>
+ * <li>Verifies that number of generated urls match, and finally </li>
+ * <li>Verifies that highest scoring urls are generated.</li>
+ * <ol>
  * 
  */
 public class TestGenerator extends AbstractNutchTest {
@@ -67,7 +71,7 @@ public class TestGenerator extends Abstr
    * @throws Exception
    */
   @Test
-  @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
+  @Ignore("GORA-240 Tests for MemStore")
   public void testGenerateHighest() throws Exception {
 
     final int NUM_RESULTS = 2;
@@ -130,7 +134,7 @@ public class TestGenerator extends Abstr
    * @throws Exception
    */
   @Test
-  @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
+  @Ignore("GORA-240 Tests for MemStore")
   public void testGenerateHostLimit() throws Exception {
     ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
 
@@ -183,7 +187,7 @@ public class TestGenerator extends Abstr
    * @throws Exception
    */
   @Test
-  @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
+  @Ignore("GORA-240 Tests for MemStore")
   public void testGenerateDomainLimit() throws Exception {
     ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
 
@@ -240,7 +244,7 @@ public class TestGenerator extends Abstr
    * @throws IOException
    */
   @Test
-  @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
+  @Ignore("GORA-240 Tests for MemStore")
   public void testFilter() throws IOException, Exception {
 
     ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java 
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java Sun 
Sep 20 12:50:51 2015
@@ -51,7 +51,6 @@ public class TestInjector extends Abstra
   }
 
   @Test
-  @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
   public void testInject() throws Exception {
     ArrayList<String> urls = new ArrayList<String>();
     for (int i = 0; i < 100; i++) {

Modified: nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java 
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java Sun 
Sep 20 12:50:51 2015
@@ -66,7 +66,6 @@ public class TestFetcher extends Abstrac
   }
 
   @Test
-  @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
   public void testFetch() throws Exception {
 
     // generate seedlist

Modified: 
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
 (original)
+++ 
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
 Sun Sep 20 12:50:51 2015
@@ -22,7 +22,9 @@ import java.io.DataInputStream;
 import java.io.DataOutputStream;
 import java.io.IOException;
 import java.util.Properties;
+
 import org.junit.Test;
+
 import static org.junit.Assert.*;
 
 /**
@@ -258,6 +260,7 @@ public class TestSpellCheckedMetadata {
    */
   @Test
   public final void testHandlingSpeed() {
+    @SuppressWarnings("unused")
     SpellCheckedMetadata result;
     long start = System.currentTimeMillis();
     for (int i = 0; i < NUM_ITERATIONS; i++) {

Modified: 
nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java 
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java 
Sun Sep 20 12:50:51 2015
@@ -65,7 +65,6 @@ public class TestGoraStorage extends Abs
    * @throws Exception
    */
   @Test
-  @Ignore("GORA-326 Removal of _g_dirty field from _ALL_FIELDS array and Field 
Enum in Persistent classes")
   public void testSinglethreaded() throws Exception {
     String id = "singlethread";
     readWrite(id, webPageStore);
@@ -113,7 +112,6 @@ public class TestGoraStorage extends Abs
    * @throws Exception
    */
   @Test
-  @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
   public void testMultithreaded() throws Exception {
     // create a fixed thread pool
     int numThreads = 8;
@@ -155,7 +153,6 @@ public class TestGoraStorage extends Abs
    * @throws Exception
    */
   @Test
-  @Ignore("GORA-326 Removal of _g_dirty field from _ALL_FIELDS array and Field 
Enum in Persistent classes")
   public void testMultiProcess() throws Exception {
     // create and start a hsql server, a stand-alone (memory backed) db
     // (important: a stand-alone server should be used because simple

Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java 
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java Sun 
Sep 20 12:50:51 2015
@@ -31,9 +31,6 @@ import org.mortbay.jetty.Server;
 import org.mortbay.jetty.handler.DefaultHandler;
 import org.mortbay.jetty.handler.HandlerList;
 import org.mortbay.jetty.handler.ResourceHandler;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import java.io.IOException;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
@@ -42,9 +39,6 @@ import java.util.List;
 
 public class CrawlTestUtil {
 
-  private static final Logger LOG = LoggerFactory
-      .getLogger(CrawlTestUtil.class);
-
   /**
    * For now we need to manually construct our Configuration, because we need 
to
    * override the default one and it is currently not possible to use


Reply via email to