Author: lewismc
Date: Sun Sep 20 12:50:51 2015
New Revision: 1704128
URL: http://svn.apache.org/viewvc?rev=1704128&view=rev
Log:
NUTCH-1946 Upgrade to Gora 0.6.1
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/ivy/ivy.xml
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java
nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java
nutch/branches/2.x/src/test/crawl-tests.xml
nutch/branches/2.x/src/test/gora.properties
nutch/branches/2.x/src/test/nutch-site.xml
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sun Sep 20 12:50:51 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development 2.4-SNAPSHOT
+* NUTCH-1946 Upgrade to Gora 0.6.1 (lewismc, hsaputra, Jeroen Vlek)
+
* NUTCH-2094 Stopping and Restarting a crawl has issues in the Web UI (Prerna
Satija via mattmann)
* NUTCH-1679 UpdateDb using batchId, link may override crawled page (Tien
Nguyen Manh, Koen Smets, Alfonso Nishikawa, Alexander Kingson via lewismc)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Sun Sep 20 12:50:51 2015
@@ -418,13 +418,13 @@
<property>
<name>db.fetch.schedule.adaptive.min_interval</name>
- <value>60.0</value>
+ <value>60</value>
<description>Minimum fetchInterval, in seconds.</description>
</property>
<property>
<name>db.fetch.schedule.adaptive.max_interval</name>
- <value>31536000.0</value>
+ <value>31536000</value>
<description>Maximum fetchInterval, in seconds (365 days).
NOTE: this is limited by db.fetch.interval.max. Pages with
fetchInterval larger than db.fetch.interval.max
@@ -890,7 +890,7 @@
<property>
<name>plugin.folders</name>
- <value>plugins</value>
+ <value>/usr/local/2scot/build/plugins</value>
<description>Directories where nutch plugins are located. Each
element may be a relative or absolute path. If absolute, it is used
as is. If relative, it is searched for on the classpath.</description>
@@ -1198,6 +1198,16 @@
</description>
</property>
+<property>
+ <name>io.serializations</name>
+
<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+ <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+ <description>A list of serialization classes that can be used for
+ obtaining serializers and deserializers.</description>
+</property>
+
<!-- solr index properties -->
<property>
@@ -1323,6 +1333,12 @@
org.apache.gora.memory.store.MemStore
Gora class for storing data in a Memory based implementation for tests.
+
+ org.apache.gora.mongodb.store.MongoStore
+ Gora class for storing data in MongoDB.
+
+ org.apache.gora.solr.store.SolrStore
+ Gora class for storing data in Apache Solr.
</description>
</property>
Modified: nutch/branches/2.x/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Sun Sep 20 12:50:51 2015
@@ -46,13 +46,19 @@
<dependency org="commons-codec" name="commons-codec" rev="1.3"
conf="*->default" />
- <dependency org="org.apache.hadoop" name="hadoop-core"
- rev="1.2.0" conf="*->default">
+ <!-- Hadoop Dependencies -->
+ <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.5.2"
conf="*->default">
+ <exclude org="hsqldb" name="hsqldb" />
<exclude org="net.sf.kosmosfs" name="kfs" />
<exclude org="net.java.dev.jets3t" name="jets3t" />
<exclude org="org.eclipse.jdt" name="core" />
<exclude org="org.mortbay.jetty" name="jsp-*" />
+ <exclude org="ant" name="ant" />
</dependency>
+ <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.5.2"
conf="*->default"/>
+ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core"
rev="2.5.2" conf="*->default"/>
+ <dependency org="org.apache.hadoop"
name="hadoop-mapreduce-client-jobclient" rev="2.5.2" conf="*->default"/>
+ <!-- End of Hadoop Dependencies -->
<dependency org="com.ibm.icu" name="icu4j" rev="55.1" />
<dependency org="org.apache.tika" name="tika-core" rev="1.10" />
@@ -70,31 +76,21 @@
<dependency org="com.google.guava" name="guava" rev="11.0.2" />
<dependency org="com.google.code.crawler-commons" name="crawler-commons"
rev="0.5" />
- <dependency org="org.restlet.jse" name="org.restlet" rev="2.2.3"
conf="*->default" />
- <dependency org="org.restlet.jse" name="org.restlet.ext.jackson"
rev="2.2.3"
- conf="*->default" />
- <dependency org="org.restlet.jse" name="org.restlet.ext.jaxrs" rev="2.2.3"
- conf="*->default" />
- <!--Configuration: test -->
+ <dependency org="org.restlet.jse" name="org.restlet" rev="2.2.3"
conf="*->default" />
+ <dependency org="org.restlet.jse" name="org.restlet.ext.jackson"
rev="2.2.3" conf="*->default" />
+ <dependency org="org.restlet.jse" name="org.restlet.ext.jaxrs" rev="2.2.3"
conf="*->default" />
<!--artifacts needed for testing -->
<dependency org="junit" name="junit" rev="4.11" conf="*->default" />
- <dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0"
conf="test->default">
- <exclude org="net.sf.kosmosfs" name="kfs" />
- <exclude org="net.java.dev.jets3t" name="jets3t" />
- <exclude org="org.eclipse.jdt" name="core" />
- <exclude org="org.mortbay.jetty" name="jsp-*" />
- </dependency>
-
<dependency org="org.mortbay.jetty" name="jetty" rev="6.1.26"
conf="test->default" />
<dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.26"
conf="test->default" />
<dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.26" />
<dependency org="org.hsqldb" name="hsqldb" rev="2.2.8" conf="*->default" />
<dependency org="org.jdom" name="jdom" rev="1.1" conf="test->default"/>
- <dependency org="org.mockito" name="mockito-all" rev="1.9.5"
conf="test->default"/>
- <dependency org="org.springframework" name="spring-test"
rev="4.0.4.RELEASE" conf="test->default"/>
+ <dependency org="org.mockito" name="mockito-all" rev="1.9.5"
conf="test->default"/>
+ <dependency org="org.springframework" name="spring-test"
rev="4.0.4.RELEASE" conf="test->default"/>
<!--================-->
@@ -103,11 +99,11 @@
<!-- N.B. To use Gora SNAPSHOT's merely replace the 'ref' value with the
SNAPSHOT version
and add changing="true" alongside the dependency declaration. An example
has been
provided for the gora-core dependency as below -->
- <dependency org="org.apache.gora" name="gora-core" rev="0.5"
conf="*->default"/>
+ <dependency org="org.apache.gora" name="gora-core" rev="0.6.1"
conf="*->default"/>
<!-- Uncomment this to use SQL as Gora backend. It should be noted that
the
gora-sql 0.1.1-incubating artifact is NOT compatable with gora-core 0.3.
Users should
- downgrade to gora-core 0.2.1 in order to use SQL as a backend. -->
+ downgrade to gora-core 0.2.1 in order to use SQL as a backend however this
is not suggested. -->
<!--
<dependency org="org.apache.gora" name="gora-sql" rev="0.1.1-incubating"
conf="*->default" />
-->
@@ -117,29 +113,29 @@
-->
<!-- Uncomment this to use HBase as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-hbase" rev="0.5"
conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-hbase" rev="0.6.1"
conf="*->default" />
-->
<!-- Uncomment this to use Accumulo as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-accumulo" rev="0.5"
conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-accumulo" rev="0.6.1"
conf="*->default" />
-->
<!-- Uncomment this to use Cassandra as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-cassandra" rev="0.5"
conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-cassandra" rev="0.6.1"
conf="*->default" />
-->
<!-- Uncomment this to use MongoDB as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-mongodb" rev="0.5"
conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-mongodb" rev="0.6.1"
conf="*->default" />
-->
<!-- Uncomment this to use Solr as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-solr" rev="0.5"
conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-solr" rev="0.6.1"
conf="*->default" />
-->
<!-- The gora-compiler is used within the 'ant generate-gora-src' target
to compile
the Gora .avsc files within ./src/gora
-->
- <dependency org="org.apache.gora" name="gora-compiler-cli" rev="0.5"
conf="*->default"/>
- <dependency org="org.apache.gora" name="gora-compiler" rev="0.5"
conf="*->default"/>
+ <dependency org="org.apache.gora" name="gora-compiler-cli" rev="0.6.1"
conf="*->default"/>
+ <dependency org="org.apache.gora" name="gora-compiler" rev="0.6.1"
conf="*->default"/>
<!-- web app dependencies -->
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java Sun
Sep 20 12:50:51 2015
@@ -25,9 +25,7 @@ import java.util.Map.Entry;
import org.apache.avro.util.Utf8;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.storage.Mark;
-import org.apache.nutch.util.NutchJob;
import org.slf4j.Logger;
-import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.scoring.ScoreDatum;
import org.apache.nutch.scoring.ScoringFilterException;
@@ -45,6 +43,7 @@ public class DbUpdateMapper extends
private final List<ScoreDatum> scoreData = new ArrayList<ScoreDatum>();
+ @SuppressWarnings("unused")
private Utf8 batchId;
// reuse writables
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Sun
Sep 20 12:50:51 2015
@@ -24,7 +24,6 @@ import java.util.Map;
import org.apache.avro.util.Utf8;
import org.apache.gora.filter.FilterOp;
import org.apache.gora.filter.MapFieldValueFilter;
-import org.apache.gora.filter.SingleFieldValueFilter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
@@ -92,7 +91,7 @@ public class DbUpdaterJob extends NutchT
HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
fields.addAll(scoringFilters.getFields());
- currentJob = new NutchJob(getConf(), "update-table");
+ currentJob = NutchJob.getInstance(getConf(), "update-table");
if (crawlId != null) {
currentJob.getConfiguration().set(Nutch.CRAWL_ID_KEY, crawlId);
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Sun
Sep 20 12:50:51 2015
@@ -167,6 +167,12 @@ public class GeneratorJob extends NutchT
String batchId = (String) args.get(Nutch.ARG_BATCH);
if (batchId != null) {
getConf().set(GeneratorJob.BATCH_ID, batchId);
+ } else {
+ // generate batchId
+ long curTime = System.currentTimeMillis();
+ int randomSeed = Math.abs(new Random().nextInt());
+ batchId = (curTime / 1000) + "-" + randomSeed;
+ getConf().set(BATCH_ID, batchId);
}
// map to inverted subset due for fetch, sort by score
@@ -209,7 +215,7 @@ public class GeneratorJob extends NutchT
}
numJobs = 1;
currentJobNum = 0;
- currentJob = new NutchJob(getConf(), "generate: " +
getConf().get(BATCH_ID));
+ currentJob = NutchJob.getInstance(getConf(), "generate: " +
getConf().get(BATCH_ID));
Collection<WebPage.Field> fields = getFields(currentJob);
StorageUtils.initMapperJob(currentJob, fields, SelectorEntry.class,
WebPage.class, GeneratorMapper.class, SelectorEntryPartitioner.class,
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java Sun
Sep 20 12:50:51 2015
@@ -16,7 +16,6 @@
******************************************************************************/
package org.apache.nutch.crawl;
-import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;
@@ -31,8 +30,6 @@ import org.apache.nutch.util.TableUtil;
import java.io.IOException;
import java.net.MalformedURLException;
-import java.nio.ByteBuffer;
-import java.util.HashMap;
public class GeneratorMapper extends
GoraMapper<String, WebPage, SelectorEntry, WebPage> {
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java
Sun Sep 20 12:50:51 2015
@@ -23,9 +23,6 @@ import java.util.Map;
import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraReducer;
-import org.apache.gora.query.Query;
-import org.apache.gora.query.Result;
-import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;
import org.apache.nutch.fetcher.FetcherJob.FetcherMapper;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java Sun Sep
20 12:50:51 2015
@@ -208,7 +208,7 @@ public class InjectorJob extends NutchTo
}
numJobs = 1;
currentJobNum = 0;
- currentJob = new NutchJob(getConf(), "inject " + input);
+ currentJob = NutchJob.getInstance(getConf(), "inject " + input);
FileInputFormat.addInputPath(currentJob, input);
currentJob.setMapperClass(UrlMapper.class);
currentJob.setMapOutputKeyClass(String.class);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java Sun
Sep 20 12:50:51 2015
@@ -314,7 +314,7 @@ public class WebTableReader extends Nutc
}
Path outFolder = new Path(output);
- Job job = new NutchJob(getConf(), "db_dump");
+ Job job = NutchJob.getInstance(getConf(), "db_dump");
Configuration cfg = job.getConfiguration();
cfg.set(WebTableRegexMapper.regexParamName, regex);
cfg.setBoolean(WebTableRegexMapper.contentParamName, content);
@@ -339,6 +339,7 @@ public class WebTableReader extends Nutc
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
+ @SuppressWarnings("unused")
boolean success = job.waitForCompletion(true);
if (LOG.isInfoEnabled()) {
@@ -540,7 +541,7 @@ public class WebTableReader extends Nutc
+ "stat_tmp" + System.currentTimeMillis());
numJobs = 1;
- currentJob = new NutchJob(getConf(), "db_stats");
+ currentJob = NutchJob.getInstance(getConf(), "db_stats");
currentJob.getConfiguration().setBoolean(
"mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java Sun
Sep 20 12:50:51 2015
@@ -18,13 +18,10 @@ package org.apache.nutch.fetcher;
import java.io.IOException;
import java.text.SimpleDateFormat;
-import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
-import java.util.StringTokenizer;
-
import org.apache.avro.util.Utf8;
import org.apache.gora.filter.FilterOp;
import org.apache.gora.filter.MapFieldValueFilter;
@@ -188,7 +185,7 @@ public class FetcherJob extends NutchToo
LOG.info("FetcherJob : timelimit set for : "
+ getConf().getLong("fetcher.timelimit", -1));
numJobs = 1;
- currentJob = new NutchJob(getConf(), "fetch");
+ currentJob = NutchJob.getInstance(getConf(), "fetch");
// for politeness, don't permit parallel execution of a single task
currentJob.setReduceSpeculativeExecution(false);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java Sun
Sep 20 12:50:51 2015
@@ -94,7 +94,7 @@ public class HostDbUpdateJob implements
FIELDS.add(WebPage.Field.OUTLINKS);
}
- NutchJob job = new NutchJob(getConf(), "hostdb-update");
+ NutchJob job = NutchJob.getInstance(getConf(), "hostdb-update");
// === Map ===
DataStore<String, WebPage> pageStore = StorageUtils.createWebStore(
Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java Sun
Sep 20 12:50:51 2015
@@ -140,7 +140,7 @@ public class HostInjectorJob implements
public boolean inject(Path hostDir) throws Exception {
LOG.info("HostInjectorJob: starting");
LOG.info("HostInjectorJob: hostDir: " + hostDir);
- Job job = new NutchJob(getConf(), "inject-hosts " + hostDir);
+ Job job = NutchJob.getInstance(getConf(), "inject-hosts " + hostDir);
FileInputFormat.addInputPath(job, hostDir);
job.setMapperClass(UrlMapper.class);
job.setMapOutputKeyClass(String.class);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java Sun
Sep 20 12:50:51 2015
@@ -99,7 +99,6 @@ public class CleaningJob extends NutchTo
public static class CleanReducer extends
Reducer<String, WebPage, NullWritable, NullWritable> {
private int numDeletes = 0;
- private static final int NUM_MAX_DELETE_REQUEST = 1000;
private boolean commit;
IndexWriters writers = null;
@@ -135,7 +134,7 @@ public class CleaningJob extends NutchTo
@Override
public Map<String, Object> run(Map<String, Object> args) throws Exception {
getConf().setBoolean(ARG_COMMIT, (Boolean) args.get(ARG_COMMIT));
- currentJob = new NutchJob(getConf(), "CleaningJob");
+ currentJob = NutchJob.getInstance(getConf(), "CleaningJob");
currentJob.getConfiguration().setClass(
"mapred.output.key.comparator.class", StringComparator.class,
RawComparator.class);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java Sun
Sep 20 12:50:51 2015
@@ -139,7 +139,7 @@ public class IndexingJob extends NutchTo
Configuration conf = getConf();
conf.set(GeneratorJob.BATCH_ID, batchId);
- Job job = new NutchJob(conf, "Indexer");
+ Job job = NutchJob.getInstance(conf, "Indexer");
// TODO: Figure out why this needs to be here
job.getConfiguration().setClass("mapred.output.key.comparator.class",
StringComparator.class, RawComparator.class);
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
Sun Sep 20 12:50:51 2015
@@ -42,6 +42,7 @@ import org.apache.hadoop.mapreduce.lib.o
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
@@ -370,7 +371,7 @@ public class SolrDeleteDuplicates
getConf().set(SolrConstants.SERVER_URL, solrUrl);
- Job job = new Job(getConf(), "solrdedup");
+ Job job = NutchJob.getInstance(getConf(), "solrdedup");
job.setInputFormatClass(SolrInputFormat.class);
job.setOutputFormatClass(NullOutputFormat.class);
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java
Sun Sep 20 12:50:51 2015
@@ -142,113 +142,4 @@ public class OutlinkExtractor {
return retval;
}
- /**
- * Extracts outlinks from a plain text. <br />
- * This Method takes the Jakarta Regexp API.
- *
- * @param plainText
- *
- * @return Array of <code>Outlink</code> s within found in plainText
- * @deprecated only for tests
- */
- @Deprecated
- private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
-
- throw new UnsupportedOperationException(
- "Implementation commented out. Please uncomment to use it.");
-
- // final List outlinks = new ArrayList();
- // String url;
- // Outlink link;
- //
- // RE re = new RE(URL_PATTERN);
- //
- // int pos = 0;
- //
- // while (re.match(plainText, pos)) {
- //
- // url = re.getParen(0);
- //
- // if (LOG.isTraceEnabled()) {
- // LOG.trace("Extracted url: " + url);
- // }
- //
- // try {
- //
- // link = new Outlink(url, null);
- // outlinks.add(link);
- //
- // } catch (MalformedURLException ex) {
- // // if it is a malformed URL we just throw it away and continue with
- // // extraction.
- // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
- // }
- //
- // pos = re.getParenEnd(0);
- // }
- //
- // final Outlink[] retval;
- //
- // if (pos > 0) {
- // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
- // } else {
- // retval = new Outlink[0];
- // }
- //
- // return retval;
-
- }
-
- /**
- * Extracts outlinks from a plain text. </p> This Method takes the JDK5
Regexp
- * API.
- *
- * @param plainText
- *
- * @return Array of <code>Outlink</code> s within found in plainText
- * @deprecated only for tests
- */
- @Deprecated
- private Outlink[] getOutlinksJDK5Impl(final String plainText) {
-
- throw new UnsupportedOperationException(
- "Implementation commented out. Please uncomment to use it.");
-
- // final List outlinks = new ArrayList();
- // String url;
- // Outlink link;
- //
- // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
- // final RE re = new RE(urlPattern);
- //
- // int pos = 0;
- //
- // while (re.match(plainText, pos)) {
- //
- // url = re.getParen(0);
- //
- // try {
- //
- // link = new Outlink(url, null);
- // outlinks.add(link);
- // } catch (MalformedURLException ex) {
- // // if it is a malformed URL we just throw it away and continue with
- // // extraction.
- // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
- // }
- //
- // pos = re.getParenEnd(0);
- // }
- //
- // final Outlink[] retval;
- //
- // if (pos > 0) {
- // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
- // } else {
- // retval = new Outlink[0];
- // }
- //
- // return retval;
- }
-
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java
Sun Sep 20 12:50:51 2015
@@ -16,8 +16,6 @@
******************************************************************************/
package org.apache.nutch.parse;
-import java.util.HashMap;
-
public interface ParseStatusCodes {
// Primary status codes:
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
Sun Sep 20 12:50:51 2015
@@ -16,7 +16,6 @@
******************************************************************************/
package org.apache.nutch.parse;
-import org.apache.avro.generic.GenericArray;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.storage.ParseStatus;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Sun Sep
20 12:50:51 2015
@@ -33,7 +33,6 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.GeneratorJob;
import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.nutch.crawl.URLWebPage;
import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.storage.Mark;
@@ -49,7 +48,6 @@ import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.ToolUtil;
import org.apache.gora.filter.FilterOp;
-import org.apache.gora.filter.SingleFieldValueFilter;
import org.apache.gora.mapreduce.GoraMapper;
public class ParserJob extends NutchTool implements Tool {
@@ -250,7 +248,7 @@ public class ParserJob extends NutchTool
} else {
LOG.info("ParserJob: batchId:\t" + batchId);
}
- currentJob = new NutchJob(getConf(), "parse");
+ currentJob = NutchJob.getInstance(getConf(), "parse");
Collection<WebPage.Field> fields = getFields(currentJob);
MapFieldValueFilter<String, WebPage> batchIdFilter =
getBatchIdFilter(batchId);
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
Sun Sep 20 12:50:51 2015
@@ -16,7 +16,6 @@
******************************************************************************/
package org.apache.nutch.protocol;
-import org.apache.avro.generic.GenericArray;
import org.apache.avro.util.Utf8;
import org.apache.nutch.storage.ProtocolStatus;
import org.apache.nutch.util.TableUtil;
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
Sun Sep 20 12:50:51 2015
@@ -22,7 +22,6 @@ import java.io.File;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.net.URL;
-import java.util.ArrayList;
import java.util.Hashtable;
import java.util.StringTokenizer;
@@ -33,8 +32,6 @@ import org.slf4j.LoggerFactory;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.io.Text;
-
import com.google.common.io.Files;
import crawlercommons.robots.BaseRobotRules;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java Sun
Sep 20 12:50:51 2015
@@ -91,7 +91,7 @@ public class StorageUtils {
public static <K, V extends Persistent> Class<? extends DataStore<K, V>>
getDataStoreClass(
Configuration conf) throws ClassNotFoundException {
return (Class<? extends DataStore<K, V>>) Class.forName(conf.get(
- "storage.data.store.class", "org.apache.gora.sql.store.SqlStore"));
+ "storage.data.store.class", "org.apache.gora.memory.store.MemStore"));
}
public static <K, V> void initMapperJob(Job job,
Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java Sun Sep
20 12:50:51 2015
@@ -186,7 +186,7 @@ public class Benchmark extends Configure
conf.setInt(GeneratorJob.GENERATOR_MAX_COUNT, maxPerHost);
conf.set(GeneratorJob.GENERATOR_COUNT_MODE,
GeneratorJob.GENERATOR_COUNT_VALUE_HOST);
- Job job = new NutchJob(conf);
+ Job job = NutchJob.getInstance(conf, "Benchmark");
FileSystem fs = FileSystem.get(job.getConfiguration());
Path dir = new Path(getConf().get("hadoop.tmp.dir"), "bench-"
+ System.currentTimeMillis());
Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java Sun Sep
20 12:50:51 2015
@@ -100,8 +100,7 @@ public class DmozParser {
*/
private class RDFProcessor extends DefaultHandler {
String curURL = null, curSection = null;
- boolean titlePending = false, descPending = false,
- insideAdultSection = false;
+ boolean titlePending = false, descPending = false;
Pattern topicPattern = null;
StringBuffer title = new StringBuffer(), desc = new StringBuffer();
XMLReader reader;
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
Sun Sep 20 12:50:51 2015
@@ -147,7 +147,6 @@ public class FakeHandler extends Abstrac
baseDomain = u.getHost();
// chop off the TLD
int pos = baseDomain.lastIndexOf('.');
- String tld = baseDomain.substring(pos);
baseDomain = baseDomain.substring(0, pos);
String link;
// external links
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java Sun Sep
20 12:50:51 2015
@@ -43,7 +43,7 @@ public class HadoopFSUtil {
return new PathFilter() {
public boolean accept(final Path path) {
try {
- return fs.getFileStatus(path).isDir();
+ return fs.getFileStatus(path).isDirectory();
} catch (IOException ioe) {
return false;
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java Sun Sep 20
12:50:51 2015
@@ -48,7 +48,7 @@ public class LockUtil {
if (fs.exists(lockFile)) {
if (!accept)
throw new IOException("lock file " + lockFile + " already exists.");
- if (fs.getFileStatus(lockFile).isDir())
+ if (fs.getFileStatus(lockFile).isDirectory())
throw new IOException("lock file " + lockFile
+ " already exists and is a directory.");
// do nothing - the file already exists.
@@ -76,7 +76,7 @@ public class LockUtil {
throws IOException {
if (!fs.exists(lockFile))
return false;
- if (fs.getFileStatus(lockFile).isDir())
+ if (fs.getFileStatus(lockFile).isDirectory())
throw new IOException("lock file " + lockFile
+ " exists but is a directory!");
return fs.delete(lockFile, false);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java Sun Sep 20
12:50:51 2015
@@ -28,11 +28,26 @@ import java.io.IOException;
/** A {@link Job} for Nutch jobs. */
public class NutchJob extends Job {
+ /**
+ *
+ * @param conf
+ * @throws IOException
+ * @deprecated use {@link NutchJob#getInstance(Configuration)}
+ */
+ @Deprecated
public NutchJob(Configuration conf) throws IOException {
super(conf);
- setJarByClass(this.getClass());
+ super.setJarByClass(this.getClass());
}
+ /**
+ *
+ * @param conf
+ * @param jobName
+ * @throws IOException
+ * @deprecated use {@link NutchJob#getInstance(Configuration, String)}
+ */
+ @Deprecated
public NutchJob(Configuration conf, String jobName) throws IOException {
super(conf, jobName);
// prefix jobName with crawlId if not empty
@@ -41,7 +56,57 @@ public class NutchJob extends Job {
jobName = "[" + crawlId + "]" + jobName;
setJobName(jobName);
}
- setJarByClass(this.getClass());
+ super.setJarByClass(this.getClass());
+ }
+
+ /**
+ * Creates a new {@link NutchJob} with no particular {@link
org.apache.hadoop.mapreduce.Cluster} and a
+ * given {@link org.apache.hadoop.conf.Configuration}.
+ *
+ * The <code>NutchJob</code> makes a copy of the <code>Configuration</code>
so
+ * that any necessary internal modifications do not reflect on the incoming
+ * parameter.
+ *
+ * A Cluster will be created from the conf parameter only when it's needed.
+ *
+ * This code heavily mimics that of Hadoop's.
+ *
+ * @param conf the configuration
+ * @return the {@link NutchJob} , with no connection to a cluster yet.
+ * @throws IOException
+ */
+ public static NutchJob getInstance(Configuration conf) throws IOException {
+ // create with a null Cluster
+ NutchJobConf jobConf = new NutchJobConf(conf);
+ return new NutchJob(jobConf);
+ }
+
+
+ /**
+ * Creates a new {@link NutchJob} with no particular {@link
org.apache.hadoop.mapreduce.Cluster}
+ * and a given jobName.
+ * A Cluster will be created from the conf parameter only when it's needed.
+ *
+ * The <code>NutchJob</code> makes a copy of the <code>Configuration</code>
so
+ * that any necessary internal modifications do not reflect on the incoming
+ * parameter.
+ *
+ * @param conf the configuration
+ * @param jobName the name given to this NutchJob
+ * @return the {@link NutchJob} , with no connection to a cluster yet.
+ * @throws IOException
+ */
+ public static NutchJob getInstance(Configuration conf, String jobName)
+ throws IOException {
+ // create with a null Cluster
+ NutchJob result = getInstance(conf);
+ // prefix jobName with crawlId if not empty
+ String crawlId = conf.get("storage.crawl.id");
+ if (!StringUtils.isEmpty(crawlId)) {
+ jobName = "[" + crawlId + "]" + jobName;
+ result.setJobName(jobName);
+ }
+ return result;
}
@Override
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java Sun Sep 20
12:50:51 2015
@@ -16,7 +16,6 @@
******************************************************************************/
package org.apache.nutch.util;
-import org.apache.avro.util.Utf8;
import org.apache.commons.lang.StringUtils;
import java.net.MalformedURLException;
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java
Sun Sep 20 12:50:51 2015
@@ -101,7 +101,7 @@ public class DomainStatistics extends Co
long start = System.currentTimeMillis();
LOG.info("DomainStatistics: starting at " + sdf.format(start));
- Job job = new NutchJob(getConf(), "Domain statistics");
+ Job job = NutchJob.getInstance(getConf(), "Domain statistics");
DataStore<String, WebPage> store = StorageUtils.createWebStore(
job.getConfiguration(), String.class, WebPage.class);
Modified: nutch/branches/2.x/src/test/crawl-tests.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/crawl-tests.xml?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/crawl-tests.xml (original)
+++ nutch/branches/2.x/src/test/crawl-tests.xml Sun Sep 20 12:50:51 2015
@@ -38,14 +38,24 @@
<value>true</value>
</property>
-<property>
- <name>http.robots.agents</name>
- <value>test-nutch,*</value>
- <description>The agent strings we'll look for in robots.txt files,
- comma-separated, in decreasing order of precedence. You should
- put the value of http.agent.name as the first agent name, and keep the
- default * at the end of the list. E.g.: BlurflDev,Blurfl,*
- </description>
+<property>
+ <name>http.robots.agents</name>
+ <value>test-nutch,*</value>
+ <description>The agent strings we'll look for in robots.txt files,
+ comma-separated, in decreasing order of precedence. You should
+ put the value of http.agent.name as the first agent name, and keep the
+ default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+ </description>
+</property>
+
+<property>
+ <name>io.serializations</name>
+
<value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+ <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+ <description>A list of serialization classes that can be used for
+ obtaining serializers and deserializers.</description>
</property>
</configuration>
Modified: nutch/branches/2.x/src/test/gora.properties
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/gora.properties?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/gora.properties (original)
+++ nutch/branches/2.x/src/test/gora.properties Sun Sep 20 12:50:51 2015
@@ -15,11 +15,8 @@
###############################
-# Default SqlStore properties #
+# Default MemStore properties #
###############################
-gora.sqlstore.jdbc.driver=org.hsqldb.jdbc.JDBCDriver
-gora.sqlstore.jdbc.url=jdbc:hsqldb:mem:test
-gora.sqlstore.jdbc.user=sa
-gora.sqlstore.jdbc.password=
+gora.datastore.default=org.apache.gora.memory.store.MemStore
Modified: nutch/branches/2.x/src/test/nutch-site.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/nutch-site.xml?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/nutch-site.xml (original)
+++ nutch/branches/2.x/src/test/nutch-site.xml Sun Sep 20 12:50:51 2015
@@ -18,8 +18,8 @@
<property>
<name>storage.data.store.class</name>
- <value>org.apache.gora.sql.store.SqlStore</value>
- <description>Default class for storing data</description>
+ <value>org.apache.gora.memory.store.MemStore</value>
+ <description>Default in-memory datastore class for temp test
data.</description>
</property>
</configuration>
Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Sun
Sep 20 12:50:51 2015
@@ -37,9 +37,13 @@ import java.util.Comparator;
import static org.junit.Assert.assertEquals;
/**
- * Basic generator test. 1. Insert entries in webtable 2. Generates entries to
- * fetch 3. Verifies that number of generated urls match 4. Verifies that
- * highest scoring urls are generated
+ * Basic generator test.
+ * <ol>
+ * <li>Insert entries in webtable</li>
+ * <li>Generates entries to fetch</li>
+ * <li>Verifies that number of generated urls match, and finally </li>
+ * <li>Verifies that highest scoring urls are generated.</li>
+ * <ol>
*
*/
public class TestGenerator extends AbstractNutchTest {
@@ -67,7 +71,7 @@ public class TestGenerator extends Abstr
* @throws Exception
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
+ @Ignore("GORA-240 Tests for MemStore")
public void testGenerateHighest() throws Exception {
final int NUM_RESULTS = 2;
@@ -130,7 +134,7 @@ public class TestGenerator extends Abstr
* @throws Exception
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
+ @Ignore("GORA-240 Tests for MemStore")
public void testGenerateHostLimit() throws Exception {
ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
@@ -183,7 +187,7 @@ public class TestGenerator extends Abstr
* @throws Exception
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
+ @Ignore("GORA-240 Tests for MemStore")
public void testGenerateDomainLimit() throws Exception {
ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
@@ -240,7 +244,7 @@ public class TestGenerator extends Abstr
* @throws IOException
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
+ @Ignore("GORA-240 Tests for MemStore")
public void testFilter() throws IOException, Exception {
ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java Sun
Sep 20 12:50:51 2015
@@ -51,7 +51,6 @@ public class TestInjector extends Abstra
}
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testInject() throws Exception {
ArrayList<String> urls = new ArrayList<String>();
for (int i = 0; i < 100; i++) {
Modified: nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java Sun
Sep 20 12:50:51 2015
@@ -66,7 +66,6 @@ public class TestFetcher extends Abstrac
}
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testFetch() throws Exception {
// generate seedlist
Modified:
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
---
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
(original)
+++
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
Sun Sep 20 12:50:51 2015
@@ -22,7 +22,9 @@ import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Properties;
+
import org.junit.Test;
+
import static org.junit.Assert.*;
/**
@@ -258,6 +260,7 @@ public class TestSpellCheckedMetadata {
*/
@Test
public final void testHandlingSpeed() {
+ @SuppressWarnings("unused")
SpellCheckedMetadata result;
long start = System.currentTimeMillis();
for (int i = 0; i < NUM_ITERATIONS; i++) {
Modified:
nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
Sun Sep 20 12:50:51 2015
@@ -65,7 +65,6 @@ public class TestGoraStorage extends Abs
* @throws Exception
*/
@Test
- @Ignore("GORA-326 Removal of _g_dirty field from _ALL_FIELDS array and Field
Enum in Persistent classes")
public void testSinglethreaded() throws Exception {
String id = "singlethread";
readWrite(id, webPageStore);
@@ -113,7 +112,6 @@ public class TestGoraStorage extends Abs
* @throws Exception
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testMultithreaded() throws Exception {
// create a fixed thread pool
int numThreads = 8;
@@ -155,7 +153,6 @@ public class TestGoraStorage extends Abs
* @throws Exception
*/
@Test
- @Ignore("GORA-326 Removal of _g_dirty field from _ALL_FIELDS array and Field
Enum in Persistent classes")
public void testMultiProcess() throws Exception {
// create and start a hsql server, a stand-alone (memory backed) db
// (important: a stand-alone server should be used because simple
Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java?rev=1704128&r1=1704127&r2=1704128&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java Sun
Sep 20 12:50:51 2015
@@ -31,9 +31,6 @@ import org.mortbay.jetty.Server;
import org.mortbay.jetty.handler.DefaultHandler;
import org.mortbay.jetty.handler.HandlerList;
import org.mortbay.jetty.handler.ResourceHandler;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
@@ -42,9 +39,6 @@ import java.util.List;
public class CrawlTestUtil {
- private static final Logger LOG = LoggerFactory
- .getLogger(CrawlTestUtil.class);
-
/**
* For now we need to manually construct our Configuration, because we need
to
* override the default one and it is currently not possible to use