This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/2.x by this push:
new 8eb2e0b Nutch 2.X GeneratorJob creates NullPointerException when
using DataFileAvroStore
8eb2e0b is described below
commit 8eb2e0b521c4321621a1649cf16be8306d8f77d5
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Thu Dec 21 14:49:31 2017 +0000
Nutch 2.X GeneratorJob creates NullPointerException when using
DataFileAvroStore
---
conf/gora.properties | 32 ----------------------
src/java/org/apache/nutch/crawl/GeneratorJob.java | 21 ++++++++------
.../org/apache/nutch/crawl/GeneratorMapper.java | 8 ++----
.../org/apache/nutch/crawl/GeneratorReducer.java | 2 +-
src/java/org/apache/nutch/util/TableUtil.java | 8 ++----
5 files changed, 19 insertions(+), 52 deletions(-)
diff --git a/conf/gora.properties b/conf/gora.properties
index d49ca77..644ec0f 100644
--- a/conf/gora.properties
+++ b/conf/gora.properties
@@ -19,31 +19,6 @@
#gora.datastore.default=org.apache.gora.mock.store.MockDataStore
#gora.datastore.autocreateschema=true
-###############################
-# Default SqlStore properties #
-###############################
-
-#gora.sqlstore.jdbc.driver=org.hsqldb.jdbc.JDBCDriver
-#gora.sqlstore.jdbc.url=jdbc:hsqldb:hsql://localhost/nutchtest
-#gora.sqlstore.jdbc.user=sa
-#gora.sqlstore.jdbc.password=
-
-################################
-# Default AvroStore properties #
-################################
-
-# gora.avrostore.codec.type=BINARY||JSON
-# gora.avrostore.output.path=file:///tmp/gora.avrostore.test.output
-
-################################
-# DatafileAvroStore properties #
-################################
-# DataFileAvroStore is file based store which uses Avro's
-# DataFile{Writer,Reader}'s as a backend. This datastore supports
-# mapreduce.
-
-# gora.datafileavrostore.###=
-
#########################
# HBaseStore properties #
#########################
@@ -68,13 +43,6 @@
# gora.cassandrastore.servers=localhost:9160
-#######################
-# MemStore properties #
-#######################
-# This is a memory based {@link DataStore} implementation for tests.
-
-# gora.memstore.###=
-
############################
# AccumuloStore properties #
############################
diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java
b/src/java/org/apache/nutch/crawl/GeneratorJob.java
index c4058c1..aa6fec7 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorJob.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java
@@ -66,7 +66,7 @@ public class GeneratorJob extends NutchTool implements Tool {
public static final String BATCH_ID = "generate.batch.id";
public static final String GENERATE_COUNT = "generate.count";
- private static final Set<WebPage.Field> FIELDS = new
HashSet<WebPage.Field>();
+ private static final Set<WebPage.Field> FIELDS = new HashSet<>();
static {
FIELDS.add(WebPage.Field.FETCH_TIME);
@@ -85,6 +85,7 @@ public class GeneratorJob extends NutchTool implements Tool {
float score;
public SelectorEntry() {
+ //default constructor
}
public SelectorEntry(String url, float score) {
@@ -92,11 +93,13 @@ public class GeneratorJob extends NutchTool implements Tool
{
this.score = score;
}
+ @Override
public void readFields(DataInput in) throws IOException {
url = Text.readString(in);
score = in.readFloat();
}
+ @Override
public void write(DataOutput out) throws IOException {
Text.writeString(out, url);
out.writeFloat(score);
@@ -161,7 +164,7 @@ public class GeneratorJob extends NutchTool implements Tool
{
}
public GeneratorJob() {
-
+ //default constructor
}
public GeneratorJob(Configuration conf) {
@@ -169,7 +172,7 @@ public class GeneratorJob extends NutchTool implements Tool
{
}
public Collection<WebPage.Field> getFields(Job job) {
- Collection<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
+ Collection<WebPage.Field> fields = new HashSet<>(FIELDS);
fields.addAll(FetchScheduleFactory.getFetchSchedule(job.getConfiguration())
.getFields());
return fields;
@@ -183,8 +186,7 @@ public class GeneratorJob extends NutchTool implements Tool
{
public static String randomBatchId() {
long curTime = System.currentTimeMillis();
int randomSeed = Math.abs(new Random().nextInt());
- String batchId = (curTime / 1000) + "-" + randomSeed;
- return batchId;
+ return (curTime / 1000) + "-" + randomSeed;
}
/**
@@ -194,6 +196,7 @@ public class GeneratorJob extends NutchTool implements Tool
{
* @return results
* @throws Exception
*/
+ @Override
public Map<String, Object> run(Map<String, Object> args) throws Exception {
String batchId = (String) args.get(Nutch.ARG_BATCH);
if (batchId == null) {
@@ -342,8 +345,10 @@ public class GeneratorJob extends NutchTool implements
Tool {
return -1;
}
- long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
- boolean filter = true, norm = true;
+ long curTime = System.currentTimeMillis();
+ long topN = Long.MAX_VALUE;
+ boolean filter = true;
+ boolean norm = true;
boolean sitemap = false;
for (int i = 0; i < args.length; i++) {
@@ -376,7 +381,7 @@ public class GeneratorJob extends NutchTool implements Tool
{
}
}
- public static void main(String args[]) throws Exception {
+ public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new GeneratorJob(),
args);
System.exit(res);
diff --git a/src/java/org/apache/nutch/crawl/GeneratorMapper.java
b/src/java/org/apache/nutch/crawl/GeneratorMapper.java
index d07b0b5..cc02c91 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorMapper.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorMapper.java
@@ -78,13 +78,9 @@ GoraMapper<String, WebPage, SelectorEntry, WebPage> {
if ((sitemap && !URLFilters.isSitemap(page)) || !sitemap && URLFilters
.isSitemap(page))
return;
- } catch (URLFilterException e) {
+ } catch (URLFilterException | MalformedURLException e) {
GeneratorJob.LOG
- .warn("Couldn't filter url: {} ({})", url, e.getMessage());
- return;
- } catch (MalformedURLException e) {
- GeneratorJob.LOG
- .warn("Couldn't filter url: {} ({})", url, e.getMessage());
+ .warn("Couldn't filter url: {} ({})", url, e);
return;
}
diff --git a/src/java/org/apache/nutch/crawl/GeneratorReducer.java
b/src/java/org/apache/nutch/crawl/GeneratorReducer.java
index ed4d1e5..852adaf 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorReducer.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorReducer.java
@@ -45,7 +45,7 @@ public class GeneratorReducer extends
private long maxCount;
protected static long count = 0;
private boolean byDomain = false;
- private Map<String, Integer> hostCountMap = new HashMap<String, Integer>();
+ private Map<String, Integer> hostCountMap = new HashMap<>();
private Utf8 batchId;
@Override
diff --git a/src/java/org/apache/nutch/util/TableUtil.java
b/src/java/org/apache/nutch/util/TableUtil.java
index e6ccbbc..21718eb 100644
--- a/src/java/org/apache/nutch/util/TableUtil.java
+++ b/src/java/org/apache/nutch/util/TableUtil.java
@@ -92,10 +92,8 @@ public class TableUtil {
pathBegin = reversedUrl.length();
String sub = reversedUrl.substring(0, pathBegin);
- String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); //
{<reversed
- // host>,
- // <port>,
- //
<protocol>}
+ // {<reversed host>, <port>, <protocol>}
+ String[] splits = StringUtils.splitPreserveAllTokens(sub, ':');
buf.append(splits[1]); // add protocol
buf.append("://");
@@ -155,7 +153,7 @@ public class TableUtil {
* @return string-ifed Utf8 object or null if Utf8 instance is null
*/
public static String toString(CharSequence utf8) {
- return (utf8 == null ? null : StringUtil.cleanField(utf8.toString()));
+ return utf8 == null ? null : StringUtil.cleanField(utf8.toString());
}
}
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].