[ 
https://issues.apache.org/jira/browse/NUTCH-2597?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16519437#comment-16519437
 ] 

ASF GitHub Bot commented on NUTCH-2597:
---------------------------------------

sebastian-nagel closed pull request #349: NUTCH-2597: fixed cleanup()
URL: https://github.com/apache/nutch/pull/349
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 73dd1b402..7b8b13aa6 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -192,14 +192,13 @@ public synchronized void close(TaskAttemptContext 
context) throws IOException {
     NutchWritable COUNT_1 = new NutchWritable(new LongWritable(1));
     private boolean sort = false;
 
+    @Override
     public void setup(Mapper<Text, CrawlDatum, Text, NutchWritable>.Context 
context) {
       Configuration conf = context.getConfiguration();
       sort = conf.getBoolean("db.reader.stats.sort", false);
     }
 
-    public void close() {
-    }
-
+    @Override
     public void map(Text key, CrawlDatum value, Context context)
         throws IOException, InterruptedException {
       context.write(new Text("T"), COUNT_1);
@@ -242,9 +241,7 @@ public void map(Text key, CrawlDatum value, Context context)
     public void setup(Reducer<Text, NutchWritable, Text, 
NutchWritable>.Context context) {
     }
 
-    public void close() {
-    }
-
+    @Override
     public void reduce(Text key, Iterable<NutchWritable> values,
         Context context)
         throws IOException, InterruptedException {
@@ -329,14 +326,13 @@ public void reduce(Text key, Iterable<NutchWritable> 
values,
     private static final FloatWritable fw = new FloatWritable();
     private float min = 0.0f;
 
+    @Override
     public void setup(Mapper<Text, CrawlDatum, FloatWritable, Text>.Context 
context) {
       Configuration conf = context.getConfiguration();
       min = conf.getFloat("db.reader.topn.min", 0.0f);
     }
 
-    public void close() {
-    }
-
+    @Override
     public void map(Text key, CrawlDatum value,
         Context context)
         throws IOException, InterruptedException {
@@ -352,6 +348,7 @@ public void map(Text key, CrawlDatum value,
     private long topN;
     private long count = 0L;
 
+    @Override
     public void reduce(FloatWritable key, Iterable<Text> values,
         Context context)
         throws IOException, InterruptedException {
@@ -364,13 +361,11 @@ public void reduce(FloatWritable key, Iterable<Text> 
values,
       }
     }
 
+    @Override
     public void setup(Reducer<FloatWritable, Text, FloatWritable, 
Text>.Context context) {
       Configuration conf = context.getConfiguration();
       topN = conf.getLong("db.reader.topn", 100) / 
Integer.parseInt(conf.get("mapreduce.job.reduces"));
     }
-
-    public void close() {
-    }
   }
 
   public void close() {
@@ -401,24 +396,25 @@ public void close() {
 
          // https://issues.apache.org/jira/browse/NUTCH-1029
          config.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", 
false);
-          FileSystem fileSystem = tmpFolder.getFileSystem(config);
-          try {
-            boolean success = job.waitForCompletion(true);
-            if (!success) {
-              String message = "CrawlDbReader job did not succeed, job status:"
-                  + job.getStatus().getState() + ", reason: "
-                  + job.getStatus().getFailureInfo();
-              LOG.error(message);
-              fileSystem.delete(tmpFolder, true);
-              throw new RuntimeException(message);
-            }
-          } catch (IOException | InterruptedException | ClassNotFoundException 
e) {
-            LOG.error(StringUtils.stringifyException(e));
-            fileSystem.delete(tmpFolder, true);
-            throw e;
-          }
-         // reading the result
-          SequenceFile.Reader[] readers = 
SegmentReaderUtil.getReaders(tmpFolder, config);
+    FileSystem fileSystem = tmpFolder.getFileSystem(config);
+    try {
+      boolean success = job.waitForCompletion(true);
+      if (!success) {
+        String message = "CrawlDbReader job did not succeed, job status:"
+            + job.getStatus().getState() + ", reason: "
+            + job.getStatus().getFailureInfo();
+        LOG.error(message);
+        fileSystem.delete(tmpFolder, true);
+        throw new RuntimeException(message);
+      }
+    } catch (IOException | InterruptedException | ClassNotFoundException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      fileSystem.delete(tmpFolder, true);
+      throw e;
+    }
+
+    // reading the result
+    SequenceFile.Reader[] readers = SegmentReaderUtil.getReaders(tmpFolder, 
config);
 
          Text key = new Text();
          NutchWritable value = new NutchWritable();
@@ -629,6 +625,7 @@ public CrawlDatum get(String crawlDb, String url, 
Configuration config)
     return res;
   }
 
+  @Override
   protected int process(String line, StringBuilder output) throws Exception {
     Job job = NutchJob.getInstance(getConf());
     Configuration config = job.getConfiguration();
@@ -720,6 +717,7 @@ public void processDumpJob(String crawlDb, String output,
     Expression expr = null;
     float sample;
 
+    @Override
     public void setup(Mapper<Text, CrawlDatum, Text, CrawlDatum>.Context 
context) {
       Configuration config = context.getConfiguration();
       if (config.get("regex", null) != null) {
@@ -734,9 +732,7 @@ public void setup(Mapper<Text, CrawlDatum, Text, 
CrawlDatum>.Context context) {
       sample = config.getFloat("sample", 1);
     }
 
-    public void close() {
-    }
-
+    @Override
     public void map(Text key, CrawlDatum value,
         Context context)
         throws IOException, InterruptedException {
diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java 
b/src/java/org/apache/nutch/crawl/LinkDb.java
index 3cf08ff8c..2d1160234 100644
--- a/src/java/org/apache/nutch/crawl/LinkDb.java
+++ b/src/java/org/apache/nutch/crawl/LinkDb.java
@@ -99,9 +99,6 @@ public void setup(Mapper<Text, ParseData, Text, 
Inlinks>.Context context) {
       }
     } 
 
-    public void cleanup(){
-    }
-
     public void map(Text key, ParseData parseData,
             Context context)
                     throws IOException, InterruptedException {
diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java 
b/src/java/org/apache/nutch/crawl/LinkDbReader.java
index cc0b6f614..9ee57ba1e 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbReader.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java
@@ -109,6 +109,7 @@ public void close() throws IOException {
     Pattern pattern = null;
     Matcher matcher = null;
     
+    @Override
     public void setup(Mapper<Text, Inlinks, Text, Inlinks>.Context context) {
       Configuration conf = context.getConfiguration();
       if (conf.get("linkdb.regex", null) != null) {
@@ -116,8 +117,7 @@ public void setup(Mapper<Text, Inlinks, Text, 
Inlinks>.Context context) {
       }
     }
 
-    public void cleanup() {}
-
+    @Override
     public void map(Text key, Inlinks value, Context context)
             throws IOException, InterruptedException {
 
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java 
b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
index 1c3c8c323..15a7e3793 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
@@ -58,11 +58,10 @@
   protected URLFilters filters = null;
   protected URLNormalizers normalizers = null;
 
-  public void cleanup() {}
-
   /**
    * @param job
    */
+  @Override
   public void setup(Mapper<Text, Writable, Text, NutchWritable>.Context 
context) {
     Configuration conf = context.getConfiguration();
     readingCrawlDb = conf.getBoolean("hostdb.reading.crawldb", false);
@@ -110,6 +109,7 @@ protected String filterNormalize(String url) {
     * @param value
     * @param context
     */
+  @Override
   public void map(Text key, Writable value,
     Context context)
     throws IOException, InterruptedException {
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java 
b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
index 21c847db8..70ce3eb3d 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
@@ -74,6 +74,7 @@
     *
     * @param job
     */
+  @Override
   public void setup(Reducer<Text, NutchWritable, Text, HostDatum>.Context 
context) {
     Configuration conf = context.getConfiguration();
     purgeFailedHostsThreshold = 
conf.getInt(UpdateHostDb.HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD, -1);
@@ -113,6 +114,7 @@ public void setup(Reducer<Text, NutchWritable, Text, 
HostDatum>.Context context)
   /**
     *
     */
+  @Override
   public void reduce(Text key, Iterable<NutchWritable> values,
     Context context) throws IOException, InterruptedException {
 
@@ -401,7 +403,8 @@ protected boolean isEligibleForCheck(HostDatum datum) {
   /**
     * Shut down all running threads and wait for completion.
     */
-  public void cleanup() {
+  @Override
+  public void cleanup(Context context) {
     LOG.info("UpdateHostDb: feeder finished, waiting for shutdown");
 
     // If we're here all keys have been fed and we can issue a shut down
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java 
b/src/java/org/apache/nutch/indexer/CleaningJob.java
index e8dab4031..7a0f70e78 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -64,12 +64,6 @@ public void setConf(Configuration conf) {
       Mapper<Text, CrawlDatum, ByteWritable, Text> {
     private ByteWritable OUT = new ByteWritable(CrawlDatum.STATUS_DB_GONE);
 
-    public void setup(Mapper<Text, CrawlDatum, ByteWritable, Text>.Context 
context) {
-    }
-
-    public void cleanup() throws IOException {
-    }
-
     @Override
     public void map(Text key, CrawlDatum value,
         Context context) throws IOException, InterruptedException {
@@ -91,6 +85,7 @@ public void map(Text key, CrawlDatum value,
 
     IndexWriters writers = null;
 
+    @Override
     public void setup(Reducer<ByteWritable, Text, Text, ByteWritable>.Context 
context) {
       Configuration conf = context.getConfiguration();
       writers = IndexWriters.get(conf);
@@ -102,7 +97,8 @@ public void setup(Reducer<ByteWritable, Text, Text, 
ByteWritable>.Context contex
       noCommit = conf.getBoolean("noCommit", false);
     }
 
-    public void cleanup() throws IOException {
+    @Override
+    public void cleanup(Context context) throws IOException {
       // BUFFERING OF CALLS TO INDEXER SHOULD BE HANDLED AT INDEXER LEVEL
       // if (numDeletes > 0) {
       // LOG.info("CleaningJob: deleting " + numDeletes + " documents");
@@ -119,6 +115,7 @@ public void cleanup() throws IOException {
       LOG.info("CleaningJob: deleted a total of " + totalDeleted + " 
documents");
     }
 
+    @Override
     public void reduce(ByteWritable key, Iterable<Text> values,
         Context context) throws IOException {
       for (Text document : values) {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java 
b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
index bde24cc33..71fe42f52 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
@@ -272,9 +272,6 @@ public void reduce(Text key, Iterable<ObjectWritable> 
values,
         }
       }
     }
-
-    public void cleanup() {
-    }
   }
 
   /**


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> NPE in updatehostdb
> -------------------
>
>                 Key: NUTCH-2597
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2597
>             Project: Nutch
>          Issue Type: Bug
>          Components: hostdb
>    Affects Versions: 1.15
>            Reporter: Jurian Broertjes
>            Priority: Critical
>             Fix For: 1.15
>
>
> I get an NPE on updatehostdb. I start with a clean crawlDB & hostDB. After an 
> inject, I do an updatehostdb with -checkAll and get the following stacktrace:
> {code}
> 2018-06-13 10:45:21,958 WARN hostdb.ResolverThread - 
> java.lang.NullPointerException
>  at 
> org.apache.hadoop.io.SequenceFile$Writer.checkAndWriteSync(SequenceFile.java:1359)
>  at org.apache.hadoop.io.SequenceFile$Writer.append(SequenceFile.java:1400)
>  at 
> org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat$1.write(SequenceFileOutputFormat.java:83)
>  at 
> org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.write(ReduceTask.java:558)
>  at 
> org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
>  at 
> org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer$Context.write(WrappedReducer.java:105)
>  at org.apache.nutch.hostdb.ResolverThread.run(ResolverThread.java:82)
>  at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  at java.lang.Thread.run(Thread.java:748)
> {code}
> Is this related to NUTCH-2375?
> If further testing is needed, please let me know!



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to