[ 
https://issues.apache.org/jira/browse/NUTCH-2551?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16435513#comment-16435513
 ] 

ASF GitHub Bot commented on NUTCH-2551:
---------------------------------------

sebastian-nagel closed pull request #316: fix for NUTCH-2551 contributed by 
Hans Brende
URL: https://github.com/apache/nutch/pull/316
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/crawl/Generator.java 
b/src/java/org/apache/nutch/crawl/Generator.java
index 92b337cfd..a3ef91c89 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -30,6 +30,7 @@
 import java.util.Map;
 import java.util.Random;
 
+import org.apache.hadoop.conf.Configurable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.commons.jexl2.Expression;
@@ -138,9 +139,27 @@ public String toString() {
 
   /** Selects entries due for fetch. */
   public static class Selector extends
-      Partitioner<FloatWritable, Writable> {
+      Partitioner<FloatWritable, Writable> implements Configurable {
 
-    private static URLPartitioner partitioner = new URLPartitioner();
+    private final URLPartitioner partitioner = new URLPartitioner();
+
+    /** Partition by host / domain or IP. */
+    public int getPartition(FloatWritable key, Writable value,
+                            int numReduceTasks) {
+      return partitioner.getPartition(((SelectorEntry) value).url, key,
+              numReduceTasks);
+    }
+
+    @Override
+    public Configuration getConf() {
+      return partitioner.getConf();
+    }
+
+    @Override
+    public void setConf(Configuration conf) {
+      partitioner.setConf(conf);
+    }
+  }
 
     /** Select and invert subset due for fetch. */
 
@@ -165,11 +184,9 @@ public String toString() {
       @Override 
       public void setup(Mapper<Text, CrawlDatum, FloatWritable, 
SelectorEntry>.Context context) throws IOException{
         conf = context.getConfiguration();
-        Job job = Job.getInstance(conf);
         curTime = conf.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis());
         filters = new URLFilters(conf);
         scfilters = new ScoringFilters(conf);
-        partitioner.configure(job);
         filter = conf.getBoolean(GENERATOR_FILTER, true);
         genDelay = conf.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L;
         long time = conf.getLong(Nutch.GENERATE_TIME_KEY, 0L);
@@ -257,12 +274,7 @@ public void map(Text key, CrawlDatum value,
       }
     }
 
-    /** Partition by host / domain or IP. */
-    public int getPartition(FloatWritable key, Writable value,
-        int numReduceTasks) {
-      return partitioner.getPartition(((SelectorEntry) value).url, key,
-          numReduceTasks);
-    }
+
     
     /** Collect until limit is reached. */
     public static class SelectorReducer extends
@@ -532,7 +544,6 @@ private HostDatum getHostDatum(String host) throws 
Exception {
         return null;
       }
     }
-  }
 
   public static class DecreasingFloatComparator extends
       FloatWritable.Comparator {
@@ -606,20 +617,20 @@ private static int hash(byte[] bytes, int start, int 
length) {
    * Update the CrawlDB so that the next generate won't include the same URLs.
    */
   public static class CrawlDbUpdater {
-    
+
     public static class CrawlDbUpdateMapper extends
-           Mapper<Text, CrawlDatum, Text, CrawlDatum> {
+            Mapper<Text, CrawlDatum, Text, CrawlDatum> {
       @Override
       public void map(Text key, CrawlDatum value,
-          Context context)
-          throws IOException, InterruptedException {
+                      Context context)
+              throws IOException, InterruptedException {
         context.write(key, value);
       }
     }
 
-    public static class CrawlDbUpdateReducer extends 
-           Reducer<Text, CrawlDatum, Text, CrawlDatum> {
-      
+    public static class CrawlDbUpdateReducer extends
+            Reducer<Text, CrawlDatum, Text, CrawlDatum> {
+
       private CrawlDatum orig = new CrawlDatum();
       private LongWritable genTime = new LongWritable(0L);
       private long generateTime;
@@ -632,13 +643,13 @@ public void setup(Reducer<Text, CrawlDatum, Text, 
CrawlDatum>.Context context) {
 
       @Override
       public void reduce(Text key, Iterable<CrawlDatum> values,
-          Context context)
-          throws IOException, InterruptedException {
+                         Context context)
+              throws IOException, InterruptedException {
         genTime.set(0L);
         for (CrawlDatum val : values) {
           if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) 
{
             LongWritable gt = (LongWritable) val.getMetaData().get(
-                Nutch.WRITABLE_GENERATE_TIME_KEY);
+                    Nutch.WRITABLE_GENERATE_TIME_KEY);
             genTime.set(gt.get());
             if (genTime.get() != generateTime) {
               orig.set(val);
@@ -770,9 +781,9 @@ public Generator(Configuration conf) {
     job.setInputFormatClass(SequenceFileInputFormat.class);
 
     job.setJarByClass(Selector.class);
-    job.setMapperClass(Selector.SelectorMapper.class);
+    job.setMapperClass(SelectorMapper.class);
     job.setPartitionerClass(Selector.class);
-    job.setReducerClass(Selector.SelectorReducer.class);
+    job.setReducerClass(SelectorReducer.class);
 
     FileOutputFormat.setOutputPath(job, tempDir);
     job.setOutputKeyClass(FloatWritable.class);
diff --git a/src/java/org/apache/nutch/crawl/URLPartitioner.java 
b/src/java/org/apache/nutch/crawl/URLPartitioner.java
index cc508962e..3d4437618 100644
--- a/src/java/org/apache/nutch/crawl/URLPartitioner.java
+++ b/src/java/org/apache/nutch/crawl/URLPartitioner.java
@@ -23,9 +23,9 @@
 import java.net.MalformedURLException;
 import java.net.UnknownHostException;
 
+import org.apache.hadoop.conf.Configurable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
@@ -37,7 +37,7 @@
  * Partition urls by host, domain name or IP depending on the value of the
  * parameter 'partition.url.mode' which can be 'byHost', 'byDomain' or 'byIP'
  */
-public class URLPartitioner extends Partitioner<Text, Writable> {
+public class URLPartitioner extends Partitioner<Text, Writable> implements 
Configurable {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
@@ -51,8 +51,11 @@
   private URLNormalizers normalizers;
   private String mode = PARTITION_MODE_HOST;
 
-  public void configure(Job job) {
-    Configuration conf = job.getConfiguration();
+  private Configuration conf;
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
     seed = conf.getInt("partition.url.seed", 0);
     mode = conf.get(PARTITION_MODE_KEY, PARTITION_MODE_HOST);
     // check that the mode is known
@@ -64,6 +67,11 @@ public void configure(Job job) {
     normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_PARTITION);
   }
 
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
   public void close() {
   }
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> NullPointerException in generator
> ---------------------------------
>
>                 Key: NUTCH-2551
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2551
>             Project: Nutch
>          Issue Type: Bug
>          Components: generator
>    Affects Versions: 1.15
>            Reporter: Hans Brende
>            Priority: Blocker
>             Fix For: 1.15
>
>
> A NullPointerException is thrown during the crawl generate stage when I 
> deploy to a hadoop cluster (but for some reason, it works fine locally).
> It looks like this is caused because the URLPartitioner class still has the 
> old {{configure()}} method in there (which is never called, causing the 
> {{normalizers}} field to remain null), rather than implementing the 
> {{Configurable}} interface as detailed in the newer mapreduce API's 
> Partitioner spec.
> Stack trace:
> {code}
> java.lang.NullPointerException
>  at org.apache.nutch.crawl.URLPartitioner.getPartition(URLPartitioner.java:76)
>  at org.apache.nutch.crawl.URLPartitioner.getPartition(URLPartitioner.java:40)
>  at 
> org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:716)
>  at 
> org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
>  at 
> org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
>  at 
> org.apache.nutch.crawl.Generator$SelectorInverseMapper.map(Generator.java:553)
>  at 
> org.apache.nutch.crawl.Generator$SelectorInverseMapper.map(Generator.java:546)
>  at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
>  at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
>  at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
>  at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:175)
>  at java.security.AccessController.doPrivileged(Native Method)
>  at javax.security.auth.Subject.doAs(Subject.java:422)
>  at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1836)
>  at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:169)
> {code}
>  
> Oh and it might also be because a *static* URLPartitioner instance is being 
> used in the Generator.Selector class... but it's only initialized in the 
> {{setup()}} method of the Generator.Selector.SelectorMapper class! So that 
> whole setup looks pretty weird...



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to