Author: snagel
Date: Thu Oct 11 21:04:14 2012
New Revision: 1397308

URL: http://svn.apache.org/viewvc?rev=1397308&view=rev
Log:
NUTCH-1383 IndexingFiltersChecker to show error message instead of null pointer 
exception

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1397308&r1=1397307&r2=1397308&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct 11 21:04:14 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1383 IndexingFiltersChecker to show error message instead of null 
pointer exception (snagel)
+
 * NUTCH-1476 SegmentReader getStats should set parsed = -1 if no parsing took 
place (snagel)
 
 * NUTCH-1252 SegmentReader -get shows wrong data (snagel)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1397308&r1=1397307&r2=1397308&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Thu Oct 11 21:04:14 2012
@@ -84,9 +84,6 @@ public class IndexingFiltersChecker exte
     Content content = protocol.getProtocolOutput(new Text(url), datum)
         .getContent();
 
-    // store the guessed content type in the crawldatum
-    if (content.getContentType() != null) datum.getMetaData().put(new 
Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
-
     if (content == null) {
       System.out.println("No content for " + url);
       return 0;
@@ -98,6 +95,9 @@ public class IndexingFiltersChecker exte
       return -1;
     }
 
+    // store the guessed content type in the crawldatum
+    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new 
Text(contentType));
+
     if (LOG.isInfoEnabled()) {
       LOG.info("parsing: " + url);
       LOG.info("contentType: " + contentType);
@@ -111,11 +111,16 @@ public class IndexingFiltersChecker exte
     Inlinks inlinks = null;
     Parse parse = parseResult.get(urlText);
     try {
-      indexers.filter(doc, parse, urlText, datum, inlinks);
+      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
     } catch (IndexingException e) {
       e.printStackTrace();
     }
 
+    if (doc == null) {
+      System.out.println("Document discarded by indexing filter");
+      return 0;
+    }
+    
     for (String fname : doc.getFieldNames()) {
       List<Object> values = doc.getField(fname).getValues();
       if (values != null) {


Reply via email to