Author: snagel
Date: Thu Oct 11 21:04:14 2012
New Revision: 1397308
URL: http://svn.apache.org/viewvc?rev=1397308&view=rev
Log:
NUTCH-1383 IndexingFiltersChecker to show error message instead of null pointer
exception
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1397308&r1=1397307&r2=1397308&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct 11 21:04:14 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1383 IndexingFiltersChecker to show error message instead of null
pointer exception (snagel)
+
* NUTCH-1476 SegmentReader getStats should set parsed = -1 if no parsing took
place (snagel)
* NUTCH-1252 SegmentReader -get shows wrong data (snagel)
Modified:
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1397308&r1=1397307&r2=1397308&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Thu Oct 11 21:04:14 2012
@@ -84,9 +84,6 @@ public class IndexingFiltersChecker exte
Content content = protocol.getProtocolOutput(new Text(url), datum)
.getContent();
- // store the guessed content type in the crawldatum
- if (content.getContentType() != null) datum.getMetaData().put(new
Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
-
if (content == null) {
System.out.println("No content for " + url);
return 0;
@@ -98,6 +95,9 @@ public class IndexingFiltersChecker exte
return -1;
}
+ // store the guessed content type in the crawldatum
+ datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new
Text(contentType));
+
if (LOG.isInfoEnabled()) {
LOG.info("parsing: " + url);
LOG.info("contentType: " + contentType);
@@ -111,11 +111,16 @@ public class IndexingFiltersChecker exte
Inlinks inlinks = null;
Parse parse = parseResult.get(urlText);
try {
- indexers.filter(doc, parse, urlText, datum, inlinks);
+ doc = indexers.filter(doc, parse, urlText, datum, inlinks);
} catch (IndexingException e) {
e.printStackTrace();
}
+ if (doc == null) {
+ System.out.println("Document discarded by indexing filter");
+ return 0;
+ }
+
for (String fname : doc.getFieldNames()) {
List<Object> values = doc.getField(fname).getValues();
if (values != null) {