This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit a7bc1a8c5a3a5ab9c72574afd98089a354bf0484 Author: YossiTamari <[email protected]> AuthorDate: Tue Nov 7 12:13:41 2017 +0200 NUTCH-2456: Redirected documents are not indexed This is a defensive, minimal approach for fixing this issue. --- .../org/apache/nutch/indexer/IndexerMapReduce.java | 43 +++++++++++----------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java index cb6e121..9598a89 100644 --- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java +++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java @@ -256,20 +256,19 @@ public class IndexerMapReduce extends Configured implements } } - if (fetchDatum == null || dbDatum == null || parseText == null - || parseData == null) { + if (fetchDatum == null || parseText == null || parseData == null) { return; // only have inlinks } // Whether to delete pages marked as duplicates - if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { + if (delete && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1); output.collect(key, DELETE_ACTION); return; } // Whether to skip DB_NOTMODIFIED pages - if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { + if (skip && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { reporter.incrCounter("IndexerStatus", "skipped (not modified)", 1); return; } @@ -309,23 +308,25 @@ public class IndexerMapReduce extends Configured implements doc.add("boost", Float.toString(boost)); try { - // Indexing filters may also be interested in the signature - fetchDatum.setSignature(dbDatum.getSignature()); - - // extract information from dbDatum and pass it to - // fetchDatum so that indexing filters can use it - final Text url = (Text) dbDatum.getMetaData().get( - Nutch.WRITABLE_REPR_URL_KEY); - if (url != null) { - // Representation URL also needs normalization and filtering. - // If repr URL is excluded by filters we still accept this document - // but represented by its primary URL ("key") which has passed URL - // filters. - String urlString = filterUrl(normalizeUrl(url.toString())); - if (urlString != null) { - url.set(urlString); - fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url); - } + if (dbDatum!=null) { + // Indexing filters may also be interested in the signature + fetchDatum.setSignature(dbDatum.getSignature()); + + // extract information from dbDatum and pass it to + // fetchDatum so that indexing filters can use it + final Text url = (Text) dbDatum.getMetaData().get( + Nutch.WRITABLE_REPR_URL_KEY); + if (url != null) { + // Representation URL also needs normalization and filtering. + // If repr URL is excluded by filters we still accept this document + // but represented by its primary URL ("key") which has passed URL + // filters. + String urlString = filterUrl(normalizeUrl(url.toString())); + if (urlString != null) { + url.set(urlString); + fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url); + } + } } // run indexing filters doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks); -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
