[nutch] 01/05: NUTCH-2456: Redirected documents are not indexed

snagel Tue, 05 Dec 2017 01:40:06 -0800

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


commit a7bc1a8c5a3a5ab9c72574afd98089a354bf0484
Author: YossiTamari <[email protected]>
AuthorDate: Tue Nov 7 12:13:41 2017 +0200

    NUTCH-2456: Redirected documents are not indexed
    
    This is a defensive, minimal approach for fixing this issue.
---
 .../org/apache/nutch/indexer/IndexerMapReduce.java | 43 +++++++++++-----------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index cb6e121..9598a89 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -256,20 +256,19 @@ public class IndexerMapReduce extends Configured 
implements
       }
     }
 
-    if (fetchDatum == null || dbDatum == null || parseText == null
-        || parseData == null) {
+    if (fetchDatum == null || parseText == null || parseData == null) {
       return; // only have inlinks
     }
 
     // Whether to delete pages marked as duplicates
-    if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
+    if (delete && dbDatum!=null && dbDatum.getStatus() == 
CrawlDatum.STATUS_DB_DUPLICATE) {
       reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1);
       output.collect(key, DELETE_ACTION);
       return;
     }
 
     // Whether to skip DB_NOTMODIFIED pages
-    if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+    if (skip && dbDatum!=null && dbDatum.getStatus() == 
CrawlDatum.STATUS_DB_NOTMODIFIED) {
       reporter.incrCounter("IndexerStatus", "skipped (not modified)", 1);
       return;
     }
@@ -309,23 +308,25 @@ public class IndexerMapReduce extends Configured 
implements
     doc.add("boost", Float.toString(boost));
 
     try {
-      // Indexing filters may also be interested in the signature
-      fetchDatum.setSignature(dbDatum.getSignature());
-      
-      // extract information from dbDatum and pass it to
-      // fetchDatum so that indexing filters can use it
-      final Text url = (Text) dbDatum.getMetaData().get(
-          Nutch.WRITABLE_REPR_URL_KEY);
-      if (url != null) {
-        // Representation URL also needs normalization and filtering.
-        // If repr URL is excluded by filters we still accept this document
-        // but represented by its primary URL ("key") which has passed URL
-        // filters.
-        String urlString = filterUrl(normalizeUrl(url.toString()));
-        if (urlString != null) {
-          url.set(urlString);
-          fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
-        }
+      if (dbDatum!=null) {
+             // Indexing filters may also be interested in the signature
+             fetchDatum.setSignature(dbDatum.getSignature());
+             
+             // extract information from dbDatum and pass it to
+             // fetchDatum so that indexing filters can use it
+             final Text url = (Text) dbDatum.getMetaData().get(
+                 Nutch.WRITABLE_REPR_URL_KEY);
+             if (url != null) {
+               // Representation URL also needs normalization and filtering.
+               // If repr URL is excluded by filters we still accept this 
document
+               // but represented by its primary URL ("key") which has passed 
URL
+               // filters.
+               String urlString = filterUrl(normalizeUrl(url.toString()));
+               if (urlString != null) {
+                 url.set(urlString);
+                 fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, 
url);
+               }
+             }
       }
       // run indexing filters
       doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);

-- 
To stop receiving notification emails like this one, please contact
"[email protected]" <[email protected]>.

[nutch] 01/05: NUTCH-2456: Redirected documents are not indexed

Reply via email to