Author: snagel
Date: Fri Sep 20 21:12:10 2013
New Revision: 1525148

URL: http://svn.apache.org/r1525148
Log:
NUTCH-1636 Indexer to normalize and filter repr URL

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1525148&r1=1525147&r2=1525148&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Sep 20 21:12:10 2013
@@ -2,7 +2,9 @@ Nutch Change Log
 
 Nutch Development Trunk
 
-* NUTCH-1637 URLUtil is mising getProtocol (markus)
+* NUTCH-1636 Indexer to normalize and filter repr URL (Iain Lopata via snagel)
+
+* NUTCH-1637 URLUtil is missing getProtocol (markus)
 
 * NUTCH-1622 Create Outlinks with metadata (jnioche)
 

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1525148&r1=1525147&r2=1525148&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Sep 
20 21:12:10 2013
@@ -89,7 +89,7 @@ implements Mapper<Text, Writable, Text, 
     filter = job.getBoolean(URL_FILTERING, false);
 
     if (normalize) {
-      urlNormalizers = new URLNormalizers(getConf(), 
URLNormalizers.SCOPE_DEFAULT);
+      urlNormalizers = new URLNormalizers(getConf(), 
URLNormalizers.SCOPE_INDEXER);
     }
 
     if (filter) {
@@ -267,7 +267,14 @@ implements Mapper<Text, Writable, Text, 
       // fetchDatum so that indexing filters can use it
       final Text url = (Text) 
dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
       if (url != null) {
-        fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+        // Representation URL also needs normalization and filtering.
+        // If repr URL is excluded by filters we still accept this document
+        // but represented by its primary URL ("key") which has passed URL 
filters.
+        String urlString = filterUrl(normalizeUrl(url.toString()));
+        if (urlString != null) {
+          url.set(urlString);
+          fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+        }
       }
       // run indexing filters
       doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);


Reply via email to