Author: snagel
Date: Fri Sep 20 21:12:10 2013
New Revision: 1525148
URL: http://svn.apache.org/r1525148
Log:
NUTCH-1636 Indexer to normalize and filter repr URL
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1525148&r1=1525147&r2=1525148&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Sep 20 21:12:10 2013
@@ -2,7 +2,9 @@ Nutch Change Log
Nutch Development Trunk
-* NUTCH-1637 URLUtil is mising getProtocol (markus)
+* NUTCH-1636 Indexer to normalize and filter repr URL (Iain Lopata via snagel)
+
+* NUTCH-1637 URLUtil is missing getProtocol (markus)
* NUTCH-1622 Create Outlinks with metadata (jnioche)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1525148&r1=1525147&r2=1525148&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Sep
20 21:12:10 2013
@@ -89,7 +89,7 @@ implements Mapper<Text, Writable, Text,
filter = job.getBoolean(URL_FILTERING, false);
if (normalize) {
- urlNormalizers = new URLNormalizers(getConf(),
URLNormalizers.SCOPE_DEFAULT);
+ urlNormalizers = new URLNormalizers(getConf(),
URLNormalizers.SCOPE_INDEXER);
}
if (filter) {
@@ -267,7 +267,14 @@ implements Mapper<Text, Writable, Text,
// fetchDatum so that indexing filters can use it
final Text url = (Text)
dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
if (url != null) {
- fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+ // Representation URL also needs normalization and filtering.
+ // If repr URL is excluded by filters we still accept this document
+ // but represented by its primary URL ("key") which has passed URL
filters.
+ String urlString = filterUrl(normalizeUrl(url.toString()));
+ if (urlString != null) {
+ url.set(urlString);
+ fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+ }
}
// run indexing filters
doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);