Author: markus Date: Wed Feb 24 14:12:42 2016 New Revision: 1732160 URL: http://svn.apache.org/viewvc?rev=1732160&view=rev Log: NUTCH-2232 DeduplicationJob should decode URL's before length is compared
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732160&r1=1732159&r2=1732160&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Feb 24 14:12:42 2016 @@ -10,6 +10,8 @@ in the release announcement and keep it Nutch Change Log +* NUTCH-2232 DeduplicationJob should decode URL's before length is compared (Ron van der Vegt via markus) + * NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus) * NUTCH-2227 RegexParseFilter (markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1732160&r1=1732159&r2=1732160&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Wed Feb 24 14:12:42 2016 @@ -17,6 +17,8 @@ package org.apache.nutch.crawl; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Iterator; @@ -193,8 +195,15 @@ public class DeduplicationJob extends Nu break; case "urlLength": // same time? keep the one which has the shortest URL - String urlExisting = existingDoc.getMetaData().get(urlKey).toString(); - String urlnewDoc = newDoc.getMetaData().get(urlKey).toString(); + String urlExisting; + String urlnewDoc; + try { + urlExisting = URLDecoder.decode(existingDoc.getMetaData().get(urlKey).toString(), "UTF8"); + urlnewDoc = URLDecoder.decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8"); + } catch (UnsupportedEncodingException e) { + LOG.error("Error decoding: " + urlKey); + throw new IOException("UnsupportedEncodingException for " + urlKey); + } if (urlExisting.length() < urlnewDoc.length()) { // mark new one as duplicate writeOutAsDuplicate(newDoc, output, reporter);