Author: dogacan Date: Tue Jul 31 05:07:30 2007 New Revision: 561306 URL: http://svn.apache.org/viewvc?view=rev&rev=561306 Log: NUTCH-533 - LinkDbMerger: url normalized is not updated in the key and inlinks list. Contributed by Emmanuel Joke.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=561306&r1=561305&r2=561306 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Jul 31 05:07:30 2007 @@ -107,6 +107,9 @@ with redirected pages, and this issue can be considered as a band-aid for the time being. See NUTCH-273 and NUTCH-353 for more details. +36. NUTCH-533 - LinkDbMerger: url normalized is not updated in the key and + inlinks list. (Emmanuel Joke via dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java?view=diff&rev=561306&r1=561305&r2=561306 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java Tue Jul 31 05:07:30 2007 @@ -22,6 +22,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.JobConf; @@ -57,6 +58,8 @@ private String scope; public static final Log LOG = LogFactory.getLog(LinkDbFilter.class); + + private Text newKey = new Text(); public void configure(JobConf job) { this.jobConf = job; @@ -75,6 +78,7 @@ public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { String url = key.toString(); + Inlinks result = new Inlinks(); if (normalize) { try { url = normalizers.normalize(url, scope); // normalize the url @@ -114,11 +118,13 @@ fromUrl = null; } } - if (fromUrl == null) { // should be discarded - it.remove(); + if (fromUrl != null) { + result.add(new Inlink(fromUrl, inlink.getAnchor())); } } - if (inlinks.size() == 0) return; // don't collect empy inlinks - output.collect(key, inlinks); + if (result.size() > 0) { // don't collect empty inlinks + newKey.set(url); + output.collect(newKey, result); + } } } ------------------------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Still grepping through log files to find problems? Stop. Now Search log events and configuration files using AJAX and a browser. Download your FREE copy of Splunk now >> http://get.splunk.com/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs