Author: ab Date: Thu Aug 17 07:56:35 2006 New Revision: 432256 URL: http://svn.apache.org/viewvc?rev=432256&view=rev Log: Apply patch in NUTCH-348 - Generator used the lowest score instead of the highest. Contributed by Chris Schneider and Stefan Groschupf.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=432256&r1=432255&r2=432256&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Aug 17 07:56:35 2006 @@ -59,7 +59,11 @@ public void write(DataOutput out) throws IOException { url.write(out); datum.write(out); - } + } + + public String toString() { + return "url=" + url.toString() + ", datum=" + datum.toString(); + } } /** Selects entries due for fetch. */ @@ -118,7 +122,7 @@ LOG.warn("Couldn't filter generatorSortValue for " + key + ": " + sfe); } } - // sort by decreasing score + // sort by decreasing score, using DecreasingFloatComparator sortValue.set(sort); entry.datum = crawlDatum; entry.url = (UTF8)key; @@ -196,6 +200,20 @@ } + public static class DecreasingFloatComparator extends WritableComparator { + + public DecreasingFloatComparator() { + super(FloatWritable.class); + } + + /** Compares two FloatWritables decreasing. */ + public int compare(WritableComparable o1, WritableComparable o2) { + float thisValue = ((FloatWritable) o1).get(); + float thatValue = ((FloatWritable) o2).get(); + return (thisValue<thatValue ? 1 : (thisValue == thatValue ? 0 : -1)); + } + } + public static class SelectorInverseMapper extends MapReduceBase implements Mapper { public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { @@ -270,7 +288,7 @@ if (LOG.isInfoEnabled()) { LOG.info("Generator: starting"); LOG.info("Generator: segment: " + segment); - LOG.info("Generator: Selecting most-linked urls due for fetch."); + LOG.info("Generator: Selecting best-scoring urls due for fetch."); } // map to inverted subset due for fetch, sort by link count @@ -296,6 +314,7 @@ job.setOutputPath(tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); + job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); JobClient.runJob(job);