Author: ab Date: Fri Oct 9 12:43:44 2009 New Revision: 823531 URL: http://svn.apache.org/viewvc?rev=823531&view=rev Log: NUTCH-707 Generation of multiple segments in multiple runs returns only 1 segment.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823531&r1=823530&r2=823531&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Oct 9 12:43:44 2009 @@ -2,12 +2,15 @@ Unreleased Changes - 1. NUTCH-735 - crawl-tool.xml must be read before nutch-site.xml when - invoked using crawl command (Susam Pal via dogacan) +* NUTCH-735 - crawl-tool.xml must be read before nutch-site.xml when + invoked using crawl command (Susam Pal via dogacan) - 2. NUTCH-721 - Fetcher2 Slow (Julien Nioche via dogacan) +* NUTCH-721 - Fetcher2 Slow (Julien Nioche via dogacan) - 3. NUTCH-702 - Lazy Instanciation of Metadata in CrawlDatum (Julien Nioche via dogacan) +* NUTCH-702 - Lazy Instanciation of Metadata in CrawlDatum (Julien Nioche via dogacan) + +* NUTCH-707 - Generation of multiple segments in multiple runs returns only 1 segment + (Michael Chen, ab) Release 1.0 - 2009-03-23 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=823531&r1=823530&r2=823531&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Oct 9 12:43:44 2009 @@ -339,25 +339,22 @@ /** * Update the CrawlDB so that the next generate won't include the same URLs. */ - public static class CrawlDbUpdater extends MapReduceBase implements Mapper<WritableComparable, Writable, Text, CrawlDatum>, Reducer<Text, CrawlDatum, Text, CrawlDatum> { + public static class CrawlDbUpdater extends MapReduceBase implements Mapper<Text, CrawlDatum, Text, CrawlDatum>, Reducer<Text, CrawlDatum, Text, CrawlDatum> { long generateTime; public void configure(JobConf job) { generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L); } - public void map(WritableComparable key, Writable value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { - if (key instanceof FloatWritable) { // tempDir source - SelectorEntry se = (SelectorEntry)value; - output.collect(se.url, se.datum); - } else { - output.collect((Text)key, (CrawlDatum)value); - } + public void map(Text key, CrawlDatum value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { + output.collect(key, value); } + private CrawlDatum orig = new CrawlDatum(); private LongWritable genTime = new LongWritable(0L); public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { + genTime.set(0L); while (values.hasNext()) { CrawlDatum val = values.next(); if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) { @@ -536,7 +533,7 @@ job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); - FileInputFormat.addInputPath(job, tempDir); + FileInputFormat.addInputPath(job, output); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class);