Generator.java

ab Fri, 09 Oct 2009 05:44:14 -0700

Author: ab
Date: Fri Oct  9 12:43:44 2009
New Revision: 823531

URL: http://svn.apache.org/viewvc?rev=823531&view=rev
Log:
NUTCH-707 Generation of multiple segments in multiple runs returns only 1 
segment.


Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823531&r1=823530&r2=823531&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Oct  9 12:43:44 2009
@@ -2,12 +2,15 @@
 
 Unreleased Changes
 
- 1. NUTCH-735 - crawl-tool.xml must be read before nutch-site.xml when
-    invoked using crawl command (Susam Pal via dogacan)
+* NUTCH-735 - crawl-tool.xml must be read before nutch-site.xml when
+  invoked using crawl command (Susam Pal via dogacan)
 
- 2. NUTCH-721 - Fetcher2 Slow (Julien Nioche via dogacan)
+* NUTCH-721 - Fetcher2 Slow (Julien Nioche via dogacan)
 
- 3. NUTCH-702 - Lazy Instanciation of Metadata in CrawlDatum (Julien Nioche 
via dogacan)
+* NUTCH-702 - Lazy Instanciation of Metadata in CrawlDatum (Julien Nioche via 
dogacan)
+
+* NUTCH-707 - Generation of multiple segments in multiple runs returns only 1 
segment
+  (Michael Chen, ab)
 
 Release 1.0 - 2009-03-23
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=823531&r1=823530&r2=823531&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Oct  
9 12:43:44 2009
@@ -339,25 +339,22 @@
   /**
    * Update the CrawlDB so that the next generate won't include the same URLs.
    */
-  public static class CrawlDbUpdater extends MapReduceBase implements 
Mapper<WritableComparable, Writable, Text, CrawlDatum>, Reducer<Text, 
CrawlDatum, Text, CrawlDatum> {
+  public static class CrawlDbUpdater extends MapReduceBase implements 
Mapper<Text, CrawlDatum, Text, CrawlDatum>, Reducer<Text, CrawlDatum, Text, 
CrawlDatum> {
     long generateTime;
     
     public void configure(JobConf job) {
       generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
     }
     
-    public void map(WritableComparable key, Writable value, 
OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException 
{
-      if (key instanceof FloatWritable) { // tempDir source
-        SelectorEntry se = (SelectorEntry)value;
-        output.collect(se.url, se.datum);
-      } else {
-        output.collect((Text)key, (CrawlDatum)value);
-      }
+    public void map(Text key, CrawlDatum value, OutputCollector<Text, 
CrawlDatum> output, Reporter reporter) throws IOException {
+        output.collect(key, value);
     }
+    
     private CrawlDatum orig = new CrawlDatum();
     private LongWritable genTime = new LongWritable(0L);
 
     public void reduce(Text key, Iterator<CrawlDatum> values, 
OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException 
{
+      genTime.set(0L);
       while (values.hasNext()) {
         CrawlDatum val = values.next();
         if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {
@@ -536,7 +533,7 @@
       job = new NutchJob(getConf());
       job.setJobName("generate: updatedb " + dbDir);
       job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
-      FileInputFormat.addInputPath(job, tempDir);
+      FileInputFormat.addInputPath(job, output);
       FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
       job.setInputFormat(SequenceFileInputFormat.class);
       job.setMapperClass(CrawlDbUpdater.class);

svn commit: r823531 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Generator.java

Reply via email to