Author: markus
Date: Fri Jan 13 16:43:42 2012
New Revision: 1231168

URL: http://svn.apache.org/viewvc?rev=1231168&view=rev
Log:
NUTCH-1248 Generator to select on status

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1231168&r1=1231167&r2=1231168&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 13 16:43:42 2012
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1248 Generator to select on status (markus)
+
 * NUTCH-1177 Generator to select on retry interval (markus)
 
 * NUTCH-1246 Upgrade to Hadoop 1.0.0 (jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1231168&r1=1231167&r2=1231168&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Jan 13 
16:43:42 2012
@@ -62,6 +62,7 @@ public class Generator extends Configure
   public static final String GENERATE_UPDATE_CRAWLDB = 
"generate.update.crawldb";
   public static final String GENERATOR_MIN_SCORE = "generate.min.score";
   public static final String GENERATOR_MIN_INTERVAL = "generate.min.interval";
+  public static final String GENERATOR_RESTRICT_STATUS = 
"generate.restrict.status";
   public static final String GENERATOR_FILTER = "generate.filter";
   public static final String GENERATOR_NORMALISE = "generate.normalise";
   public static final String GENERATOR_MAX_COUNT = "generate.max.count";
@@ -131,6 +132,7 @@ public class Generator extends Configure
     private FetchSchedule schedule;
     private float scoreThreshold = 0f;
     private int intervalThreshold = -1;
+    private String restrictStatus = null;
     private int maxNumSegments = 1;
     int currentsegmentnum = 1;
 
@@ -158,6 +160,7 @@ public class Generator extends Configure
       schedule = FetchScheduleFactory.getFetchSchedule(job);
       scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
       intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1);
+      restrictStatus = job.get(GENERATOR_RESTRICT_STATUS, null);
       maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
       segCounts = new int[maxNumSegments];
     }
@@ -205,6 +208,9 @@ public class Generator extends Configure
         }
       }
 
+      if (restrictStatus != null
+        && 
!restrictStatus.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus())))
 return;
+
       // consider only entries with a score superior to the threshold
       if (scoreThreshold != Float.NaN && sort < scoreThreshold) return;
 


Reply via email to