Author: markus
Date: Fri Jan 13 16:43:42 2012
New Revision: 1231168
URL: http://svn.apache.org/viewvc?rev=1231168&view=rev
Log:
NUTCH-1248 Generator to select on status
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1231168&r1=1231167&r2=1231168&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 13 16:43:42 2012
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-1248 Generator to select on status (markus)
+
* NUTCH-1177 Generator to select on retry interval (markus)
* NUTCH-1246 Upgrade to Hadoop 1.0.0 (jnioche)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1231168&r1=1231167&r2=1231168&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Jan 13
16:43:42 2012
@@ -62,6 +62,7 @@ public class Generator extends Configure
public static final String GENERATE_UPDATE_CRAWLDB =
"generate.update.crawldb";
public static final String GENERATOR_MIN_SCORE = "generate.min.score";
public static final String GENERATOR_MIN_INTERVAL = "generate.min.interval";
+ public static final String GENERATOR_RESTRICT_STATUS =
"generate.restrict.status";
public static final String GENERATOR_FILTER = "generate.filter";
public static final String GENERATOR_NORMALISE = "generate.normalise";
public static final String GENERATOR_MAX_COUNT = "generate.max.count";
@@ -131,6 +132,7 @@ public class Generator extends Configure
private FetchSchedule schedule;
private float scoreThreshold = 0f;
private int intervalThreshold = -1;
+ private String restrictStatus = null;
private int maxNumSegments = 1;
int currentsegmentnum = 1;
@@ -158,6 +160,7 @@ public class Generator extends Configure
schedule = FetchScheduleFactory.getFetchSchedule(job);
scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1);
+ restrictStatus = job.get(GENERATOR_RESTRICT_STATUS, null);
maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
segCounts = new int[maxNumSegments];
}
@@ -205,6 +208,9 @@ public class Generator extends Configure
}
}
+ if (restrictStatus != null
+ &&
!restrictStatus.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus())))
return;
+
// consider only entries with a score superior to the threshold
if (scoreThreshold != Float.NaN && sort < scoreThreshold) return;