Author: markus Date: Fri Jan 13 16:43:42 2012 New Revision: 1231168 URL: http://svn.apache.org/viewvc?rev=1231168&view=rev Log: NUTCH-1248 Generator to select on status
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1231168&r1=1231167&r2=1231168&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jan 13 16:43:42 2012 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-1248 Generator to select on status (markus) + * NUTCH-1177 Generator to select on retry interval (markus) * NUTCH-1246 Upgrade to Hadoop 1.0.0 (jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1231168&r1=1231167&r2=1231168&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Jan 13 16:43:42 2012 @@ -62,6 +62,7 @@ public class Generator extends Configure public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb"; public static final String GENERATOR_MIN_SCORE = "generate.min.score"; public static final String GENERATOR_MIN_INTERVAL = "generate.min.interval"; + public static final String GENERATOR_RESTRICT_STATUS = "generate.restrict.status"; public static final String GENERATOR_FILTER = "generate.filter"; public static final String GENERATOR_NORMALISE = "generate.normalise"; public static final String GENERATOR_MAX_COUNT = "generate.max.count"; @@ -131,6 +132,7 @@ public class Generator extends Configure private FetchSchedule schedule; private float scoreThreshold = 0f; private int intervalThreshold = -1; + private String restrictStatus = null; private int maxNumSegments = 1; int currentsegmentnum = 1; @@ -158,6 +160,7 @@ public class Generator extends Configure schedule = FetchScheduleFactory.getFetchSchedule(job); scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN); intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1); + restrictStatus = job.get(GENERATOR_RESTRICT_STATUS, null); maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1); segCounts = new int[maxNumSegments]; } @@ -205,6 +208,9 @@ public class Generator extends Configure } } + if (restrictStatus != null + && !restrictStatus.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) return; + // consider only entries with a score superior to the threshold if (scoreThreshold != Float.NaN && sort < scoreThreshold) return;