Author: tejasp
Date: Mon Mar 11 02:12:58 2013
New Revision: 1454974
URL: http://svn.apache.org/r1454974
Log:
NUTCH-1542 adddays param for generator not present in 2.x
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/bin/crawl
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1454974&r1=1454973&r2=1454974&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Mar 11 02:12:58 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1542 "adddays" param for generator not present in 2.x (tejasp)
+
* NUTCH-1393 Display consistent usage of GeneratorJob with 1.X (Lufeng +via
lewismc)
* NUTCH-1540 Add Gora buffered read and write maximum limits to
nutch-default.xml configuration. (lewismc)
Modified: nutch/branches/2.x/src/bin/crawl
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1454974&r1=1454973&r2=1454974&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Mon Mar 11 02:12:58 2013
@@ -64,6 +64,10 @@ sizeFetchlist=`expr $numSlaves \* 50000`
# time limit for feching
timeLimitFetch=180
+# Adds <days> to the current time to facilitate
+# crawling urls already fetched sooner then
+# db.default.fetch.interval.
+addDays=0
#############################################
# determines whether mode based on presence of job file
@@ -108,7 +112,7 @@ do
echo `date` ": Iteration $a of $LIMIT"
echo "Generating a new fetchlist"
- $bin/nutch generate $commonOptions $CRAWL_ID/crawldb $CRAWL_ID/segments
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter
+ $bin/nutch generate $commonOptions $CRAWL_ID/crawldb $CRAWL_ID/segments
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter -adddays $addDays
if [ $? -ne 0 ]
then exit $?
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1454974&r1=1454973&r2=1454974&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Mon
Mar 11 02:12:58 2013
@@ -230,12 +230,13 @@ public class GeneratorJob extends NutchT
public int run(String[] args) throws Exception {
if (args.length <= 0) {
- System.out.println("Usage: GeneratorJob [-topN N] [-crawlId id]
[-noFilter] [-noNorm]");
+ System.out.println("Usage: GeneratorJob [-topN N] [-crawlId id]
[-noFilter] [-noNorm] [-adddays numDays]");
System.out.println(" -topN <N> - number of top URLs to be
selected, default is Long.MAX_VALUE ");
System.out.println(" -crawlId <id> - the id to prefix the schemas to
operate on, \n \t \t (default: storage.crawl.id)\");");
System.out.println(" -noFilter - do not activate the filter
plugin to filter the url, default is true ");
System.out.println(" -noNorm - do not activate the normalizer
plugin to normalize the url, default is true ");
-
+ System.out.println(" -adddays - Adds numDays to the current
time to facilitate crawling urls already");
+ System.out.println(" fetched sooner then
db.default.fetch.interval. Default value is 0.");
System.out.println("----------------------");
System.out.println("Please set the params.");
return -1;
@@ -253,6 +254,9 @@ public class GeneratorJob extends NutchT
norm = false;
} else if ("-crawlId".equals(args[i])) {
getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
+ } else if ("-adddays".equals(args[i])) {
+ long numDays = Integer.parseInt(args[++i]);
+ curTime += numDays * 1000L * 60 * 60 * 24;
}
}