GeneratorJob.java

tejasp Sun, 10 Mar 2013 19:13:21 -0700

Author: tejasp
Date: Mon Mar 11 02:12:58 2013
New Revision: 1454974

URL: http://svn.apache.org/r1454974
Log:
NUTCH-1542 adddays param for generator not present in 2.x


Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/bin/crawl
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1454974&r1=1454973&r2=1454974&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Mar 11 02:12:58 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1542 "adddays" param for generator not present in 2.x (tejasp)
+
 * NUTCH-1393 Display consistent usage of GeneratorJob with 1.X (Lufeng +via 
lewismc)
 
 * NUTCH-1540 Add Gora buffered read and write maximum limits to 
nutch-default.xml configuration. (lewismc)

Modified: nutch/branches/2.x/src/bin/crawl
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1454974&r1=1454973&r2=1454974&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Mon Mar 11 02:12:58 2013
@@ -64,6 +64,10 @@ sizeFetchlist=`expr $numSlaves \* 50000`
 # time limit for feching
 timeLimitFetch=180
 
+# Adds <days> to the current time to facilitate 
+# crawling urls already fetched sooner then 
+# db.default.fetch.interval.
+addDays=0
 #############################################
 
 # determines whether mode based on presence of job file
@@ -108,7 +112,7 @@ do
   echo `date` ": Iteration $a of $LIMIT"
 
   echo "Generating a new fetchlist"
-  $bin/nutch generate $commonOptions $CRAWL_ID/crawldb $CRAWL_ID/segments 
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter
+  $bin/nutch generate $commonOptions $CRAWL_ID/crawldb $CRAWL_ID/segments 
-topN $sizeFetchlist -numFetchers $numSlaves -noFilter -adddays $addDays
   
   if [ $? -ne 0 ] 
   then exit $? 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1454974&r1=1454973&r2=1454974&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Mon 
Mar 11 02:12:58 2013
@@ -230,12 +230,13 @@ public class GeneratorJob extends NutchT
 
   public int run(String[] args) throws Exception {
     if (args.length <= 0) {
-      System.out.println("Usage: GeneratorJob [-topN N] [-crawlId id] 
[-noFilter] [-noNorm]");
+      System.out.println("Usage: GeneratorJob [-topN N] [-crawlId id] 
[-noFilter] [-noNorm] [-adddays numDays]");
       System.out.println("    -topN <N>      - number of top URLs to be 
selected, default is Long.MAX_VALUE ");
       System.out.println("    -crawlId <id>  - the id to prefix the schemas to 
operate on, \n \t \t    (default: storage.crawl.id)\");");
       System.out.println("    -noFilter      - do not activate the filter 
plugin to filter the url, default is true ");
       System.out.println("    -noNorm        - do not activate the normalizer 
plugin to normalize the url, default is true ");
-
+      System.out.println("    -adddays       - Adds numDays to the current 
time to facilitate crawling urls already");
+      System.out.println("                     fetched sooner then 
db.default.fetch.interval. Default value is 0.");
       System.out.println("----------------------");
       System.out.println("Please set the params.");
       return -1;
@@ -253,6 +254,9 @@ public class GeneratorJob extends NutchT
         norm = false;
       } else if ("-crawlId".equals(args[i])) {
         getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
+      } else if ("-adddays".equals(args[i])) {
+        long numDays = Integer.parseInt(args[++i]);
+        curTime += numDays * 1000L * 60 * 60 * 24;
       }
     }

svn commit: r1454974 - in /nutch/branches/2.x: CHANGES.txt src/bin/crawl src/java/org/apache/nutch/crawl/GeneratorJob.java

Reply via email to