Author: lewismc
Date: Fri Mar 8 19:38:06 2013
New Revision: 1454508
URL: http://svn.apache.org/r1454508
Log:
NUTCH-1393 Display consistent usage of GeneratorJob with 1.X
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1454508&r1=1454507&r2=1454508&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Mar 8 19:38:06 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1393 Display consistent usage of GeneratorJob with 1.X (Lufeng +via
lewismc)
+
* NUTCH-1540 Add Gora buffered read and write maximum limits to
nutch-default.xml configuration. (lewismc)
* NUTCH-842 AutoGenerate WebPage code (jnioche via lewismc)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1454508&r1=1454507&r2=1454508&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Fri
Mar 8 19:38:06 2013
@@ -19,6 +19,7 @@ package org.apache.nutch.crawl;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
@@ -40,6 +41,7 @@ import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.ToolUtil;
public class GeneratorJob extends NutchTool implements Tool {
@@ -204,9 +206,13 @@ public class GeneratorJob extends NutchT
public String generate(long topN, long curTime, boolean filter, boolean norm)
throws Exception {
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("GeneratorJob: starting at " + sdf.format(start));
LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch.");
LOG.info("GeneratorJob: starting");
LOG.info("GeneratorJob: filtering: " + filter);
+ LOG.info("GeneratorJob: normalizing: " + norm);
if (topN != Long.MAX_VALUE) {
LOG.info("GeneratorJob: topN: " + topN);
}
@@ -216,12 +222,25 @@ public class GeneratorJob extends NutchT
Nutch.ARG_FILTER, filter,
Nutch.ARG_NORMALIZE, norm));
batchId = getConf().get(BATCH_ID);
- LOG.info("GeneratorJob: done");
+ long finish = System.currentTimeMillis();
+ LOG.info("GeneratorJob: finished at " + sdf.format(finish) + ", time
elapsed: " + TimingUtil.elapsedTime(start, finish));
LOG.info("GeneratorJob: generated batch id: " + batchId);
return batchId;
}
public int run(String[] args) throws Exception {
+ if (args.length <= 0) {
+ System.out.println("Usage: GeneratorJob [-topN N] [-crawlId id]
[-noFilter] [-noNorm]");
+ System.out.println(" -topN <N> - number of top URLs to be
selected, default is Long.MAX_VALUE ");
+ System.out.println(" -crawlId <id> - the id to prefix the schemas to
operate on, \n \t \t (default: storage.crawl.id)\");");
+ System.out.println(" -noFilter - do not activate the filter
plugin to filter the url, default is true ");
+ System.out.println(" -noNorm - do not activate the normalizer
plugin to normalize the url, default is true ");
+
+ System.out.println("----------------------");
+ System.out.println("Please set the params.");
+ return -1;
+ }
+
long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
boolean filter = true, norm = true;