Author: markus
Date: Mon Jan 20 09:29:42 2014
New Revision: 1559657
URL: http://svn.apache.org/r1559657
Log:
NUTCH-1680 CrawlDbReader to dump minRetry value
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1559657&r1=1559656&r2=1559657&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jan 20 09:29:42 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1680 CrawlDbReader to dump minRetry value (markus)
+
* NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid Yüzügüldü, snagel
via lewismc)
* NUTCH-1695 Add NutchDocument.toString() to ease debugging (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1559657&r1=1559656&r2=1559657&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Mon Jan 20
09:29:42 2014
@@ -407,7 +407,7 @@ public class CrawlDbReader implements Cl
}
}
- public void processDumpJob(String crawlDb, String output, Configuration
config, String format, String regex, String status) throws IOException {
+ public void processDumpJob(String crawlDb, String output, Configuration
config, String format, String regex, String status, Integer retry) throws
IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb dump: starting");
LOG.info("CrawlDb db: " + crawlDb);
@@ -433,7 +433,8 @@ public class CrawlDbReader implements Cl
if (status != null) job.set("status", status);
if (regex != null) job.set("regex", regex);
-
+ if (retry != null) job.setInt("retry", retry);
+
job.setMapperClass(CrawlDbDumpMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
@@ -446,17 +447,26 @@ public class CrawlDbReader implements Cl
Pattern pattern = null;
Matcher matcher = null;
String status = null;
+ Integer retry = null;
public void configure(JobConf job) {
if (job.get("regex", null) != null) {
pattern = Pattern.compile(job.get("regex"));
}
status = job.get("status", null);
+ retry = job.getInt("retry", -1);
}
public void close() {}
public void map(Text key, CrawlDatum value, OutputCollector<Text,
CrawlDatum> output, Reporter reporter)
throws IOException {
+
+ // check retry
+ if (retry != -1) {
+ if (value.getRetriesSinceFetch() < retry) {
+ return;
+ }
+ }
// check status
if (status != null
@@ -542,6 +552,7 @@ public class CrawlDbReader implements Cl
System.err.println("\t\t[-format normal]\tdump in standard format
(default option)");
System.err.println("\t\t[-format crawldb]\tdump as CrawlDB");
System.err.println("\t\t[-regex <expr>]\tfilter records with
expression");
+ System.err.println("\t\t[-retry <num>]\tminimum retry count");
System.err.println("\t\t[-status <status>]\tfilter records by CrawlDatum
status");
System.err.println("\t-url <url>\tprint information on <url> to
System.out");
System.err.println("\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn>
urls sorted by score to <out_dir>");
@@ -564,6 +575,7 @@ public class CrawlDbReader implements Cl
param = args[++i];
String format = "normal";
String regex = null;
+ Integer retry = null;
String status = null;
for (int j = i + 1; j < args.length; j++) {
if (args[j].equals("-format")) {
@@ -574,12 +586,16 @@ public class CrawlDbReader implements Cl
regex = args[++j];
i=i+2;
}
+ if (args[j].equals("-retry")) {
+ retry = Integer.parseInt(args[++j]);
+ i=i+2;
+ }
if (args[j].equals("-status")) {
status = args[++j];
i=i+2;
}
}
- dbr.processDumpJob(crawlDb, param, conf, format, regex, status);
+ dbr.processDumpJob(crawlDb, param, conf, format, regex, status, retry);
} else if (args[i].equals("-url")) {
param = args[++i];
dbr.readUrl(crawlDb, param, conf);