This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 6b6e74c NUTCH-2791 Handle GCS URLs in stats commands
new 59d0d95 Merge pull request #533 from pmezard/NUTCH-2791
6b6e74c is described below
commit 6b6e74c5a33c74ec30e3691c04797e6742350456
Author: Patrick Mezard <[email protected]>
AuthorDate: Tue Jun 9 17:39:41 2020 +0200
NUTCH-2791 Handle GCS URLs in stats commands
- Handle Google Cloud Storage URLs as crawldb inputs in domainstats,
protocolstats and crawlcomplete commands.
- Correctly resolve numReducers in protocolstats.
- Align crawlcomplete -inputDirs behaviour on the other commands: expect
directories containing "current", not "crawldb/current".
---
src/java/org/apache/nutch/util/CrawlCompletionStats.java | 6 ++----
src/java/org/apache/nutch/util/ProtocolStatusStatistics.java | 7 +++----
src/java/org/apache/nutch/util/domain/DomainStatistics.java | 3 +--
3 files changed, 6 insertions(+), 10 deletions(-)
diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index f3e808b..8a23fbf 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -74,7 +74,7 @@ public class CrawlCompletionStats extends Configured
implements Tool {
Option inDirs = OptionBuilder
.withArgName("inputDirs")
.isRequired()
- .withDescription("Comma separated list of crawl directories (e.g.,
\"./crawl1,./crawl2\")")
+ .withDescription("Comma separated list of crawldb directories (e.g.,
\"./crawl1/crawldb,./crawl2/crawldb\")")
.hasArgs()
.create("inputDirs");
@SuppressWarnings("static-access")
@@ -153,9 +153,7 @@ public class CrawlCompletionStats extends Configured
implements Tool {
String[] inputDirsSpecs = inputDir.split(",");
for (int i = 0; i < inputDirsSpecs.length; i++) {
- File completeInputPath = new File(new File(inputDirsSpecs[i]),
"crawldb/current");
- FileInputFormat.addInputPath(job, new
Path(completeInputPath.toString()));
-
+ FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i],
"current"));
}
job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
index f52a9c5..213c1c2 100644
--- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
+++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -82,8 +82,8 @@ public class ProtocolStatusStatistics extends Configured
implements Tool {
int numOfReducers = 1;
- if (args.length > 3) {
- numOfReducers = Integer.parseInt(args[3]);
+ if (args.length > 2) {
+ numOfReducers = Integer.parseInt(args[2]);
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
@@ -100,8 +100,7 @@ public class ProtocolStatusStatistics extends Configured
implements Tool {
String[] inputDirsSpecs = inputDir.split(",");
for (int i = 0; i < inputDirsSpecs.length; i++) {
- File completeInputPath = new File(new File(inputDirsSpecs[i]),
"current");
- FileInputFormat.addInputPath(job, new
Path(completeInputPath.toString()));
+ FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i],
"current"));
}
job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index fd2f940..24e7a1c 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -119,8 +119,7 @@ public class DomainStatistics extends Configured implements
Tool {
String[] inputDirsSpecs = inputDir.split(",");
for (int i = 0; i < inputDirsSpecs.length; i++) {
- File completeInputPath = new File(new File(inputDirsSpecs[i]),
"current");
- FileInputFormat.addInputPath(job, new
Path(completeInputPath.toString()));
+ FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i],
"current"));
}
job.setInputFormatClass(SequenceFileInputFormat.class);