This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 564951340758147b19660b169dfe763f65b8ed7b Author: Patrick Mezard <[email protected]> AuthorDate: Tue Jun 9 17:39:41 2020 +0200 NUTCH-2791 Handle GCS URLs in stats commands - Handle Google Cloud Storage URLs as crawldb inputs in domainstats, protocolstats and crawlcomplete commands. - Correctly resolve numReducers in protocolstats. - Align crawlcomplete -inputDirs behaviour on the other commands: expect directories containing "current", not "crawldb/current". --- src/java/org/apache/nutch/util/CrawlCompletionStats.java | 6 ++---- src/java/org/apache/nutch/util/ProtocolStatusStatistics.java | 7 +++---- src/java/org/apache/nutch/util/domain/DomainStatistics.java | 3 +-- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java index f3e808b..8a23fbf 100644 --- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java +++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java @@ -74,7 +74,7 @@ public class CrawlCompletionStats extends Configured implements Tool { Option inDirs = OptionBuilder .withArgName("inputDirs") .isRequired() - .withDescription("Comma separated list of crawl directories (e.g., \"./crawl1,./crawl2\")") + .withDescription("Comma separated list of crawldb directories (e.g., \"./crawl1/crawldb,./crawl2/crawldb\")") .hasArgs() .create("inputDirs"); @SuppressWarnings("static-access") @@ -153,9 +153,7 @@ public class CrawlCompletionStats extends Configured implements Tool { String[] inputDirsSpecs = inputDir.split(","); for (int i = 0; i < inputDirsSpecs.length; i++) { - File completeInputPath = new File(new File(inputDirsSpecs[i]), "crawldb/current"); - FileInputFormat.addInputPath(job, new Path(completeInputPath.toString())); - + FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current")); } job.setInputFormatClass(SequenceFileInputFormat.class); diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java index f52a9c5..213c1c2 100644 --- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java +++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java @@ -82,8 +82,8 @@ public class ProtocolStatusStatistics extends Configured implements Tool { int numOfReducers = 1; - if (args.length > 3) { - numOfReducers = Integer.parseInt(args[3]); + if (args.length > 2) { + numOfReducers = Integer.parseInt(args[2]); } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); @@ -100,8 +100,7 @@ public class ProtocolStatusStatistics extends Configured implements Tool { String[] inputDirsSpecs = inputDir.split(","); for (int i = 0; i < inputDirsSpecs.length; i++) { - File completeInputPath = new File(new File(inputDirsSpecs[i]), "current"); - FileInputFormat.addInputPath(job, new Path(completeInputPath.toString())); + FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current")); } job.setInputFormatClass(SequenceFileInputFormat.class); diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java index fd2f940..24e7a1c 100644 --- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java @@ -119,8 +119,7 @@ public class DomainStatistics extends Configured implements Tool { String[] inputDirsSpecs = inputDir.split(","); for (int i = 0; i < inputDirsSpecs.length; i++) { - File completeInputPath = new File(new File(inputDirsSpecs[i]), "current"); - FileInputFormat.addInputPath(job, new Path(completeInputPath.toString())); + FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current")); } job.setInputFormatClass(SequenceFileInputFormat.class);
