This is an automated email from the ASF dual-hosted git repository. lewismc pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new bbf086726 NUTCH-3014 Standardize Job names (#789) bbf086726 is described below commit bbf0867263ed1764c56fe7794c17942d0e8bf1c4 Author: Lewis John McGibbney <lewis.mcgibb...@gmail.com> AuthorDate: Thu Nov 2 20:36:43 2023 -0700 NUTCH-3014 Standardize Job names (#789) --- src/java/org/apache/nutch/crawl/CrawlDb.java | 3 +- src/java/org/apache/nutch/crawl/CrawlDbMerger.java | 3 +- src/java/org/apache/nutch/crawl/CrawlDbReader.java | 20 +++++-------- .../org/apache/nutch/crawl/DeduplicationJob.java | 3 +- src/java/org/apache/nutch/crawl/Generator.java | 13 ++++----- src/java/org/apache/nutch/crawl/Injector.java | 2 +- src/java/org/apache/nutch/crawl/LinkDb.java | 3 +- src/java/org/apache/nutch/crawl/LinkDbMerger.java | 3 +- src/java/org/apache/nutch/crawl/LinkDbReader.java | 3 +- src/java/org/apache/nutch/fetcher/Fetcher.java | 2 +- src/java/org/apache/nutch/hostdb/ReadHostDb.java | 3 +- src/java/org/apache/nutch/hostdb/UpdateHostDb.java | 3 +- src/java/org/apache/nutch/indexer/CleaningJob.java | 4 +-- src/java/org/apache/nutch/indexer/IndexingJob.java | 3 +- src/java/org/apache/nutch/parse/ParseSegment.java | 3 +- .../apache/nutch/scoring/webgraph/LinkDumper.java | 6 ++-- .../apache/nutch/scoring/webgraph/LinkRank.java | 15 ++++------ .../apache/nutch/scoring/webgraph/NodeDumper.java | 3 +- .../nutch/scoring/webgraph/ScoreUpdater.java | 3 +- .../apache/nutch/scoring/webgraph/WebGraph.java | 9 ++---- .../org/apache/nutch/segment/SegmentMerger.java | 3 +- .../org/apache/nutch/segment/SegmentReader.java | 3 +- src/java/org/apache/nutch/tools/FreeGenerator.java | 2 +- .../apache/nutch/tools/arc/ArcSegmentCreator.java | 9 ++---- .../org/apache/nutch/tools/warc/WARCExporter.java | 3 +- .../apache/nutch/util/CrawlCompletionStats.java | 6 ++-- src/java/org/apache/nutch/util/NutchJob.java | 4 --- .../nutch/util/ProtocolStatusStatistics.java | 2 +- .../org/apache/nutch/util/SitemapProcessor.java | 34 ++++++++++------------ .../apache/nutch/util/domain/DomainStatistics.java | 10 +++---- .../org/apache/nutch/crawl/TestCrawlDbFilter.java | 3 +- .../org/apache/nutch/plugin/TestPluginSystem.java | 5 ++-- 32 files changed, 74 insertions(+), 117 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java index 16394832b..2b609c0a6 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDb.java +++ b/src/java/org/apache/nutch/crawl/CrawlDb.java @@ -165,8 +165,7 @@ public class CrawlDb extends NutchTool implements Tool { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random() .nextInt(Integer.MAX_VALUE))); - Job job = NutchJob.getInstance(config); - job.setJobName("crawldb " + crawlDb); + Job job = Job.getInstance(config, "Nutch CrawlDb: " + crawlDb); Path current = new Path(crawlDb, CURRENT_NAME); if (current.getFileSystem(job.getConfiguration()).exists(current)) { diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java index 1bf7243d3..6ee4b43cd 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java @@ -165,9 +165,8 @@ public class CrawlDbMerger extends Configured implements Tool { Path newCrawlDb = new Path(output, "merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job job = NutchJob.getInstance(conf); + Job job = Job.getInstance(conf, "Nutch CrawlDbMerger: " + output); conf = job.getConfiguration(); - job.setJobName("crawldb merge " + output); job.setInputFormatClass(SequenceFileInputFormat.class); diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index bd3e6f38d..29e8efe17 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -564,9 +564,8 @@ public class CrawlDbReader extends AbstractChecker implements Closeable { throws IOException, InterruptedException, ClassNotFoundException { Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis()); - Job job = NutchJob.getInstance(config); + Job job = Job.getInstance(config, "Nutch CrawlDbReader: " + crawlDb); config = job.getConfiguration(); - job.setJobName("stats " + crawlDb); config.setBoolean("db.reader.stats.sort", sort); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); @@ -812,7 +811,7 @@ public class CrawlDbReader extends AbstractChecker implements Closeable { @Override protected int process(String line, StringBuilder output) throws Exception { - Job job = NutchJob.getInstance(getConf()); + Job job = Job.getInstance(getConf(), "Nutch CrawlDbReader: process " + crawlDb); Configuration config = job.getConfiguration(); readUrl(this.crawlDb, line, config, output); return 0; @@ -839,8 +838,7 @@ public class CrawlDbReader extends AbstractChecker implements Closeable { Path outFolder = new Path(output); - Job job = NutchJob.getInstance(config); - job.setJobName("dump " + crawlDb); + Job job = Job.getInstance(config, "Nutch CrawlDbReader: dump " + crawlDb); Configuration jobConf = job.getConfiguration(); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); @@ -958,18 +956,15 @@ public class CrawlDbReader extends AbstractChecker implements Closeable { String output, Configuration config) throws IOException, ClassNotFoundException, InterruptedException { - if (LOG.isInfoEnabled()) { - LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")"); - LOG.info("CrawlDb db: {}", crawlDb); - } + LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")"); + LOG.info("CrawlDb db: {}", crawlDb); Path outFolder = new Path(output); Path tempDir = new Path( config.get("mapreduce.cluster.temp.dir", ".") + "/readdb-topN-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job job = NutchJob.getInstance(config); - job.setJobName("topN prepare " + crawlDb); + Job job = Job.getInstance(config, "Nutch CrawlDbReader: topN prepare " + crawlDb); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(SequenceFileInputFormat.class); @@ -1000,8 +995,7 @@ public class CrawlDbReader extends AbstractChecker implements Closeable { } LOG.info("CrawlDb topN: collecting topN scores."); - job = NutchJob.getInstance(config); - job.setJobName("topN collect " + crawlDb); + job = Job.getInstance(config, "Nutch CrawlDbReader: topN collect " + crawlDb); job.getConfiguration().setLong("db.reader.topn", topN); FileInputFormat.addInputPath(job, tempDir); diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index 217005d41..e37001354 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -305,9 +305,8 @@ public class DeduplicationJob extends NutchTool implements Tool { Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job job = NutchJob.getInstance(getConf()); + Job job = Job.getInstance(getConf(), "Nutch DeduplicationJob: " + crawlDb); Configuration conf = job.getConfiguration(); - job.setJobName("Deduplication on " + crawlDb); conf.set(DEDUPLICATION_GROUP_MODE, group); conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder); job.setJarByClass(DeduplicationJob.class); diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 1b62314e7..33f743a37 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -388,7 +388,7 @@ public class Generator extends NutchTool implements Tool { public void setup(Context context) throws IOException { conf = context.getConfiguration(); mos = new MultipleOutputs<FloatWritable, SelectorEntry>(context); - Job job = Job.getInstance(conf); + Job job = Job.getInstance(conf, "Nutch Generator.SelectorReducer"); limit = conf.getLong(GENERATOR_TOP_N, Long.MAX_VALUE) / job.getNumReduceTasks(); maxNumSegments = conf.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1); @@ -695,7 +695,7 @@ public class Generator extends NutchTool implements Tool { long curTime) throws IOException, InterruptedException, ClassNotFoundException { - Job job = NutchJob.getInstance(getConf()); + Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " + dbDir); Configuration conf = job.getConfiguration(); boolean filter = conf.getBoolean(GENERATOR_FILTER, true); boolean normalise = conf.getBoolean(GENERATOR_NORMALISE, true); @@ -839,8 +839,7 @@ public class Generator extends NutchTool implements Tool { } // map to inverted subset due for fetch, sort by score - Job job = NutchJob.getInstance(getConf()); - job.setJobName("generate: select from " + dbDir); + Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " + dbDir); Configuration conf = job.getConfiguration(); if (numLists == -1) { /* for politeness create exactly one partition per fetch task */ @@ -942,8 +941,7 @@ public class Generator extends NutchTool implements Tool { Path tempDir2 = new Path(dbDir, "generate-temp-" + java.util.UUID.randomUUID().toString()); - job = NutchJob.getInstance(getConf()); - job.setJobName("generate: updatedb " + dbDir); + job = Job.getInstance(getConf(), "Nutch Generator: updatedb " + dbDir); job.getConfiguration().setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); @@ -1001,8 +999,7 @@ public class Generator extends NutchTool implements Tool { LOG.info("Generator: segment: " + segment); - Job job = NutchJob.getInstance(getConf()); - job.setJobName("generate: partition " + segment); + Job job = Job.getInstance(getConf(), "Nutch Generator: partition segment " + segment); Configuration conf = job.getConfiguration(); conf.setInt("partition.url.seed", RANDOM.nextInt()); diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 9bfd1b454..0d3740eb4 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -404,7 +404,7 @@ public class Injector extends NutchTool implements Tool { Path lock = CrawlDb.lock(conf, crawlDb, false); // configure job - Job job = Job.getInstance(conf, "inject " + urlDir); + Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir); job.setJarByClass(Injector.class); job.setMapperClass(InjectMapper.class); job.setReducerClass(InjectReducer.class); diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java b/src/java/org/apache/nutch/crawl/LinkDb.java index 3c752ab1d..2f4a0dda4 100644 --- a/src/java/org/apache/nutch/crawl/LinkDb.java +++ b/src/java/org/apache/nutch/crawl/LinkDb.java @@ -270,9 +270,8 @@ public class LinkDb extends NutchTool implements Tool { Path newLinkDb = new Path(linkDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job job = NutchJob.getInstance(config); + Job job = Job.getInstance(config, "Nutch LinkDb: " + linkDb); Configuration conf = job.getConfiguration(); - job.setJobName("linkdb " + linkDb); job.setInputFormatClass(SequenceFileInputFormat.class); diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java b/src/java/org/apache/nutch/crawl/LinkDbMerger.java index d6a41ab48..c3da2031e 100644 --- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java +++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java @@ -147,8 +147,7 @@ public class LinkDbMerger extends Configured implements Tool { Path newLinkDb = new Path(linkDb, "merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job job = NutchJob.getInstance(config); - job.setJobName("linkdb merge " + linkDb); + Job job = Job.getInstance(config, "Nutch LinkDbMerger: " + linkDb); Configuration conf = job.getConfiguration(); job.setInputFormatClass(SequenceFileInputFormat.class); diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java b/src/java/org/apache/nutch/crawl/LinkDbReader.java index fa01f20bf..9ae356683 100644 --- a/src/java/org/apache/nutch/crawl/LinkDbReader.java +++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java @@ -159,8 +159,7 @@ public class LinkDbReader extends AbstractChecker implements Closeable { Path outFolder = new Path(output); - Job job = NutchJob.getInstance(getConf()); - job.setJobName("read " + linkdb); + Job job = Job.getInstance(getConf(), "Nutch LinkDbReader: " + linkdb); job.setJarByClass(LinkDbReader.class); Configuration conf = job.getConfiguration(); diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index 92aef6f10..d1774f530 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -498,7 +498,7 @@ public class Fetcher extends NutchTool implements Tool { totalOutlinksToFollow); } - Job job = NutchJob.getInstance(getConf()); + Job job = Job.getInstance(getConf(), "Nutch Fetcher: " + segment.getName()); job.setJobName("FetchData"); Configuration conf = job.getConfiguration(); diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index 0321a8652..036b78650 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -181,8 +181,7 @@ public class ReadHostDb extends Configured implements Tool { conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); conf.set("mapreduce.output.textoutputformat.separator", "\t"); - Job job = Job.getInstance(conf); - job.setJobName("ReadHostDb"); + Job job = Job.getInstance(conf, "Nutch ReadHostDb"); job.setJarByClass(ReadHostDb.class); FileInputFormat.addInputPath(job, new Path(hostDb, "current")); diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java index 65e45c55d..5148a6be1 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java @@ -77,11 +77,10 @@ public class UpdateHostDb extends Configured implements Tool { stopWatch.start(); LOG.info("UpdateHostDb: starting"); - Job job = NutchJob.getInstance(getConf()); + Job job = Job.getInstance(getConf(), "Nutch UpdateHostDb"); Configuration conf = job.getConfiguration(); boolean preserveBackup = conf.getBoolean("db.preserve.backup", true); job.setJarByClass(UpdateHostDb.class); - job.setJobName("UpdateHostDb"); FileSystem fs = hostDb.getFileSystem(conf); Path old = new Path(hostDb, "old"); diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java index 04b9c2efa..8334ac353 100644 --- a/src/java/org/apache/nutch/indexer/CleaningJob.java +++ b/src/java/org/apache/nutch/indexer/CleaningJob.java @@ -144,7 +144,7 @@ public class CleaningJob implements Tool { stopWatch.start(); LOG.info("CleaningJob: starting"); - Job job = NutchJob.getInstance(getConf()); + Job job = Job.getInstance(getConf(), "Nutch CleaningJob: " + crawldb); Configuration conf = job.getConfiguration(); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); @@ -157,8 +157,6 @@ public class CleaningJob implements Tool { job.setReducerClass(DeleterReducer.class); job.setJarByClass(CleaningJob.class); - job.setJobName("CleaningJob"); - // need to expicitely allow deletions conf.setBoolean(IndexerMapReduce.INDEXER_DELETE, true); diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java index d2115230c..c3ddb4ae9 100644 --- a/src/java/org/apache/nutch/indexer/IndexingJob.java +++ b/src/java/org/apache/nutch/indexer/IndexingJob.java @@ -108,7 +108,8 @@ public class IndexingJob extends NutchTool implements Tool { stopWatch.start(); LOG.info("Indexer: starting"); - final Job job = NutchJob.getInstance(getConf()); + final Job job = Job.getInstance(getConf(), + "Nutch IndexingJob: crawldb: " + crawlDb + " segment(s): " + segments); job.setJobName("Indexer"); Configuration conf = job.getConfiguration(); diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index de45c463b..1995a880e 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -232,8 +232,7 @@ public class ParseSegment extends NutchTool implements Tool { LOG.info("ParseSegment: starting"); LOG.info("ParseSegment: segment: {}", segment); - Job job = NutchJob.getInstance(getConf()); - job.setJobName("parse " + segment); + Job job = Job.getInstance(getConf(), "Nutch ParseSegment: " + segment); Configuration conf = job.getConfiguration(); FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java index 4831d73f3..439d7438c 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java @@ -341,8 +341,7 @@ public class LinkDumper extends Configured implements Tool { // run the inverter job Path tempInverted = new Path(webGraphDb, "inverted-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job inverter = NutchJob.getInstance(conf); - inverter.setJobName("LinkDumper: inverter"); + Job inverter = Job.getInstance(conf, "Nutch LinkDumper: invert " + webGraphDb); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); inverter.setInputFormatClass(SequenceFileInputFormat.class); @@ -372,8 +371,7 @@ public class LinkDumper extends Configured implements Tool { } // run the merger job - Job merger = NutchJob.getInstance(conf); - merger.setJobName("LinkDumper: merger"); + Job merger = Job.getInstance(conf, "Nutch LinkDumper: merge " + tempInverted); FileInputFormat.addInputPath(merger, tempInverted); merger.setJarByClass(Merger.class); merger.setInputFormatClass(SequenceFileInputFormat.class); diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java index c226ad130..e48f04acd 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java @@ -93,9 +93,8 @@ public class LinkRank extends Configured implements Tool { // configure the counter job Path numLinksPath = new Path(webGraphDb, NUM_NODES); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); - Job counter = NutchJob.getInstance(getConf()); + Job counter = Job.getInstance(getConf(), "Nutch LinkRank: counter " + webGraphDb); Configuration conf = counter.getConfiguration(); - counter.setJobName("LinkRank Counter"); FileInputFormat.addInputPath(counter, nodeDb); FileOutputFormat.setOutputPath(counter, numLinksPath); counter.setInputFormatClass(SequenceFileInputFormat.class); @@ -194,9 +193,8 @@ public class LinkRank extends Configured implements Tool { InterruptedException, ClassNotFoundException { // configure the initializer - Job initializer = NutchJob.getInstance(getConf()); + Job initializer = Job.getInstance(getConf(), "Nutch LinkRank: initializer " + nodeDb); Configuration conf = initializer.getConfiguration(); - initializer.setJobName("LinkAnalysis Initializer"); FileInputFormat.addInputPath(initializer, nodeDb); FileOutputFormat.setOutputPath(initializer, output); initializer.setJarByClass(Initializer.class); @@ -245,9 +243,9 @@ public class LinkRank extends Configured implements Tool { throws IOException, InterruptedException, ClassNotFoundException { // configure the inverter - Job inverter = NutchJob.getInstance(getConf()); + Job inverter = Job.getInstance(getConf(), + "Nutch Linkrank: inverter nodedb: " + nodeDb + " outlinkdb: " + outlinkDb); Configuration conf = inverter.getConfiguration(); - inverter.setJobName("LinkAnalysis Inverter"); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); FileOutputFormat.setOutputPath(inverter, output); @@ -305,11 +303,10 @@ public class LinkRank extends Configured implements Tool { int iteration, int numIterations, float rankOne) throws IOException, InterruptedException, ClassNotFoundException { - Job analyzer = NutchJob.getInstance(getConf()); + Job analyzer = Job.getInstance(getConf(), + "Nutch LinkRank: analysis iteration" + (iteration + 1) + " of " + numIterations); Configuration conf = analyzer.getConfiguration(); conf.set("link.analyze.iteration", String.valueOf(iteration + 1)); - analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) - + " of " + numIterations); FileInputFormat.addInputPath(analyzer, nodeDb); FileInputFormat.addInputPath(analyzer, inverted); FileOutputFormat.setOutputPath(analyzer, output); diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java index dfccccc19..9277df8f6 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java @@ -298,9 +298,8 @@ public class NodeDumper extends Configured implements Tool { LOG.info("NodeDumper: starting"); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); - Job dumper = NutchJob.getInstance(getConf()); + Job dumper = Job.getInstance(getConf(), "Nutch NodeDumper: " + webGraphDb); Configuration conf = dumper.getConfiguration(); - dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormatClass(SequenceFileInputFormat.class); diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java index c10a6e37b..bcd534274 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java +++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java @@ -170,8 +170,7 @@ public class ScoreUpdater extends Configured implements Tool{ .nextInt(Integer.MAX_VALUE))); // run the updater job outputting to the temp crawl database - Job updater = NutchJob.getInstance(conf); - updater.setJobName("Update CrawlDb from WebGraph"); + Job updater = Job.getInstance(conf, "Nutch ScoreUpdater: " + crawlDb); FileInputFormat.addInputPath(updater, crawlDbCurrent); FileInputFormat.addInputPath(updater, nodeDb); FileOutputFormat.setOutputPath(updater, newCrawlDb); diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java index b98329d1e..25e3cf230 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java +++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java @@ -545,9 +545,8 @@ public class WebGraph extends Configured implements Tool { Path tempOutlinkDb = new Path(outlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job outlinkJob = NutchJob.getInstance(getConf()); + Job outlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: outlinkdb " + outlinkDb); Configuration outlinkJobConf = outlinkJob.getConfiguration(); - outlinkJob.setJobName("Outlinkdb: " + outlinkDb); boolean deleteGone = outlinkJobConf.getBoolean("link.delete.gone", false); boolean preserveBackup = outlinkJobConf.getBoolean("db.preserve.backup", true); @@ -625,9 +624,8 @@ public class WebGraph extends Configured implements Tool { Path tempInlinkDb = new Path(inlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job inlinkJob = NutchJob.getInstance(getConf()); + Job inlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: inlinkdb " + inlinkDb); Configuration inlinkJobConf = inlinkJob.getConfiguration(); - inlinkJob.setJobName("Inlinkdb " + inlinkDb); LOG.info("InlinkDb: adding input: " + outlinkDb); FileInputFormat.addInputPath(inlinkJob, outlinkDb); inlinkJob.setInputFormatClass(SequenceFileInputFormat.class); @@ -669,9 +667,8 @@ public class WebGraph extends Configured implements Tool { Path tempNodeDb = new Path(nodeDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - Job nodeJob = NutchJob.getInstance(getConf()); + Job nodeJob = Job.getInstance(getConf(), "Nutch WebGraph: nodedb " + nodeDb); Configuration nodeJobConf = nodeJob.getConfiguration(); - nodeJob.setJobName("NodeDb " + nodeDb); LOG.info("NodeDb: adding input: " + outlinkDb); LOG.info("NodeDb: adding input: " + inlinkDb); FileInputFormat.addInputPath(nodeJob, outlinkDb); diff --git a/src/java/org/apache/nutch/segment/SegmentMerger.java b/src/java/org/apache/nutch/segment/SegmentMerger.java index c884dfedf..53bdee22e 100644 --- a/src/java/org/apache/nutch/segment/SegmentMerger.java +++ b/src/java/org/apache/nutch/segment/SegmentMerger.java @@ -625,9 +625,8 @@ public class SegmentMerger extends Configured implements Tool{ long slice) throws IOException, ClassNotFoundException, InterruptedException { String segmentName = Generator.generateSegmentName(); LOG.info("Merging {} segments to {}/{}", segs.length, out, segmentName); - Job job = NutchJob.getInstance(getConf()); + Job job = Job.getInstance(getConf(), "Nutch SegmentMerger: " + out + "/" + segmentName); Configuration conf = job.getConfiguration(); - job.setJobName("mergesegs " + out + "/" + segmentName); conf.setBoolean("segment.merger.filter", filter); conf.setBoolean("segment.merger.normalizer", normalize); conf.setLong("segment.merger.slice", slice); diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java b/src/java/org/apache/nutch/segment/SegmentReader.java index ee5c266fd..bef980060 100644 --- a/src/java/org/apache/nutch/segment/SegmentReader.java +++ b/src/java/org/apache/nutch/segment/SegmentReader.java @@ -200,8 +200,7 @@ public class SegmentReader extends Configured implements Tool { LOG.info("SegmentReader: dump segment: {}", segment); - Job job = NutchJob.getInstance(getConf()); - job.setJobName("read " + segment); + Job job = Job.getInstance(getConf(), "Nutch SegmentReader: " + segment); Configuration conf = job.getConfiguration(); if (ge) diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java index e9f5c8761..9ace8f192 100644 --- a/src/java/org/apache/nutch/tools/FreeGenerator.java +++ b/src/java/org/apache/nutch/tools/FreeGenerator.java @@ -184,7 +184,7 @@ public class FreeGenerator extends Configured implements Tool { stopWatch.start(); LOG.info("FreeGenerator: starting"); - Job job = NutchJob.getInstance(getConf()); + Job job = Job.getInstance(getConf(), "Nutch FreeGenerator: " + args[0]); Configuration conf = job.getConfiguration(); conf.setBoolean(FILTER_KEY, filter); conf.setBoolean(NORMALIZE_KEY, normalize); diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java index 825e752cc..311675310 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java +++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java @@ -371,14 +371,11 @@ public class ArcSegmentCreator extends Configured implements Tool { StopWatch stopWatch = new StopWatch(); stopWatch.start(); - if (LOG.isInfoEnabled()) { - LOG.info("ArcSegmentCreator: starting"); - LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles); - } + LOG.info("ArcSegmentCreator: starting"); + LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles); - Job job = NutchJob.getInstance(getConf()); + Job job = Job.getInstance(getConf(), "Nutch ArcSegmentCreator: " + arcFiles); Configuration conf = job.getConfiguration(); - job.setJobName("ArcSegmentCreator " + arcFiles); String segName = generateSegmentName(); conf.set(Nutch.SEGMENT_NAME_KEY, segName); FileInputFormat.addInputPath(job, arcFiles); diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index 6d8a38557..4e80aac5f 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -433,8 +433,7 @@ public class WARCExporter extends Configured implements Tool { stopWatch.start(); LOG.info("WARCExporter: starting"); - final Job job = NutchJob.getInstance(getConf()); - job.setJobName("warc-exporter " + output); + final Job job = Job.getInstance(getConf(), "Nutch WARCExporter: " + output); job.getConfiguration().setBoolean(ONLY_SUCCESSFUL_RESPONSES, onlySuccessfulResponses); diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java index 8696d2822..e5ee5f643 100644 --- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java +++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java @@ -133,12 +133,12 @@ public class CrawlCompletionStats extends Configured implements Tool { LOG.info("CrawlCompletionStats: starting"); int mode = 0; - String jobName = "CrawlCompletionStats"; + String jobName = "Nutch CrawlCompletionStats: "; if (cli.getOptionValue("mode").equals("host")) { - jobName = "Host CrawlCompletionStats"; + jobName = jobName + "Host statistics"; mode = MODE_HOST; } else if (cli.getOptionValue("mode").equals("domain")) { - jobName = "Domain CrawlCompletionStats"; + jobName = jobName + "Domain statistics"; mode = MODE_DOMAIN; } diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java index 068c64fef..25b894550 100644 --- a/src/java/org/apache/nutch/util/NutchJob.java +++ b/src/java/org/apache/nutch/util/NutchJob.java @@ -56,10 +56,6 @@ public class NutchJob extends Job { } } - public static Job getInstance(Configuration conf) throws IOException { - return Job.getInstance(conf); - } - /** * Clean up the file system in case of a job failure. * @param tempDir The temporary directory which needs to be diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java index 0fe6c57d0..f4e8a1b91 100644 --- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java +++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java @@ -89,7 +89,7 @@ public class ProtocolStatusStatistics extends Configured implements Tool { stopWatch.start(); LOG.info("ProtocolStatistics: starting"); - String jobName = "ProtocolStatistics"; + String jobName = "Nutch ProtocolStatusStatistics: " + inputDir; Configuration conf = getConf(); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index 66fa9b0e7..043e77f69 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -383,7 +383,7 @@ public class SitemapProcessor extends Configured implements Tool { conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); - Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString()); + Job job = Job.getInstance(conf, "Nutch SitemapProcessor: " + crawldb.toString()); job.setJarByClass(SitemapProcessor.class); // add crawlDb, sitemap url directory and hostDb to input paths @@ -431,23 +431,21 @@ public class SitemapProcessor extends Configured implements Tool { FSUtils.replace(fs, current, tempCrawlDb, true); LockUtil.removeLockFile(fs, lock); - if (LOG.isInfoEnabled()) { - long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue(); - long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue(); - long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue(); - long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue(); - long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue(); - - LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords); - LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname); - LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds); - LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches); - LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries); - - stopWatch.stop(); - LOG.info("SitemapProcessor: finished, elapsed: {} ms", stopWatch.getTime( - TimeUnit.MILLISECONDS)); - } + long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue(); + long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue(); + long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue(); + long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue(); + long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue(); + + LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords); + LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname); + LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds); + LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches); + LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries); + + stopWatch.stop(); + LOG.info("SitemapProcessor: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("SitemapProcessor_" + crawldb.toString(), e); NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs); diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java index f77b72bc5..1843c424d 100644 --- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java @@ -97,18 +97,18 @@ public class DomainStatistics extends Configured implements Tool { LOG.info("DomainStatistics: starting"); int mode = 0; - String jobName = "DomainStatistics"; + String jobName = "Nutch DomainStatistics: "; if (args[2].equals("host")) { - jobName = "Host statistics"; + jobName = jobName + "Host statistics"; mode = MODE_HOST; } else if (args[2].equals("domain")) { - jobName = "Domain statistics"; + jobName = jobName + "Domain statistics"; mode = MODE_DOMAIN; } else if (args[2].equals("suffix")) { - jobName = "Suffix statistics"; + jobName = jobName + "Suffix statistics"; mode = MODE_SUFFIX; } else if (args[2].equals("tld")) { - jobName = "TLD statistics"; + jobName = jobName + "Top Level Directory statistics"; mode = MODE_TLD; } diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java index 82fefaf16..812d4a6a8 100644 --- a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java @@ -31,7 +31,6 @@ import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum; -import org.apache.nutch.util.NutchJob; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -94,7 +93,7 @@ public class TestCrawlDbFilter { conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true); conf.setBoolean(CrawlDbFilter.URL_FILTERING, false); conf.setInt("urlnormalizer.loop.count", 2); - Job job = NutchJob.getInstance(conf); + Job job = Job.getInstance(conf); job.setJobName("Test CrawlDbFilter"); Path current = new Path(dbDir, "current"); if (FileSystem.get(conf).exists(current)) { diff --git a/src/test/org/apache/nutch/plugin/TestPluginSystem.java b/src/test/org/apache/nutch/plugin/TestPluginSystem.java index dba7c6606..7c1362aa5 100644 --- a/src/test/org/apache/nutch/plugin/TestPluginSystem.java +++ b/src/test/org/apache/nutch/plugin/TestPluginSystem.java @@ -28,7 +28,6 @@ import java.util.Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Job; import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.NutchJob; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -102,7 +101,7 @@ public class TestPluginSystem { public void testRepositoryCache() throws IOException { Configuration config = NutchConfiguration.create(); PluginRepository repo = PluginRepository.get(config); - Job job = NutchJob.getInstance(config); + Job job = Job.getInstance(config); config = job.getConfiguration(); PluginRepository repo1 = PluginRepository.get(config); Assert.assertTrue(repo == repo1); @@ -111,7 +110,7 @@ public class TestPluginSystem { config.addResource("nutch-default.xml"); config.addResource("nutch-site.xml"); repo = PluginRepository.get(config); - job = NutchJob.getInstance(config); + job = Job.getInstance(config); config = job.getConfiguration(); repo1 = PluginRepository.get(config); Assert.assertTrue(repo1 != repo);