This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new bbf086726 NUTCH-3014 Standardize Job names (#789)
bbf086726 is described below
commit bbf0867263ed1764c56fe7794c17942d0e8bf1c4
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Thu Nov 2 20:36:43 2023 -0700
NUTCH-3014 Standardize Job names (#789)
---
src/java/org/apache/nutch/crawl/CrawlDb.java | 3 +-
src/java/org/apache/nutch/crawl/CrawlDbMerger.java | 3 +-
src/java/org/apache/nutch/crawl/CrawlDbReader.java | 20 +++++--------
.../org/apache/nutch/crawl/DeduplicationJob.java | 3 +-
src/java/org/apache/nutch/crawl/Generator.java | 13 ++++-----
src/java/org/apache/nutch/crawl/Injector.java | 2 +-
src/java/org/apache/nutch/crawl/LinkDb.java | 3 +-
src/java/org/apache/nutch/crawl/LinkDbMerger.java | 3 +-
src/java/org/apache/nutch/crawl/LinkDbReader.java | 3 +-
src/java/org/apache/nutch/fetcher/Fetcher.java | 2 +-
src/java/org/apache/nutch/hostdb/ReadHostDb.java | 3 +-
src/java/org/apache/nutch/hostdb/UpdateHostDb.java | 3 +-
src/java/org/apache/nutch/indexer/CleaningJob.java | 4 +--
src/java/org/apache/nutch/indexer/IndexingJob.java | 3 +-
src/java/org/apache/nutch/parse/ParseSegment.java | 3 +-
.../apache/nutch/scoring/webgraph/LinkDumper.java | 6 ++--
.../apache/nutch/scoring/webgraph/LinkRank.java | 15 ++++------
.../apache/nutch/scoring/webgraph/NodeDumper.java | 3 +-
.../nutch/scoring/webgraph/ScoreUpdater.java | 3 +-
.../apache/nutch/scoring/webgraph/WebGraph.java | 9 ++----
.../org/apache/nutch/segment/SegmentMerger.java | 3 +-
.../org/apache/nutch/segment/SegmentReader.java | 3 +-
src/java/org/apache/nutch/tools/FreeGenerator.java | 2 +-
.../apache/nutch/tools/arc/ArcSegmentCreator.java | 9 ++----
.../org/apache/nutch/tools/warc/WARCExporter.java | 3 +-
.../apache/nutch/util/CrawlCompletionStats.java | 6 ++--
src/java/org/apache/nutch/util/NutchJob.java | 4 ---
.../nutch/util/ProtocolStatusStatistics.java | 2 +-
.../org/apache/nutch/util/SitemapProcessor.java | 34 ++++++++++------------
.../apache/nutch/util/domain/DomainStatistics.java | 10 +++----
.../org/apache/nutch/crawl/TestCrawlDbFilter.java | 3 +-
.../org/apache/nutch/plugin/TestPluginSystem.java | 5 ++--
32 files changed, 74 insertions(+), 117 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java
b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 16394832b..2b609c0a6 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -165,8 +165,7 @@ public class CrawlDb extends NutchTool implements Tool {
Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random()
.nextInt(Integer.MAX_VALUE)));
- Job job = NutchJob.getInstance(config);
- job.setJobName("crawldb " + crawlDb);
+ Job job = Job.getInstance(config, "Nutch CrawlDb: " + crawlDb);
Path current = new Path(crawlDb, CURRENT_NAME);
if (current.getFileSystem(job.getConfiguration()).exists(current)) {
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
index 1bf7243d3..6ee4b43cd 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -165,9 +165,8 @@ public class CrawlDbMerger extends Configured implements
Tool {
Path newCrawlDb = new Path(output,
"merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- Job job = NutchJob.getInstance(conf);
+ Job job = Job.getInstance(conf, "Nutch CrawlDbMerger: " + output);
conf = job.getConfiguration();
- job.setJobName("crawldb merge " + output);
job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index bd3e6f38d..29e8efe17 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -564,9 +564,8 @@ public class CrawlDbReader extends AbstractChecker
implements Closeable {
throws IOException, InterruptedException, ClassNotFoundException {
Path tmpFolder = new Path(crawlDb, "stat_tmp" +
System.currentTimeMillis());
- Job job = NutchJob.getInstance(config);
+ Job job = Job.getInstance(config, "Nutch CrawlDbReader: " + crawlDb);
config = job.getConfiguration();
- job.setJobName("stats " + crawlDb);
config.setBoolean("db.reader.stats.sort", sort);
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -812,7 +811,7 @@ public class CrawlDbReader extends AbstractChecker
implements Closeable {
@Override
protected int process(String line, StringBuilder output) throws Exception {
- Job job = NutchJob.getInstance(getConf());
+ Job job = Job.getInstance(getConf(), "Nutch CrawlDbReader: process " +
crawlDb);
Configuration config = job.getConfiguration();
readUrl(this.crawlDb, line, config, output);
return 0;
@@ -839,8 +838,7 @@ public class CrawlDbReader extends AbstractChecker
implements Closeable {
Path outFolder = new Path(output);
- Job job = NutchJob.getInstance(config);
- job.setJobName("dump " + crawlDb);
+ Job job = Job.getInstance(config, "Nutch CrawlDbReader: dump " + crawlDb);
Configuration jobConf = job.getConfiguration();
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -958,18 +956,15 @@ public class CrawlDbReader extends AbstractChecker
implements Closeable {
String output, Configuration config)
throws IOException, ClassNotFoundException, InterruptedException {
- if (LOG.isInfoEnabled()) {
- LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
- LOG.info("CrawlDb db: {}", crawlDb);
- }
+ LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
+ LOG.info("CrawlDb db: {}", crawlDb);
Path outFolder = new Path(output);
Path tempDir = new Path(
config.get("mapreduce.cluster.temp.dir", ".") + "/readdb-topN-temp-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- Job job = NutchJob.getInstance(config);
- job.setJobName("topN prepare " + crawlDb);
+ Job job = Job.getInstance(config, "Nutch CrawlDbReader: topN prepare " +
crawlDb);
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.setInputFormatClass(SequenceFileInputFormat.class);
@@ -1000,8 +995,7 @@ public class CrawlDbReader extends AbstractChecker
implements Closeable {
}
LOG.info("CrawlDb topN: collecting topN scores.");
- job = NutchJob.getInstance(config);
- job.setJobName("topN collect " + crawlDb);
+ job = Job.getInstance(config, "Nutch CrawlDbReader: topN collect " +
crawlDb);
job.getConfiguration().setLong("db.reader.topn", topN);
FileInputFormat.addInputPath(job, tempDir);
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 217005d41..e37001354 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -305,9 +305,8 @@ public class DeduplicationJob extends NutchTool implements
Tool {
Path tempDir = new Path(crawlDb, "dedup-temp-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- Job job = NutchJob.getInstance(getConf());
+ Job job = Job.getInstance(getConf(), "Nutch DeduplicationJob: " + crawlDb);
Configuration conf = job.getConfiguration();
- job.setJobName("Deduplication on " + crawlDb);
conf.set(DEDUPLICATION_GROUP_MODE, group);
conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
job.setJarByClass(DeduplicationJob.class);
diff --git a/src/java/org/apache/nutch/crawl/Generator.java
b/src/java/org/apache/nutch/crawl/Generator.java
index 1b62314e7..33f743a37 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -388,7 +388,7 @@ public class Generator extends NutchTool implements Tool {
public void setup(Context context) throws IOException {
conf = context.getConfiguration();
mos = new MultipleOutputs<FloatWritable, SelectorEntry>(context);
- Job job = Job.getInstance(conf);
+ Job job = Job.getInstance(conf, "Nutch Generator.SelectorReducer");
limit = conf.getLong(GENERATOR_TOP_N, Long.MAX_VALUE)
/ job.getNumReduceTasks();
maxNumSegments = conf.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
@@ -695,7 +695,7 @@ public class Generator extends NutchTool implements Tool {
long curTime)
throws IOException, InterruptedException, ClassNotFoundException {
- Job job = NutchJob.getInstance(getConf());
+ Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " +
dbDir);
Configuration conf = job.getConfiguration();
boolean filter = conf.getBoolean(GENERATOR_FILTER, true);
boolean normalise = conf.getBoolean(GENERATOR_NORMALISE, true);
@@ -839,8 +839,7 @@ public class Generator extends NutchTool implements Tool {
}
// map to inverted subset due for fetch, sort by score
- Job job = NutchJob.getInstance(getConf());
- job.setJobName("generate: select from " + dbDir);
+ Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " +
dbDir);
Configuration conf = job.getConfiguration();
if (numLists == -1) {
/* for politeness create exactly one partition per fetch task */
@@ -942,8 +941,7 @@ public class Generator extends NutchTool implements Tool {
Path tempDir2 = new Path(dbDir,
"generate-temp-" + java.util.UUID.randomUUID().toString());
- job = NutchJob.getInstance(getConf());
- job.setJobName("generate: updatedb " + dbDir);
+ job = Job.getInstance(getConf(), "Nutch Generator: updatedb " + dbDir);
job.getConfiguration().setLong(Nutch.GENERATE_TIME_KEY, generateTime);
for (Path segmpaths : generatedSegments) {
Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
@@ -1001,8 +999,7 @@ public class Generator extends NutchTool implements Tool {
LOG.info("Generator: segment: " + segment);
- Job job = NutchJob.getInstance(getConf());
- job.setJobName("generate: partition " + segment);
+ Job job = Job.getInstance(getConf(), "Nutch Generator: partition segment "
+ segment);
Configuration conf = job.getConfiguration();
conf.setInt("partition.url.seed", RANDOM.nextInt());
diff --git a/src/java/org/apache/nutch/crawl/Injector.java
b/src/java/org/apache/nutch/crawl/Injector.java
index 9bfd1b454..0d3740eb4 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -404,7 +404,7 @@ public class Injector extends NutchTool implements Tool {
Path lock = CrawlDb.lock(conf, crawlDb, false);
// configure job
- Job job = Job.getInstance(conf, "inject " + urlDir);
+ Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir);
job.setJarByClass(Injector.class);
job.setMapperClass(InjectMapper.class);
job.setReducerClass(InjectReducer.class);
diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java
b/src/java/org/apache/nutch/crawl/LinkDb.java
index 3c752ab1d..2f4a0dda4 100644
--- a/src/java/org/apache/nutch/crawl/LinkDb.java
+++ b/src/java/org/apache/nutch/crawl/LinkDb.java
@@ -270,9 +270,8 @@ public class LinkDb extends NutchTool implements Tool {
Path newLinkDb = new Path(linkDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- Job job = NutchJob.getInstance(config);
+ Job job = Job.getInstance(config, "Nutch LinkDb: " + linkDb);
Configuration conf = job.getConfiguration();
- job.setJobName("linkdb " + linkDb);
job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java
b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
index d6a41ab48..c3da2031e 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
@@ -147,8 +147,7 @@ public class LinkDbMerger extends Configured implements
Tool {
Path newLinkDb = new Path(linkDb,
"merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- Job job = NutchJob.getInstance(config);
- job.setJobName("linkdb merge " + linkDb);
+ Job job = Job.getInstance(config, "Nutch LinkDbMerger: " + linkDb);
Configuration conf = job.getConfiguration();
job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java
b/src/java/org/apache/nutch/crawl/LinkDbReader.java
index fa01f20bf..9ae356683 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbReader.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java
@@ -159,8 +159,7 @@ public class LinkDbReader extends AbstractChecker
implements Closeable {
Path outFolder = new Path(output);
- Job job = NutchJob.getInstance(getConf());
- job.setJobName("read " + linkdb);
+ Job job = Job.getInstance(getConf(), "Nutch LinkDbReader: " + linkdb);
job.setJarByClass(LinkDbReader.class);
Configuration conf = job.getConfiguration();
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 92aef6f10..d1774f530 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -498,7 +498,7 @@ public class Fetcher extends NutchTool implements Tool {
totalOutlinksToFollow);
}
- Job job = NutchJob.getInstance(getConf());
+ Job job = Job.getInstance(getConf(), "Nutch Fetcher: " +
segment.getName());
job.setJobName("FetchData");
Configuration conf = job.getConfiguration();
diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index 0321a8652..036b78650 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -181,8 +181,7 @@ public class ReadHostDb extends Configured implements Tool {
conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
conf.set("mapreduce.output.textoutputformat.separator", "\t");
- Job job = Job.getInstance(conf);
- job.setJobName("ReadHostDb");
+ Job job = Job.getInstance(conf, "Nutch ReadHostDb");
job.setJarByClass(ReadHostDb.class);
FileInputFormat.addInputPath(job, new Path(hostDb, "current"));
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
index 65e45c55d..5148a6be1 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
@@ -77,11 +77,10 @@ public class UpdateHostDb extends Configured implements
Tool {
stopWatch.start();
LOG.info("UpdateHostDb: starting");
- Job job = NutchJob.getInstance(getConf());
+ Job job = Job.getInstance(getConf(), "Nutch UpdateHostDb");
Configuration conf = job.getConfiguration();
boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
job.setJarByClass(UpdateHostDb.class);
- job.setJobName("UpdateHostDb");
FileSystem fs = hostDb.getFileSystem(conf);
Path old = new Path(hostDb, "old");
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java
b/src/java/org/apache/nutch/indexer/CleaningJob.java
index 04b9c2efa..8334ac353 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -144,7 +144,7 @@ public class CleaningJob implements Tool {
stopWatch.start();
LOG.info("CleaningJob: starting");
- Job job = NutchJob.getInstance(getConf());
+ Job job = Job.getInstance(getConf(), "Nutch CleaningJob: " + crawldb);
Configuration conf = job.getConfiguration();
FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
@@ -157,8 +157,6 @@ public class CleaningJob implements Tool {
job.setReducerClass(DeleterReducer.class);
job.setJarByClass(CleaningJob.class);
- job.setJobName("CleaningJob");
-
// need to expicitely allow deletions
conf.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);
diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java
b/src/java/org/apache/nutch/indexer/IndexingJob.java
index d2115230c..c3ddb4ae9 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -108,7 +108,8 @@ public class IndexingJob extends NutchTool implements Tool {
stopWatch.start();
LOG.info("Indexer: starting");
- final Job job = NutchJob.getInstance(getConf());
+ final Job job = Job.getInstance(getConf(),
+ "Nutch IndexingJob: crawldb: " + crawlDb + " segment(s): " + segments);
job.setJobName("Indexer");
Configuration conf = job.getConfiguration();
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java
b/src/java/org/apache/nutch/parse/ParseSegment.java
index de45c463b..1995a880e 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -232,8 +232,7 @@ public class ParseSegment extends NutchTool implements Tool
{
LOG.info("ParseSegment: starting");
LOG.info("ParseSegment: segment: {}", segment);
- Job job = NutchJob.getInstance(getConf());
- job.setJobName("parse " + segment);
+ Job job = Job.getInstance(getConf(), "Nutch ParseSegment: " + segment);
Configuration conf = job.getConfiguration();
FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
index 4831d73f3..439d7438c 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
@@ -341,8 +341,7 @@ public class LinkDumper extends Configured implements Tool {
// run the inverter job
Path tempInverted = new Path(webGraphDb, "inverted-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- Job inverter = NutchJob.getInstance(conf);
- inverter.setJobName("LinkDumper: inverter");
+ Job inverter = Job.getInstance(conf, "Nutch LinkDumper: invert " +
webGraphDb);
FileInputFormat.addInputPath(inverter, nodeDb);
FileInputFormat.addInputPath(inverter, outlinkDb);
inverter.setInputFormatClass(SequenceFileInputFormat.class);
@@ -372,8 +371,7 @@ public class LinkDumper extends Configured implements Tool {
}
// run the merger job
- Job merger = NutchJob.getInstance(conf);
- merger.setJobName("LinkDumper: merger");
+ Job merger = Job.getInstance(conf, "Nutch LinkDumper: merge " +
tempInverted);
FileInputFormat.addInputPath(merger, tempInverted);
merger.setJarByClass(Merger.class);
merger.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
index c226ad130..e48f04acd 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
@@ -93,9 +93,8 @@ public class LinkRank extends Configured implements Tool {
// configure the counter job
Path numLinksPath = new Path(webGraphDb, NUM_NODES);
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
- Job counter = NutchJob.getInstance(getConf());
+ Job counter = Job.getInstance(getConf(), "Nutch LinkRank: counter " +
webGraphDb);
Configuration conf = counter.getConfiguration();
- counter.setJobName("LinkRank Counter");
FileInputFormat.addInputPath(counter, nodeDb);
FileOutputFormat.setOutputPath(counter, numLinksPath);
counter.setInputFormatClass(SequenceFileInputFormat.class);
@@ -194,9 +193,8 @@ public class LinkRank extends Configured implements Tool {
InterruptedException, ClassNotFoundException {
// configure the initializer
- Job initializer = NutchJob.getInstance(getConf());
+ Job initializer = Job.getInstance(getConf(), "Nutch LinkRank: initializer
" + nodeDb);
Configuration conf = initializer.getConfiguration();
- initializer.setJobName("LinkAnalysis Initializer");
FileInputFormat.addInputPath(initializer, nodeDb);
FileOutputFormat.setOutputPath(initializer, output);
initializer.setJarByClass(Initializer.class);
@@ -245,9 +243,9 @@ public class LinkRank extends Configured implements Tool {
throws IOException, InterruptedException, ClassNotFoundException {
// configure the inverter
- Job inverter = NutchJob.getInstance(getConf());
+ Job inverter = Job.getInstance(getConf(),
+ "Nutch Linkrank: inverter nodedb: " + nodeDb + " outlinkdb: " +
outlinkDb);
Configuration conf = inverter.getConfiguration();
- inverter.setJobName("LinkAnalysis Inverter");
FileInputFormat.addInputPath(inverter, nodeDb);
FileInputFormat.addInputPath(inverter, outlinkDb);
FileOutputFormat.setOutputPath(inverter, output);
@@ -305,11 +303,10 @@ public class LinkRank extends Configured implements Tool {
int iteration, int numIterations, float rankOne)
throws IOException, InterruptedException, ClassNotFoundException {
- Job analyzer = NutchJob.getInstance(getConf());
+ Job analyzer = Job.getInstance(getConf(),
+ "Nutch LinkRank: analysis iteration" + (iteration + 1) + " of " +
numIterations);
Configuration conf = analyzer.getConfiguration();
conf.set("link.analyze.iteration", String.valueOf(iteration + 1));
- analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1)
- + " of " + numIterations);
FileInputFormat.addInputPath(analyzer, nodeDb);
FileInputFormat.addInputPath(analyzer, inverted);
FileOutputFormat.setOutputPath(analyzer, output);
diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
index dfccccc19..9277df8f6 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
@@ -298,9 +298,8 @@ public class NodeDumper extends Configured implements Tool {
LOG.info("NodeDumper: starting");
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
- Job dumper = NutchJob.getInstance(getConf());
+ Job dumper = Job.getInstance(getConf(), "Nutch NodeDumper: " + webGraphDb);
Configuration conf = dumper.getConfiguration();
- dumper.setJobName("NodeDumper: " + webGraphDb);
FileInputFormat.addInputPath(dumper, nodeDb);
dumper.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
index c10a6e37b..bcd534274 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
@@ -170,8 +170,7 @@ public class ScoreUpdater extends Configured implements
Tool{
.nextInt(Integer.MAX_VALUE)));
// run the updater job outputting to the temp crawl database
- Job updater = NutchJob.getInstance(conf);
- updater.setJobName("Update CrawlDb from WebGraph");
+ Job updater = Job.getInstance(conf, "Nutch ScoreUpdater: " + crawlDb);
FileInputFormat.addInputPath(updater, crawlDbCurrent);
FileInputFormat.addInputPath(updater, nodeDb);
FileOutputFormat.setOutputPath(updater, newCrawlDb);
diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
index b98329d1e..25e3cf230 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
@@ -545,9 +545,8 @@ public class WebGraph extends Configured implements Tool {
Path tempOutlinkDb = new Path(outlinkDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- Job outlinkJob = NutchJob.getInstance(getConf());
+ Job outlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: outlinkdb " +
outlinkDb);
Configuration outlinkJobConf = outlinkJob.getConfiguration();
- outlinkJob.setJobName("Outlinkdb: " + outlinkDb);
boolean deleteGone = outlinkJobConf.getBoolean("link.delete.gone", false);
boolean preserveBackup = outlinkJobConf.getBoolean("db.preserve.backup",
true);
@@ -625,9 +624,8 @@ public class WebGraph extends Configured implements Tool {
Path tempInlinkDb = new Path(inlinkDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- Job inlinkJob = NutchJob.getInstance(getConf());
+ Job inlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: inlinkdb " +
inlinkDb);
Configuration inlinkJobConf = inlinkJob.getConfiguration();
- inlinkJob.setJobName("Inlinkdb " + inlinkDb);
LOG.info("InlinkDb: adding input: " + outlinkDb);
FileInputFormat.addInputPath(inlinkJob, outlinkDb);
inlinkJob.setInputFormatClass(SequenceFileInputFormat.class);
@@ -669,9 +667,8 @@ public class WebGraph extends Configured implements Tool {
Path tempNodeDb = new Path(nodeDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
- Job nodeJob = NutchJob.getInstance(getConf());
+ Job nodeJob = Job.getInstance(getConf(), "Nutch WebGraph: nodedb " +
nodeDb);
Configuration nodeJobConf = nodeJob.getConfiguration();
- nodeJob.setJobName("NodeDb " + nodeDb);
LOG.info("NodeDb: adding input: " + outlinkDb);
LOG.info("NodeDb: adding input: " + inlinkDb);
FileInputFormat.addInputPath(nodeJob, outlinkDb);
diff --git a/src/java/org/apache/nutch/segment/SegmentMerger.java
b/src/java/org/apache/nutch/segment/SegmentMerger.java
index c884dfedf..53bdee22e 100644
--- a/src/java/org/apache/nutch/segment/SegmentMerger.java
+++ b/src/java/org/apache/nutch/segment/SegmentMerger.java
@@ -625,9 +625,8 @@ public class SegmentMerger extends Configured implements
Tool{
long slice) throws IOException, ClassNotFoundException,
InterruptedException {
String segmentName = Generator.generateSegmentName();
LOG.info("Merging {} segments to {}/{}", segs.length, out, segmentName);
- Job job = NutchJob.getInstance(getConf());
+ Job job = Job.getInstance(getConf(), "Nutch SegmentMerger: " + out + "/" +
segmentName);
Configuration conf = job.getConfiguration();
- job.setJobName("mergesegs " + out + "/" + segmentName);
conf.setBoolean("segment.merger.filter", filter);
conf.setBoolean("segment.merger.normalizer", normalize);
conf.setLong("segment.merger.slice", slice);
diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java
b/src/java/org/apache/nutch/segment/SegmentReader.java
index ee5c266fd..bef980060 100644
--- a/src/java/org/apache/nutch/segment/SegmentReader.java
+++ b/src/java/org/apache/nutch/segment/SegmentReader.java
@@ -200,8 +200,7 @@ public class SegmentReader extends Configured implements
Tool {
LOG.info("SegmentReader: dump segment: {}", segment);
- Job job = NutchJob.getInstance(getConf());
- job.setJobName("read " + segment);
+ Job job = Job.getInstance(getConf(), "Nutch SegmentReader: " + segment);
Configuration conf = job.getConfiguration();
if (ge)
diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java
b/src/java/org/apache/nutch/tools/FreeGenerator.java
index e9f5c8761..9ace8f192 100644
--- a/src/java/org/apache/nutch/tools/FreeGenerator.java
+++ b/src/java/org/apache/nutch/tools/FreeGenerator.java
@@ -184,7 +184,7 @@ public class FreeGenerator extends Configured implements
Tool {
stopWatch.start();
LOG.info("FreeGenerator: starting");
- Job job = NutchJob.getInstance(getConf());
+ Job job = Job.getInstance(getConf(), "Nutch FreeGenerator: " + args[0]);
Configuration conf = job.getConfiguration();
conf.setBoolean(FILTER_KEY, filter);
conf.setBoolean(NORMALIZE_KEY, normalize);
diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
index 825e752cc..311675310 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
@@ -371,14 +371,11 @@ public class ArcSegmentCreator extends Configured
implements Tool {
StopWatch stopWatch = new StopWatch();
stopWatch.start();
- if (LOG.isInfoEnabled()) {
- LOG.info("ArcSegmentCreator: starting");
- LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
- }
+ LOG.info("ArcSegmentCreator: starting");
+ LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
- Job job = NutchJob.getInstance(getConf());
+ Job job = Job.getInstance(getConf(), "Nutch ArcSegmentCreator: " +
arcFiles);
Configuration conf = job.getConfiguration();
- job.setJobName("ArcSegmentCreator " + arcFiles);
String segName = generateSegmentName();
conf.set(Nutch.SEGMENT_NAME_KEY, segName);
FileInputFormat.addInputPath(job, arcFiles);
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index 6d8a38557..4e80aac5f 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -433,8 +433,7 @@ public class WARCExporter extends Configured implements
Tool {
stopWatch.start();
LOG.info("WARCExporter: starting");
- final Job job = NutchJob.getInstance(getConf());
- job.setJobName("warc-exporter " + output);
+ final Job job = Job.getInstance(getConf(), "Nutch WARCExporter: " +
output);
job.getConfiguration().setBoolean(ONLY_SUCCESSFUL_RESPONSES,
onlySuccessfulResponses);
diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index 8696d2822..e5ee5f643 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -133,12 +133,12 @@ public class CrawlCompletionStats extends Configured
implements Tool {
LOG.info("CrawlCompletionStats: starting");
int mode = 0;
- String jobName = "CrawlCompletionStats";
+ String jobName = "Nutch CrawlCompletionStats: ";
if (cli.getOptionValue("mode").equals("host")) {
- jobName = "Host CrawlCompletionStats";
+ jobName = jobName + "Host statistics";
mode = MODE_HOST;
} else if (cli.getOptionValue("mode").equals("domain")) {
- jobName = "Domain CrawlCompletionStats";
+ jobName = jobName + "Domain statistics";
mode = MODE_DOMAIN;
}
diff --git a/src/java/org/apache/nutch/util/NutchJob.java
b/src/java/org/apache/nutch/util/NutchJob.java
index 068c64fef..25b894550 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -56,10 +56,6 @@ public class NutchJob extends Job {
}
}
- public static Job getInstance(Configuration conf) throws IOException {
- return Job.getInstance(conf);
- }
-
/**
* Clean up the file system in case of a job failure.
* @param tempDir The temporary directory which needs to be
diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
index 0fe6c57d0..f4e8a1b91 100644
--- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
+++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -89,7 +89,7 @@ public class ProtocolStatusStatistics extends Configured
implements Tool {
stopWatch.start();
LOG.info("ProtocolStatistics: starting");
- String jobName = "ProtocolStatistics";
+ String jobName = "Nutch ProtocolStatusStatistics: " + inputDir;
Configuration conf = getConf();
conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 66fa9b0e7..043e77f69 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -383,7 +383,7 @@ public class SitemapProcessor extends Configured implements
Tool {
conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize);
conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
- Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString());
+ Job job = Job.getInstance(conf, "Nutch SitemapProcessor: " +
crawldb.toString());
job.setJarByClass(SitemapProcessor.class);
// add crawlDb, sitemap url directory and hostDb to input paths
@@ -431,23 +431,21 @@ public class SitemapProcessor extends Configured
implements Tool {
FSUtils.replace(fs, current, tempCrawlDb, true);
LockUtil.removeLockFile(fs, lock);
- if (LOG.isInfoEnabled()) {
- long filteredRecords = job.getCounters().findCounter("Sitemap",
"filtered_records").getValue();
- long fromHostname = job.getCounters().findCounter("Sitemap",
"sitemaps_from_hostname").getValue();
- long fromSeeds = job.getCounters().findCounter("Sitemap",
"sitemap_seeds").getValue();
- long failedFetches = job.getCounters().findCounter("Sitemap",
"failed_fetches").getValue();
- long newSitemapEntries = job.getCounters().findCounter("Sitemap",
"new_sitemap_entries").getValue();
-
- LOG.info("SitemapProcessor: Total records rejected by filters: {}",
filteredRecords);
- LOG.info("SitemapProcessor: Total sitemaps from host name: {}",
fromHostname);
- LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}",
fromSeeds);
- LOG.info("SitemapProcessor: Total failed sitemap fetches: {}",
failedFetches);
- LOG.info("SitemapProcessor: Total new sitemap entries added: {}",
newSitemapEntries);
-
- stopWatch.stop();
- LOG.info("SitemapProcessor: finished, elapsed: {} ms",
stopWatch.getTime(
- TimeUnit.MILLISECONDS));
- }
+ long filteredRecords = job.getCounters().findCounter("Sitemap",
"filtered_records").getValue();
+ long fromHostname = job.getCounters().findCounter("Sitemap",
"sitemaps_from_hostname").getValue();
+ long fromSeeds = job.getCounters().findCounter("Sitemap",
"sitemap_seeds").getValue();
+ long failedFetches = job.getCounters().findCounter("Sitemap",
"failed_fetches").getValue();
+ long newSitemapEntries = job.getCounters().findCounter("Sitemap",
"new_sitemap_entries").getValue();
+
+ LOG.info("SitemapProcessor: Total records rejected by filters: {}",
filteredRecords);
+ LOG.info("SitemapProcessor: Total sitemaps from host name: {}",
fromHostname);
+ LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}",
fromSeeds);
+ LOG.info("SitemapProcessor: Total failed sitemap fetches: {}",
failedFetches);
+ LOG.info("SitemapProcessor: Total new sitemap entries added: {}",
newSitemapEntries);
+
+ stopWatch.stop();
+ LOG.info("SitemapProcessor: finished, elapsed: {} ms", stopWatch.getTime(
+ TimeUnit.MILLISECONDS));
} catch (IOException | InterruptedException | ClassNotFoundException e) {
LOG.error("SitemapProcessor_" + crawldb.toString(), e);
NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index f77b72bc5..1843c424d 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -97,18 +97,18 @@ public class DomainStatistics extends Configured implements
Tool {
LOG.info("DomainStatistics: starting");
int mode = 0;
- String jobName = "DomainStatistics";
+ String jobName = "Nutch DomainStatistics: ";
if (args[2].equals("host")) {
- jobName = "Host statistics";
+ jobName = jobName + "Host statistics";
mode = MODE_HOST;
} else if (args[2].equals("domain")) {
- jobName = "Domain statistics";
+ jobName = jobName + "Domain statistics";
mode = MODE_DOMAIN;
} else if (args[2].equals("suffix")) {
- jobName = "Suffix statistics";
+ jobName = jobName + "Suffix statistics";
mode = MODE_SUFFIX;
} else if (args[2].equals("tld")) {
- jobName = "TLD statistics";
+ jobName = jobName + "Top Level Directory statistics";
mode = MODE_TLD;
}
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
index 82fefaf16..812d4a6a8 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -31,7 +31,6 @@ import
org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
-import org.apache.nutch.util.NutchJob;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
@@ -94,7 +93,7 @@ public class TestCrawlDbFilter {
conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true);
conf.setBoolean(CrawlDbFilter.URL_FILTERING, false);
conf.setInt("urlnormalizer.loop.count", 2);
- Job job = NutchJob.getInstance(conf);
+ Job job = Job.getInstance(conf);
job.setJobName("Test CrawlDbFilter");
Path current = new Path(dbDir, "current");
if (FileSystem.get(conf).exists(current)) {
diff --git a/src/test/org/apache/nutch/plugin/TestPluginSystem.java
b/src/test/org/apache/nutch/plugin/TestPluginSystem.java
index dba7c6606..7c1362aa5 100644
--- a/src/test/org/apache/nutch/plugin/TestPluginSystem.java
+++ b/src/test/org/apache/nutch/plugin/TestPluginSystem.java
@@ -28,7 +28,6 @@ import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
@@ -102,7 +101,7 @@ public class TestPluginSystem {
public void testRepositoryCache() throws IOException {
Configuration config = NutchConfiguration.create();
PluginRepository repo = PluginRepository.get(config);
- Job job = NutchJob.getInstance(config);
+ Job job = Job.getInstance(config);
config = job.getConfiguration();
PluginRepository repo1 = PluginRepository.get(config);
Assert.assertTrue(repo == repo1);
@@ -111,7 +110,7 @@ public class TestPluginSystem {
config.addResource("nutch-default.xml");
config.addResource("nutch-site.xml");
repo = PluginRepository.get(config);
- job = NutchJob.getInstance(config);
+ job = Job.getInstance(config);
config = job.getConfiguration();
repo1 = PluginRepository.get(config);
Assert.assertTrue(repo1 != repo);