This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 3495472ca Unlock database when Injector finishes - regardless of result
3495472ca is described below
commit 3495472ca9f3b54867e90eeb1bdfa64be36f731e
Author: cube <[email protected]>
AuthorDate: Tue Oct 15 08:04:34 2024 +0200
Unlock database when Injector finishes - regardless of result
---
.gitignore | 1 +
src/java/org/apache/nutch/crawl/Injector.java | 70 +++++++++++++++------------
2 files changed, 39 insertions(+), 32 deletions(-)
diff --git a/.gitignore b/.gitignore
index 8c521aa68..9cac3379c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ lib/spotbugs-*
ivy/dependency-check-ant/*
.gradle*
ivy/apache-rat-*
+.vscode
diff --git a/src/java/org/apache/nutch/crawl/Injector.java
b/src/java/org/apache/nutch/crawl/Injector.java
index 0d3740eb4..314cf448d 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -358,6 +358,39 @@ public class Injector extends NutchTool implements Tool {
setConf(conf);
}
+ private Job prepareJob(Configuration conf, Path urlDir, Path current, Path
tempCrawlDb) throws IOException {
+ Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir);
+ job.setJarByClass(Injector.class);
+ job.setMapperClass(InjectMapper.class);
+ job.setReducerClass(InjectReducer.class);
+ job.setOutputFormatClass(MapFileOutputFormat.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(CrawlDatum.class);
+ job.setSpeculativeExecution(false);
+
+ // set input and output paths of the job
+ MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);
+ FileStatus[] seedFiles = urlDir.getFileSystem(conf).listStatus(urlDir);
+ int numSeedFiles = 0;
+ for (FileStatus seedFile : seedFiles) {
+ if (seedFile.isFile()) {
+ MultipleInputs.addInputPath(job, seedFile.getPath(),
+ KeyValueTextInputFormat.class);
+ numSeedFiles++;
+ LOG.info("Injecting seed URL file {}", seedFile.getPath());
+ } else {
+ LOG.warn("Skipped non-file input in {}: {}", urlDir,
+ seedFile.getPath());
+ }
+ }
+ if (numSeedFiles == 0) {
+ LOG.error("No seed files to inject found in {}", urlDir);
+ throw new IllegalStateException("No seed files found");
+ }
+ FileOutputFormat.setOutputPath(job, tempCrawlDb);
+ return job;
+ }
+
public void inject(Path crawlDb, Path urlDir)
throws IOException, ClassNotFoundException, InterruptedException {
inject(crawlDb, urlDir, false, false);
@@ -400,40 +433,11 @@ public class Injector extends NutchTool implements Tool {
Path tempCrawlDb = new Path(crawlDb,
"crawldb-" + Integer.toString(new
Random().nextInt(Integer.MAX_VALUE)));
- // lock an existing crawldb to prevent multiple simultaneous updates
- Path lock = CrawlDb.lock(conf, crawlDb, false);
-
// configure job
- Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir);
- job.setJarByClass(Injector.class);
- job.setMapperClass(InjectMapper.class);
- job.setReducerClass(InjectReducer.class);
- job.setOutputFormatClass(MapFileOutputFormat.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(CrawlDatum.class);
- job.setSpeculativeExecution(false);
+ Job job = prepareJob(conf, urlDir, current, tempCrawlDb);
- // set input and output paths of the job
- MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);
- FileStatus[] seedFiles =
urlDir.getFileSystem(getConf()).listStatus(urlDir);
- int numSeedFiles = 0;
- for (FileStatus seedFile : seedFiles) {
- if (seedFile.isFile()) {
- MultipleInputs.addInputPath(job, seedFile.getPath(),
- KeyValueTextInputFormat.class);
- numSeedFiles++;
- LOG.info("Injecting seed URL file {}", seedFile.getPath());
- } else {
- LOG.warn("Skipped non-file input in {}: {}", urlDir,
- seedFile.getPath());
- }
- }
- if (numSeedFiles == 0) {
- LOG.error("No seed files to inject found in {}", urlDir);
- LockUtil.removeLockFile(fs, lock);
- return;
- }
- FileOutputFormat.setOutputPath(job, tempCrawlDb);
+ // lock an existing crawldb to prevent multiple simultaneous updates
+ Path lock = CrawlDb.lock(conf, crawlDb, false);
try {
// run the job
@@ -487,6 +491,8 @@ public class Injector extends NutchTool implements Tool {
LOG.error("Injector job failed: {}", e.getMessage());
NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
throw e;
+ } finally {
+ LockUtil.removeLockFile(fs, lock);
}
}