(nutch) branch master updated: Unlock database when Injector finishes - regardless of result

snagel Wed, 23 Oct 2024 13:08:40 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new 3495472ca Unlock database when Injector finishes - regardless of result
3495472ca is described below

commit 3495472ca9f3b54867e90eeb1bdfa64be36f731e
Author: cube <[email protected]>
AuthorDate: Tue Oct 15 08:04:34 2024 +0200

    Unlock database when Injector finishes - regardless of result
---
 .gitignore                                    |  1 +
 src/java/org/apache/nutch/crawl/Injector.java | 70 +++++++++++++++------------
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8c521aa68..9cac3379c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ lib/spotbugs-*
 ivy/dependency-check-ant/*
 .gradle*
 ivy/apache-rat-*
+.vscode
diff --git a/src/java/org/apache/nutch/crawl/Injector.java 
b/src/java/org/apache/nutch/crawl/Injector.java
index 0d3740eb4..314cf448d 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -358,6 +358,39 @@ public class Injector extends NutchTool implements Tool {
     setConf(conf);
   }
 
+  private Job prepareJob(Configuration conf, Path urlDir, Path current, Path 
tempCrawlDb) throws IOException {
+    Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir);
+    job.setJarByClass(Injector.class);
+    job.setMapperClass(InjectMapper.class);
+    job.setReducerClass(InjectReducer.class);
+    job.setOutputFormatClass(MapFileOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+    job.setSpeculativeExecution(false);
+
+    // set input and output paths of the job
+    MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);
+    FileStatus[] seedFiles = urlDir.getFileSystem(conf).listStatus(urlDir);
+    int numSeedFiles = 0;
+    for (FileStatus seedFile : seedFiles) {
+      if (seedFile.isFile()) {
+        MultipleInputs.addInputPath(job, seedFile.getPath(),
+            KeyValueTextInputFormat.class);
+        numSeedFiles++;
+        LOG.info("Injecting seed URL file {}", seedFile.getPath());
+      } else {
+        LOG.warn("Skipped non-file input in {}: {}", urlDir,
+            seedFile.getPath());
+      }
+    }
+    if (numSeedFiles == 0) {
+      LOG.error("No seed files to inject found in {}", urlDir);
+      throw new IllegalStateException("No seed files found");
+    }
+    FileOutputFormat.setOutputPath(job, tempCrawlDb);
+    return job;
+  }
+
   public void inject(Path crawlDb, Path urlDir)
       throws IOException, ClassNotFoundException, InterruptedException {
     inject(crawlDb, urlDir, false, false);
@@ -400,40 +433,11 @@ public class Injector extends NutchTool implements Tool {
     Path tempCrawlDb = new Path(crawlDb,
         "crawldb-" + Integer.toString(new 
Random().nextInt(Integer.MAX_VALUE)));
 
-    // lock an existing crawldb to prevent multiple simultaneous updates
-    Path lock = CrawlDb.lock(conf, crawlDb, false);
-
     // configure job
-    Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir);
-    job.setJarByClass(Injector.class);
-    job.setMapperClass(InjectMapper.class);
-    job.setReducerClass(InjectReducer.class);
-    job.setOutputFormatClass(MapFileOutputFormat.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(CrawlDatum.class);
-    job.setSpeculativeExecution(false);
+    Job job = prepareJob(conf, urlDir, current, tempCrawlDb);
 
-    // set input and output paths of the job
-    MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);
-    FileStatus[] seedFiles = 
urlDir.getFileSystem(getConf()).listStatus(urlDir);
-    int numSeedFiles = 0;
-    for (FileStatus seedFile : seedFiles) {
-      if (seedFile.isFile()) {
-        MultipleInputs.addInputPath(job, seedFile.getPath(),
-            KeyValueTextInputFormat.class);
-        numSeedFiles++;
-        LOG.info("Injecting seed URL file {}", seedFile.getPath());
-      } else {
-        LOG.warn("Skipped non-file input in {}: {}", urlDir,
-            seedFile.getPath());
-      }
-    }
-    if (numSeedFiles == 0) {
-      LOG.error("No seed files to inject found in {}", urlDir);
-      LockUtil.removeLockFile(fs, lock);
-      return;
-    }
-    FileOutputFormat.setOutputPath(job, tempCrawlDb);
+    // lock an existing crawldb to prevent multiple simultaneous updates
+    Path lock = CrawlDb.lock(conf, crawlDb, false);
 
     try {
       // run the job
@@ -487,6 +491,8 @@ public class Injector extends NutchTool implements Tool {
       LOG.error("Injector job failed: {}", e.getMessage());
       NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
       throw e;
+    } finally {
+      LockUtil.removeLockFile(fs, lock);
     }
   }

(nutch) branch master updated: Unlock database when Injector finishes - regardless of result

Reply via email to