This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 11eea5a NUTCH-1194 Generator: CrawlDB lock should be released earlier
- release CrawlDb lock after select step, in case, generated items are not
marked in CrawlDb (generate.update.crawldb is false)
new 04e1592 Merge pull request #514 from
sebastian-nagel/NUTCH-1194-generator-release-crawldb-lock-earlier
11eea5a is described below
commit 11eea5aea89599e2c35e577d15623f1278ded8e4
Author: Sebastian Nagel <[email protected]>
AuthorDate: Thu Apr 23 15:55:32 2020 +0200
NUTCH-1194 Generator: CrawlDB lock should be released earlier
- release CrawlDb lock after select step, in case, generated items
are not marked in CrawlDb (generate.update.crawldb is false)
---
src/java/org/apache/nutch/crawl/Generator.java | 18 +++++++++++++-----
src/java/org/apache/nutch/util/NutchJob.java | 14 ++++++++++++--
2 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/Generator.java
b/src/java/org/apache/nutch/crawl/Generator.java
index 5dcd2ea..04c2ae8 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -841,6 +841,14 @@ public class Generator extends NutchTool implements Tool {
String.format(Locale.ROOT, "%6d", counter.getValue()),
counter.getName());
}
+ if (!getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
+ /*
+ * generated items are not marked in CrawlDb, and CrawlDb will not
+ * accessed anymore: we already can release the lock
+ */
+ LockUtil.removeLockFile(getConf(), lock);
+ lock = null;
+ }
// read the subdirectories generated in the temp
// output and turn them into segments
@@ -858,15 +866,13 @@ public class Generator extends NutchTool implements Tool {
}
} catch (Exception e) {
LOG.warn("Generator: exception while partitioning segments, exiting
...");
- LockUtil.removeLockFile(getConf(), lock);
- fs.delete(tempDir, true);
+ NutchJob.cleanupAfterFailure(tempDir, lock, fs);
return null;
}
if (generatedSegments.size() == 0) {
LOG.warn("Generator: 0 records selected for fetching, exiting ...");
- LockUtil.removeLockFile(getConf(), lock);
- fs.delete(tempDir, true);
+ NutchJob.cleanupAfterFailure(tempDir, lock, fs);
return null;
}
@@ -913,7 +919,9 @@ public class Generator extends NutchTool implements Tool {
fs.delete(tempDir2, true);
}
- LockUtil.removeLockFile(getConf(), lock);
+ if (lock != null) {
+ LockUtil.removeLockFile(getConf(), lock);
+ }
fs.delete(tempDir, true);
long end = System.currentTimeMillis();
diff --git a/src/java/org/apache/nutch/util/NutchJob.java
b/src/java/org/apache/nutch/util/NutchJob.java
index 991e506..13257d2 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -41,7 +41,15 @@ public class NutchJob extends Job {
return Job.getInstance(conf);
}
- /*
+ /**
+ * Clean up the file system in case of a job failure.
+ */
+ public static void cleanupAfterFailure(Path tempDir, FileSystem fs)
+ throws IOException {
+ cleanupAfterFailure(tempDir, null, fs);
+ }
+
+ /**
* Clean up the file system in case of a job failure.
*/
public static void cleanupAfterFailure(Path tempDir, Path lock, FileSystem
fs)
@@ -50,7 +58,9 @@ public class NutchJob extends Job {
if (fs.exists(tempDir)) {
fs.delete(tempDir, true);
}
- LockUtil.removeLockFile(fs, lock);
+ if (lock != null) {
+ LockUtil.removeLockFile(fs, lock);
+ }
} catch (IOException e) {
LOG.error("NutchJob cleanup failed: {}", e.getMessage());
throw e;