This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new ceb23e111 NUTCH-3145 Upgrade to JUnit 6 (#883)
ceb23e111 is described below
commit ceb23e111caa8eb079f703a0fca04ca934cfb29f
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Sat Feb 21 08:26:32 2026 -0800
NUTCH-3145 Upgrade to JUnit 6 (#883)
---
.github/workflows/master-build.yml | 108 ++++++++++-
build.xml | 20 +-
default.properties | 12 +-
docker/Dockerfile | 8 +-
ivy/ivy.xml | 20 +-
src/plugin/build-plugin.xml | 2 +
src/test/junit-platform.properties | 31 +++
.../org/apache/nutch/crawl/CrawlDBTestUtil.java | 41 +++-
src/test/org/apache/nutch/fetcher/TestFetcher.java | 93 ++++++---
.../apache/nutch/segment/TestSegmentMerger.java | 57 ++++--
.../nutch/util/CancellationAwareTestUtils.java | 209 +++++++++++++++++++++
.../org/apache/nutch/util/WritableTestUtils.java | 13 +-
12 files changed, 541 insertions(+), 73 deletions(-)
diff --git a/.github/workflows/master-build.yml
b/.github/workflows/master-build.yml
index d73bb3a69..1fe9da252 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/master-build.yml
@@ -20,6 +20,16 @@ on:
pull_request:
types: [opened, synchronize, reopened]
branches: [master]
+
+# Java Version Strategy:
+# - BUILD: Requires Java 17+ (JUnit 6 dependency)
+# - RUNTIME: Supports Java 11+ (javac.version=11 produces Java 11 bytecode)
+#
+# The 'build' job verifies bytecode compilation for both Java 11 and 17
targets.
+# The 'runtime-java11' job verifies the built artifacts actually run on Java
11.
+# The 'tests' job runs on JDK 17 (required by JUnit 6) with the default
+# javac.version=11 bytecode target for backward compatibility.
+
jobs:
javadoc:
strategy:
@@ -43,6 +53,7 @@ jobs:
${{ runner.os }}-ivy-
- name: Javadoc
run: ant clean javadoc -buildfile build.xml
+
rat:
strategy:
matrix:
@@ -73,19 +84,108 @@ jobs:
- name: Fail if any unknown licenses
if: ${{ env.UNKNOWN_LICENSES != '0 Unknown Licenses' }}
run: exit 1
+
+ # Build verification with Java bytecode target matrix
+ # Verifies bytecode compatibility for both Java 11 and Java 17 targets
+ build:
+ strategy:
+ fail-fast: false
+ matrix:
+ javac-version: ['11', '17']
+ os: [ubuntu-latest]
+ runs-on: ${{ matrix.os }}
+ name: build (javac.version=${{ matrix.javac-version }})
+ steps:
+ - uses: actions/checkout@v5
+ - name: Set up JDK 17
+ uses: actions/setup-java@v5
+ with:
+ java-version: '17'
+ distribution: 'temurin'
+ - name: Cache Ivy dependencies
+ uses: actions/cache@v4
+ with:
+ path: ~/.ivy2/cache
+ key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml',
'src/plugin/**/ivy.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-ivy-
+ - name: Build with javac.version=${{ matrix.javac-version }}
+ run: ant clean runtime -Djavac.version=${{ matrix.javac-version }}
-buildfile build.xml
+ - name: Verify bytecode version
+ run: |
+ # Extract and verify the bytecode version of compiled classes
+ # Java 11 = major version 55, Java 17 = major version 61
+ EXPECTED_VERSION=${{ matrix.javac-version == '11' && '55' || '61' }}
+ echo "Expected major version: $EXPECTED_VERSION (Java ${{
matrix.javac-version }})"
+
+ # Find a real class file (exclude package-info.class which may have
different version)
+ cd build/classes
+ CLASS_FILE=$(find . -name "*.class" ! -name "package-info.class" |
head -1)
+ if [ -n "$CLASS_FILE" ]; then
+ echo "Checking: $CLASS_FILE"
+ ACTUAL_VERSION=$(javap -verbose "$CLASS_FILE" 2>/dev/null | grep
"major version" | awk '{print $NF}')
+ echo "Actual major version: $ACTUAL_VERSION"
+ if [ "$ACTUAL_VERSION" != "$EXPECTED_VERSION" ]; then
+ echo "ERROR: Bytecode version mismatch!"
+ exit 1
+ fi
+ echo "Bytecode version verified successfully"
+ else
+ echo "ERROR: No class files found"
+ exit 1
+ fi
+
+ # Verify runtime compatibility on Java 11
+ # This ensures the built artifacts can actually run on Java 11
+ runtime-java11:
+ needs: build
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v5
+ - name: Set up JDK 17 for building
+ uses: actions/setup-java@v5
+ with:
+ java-version: '17'
+ distribution: 'temurin'
+ - name: Cache Ivy dependencies
+ uses: actions/cache@v4
+ with:
+ path: ~/.ivy2/cache
+ key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml',
'src/plugin/**/ivy.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-ivy-
+ - name: Build with Java 11 target
+ run: ant clean runtime -Djavac.version=11 -buildfile build.xml
+ - name: Set up JDK 11 for runtime verification
+ uses: actions/setup-java@v5
+ with:
+ java-version: '11'
+ distribution: 'temurin'
+ - name: Verify runtime on Java 11
+ run: |
+ echo "Verifying Nutch can run on Java 11..."
+ java -version
+ cd runtime/local
+ # Actually load Java classes by running showproperties
+ # This invokes org.apache.nutch.tools.ShowProperties and verifies
the JAR loads
+ bin/nutch showproperties | head -20
+ echo "Java 11 runtime verification complete"
+
+ # Tests run on JDK 17 (required by JUnit 6) with default javac.version=11
+ # Java 11 runtime compatibility is verified by the runtime-java11 job
tests:
strategy:
+ fail-fast: false
matrix:
- java: ['17']
os: [ubuntu-latest, macos-latest]
runs-on: ${{ matrix.os }}
timeout-minutes: 45
steps:
- uses: actions/checkout@v5
- - name: Set up JDK ${{ matrix.java }}
+ - name: Set up JDK 17
uses: actions/setup-java@v5
with:
- java-version: ${{ matrix.java }}
+ java-version: '17'
distribution: 'temurin'
- name: Cache Ivy dependencies
uses: actions/cache@v4
@@ -139,4 +239,4 @@ jobs:
path: |
./build/test/TEST-*.xml
./build/**/test/TEST-*.xml
- retention-days: 1
\ No newline at end of file
+ retention-days: 1
diff --git a/build.xml b/build.xml
index 38e549797..277225d24 100644
--- a/build.xml
+++ b/build.xml
@@ -50,9 +50,19 @@
<property name="ant-eclipse.jar"
value="${ivy.dir}/lib/ant-eclipse-1.0-jvm1.2.jar" />
- <condition property="using.jdk.11">
- <matches string="${java.version}" pattern="11.+" casesensitive="false" />
- </condition>
+ <!--
+ Java Version Strategy (see HADOOP-18887 for similar approach):
+
+ BUILD REQUIREMENT: Java 17+ is required to build Nutch and run unit tests
+ because JUnit 6 (Jupiter) requires Java 17+.
+
+ RUNTIME COMPATIBILITY: The compiled bytecode targets Java 11 by default
+ (javac.version=11 in default.properties), allowing the binary package
+ to run on Java 11+ environments. This is important for Hadoop clusters
+ that may not yet support Java 17 runtime.
+
+ To build with Java 17 bytecode target: ant -Djavac.version=17 ...
+ -->
<!-- the normal classpath -->
<path id="classpath">
@@ -201,7 +211,6 @@
otherwise the Javascript search is broken,
see https://bugs.openjdk.org/browse/JDK-8215291
-->
- <arg value="--no-module-directories" if:set="using.jdk.11"/>
<packageset dir="${src.dir}"/>
<packageset dir="${plugins.dir}/creativecommons/src/java"/>
@@ -501,6 +510,7 @@
<sysproperty key="test.build.data" value="${test.build.data}"/>
<sysproperty key="test.src.dir" value="${test.src.dir}"/>
<sysproperty key="test.include.slow" value="${test.include.slow}"/>
+ <sysproperty key="junit.platform.execution.failfast.enabled"
value="${test.failfast}"/>
<sysproperty key="javax.xml.parsers.DocumentBuilderFactory"
value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/>
</fork>
<fileset dir="${test.build.classes}">
@@ -516,6 +526,7 @@
<sysproperty key="test.build.data" value="${test.build.data}"/>
<sysproperty key="test.src.dir" value="${test.src.dir}"/>
<sysproperty key="test.include.slow" value="${test.include.slow}"/>
+ <sysproperty key="junit.platform.execution.failfast.enabled"
value="${test.failfast}"/>
<sysproperty key="javax.xml.parsers.DocumentBuilderFactory"
value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/>
</fork>
<fileset dir="${test.build.classes}">
@@ -653,7 +664,6 @@
otherwise the Javascript search is broken,
see https://bugs.openjdk.org/browse/JDK-8215291
-->
- <arg value="--no-module-directories" if:set="using.jdk.11"/>
<packageset dir="${src.dir}"/>
<packageset dir="${plugins.dir}/creativecommons/src/java"/>
diff --git a/default.properties b/default.properties
index 09bfa5594..68a9b304d 100644
--- a/default.properties
+++ b/default.properties
@@ -39,10 +39,14 @@ test.build.data = ${test.build.dir}/data
test.build.classes = ${test.build.dir}/classes
test.build.javadoc = ${test.build.dir}/docs/api
+# JUnit 6 fail-fast mode - stop on first test failure (true/false)
+# Enable with: ant test -Dtest.failfast=true
+test.failfast = false
+
# Proxy Host and Port to use for building JavaDoc
javadoc.proxy.host=-J-DproxyHost=
javadoc.proxy.port=-J-DproxyPort=
-javadoc.link.java=https://docs.oracle.com/en/java/javase/11/docs/api/
+javadoc.link.java=https://docs.oracle.com/en/java/javase/17/docs/api/
javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.4.2/api/
javadoc.packages=org.apache.nutch.*
@@ -53,6 +57,12 @@ bin.dist.version.dir=${dist.dir}/${final.name}-bin
javac.debug=on
javac.optimize=on
javac.deprecation=on
+
+# Java bytecode target version for compiled classes.
+# Set to 11 for backward-compatible runtime (works on Java 11+).
+# Note: Building and running tests requires Java 17+ (JUnit 6 requirement),
+# but the compiled artifacts will run on Java 11+.
+# Override with: ant -Djavac.version=17 to target Java 17 bytecode.
javac.version=11
runtime.dir=./runtime
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 2eb218bad..93985f228 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -42,12 +42,12 @@ WORKDIR /root/
# Install dependencies
RUN apk update
-RUN apk --no-cache add apache-ant bash git openjdk11 supervisor
+RUN apk --no-cache add apache-ant bash git openjdk17 supervisor
# Establish environment variables
-RUN echo 'export JAVA_HOME=/usr/lib/jvm/java-11-openjdk' >> $HOME/.bashrc
-RUN echo 'export JAVA_HOME=/usr/lib/jvm/java-11-openjdk' >> $HOME/.ashrc
-ENV JAVA_HOME='/usr/lib/jvm/java-11-openjdk'
+RUN echo 'export JAVA_HOME=/usr/lib/jvm/java-17-openjdk' >> $HOME/.bashrc
+RUN echo 'export JAVA_HOME=/usr/lib/jvm/java-17-openjdk' >> $HOME/.ashrc
+ENV JAVA_HOME='/usr/lib/jvm/java-17-openjdk'
ENV NUTCH_HOME='/root/nutch_source/runtime/local'
# Checkout and build the Nutch master branch (1.x)
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 9b38d2fa9..06e269bf5 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -116,15 +116,17 @@
<exclude org="org.gnu.inet" module="libidn" /><!-- LGPL
2.1 -->
</dependency>
- <dependency org="org.hamcrest" name="hamcrest" rev="3.0"
conf="test->default"/>
- <!-- Required for <junitlauncher> task -->
- <dependency org="org.junit.platform" name="junit-platform-launcher"
rev="1.14.1" conf="test->default"/>
- <!-- Required for JUnit 5 (Jupiter) test execution -->
- <dependency org="org.junit.jupiter" name="junit-jupiter-engine"
rev="5.14.1" conf="test->default"/>
- <dependency org="org.junit.jupiter" name="junit-jupiter-api"
rev="5.14.1" conf="test->default"/>
- <!-- Mockito for mocking in tests -->
- <dependency org="org.mockito" name="mockito-core" rev="5.18.0"
conf="test->default"/>
- <dependency org="org.mockito" name="mockito-junit-jupiter"
rev="5.18.0" conf="test->default"/>
+ <dependency org="org.hamcrest" name="hamcrest" rev="3.0"
conf="test->default"/>
+ <!-- JSpecify nullability annotations for improved null safety -->
+ <dependency org="org.jspecify" name="jspecify" rev="1.0.0"
conf="*->default"/>
+ <!-- Required for <junitlauncher> task -->
+ <dependency org="org.junit.platform" name="junit-platform-launcher"
rev="6.0.3" conf="test->default"/>
+ <!-- Required for JUnit 6 (Jupiter) test execution -->
+ <dependency org="org.junit.jupiter" name="junit-jupiter-engine"
rev="6.0.3" conf="test->default"/>
+ <dependency org="org.junit.jupiter" name="junit-jupiter-api" rev="6.0.3"
conf="test->default"/>
+ <!-- Mockito for mocking in tests -->
+ <dependency org="org.mockito" name="mockito-core" rev="5.18.0"
conf="test->default"/>
+ <dependency org="org.mockito" name="mockito-junit-jupiter" rev="5.18.0"
conf="test->default"/>
<!-- Jetty used to serve test pages for unit tests, but is also
provided as dependency of Hadoop -->
<dependency org="org.eclipse.jetty" name="jetty-server"
rev="12.1.5" conf="test->default">
diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml
index b0aca7103..f1787ed03 100755
--- a/src/plugin/build-plugin.xml
+++ b/src/plugin/build-plugin.xml
@@ -218,6 +218,7 @@
<jvmarg value="-Xmx1000m"/>
<sysproperty key="test.data" value="${build.test}/data"/>
<sysproperty key="test.input" value="${root}/data"/>
+ <sysproperty key="junit.platform.execution.failfast.enabled"
value="${test.failfast}"/>
<sysproperty key="javax.xml.parsers.DocumentBuilderFactory"
value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/>
</fork>
<fileset dir="${build.test}">
@@ -232,6 +233,7 @@
<jvmarg value="-Xmx1000m"/>
<sysproperty key="test.data" value="${build.test}/data"/>
<sysproperty key="test.input" value="${root}/data"/>
+ <sysproperty key="junit.platform.execution.failfast.enabled"
value="${test.failfast}"/>
<sysproperty key="javax.xml.parsers.DocumentBuilderFactory"
value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/>
</fork>
<fileset dir="${basedir}">
diff --git a/src/test/junit-platform.properties
b/src/test/junit-platform.properties
new file mode 100644
index 000000000..b2f5ccaea
--- /dev/null
+++ b/src/test/junit-platform.properties
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# JUnit Platform Configuration
+# See:
https://junit.org/junit5/docs/current/user-guide/#running-tests-config-params
+
+# Enable fail-fast mode - stops test execution on first failure
+# This provides faster feedback during development
+# Set to false or remove this line to run all tests even if some fail
+junit.platform.execution.failfast.enabled=false
+
+# Display names for tests - use method names by default
+junit.jupiter.displayname.generator.default=org.junit.jupiter.api.DisplayNameGenerator$Standard
+
+# Timeout configuration for individual tests (can be overridden with @Timeout)
+# junit.jupiter.execution.timeout.default=5m
+
+# Parallel execution configuration (disabled by default for deterministic
results)
+junit.jupiter.execution.parallel.enabled=false
diff --git a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
index 9e96071a0..581b528f3 100644
--- a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
+++ b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
@@ -24,6 +24,8 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import org.jspecify.annotations.NonNull;
+import org.jspecify.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -53,6 +55,10 @@ import org.eclipse.jetty.server.ServerConnector;
import org.eclipse.jetty.server.handler.ContextHandler;
import org.eclipse.jetty.server.handler.ResourceHandler;
+/**
+ * Test utility for creating and manipulating CrawlDb instances.
+ * Uses JSpecify annotations for null safety.
+ */
public class CrawlDBTestUtil {
private static final Logger LOG = LoggerFactory
@@ -62,6 +68,8 @@ public class CrawlDBTestUtil {
/**
* Creates synthetic crawldb
*
+ * @param conf
+ * configuration to use
* @param fs
* filesystem where db will be created
* @param crawldb
@@ -70,8 +78,8 @@ public class CrawlDBTestUtil {
* urls to be inserted, objects are of type URLCrawlDatum
* @throws Exception
*/
- public static void createCrawlDb(Configuration conf, FileSystem fs,
- Path crawldb, List<URLCrawlDatum> init) throws Exception {
+ public static void createCrawlDb(@NonNull Configuration conf, @NonNull
FileSystem fs,
+ @NonNull Path crawldb, @NonNull List<URLCrawlDatum> init) throws
Exception {
LOG.trace("* creating crawldb: {}", crawldb);
Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
@@ -366,8 +374,9 @@ public class CrawlDBTestUtil {
* override the default one and it is currently not possible to use
* dynamically set values.
*
- * @return
+ * @return a new Reducer Context with test configuration
*/
+ @NonNull
public static Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context
createContext() {
DummyContext context = new DummyContext();
Configuration conf = context.getConfiguration();
@@ -376,13 +385,16 @@ public class CrawlDBTestUtil {
return (Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context) context;
}
+ /** Container for URL and CrawlDatum pairs used in test data. */
public static class URLCrawlDatum {
+ @NonNull
public Text url;
+ @NonNull
public CrawlDatum datum;
- public URLCrawlDatum(Text url, CrawlDatum datum) {
+ public URLCrawlDatum(@NonNull Text url, @NonNull CrawlDatum datum) {
this.url = url;
this.datum = datum;
}
@@ -391,20 +403,27 @@ public class CrawlDBTestUtil {
/**
* Generate seedlist
*
+ * @param fs filesystem to use
+ * @param urlPath path where seed file will be created
+ * @param urls list of URLs to write
* @throws IOException
*/
- public static void generateSeedList(FileSystem fs, Path urlPath,
- List<String> urls) throws IOException {
+ public static void generateSeedList(@NonNull FileSystem fs, @NonNull Path
urlPath,
+ @NonNull List<String> urls) throws IOException {
generateSeedList(fs, urlPath, urls, new ArrayList<String>());
}
/**
- * Generate seedlist
+ * Generate seedlist with optional metadata
*
+ * @param fs filesystem to use
+ * @param urlPath path where seed file will be created
+ * @param urls list of URLs to write
+ * @param metadata optional metadata for each URL
* @throws IOException
*/
- public static void generateSeedList(FileSystem fs, Path urlPath,
- List<String> urls, List<String> metadata) throws IOException {
+ public static void generateSeedList(@NonNull FileSystem fs, @NonNull Path
urlPath,
+ @NonNull List<String> urls, @NonNull List<String> metadata) throws
IOException {
FSDataOutputStream out;
Path file = new Path(urlPath, "urls.txt");
fs.mkdirs(urlPath);
@@ -439,9 +458,11 @@ public class CrawlDBTestUtil {
* port to listen to
* @param staticContent
* folder where static content lives
+ * @return configured Jetty server instance
* @throws UnknownHostException
*/
- public static Server getServer(int port, String staticContent)
+ @NonNull
+ public static Server getServer(int port, @NonNull String staticContent)
throws UnknownHostException {
Server webServer = new Server();
diff --git a/src/test/org/apache/nutch/fetcher/TestFetcher.java
b/src/test/org/apache/nutch/fetcher/TestFetcher.java
index f25cab545..176a88a52 100644
--- a/src/test/org/apache/nutch/fetcher/TestFetcher.java
+++ b/src/test/org/apache/nutch/fetcher/TestFetcher.java
@@ -28,14 +28,18 @@ import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.CancellationAwareTestUtils;
+import org.apache.nutch.util.CancellationAwareTestUtils.CancellationToken;
import org.eclipse.jetty.server.Server;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.Timeout;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -44,6 +48,8 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
* Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4.
* Verify contents
*
+ * <p>This test is cancellation-aware and will exit gracefully if the test
+ * suite is stopped early (e.g., due to fail-fast mode).</p>
*/
public class TestFetcher {
@@ -81,7 +87,10 @@ public class TestFetcher {
}
@Test
+ @Timeout(value = 5, unit = TimeUnit.MINUTES)
public void testFetch() throws IOException, ClassNotFoundException,
InterruptedException {
+ // Create cancellation token for graceful shutdown support
+ CancellationToken cancellationToken =
CancellationAwareTestUtils.createToken();
// generate seedlist
ArrayList<String> urls = new ArrayList<String>();
@@ -95,15 +104,22 @@ public class TestFetcher {
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
+ // Check for cancellation before long-running operations
+ cancellationToken.throwIfCancelled();
+
// inject
Injector injector = new Injector(conf);
injector.inject(crawldbPath, urlPath);
+ cancellationToken.throwIfCancelled();
+
// generate
Generator g = new Generator(conf);
Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
Long.MAX_VALUE, Long.MAX_VALUE, false, false, false, 1, null);
+ cancellationToken.throwIfCancelled();
+
long time = System.currentTimeMillis();
// fetch
Fetcher fetcher = new Fetcher(conf);
@@ -115,6 +131,11 @@ public class TestFetcher {
time = System.currentTimeMillis() - time;
+ // Skip verification if cancelled
+ if (cancellationToken.isCancelled()) {
+ return;
+ }
+
// verify politeness, time taken should be more than (num_of_pages
+1)*delay
int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat(
"fetcher.server.delay", 5));
@@ -127,18 +148,28 @@ public class TestFetcher {
ArrayList<String> handledurls = new ArrayList<String>();
- READ_CONTENT: do {
- Text key = new Text();
- Content value = new Content();
- if (!reader.next(key, value))
- break READ_CONTENT;
- String contentString = new String(value.getContent());
- if (contentString.indexOf("Nutch fetcher test page") != -1) {
- handledurls.add(key.toString());
- }
- } while (true);
+ try {
+ READ_CONTENT: do {
+ // Check for cancellation periodically during I/O operations
+ if (cancellationToken.isCancelled()) break READ_CONTENT;
+
+ Text key = new Text();
+ Content value = new Content();
+ if (!reader.next(key, value))
+ break READ_CONTENT;
+ String contentString = new String(value.getContent());
+ if (contentString.indexOf("Nutch fetcher test page") != -1) {
+ handledurls.add(key.toString());
+ }
+ } while (true);
+ } finally {
+ reader.close();
+ }
- reader.close();
+ // Skip remaining verification if cancelled
+ if (cancellationToken.isCancelled()) {
+ return;
+ }
Collections.sort(urls);
Collections.sort(handledurls);
@@ -157,22 +188,32 @@ public class TestFetcher {
new Path(generatedSegment[0], ParseData.DIR_NAME),
"part-r-00000/data");
reader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(parseData));
- READ_PARSE_DATA: do {
- Text key = new Text();
- ParseData value = new ParseData();
- if (!reader.next(key, value))
- break READ_PARSE_DATA;
- // make sure they all contain "nutch.segment.name" and
- // "nutch.content.digest"
- // keys in parse metadata
- Metadata contentMeta = value.getContentMeta();
- if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
- && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
- handledurls.add(key.toString());
- }
- } while (true);
+ try {
+ READ_PARSE_DATA: do {
+ // Check for cancellation periodically
+ if (cancellationToken.isCancelled()) break READ_PARSE_DATA;
+
+ Text key = new Text();
+ ParseData value = new ParseData();
+ if (!reader.next(key, value))
+ break READ_PARSE_DATA;
+ // make sure they all contain "nutch.segment.name" and
+ // "nutch.content.digest"
+ // keys in parse metadata
+ Metadata contentMeta = value.getContentMeta();
+ if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
+ && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
+ handledurls.add(key.toString());
+ }
+ } while (true);
+ } finally {
+ reader.close();
+ }
- reader.close();
+ // Skip final assertions if cancelled
+ if (cancellationToken.isCancelled()) {
+ return;
+ }
Collections.sort(handledurls);
diff --git a/src/test/org/apache/nutch/segment/TestSegmentMerger.java
b/src/test/org/apache/nutch/segment/TestSegmentMerger.java
index 0df88a2de..9cc076ad6 100644
--- a/src/test/org/apache/nutch/segment/TestSegmentMerger.java
+++ b/src/test/org/apache/nutch/segment/TestSegmentMerger.java
@@ -17,6 +17,7 @@
package org.apache.nutch.segment;
import java.text.DecimalFormat;
+import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
@@ -28,16 +29,23 @@ import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.util.CancellationAwareTestUtils;
+import org.apache.nutch.util.CancellationAwareTestUtils.CancellationToken;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.Timeout;
import org.junit.jupiter.api.BeforeEach;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
+/**
+ * Tests for SegmentMerger functionality.
+ * This test is cancellation-aware for graceful shutdown during fail-fast mode.
+ */
public class TestSegmentMerger {
Configuration conf;
FileSystem fs;
@@ -106,9 +114,19 @@ public class TestSegmentMerger {
}
@Test
+ @Timeout(value = 10, unit = TimeUnit.MINUTES)
public void testLargeMerge() throws Exception {
+ // Create cancellation token for graceful shutdown support
+ CancellationToken cancellationToken =
CancellationAwareTestUtils.createToken();
+
SegmentMerger merger = new SegmentMerger(conf);
merger.merge(out, new Path[] { seg1, seg2 }, false, false, -1);
+
+ // Check for cancellation before verification
+ if (cancellationToken.isCancelled()) {
+ return;
+ }
+
// verify output
FileStatus[] stats = fs.listStatus(out);
// there should be just one path
@@ -119,20 +137,37 @@ public class TestSegmentMerger {
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(new Path(
outSeg, ParseText.DIR_NAME), conf);
int cnt1 = 0, cnt2 = 0;
- for (MapFile.Reader r : readers) {
- while (r.next(k, v)) {
- String ks = k.toString();
- String vs = v.getText();
- if (ks.startsWith("seg1-")) {
- cnt1++;
- assertTrue(vs.startsWith("seg1 "));
- } else if (ks.startsWith("seg2-")) {
- cnt2++;
- assertTrue(vs.startsWith("seg2 "));
+ try {
+ for (MapFile.Reader r : readers) {
+ while (r.next(k, v)) {
+ // Check for cancellation periodically during I/O
+ if (cancellationToken.isCancelled()) {
+ return;
+ }
+
+ String ks = k.toString();
+ String vs = v.getText();
+ if (ks.startsWith("seg1-")) {
+ cnt1++;
+ assertTrue(vs.startsWith("seg1 "));
+ } else if (ks.startsWith("seg2-")) {
+ cnt2++;
+ assertTrue(vs.startsWith("seg2 "));
+ }
}
}
- r.close();
+ } finally {
+ // Ensure readers are closed even on cancellation
+ for (MapFile.Reader r : readers) {
+ r.close();
+ }
+ }
+
+ // Skip final assertions if cancelled
+ if (cancellationToken.isCancelled()) {
+ return;
}
+
assertEquals(countSeg1, cnt1);
assertEquals(countSeg2, cnt2);
}
diff --git a/src/test/org/apache/nutch/util/CancellationAwareTestUtils.java
b/src/test/org/apache/nutch/util/CancellationAwareTestUtils.java
new file mode 100644
index 000000000..055a64261
--- /dev/null
+++ b/src/test/org/apache/nutch/util/CancellationAwareTestUtils.java
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import org.jspecify.annotations.NonNull;
+import org.jspecify.annotations.Nullable;
+
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.BooleanSupplier;
+
+/**
+ * Utility class for making long-running tests cancellation-aware.
+ *
+ * <p>This supports JUnit 6's fail-fast mode by allowing tests to check
+ * for cancellation requests and exit gracefully, ensuring proper resource
+ * cleanup even when the test suite is stopped early.</p>
+ *
+ * <p>Usage example:</p>
+ * <pre>{@code
+ * @Test
+ * @Timeout(value = 5, unit = TimeUnit.MINUTES)
+ * void testLongRunningOperation() throws Exception {
+ * CancellationAwareTestUtils.CancellationToken token =
+ * CancellationAwareTestUtils.createToken();
+ *
+ * try {
+ * while (hasMoreWork() && !token.isCancelled()) {
+ * doWork();
+ * }
+ * } finally {
+ * cleanup();
+ * }
+ * }
+ * }</pre>
+ */
+public class CancellationAwareTestUtils {
+
+ /**
+ * A simple cancellation token that can be checked during long-running
operations.
+ * The token is automatically cancelled when the current thread is
interrupted.
+ */
+ public static class CancellationToken {
+ private final AtomicBoolean cancelled = new AtomicBoolean(false);
+ @Nullable
+ private final BooleanSupplier additionalCheck;
+
+ CancellationToken(@Nullable BooleanSupplier additionalCheck) {
+ this.additionalCheck = additionalCheck;
+ }
+
+ /**
+ * Check if cancellation has been requested.
+ * This checks both explicit cancellation and thread interruption.
+ *
+ * @return true if the operation should be cancelled
+ */
+ public boolean isCancelled() {
+ if (cancelled.get()) {
+ return true;
+ }
+ if (Thread.currentThread().isInterrupted()) {
+ cancelled.set(true);
+ return true;
+ }
+ if (additionalCheck != null && additionalCheck.getAsBoolean()) {
+ cancelled.set(true);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Explicitly request cancellation.
+ */
+ public void cancel() {
+ cancelled.set(true);
+ }
+
+ /**
+ * Throws InterruptedException if cancellation has been requested.
+ * Useful for cooperative cancellation in loops.
+ *
+ * @throws InterruptedException if cancelled
+ */
+ public void throwIfCancelled() throws InterruptedException {
+ if (isCancelled()) {
+ throw new InterruptedException("Test cancelled");
+ }
+ }
+ }
+
+ /**
+ * Creates a new cancellation token.
+ *
+ * @return a new CancellationToken instance
+ */
+ @NonNull
+ public static CancellationToken createToken() {
+ return new CancellationToken(null);
+ }
+
+ /**
+ * Creates a cancellation token with an additional cancellation condition.
+ *
+ * @param additionalCheck additional condition that triggers cancellation
+ * @return a new CancellationToken instance
+ */
+ @NonNull
+ public static CancellationToken createToken(@NonNull BooleanSupplier
additionalCheck) {
+ return new CancellationToken(additionalCheck);
+ }
+
+ /**
+ * Executes an operation with periodic cancellation checks.
+ *
+ * @param token the cancellation token to check
+ * @param operation the operation to execute (should be short-lived)
+ * @param iterations number of times to execute the operation
+ * @param checkInterval how often to check for cancellation (every N
iterations)
+ * @return the number of iterations actually completed
+ * @throws InterruptedException if cancelled during execution
+ */
+ public static int executeWithCancellation(
+ @NonNull CancellationToken token,
+ @NonNull Runnable operation,
+ int iterations,
+ int checkInterval) throws InterruptedException {
+
+ int completed = 0;
+ for (int i = 0; i < iterations; i++) {
+ if (i % checkInterval == 0) {
+ token.throwIfCancelled();
+ }
+ operation.run();
+ completed++;
+ }
+ return completed;
+ }
+
+ /**
+ * Sleeps for the specified duration while remaining cancellation-aware.
+ * Checks for cancellation every 100ms.
+ *
+ * @param token the cancellation token
+ * @param millis total milliseconds to sleep
+ * @throws InterruptedException if cancelled or interrupted
+ */
+ public static void sleepWithCancellation(@NonNull CancellationToken token,
long millis)
+ throws InterruptedException {
+ long remaining = millis;
+ while (remaining > 0) {
+ token.throwIfCancelled();
+ long sleepTime = Math.min(remaining, 100);
+ Thread.sleep(sleepTime);
+ remaining -= sleepTime;
+ }
+ }
+
+ /**
+ * Interface for operations that can be interrupted and resumed.
+ *
+ * @param <T> the result type
+ */
+ @FunctionalInterface
+ public interface CancellableOperation<T> {
+ /**
+ * Execute a portion of the operation.
+ *
+ * @param token cancellation token to check
+ * @return the result, or null if more work is needed
+ * @throws Exception if the operation fails
+ */
+ @Nullable
+ T execute(@NonNull CancellationToken token) throws Exception;
+ }
+
+ /**
+ * Runs a cancellable operation, returning null if cancelled before
completion.
+ *
+ * @param <T> the result type
+ * @param operation the operation to run
+ * @return the result, or null if cancelled
+ * @throws Exception if the operation fails (not due to cancellation)
+ */
+ @Nullable
+ public static <T> T runCancellable(@NonNull CancellableOperation<T>
operation) throws Exception {
+ CancellationToken token = createToken();
+ try {
+ return operation.execute(token);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ return null;
+ }
+ }
+}
diff --git a/src/test/org/apache/nutch/util/WritableTestUtils.java
b/src/test/org/apache/nutch/util/WritableTestUtils.java
index d4429dbf3..632849e71 100644
--- a/src/test/org/apache/nutch/util/WritableTestUtils.java
+++ b/src/test/org/apache/nutch/util/WritableTestUtils.java
@@ -18,24 +18,31 @@ package org.apache.nutch.util;
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.*;
+import org.jspecify.annotations.NonNull;
+import org.jspecify.annotations.Nullable;
import static org.junit.jupiter.api.Assertions.assertEquals;
+/**
+ * Utility methods for testing Hadoop Writable implementations.
+ * Uses JSpecify annotations for null safety.
+ */
public class WritableTestUtils {
/** Utility method for testing writables. */
- public static void testWritable(Writable before) throws Exception {
+ public static void testWritable(@NonNull Writable before) throws Exception {
testWritable(before, null);
}
/** Utility method for testing writables. */
- public static void testWritable(Writable before, Configuration conf)
+ public static void testWritable(@NonNull Writable before, @Nullable
Configuration conf)
throws Exception {
assertEquals(before, writeRead(before, conf));
}
/** Utility method for testing writables. */
- public static Writable writeRead(Writable before, Configuration conf)
+ @NonNull
+ public static Writable writeRead(@NonNull Writable before, @Nullable
Configuration conf)
throws Exception {
DataOutputBuffer dob = new DataOutputBuffer();