This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 360b3d3545 merge conflict and flaky test
360b3d3545 is described below
commit 360b3d3545e05edcfcf012fae16e1a2d3e3a0ae1
Author: Tim Allison <[email protected]>
AuthorDate: Wed Jun 10 09:33:34 2026 +0200
merge conflict and flaky test
---
.../tika/ml/chardetect/MojibusterEncodingDetector.java | 17 ++++++++++-------
.../org/apache/tika/pipes/core/PipesClientTest.java | 8 ++++----
.../tika/pipes/core/SharedServerChaosMonkeyTest.java | 5 ++++-
3 files changed, 18 insertions(+), 12 deletions(-)
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 3d48b595d0..7b9aa75d57 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -355,13 +355,16 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
}
}
LOG.trace("mojibuster utf8Check={} tolerated={}", utf8, utf8Tolerated);
- // Emit a structural UTF-8 candidate when the grammar is clean (LIKELY)
- // OR essentially-UTF-8 (NOT_UTF8 with malformed bytes within
tolerance —
- // a few corrupt bytes in otherwise-valid UTF-8). Both exclude legacy
- // CJK, which produces many grammar errors (measured: 0/321K labeled
CJK
- // samples return LIKELY or fall within tolerance). The type-priority
- // sort in sortAndDedup then ranks this above NB's statistical pick.
- if (utf8 == StructuralEncodingRules.Utf8Result.LIKELY_UTF8 ||
utf8Tolerated) {
+ // Emit a structural UTF-8 candidate only when the grammar is
definitively
+ // clean (LIKELY_UTF8). When the probe is NOT_UTF8 but within the
error
+ // tolerance (utf8Tolerated), NB's UTF-8 result is already kept as a
+ // STATISTICAL candidate (see NOT_UTF8 disqualifier above) — promoting
it
+ // to STRUCTURAL here would cause the "return only top-1 STRUCTURAL"
path
+ // to short-circuit JunkFilter, preventing it from comparing UTF-8
against
+ // windows-1252. For short probes a single bad byte in otherwise-ASCII
+ // content is more likely a genuine Latin-1/windows-1252 byte than a
+ // corrupt UTF-8 sequence; JunkFilter has enough signal to arbitrate.
+ if (utf8 == StructuralEncodingRules.Utf8Result.LIKELY_UTF8) {
pool.add(new EncodingResult(
java.nio.charset.StandardCharsets.UTF_8,
UTF8_STRUCTURAL_CONF, "UTF-8",
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
index 561321a88f..d3189f0bf1 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
@@ -401,10 +401,10 @@ public class PipesClientTest {
assertEquals(PipesResult.RESULT_STATUS.TIMEOUT,
pipesResult.status(),
"Should timeout when socket times out");
- // Should timeout relatively quickly (within ~5 seconds including
overhead)
- // Socket timeout is 3 seconds, but allow some buffer for
processing
- assertTrue(elapsed < 10000,
- "Socket timeout should occur quickly (elapsed: " + elapsed
+ "ms)");
+ // Socket timeout is 3 seconds; allow generous headroom for slow
CI runners
+ // where the server may need multiple startup attempts before
connecting.
+ assertTrue(elapsed < 60000,
+ "Socket timeout should occur within 60s (elapsed: " +
elapsed + "ms)");
// Verify it's a process crash category (socket timeout means
process isn't responding)
assertTrue(pipesResult.isProcessCrash(),
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/SharedServerChaosMonkeyTest.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/SharedServerChaosMonkeyTest.java
index decb5a45ba..324520d106 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/SharedServerChaosMonkeyTest.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/SharedServerChaosMonkeyTest.java
@@ -209,7 +209,10 @@ public class SharedServerChaosMonkeyTest {
observedOom.incrementAndGet();
} else if (result.status() ==
PipesResult.RESULT_STATUS.TIMEOUT) {
observedTimeout.incrementAndGet();
- } else if (result.isProcessCrash()) {
+ } else {
+ // Covers PROCESS_CRASH category (UNSPECIFIED_CRASH) as
well as
+ // FATAL (FAILED_TO_INITIALIZE) and INITIALIZATION_FAILURE
statuses
+ // that can occur under resource pressure when the server
fails to start.
observedCrash.incrementAndGet();
// In shared mode, OK files may fail if server crashed
during their processing
if (expectedType == FileType.OK) {