This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 360b3d3545 merge conflict and flaky test
360b3d3545 is described below

commit 360b3d3545e05edcfcf012fae16e1a2d3e3a0ae1
Author: Tim Allison <[email protected]>
AuthorDate: Wed Jun 10 09:33:34 2026 +0200

    merge conflict and flaky test
---
 .../tika/ml/chardetect/MojibusterEncodingDetector.java  | 17 ++++++++++-------
 .../org/apache/tika/pipes/core/PipesClientTest.java     |  8 ++++----
 .../tika/pipes/core/SharedServerChaosMonkeyTest.java    |  5 ++++-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 3d48b595d0..7b9aa75d57 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -355,13 +355,16 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             }
         }
         LOG.trace("mojibuster utf8Check={} tolerated={}", utf8, utf8Tolerated);
-        // Emit a structural UTF-8 candidate when the grammar is clean (LIKELY)
-        // OR essentially-UTF-8 (NOT_UTF8 with malformed bytes within 
tolerance —
-        // a few corrupt bytes in otherwise-valid UTF-8).  Both exclude legacy
-        // CJK, which produces many grammar errors (measured: 0/321K labeled 
CJK
-        // samples return LIKELY or fall within tolerance).  The type-priority
-        // sort in sortAndDedup then ranks this above NB's statistical pick.
-        if (utf8 == StructuralEncodingRules.Utf8Result.LIKELY_UTF8 || 
utf8Tolerated) {
+        // Emit a structural UTF-8 candidate only when the grammar is 
definitively
+        // clean (LIKELY_UTF8).  When the probe is NOT_UTF8 but within the 
error
+        // tolerance (utf8Tolerated), NB's UTF-8 result is already kept as a
+        // STATISTICAL candidate (see NOT_UTF8 disqualifier above) — promoting 
it
+        // to STRUCTURAL here would cause the "return only top-1 STRUCTURAL" 
path
+        // to short-circuit JunkFilter, preventing it from comparing UTF-8 
against
+        // windows-1252.  For short probes a single bad byte in otherwise-ASCII
+        // content is more likely a genuine Latin-1/windows-1252 byte than a
+        // corrupt UTF-8 sequence; JunkFilter has enough signal to arbitrate.
+        if (utf8 == StructuralEncodingRules.Utf8Result.LIKELY_UTF8) {
             pool.add(new EncodingResult(
                     java.nio.charset.StandardCharsets.UTF_8,
                     UTF8_STRUCTURAL_CONF, "UTF-8",
diff --git 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
index 561321a88f..d3189f0bf1 100644
--- 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
+++ 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
@@ -401,10 +401,10 @@ public class PipesClientTest {
             assertEquals(PipesResult.RESULT_STATUS.TIMEOUT, 
pipesResult.status(),
                     "Should timeout when socket times out");
 
-            // Should timeout relatively quickly (within ~5 seconds including 
overhead)
-            // Socket timeout is 3 seconds, but allow some buffer for 
processing
-            assertTrue(elapsed < 10000,
-                    "Socket timeout should occur quickly (elapsed: " + elapsed 
+ "ms)");
+            // Socket timeout is 3 seconds; allow generous headroom for slow 
CI runners
+            // where the server may need multiple startup attempts before 
connecting.
+            assertTrue(elapsed < 60000,
+                    "Socket timeout should occur within 60s (elapsed: " + 
elapsed + "ms)");
 
             // Verify it's a process crash category (socket timeout means 
process isn't responding)
             assertTrue(pipesResult.isProcessCrash(),
diff --git 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/SharedServerChaosMonkeyTest.java
 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/SharedServerChaosMonkeyTest.java
index decb5a45ba..324520d106 100644
--- 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/SharedServerChaosMonkeyTest.java
+++ 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/SharedServerChaosMonkeyTest.java
@@ -209,7 +209,10 @@ public class SharedServerChaosMonkeyTest {
                     observedOom.incrementAndGet();
                 } else if (result.status() == 
PipesResult.RESULT_STATUS.TIMEOUT) {
                     observedTimeout.incrementAndGet();
-                } else if (result.isProcessCrash()) {
+                } else {
+                    // Covers PROCESS_CRASH category (UNSPECIFIED_CRASH) as 
well as
+                    // FATAL (FAILED_TO_INITIALIZE) and INITIALIZATION_FAILURE 
statuses
+                    // that can occur under resource pressure when the server 
fails to start.
                     observedCrash.incrementAndGet();
                     // In shared mode, OK files may fail if server crashed 
during their processing
                     if (expectedType == FileType.OK) {

Reply via email to