This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4624
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 500ef0dbe1c35f261606abd5833e642ef00510e6
Author: tallison <[email protected]>
AuthorDate: Thu Jan 15 12:53:19 2026 -0500

    WIP: Checkpoint - CachingSource metadata update and cleanup
    
    - Remove tmp parameter from TikaInputSource.getPath()
    - Add Metadata to CachingSource constructor
    - Update CONTENT_LENGTH when spilling to disk
    - Add CachingSource unit tests
    - Remove markLimit from digesters and configs (depends on enableRewind)
    
    🤖 Generated with [Claude Code](https://claude.com/claude-code)
    
    Co-Authored-By: Claude Opus 4.5 <[email protected]>
---
 .../main/java/org/apache/tika/digest/Digester.java |  11 +-
 .../java/org/apache/tika/io/ByteArraySource.java   |   6 +-
 .../java/org/apache/tika/io/CachingSource.java     |  17 ++-
 .../main/java/org/apache/tika/io/FileSource.java   |   2 +-
 .../java/org/apache/tika/io/TikaInputSource.java   |   3 +-
 .../java/org/apache/tika/io/TikaInputStream.java   |  10 +-
 .../org/apache/tika/io/TikaInputStreamTest.java    | 116 +++++++++++++++++++++
 .../parser/digestutils/BouncyCastleDigester.java   |  31 +++---
 .../digestutils/BouncyCastleDigesterFactory.java   |  14 +--
 .../tika/parser/digestutils/CommonsDigester.java   |  29 +++---
 .../parser/digestutils/CommonsDigesterFactory.java |  14 +--
 .../apache/tika/detect/ole/MiscOLEDetector.java    |  20 ----
 .../src/test/resources/configs/tika-4533.json      |   1 -
 .../configs/tika-config-bc-digests-base32.json     |   1 -
 .../configs/tika-config-bc-digests-basic.json      |   1 -
 .../configs/tika-config-bc-digests-multiple.json   |   1 -
 .../configs/tika-config-commons-digests-basic.json |   1 -
 .../configs/tika-config-digests-pdf-only.json      |   1 -
 .../tika-config-digests-skip-container.json        |   1 -
 .../resources/configs/tika-config-digests.json     |   1 -
 .../resources/configs/tika-config-md5-digest.json  |   1 -
 .../configs/tika-config-write-filter.json          |   1 -
 .../resources/configs/cxf-test-base-template.json  |   1 -
 .../resources/configs/cxf-test-base-template.json  |   1 -
 .../configs/tika-config-for-server-tests.json      |   1 -
 .../tika-config-langdetect-opennlp-filter.json     |   1 -
 .../tika-config-langdetect-optimaize-filter.json   |   1 -
 27 files changed, 175 insertions(+), 113 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/digest/Digester.java 
b/tika-core/src/main/java/org/apache/tika/digest/Digester.java
index ac6459607c..133d5dce09 100644
--- a/tika-core/src/main/java/org/apache/tika/digest/Digester.java
+++ b/tika-core/src/main/java/org/apache/tika/digest/Digester.java
@@ -30,14 +30,11 @@ import org.apache.tika.parser.ParseContext;
 public interface Digester {
     /**
      * Digests a TikaInputStream and sets the appropriate value(s) in the 
metadata.
-     * The Digester is also responsible for marking and resetting the stream.
+     * The Digester is responsible for calling {@link 
TikaInputStream#enableRewind()}
+     * and {@link TikaInputStream#rewind()} to ensure the stream can be read by
+     * subsequent processing after digesting.
      * <p>
-     * The given stream is guaranteed to support the
-     * {@link TikaInputStream#markSupported() mark feature} and the detector
-     * is expected to {@link TikaInputStream#mark(int) mark} the stream before
-     * reading any bytes from it, and to {@link TikaInputStream#reset() reset}
-     * the stream before returning. The stream must not be closed by the
-     * detector.
+     * The stream must not be closed by the digester.
      *
      * @param tis          TikaInputStream to digest
      * @param m            Metadata to set the values for
diff --git a/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java 
b/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java
index 3d19a04a5d..148fcc04d8 100644
--- a/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java
+++ b/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java
@@ -36,12 +36,14 @@ class ByteArraySource extends InputStream implements 
TikaInputSource {
 
     private final byte[] data;
     private final int length;
+    private final TemporaryResources tmp;
     private int position;
     private Path spilledPath;
 
-    ByteArraySource(byte[] data) {
+    ByteArraySource(byte[] data, TemporaryResources tmp) {
         this.data = data;
         this.length = data.length;
+        this.tmp = tmp;
         this.position = 0;
         this.spilledPath = null;
     }
@@ -97,7 +99,7 @@ class ByteArraySource extends InputStream implements 
TikaInputSource {
     }
 
     @Override
-    public Path getPath(TemporaryResources tmp, String suffix) throws 
IOException {
+    public Path getPath(String suffix) throws IOException {
         if (spilledPath == null) {
             // Spill to temp file on first call
             spilledPath = tmp.createTempFile(suffix);
diff --git a/tika-core/src/main/java/org/apache/tika/io/CachingSource.java 
b/tika-core/src/main/java/org/apache/tika/io/CachingSource.java
index baf38c7cd8..15a43d0b33 100644
--- a/tika-core/src/main/java/org/apache/tika/io/CachingSource.java
+++ b/tika-core/src/main/java/org/apache/tika/io/CachingSource.java
@@ -24,6 +24,9 @@ import java.nio.file.Path;
 
 import org.apache.commons.io.IOUtils;
 
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.StringUtils;
+
 /**
  * Input source that caches bytes from a raw InputStream.
  * <p>
@@ -33,17 +36,17 @@ import org.apache.commons.io.IOUtils;
  */
 class CachingSource extends InputStream implements TikaInputSource {
 
-    private final TemporaryResources tmp;
     private CachingInputStream cachingStream;
     private long length;
+    private final Metadata metadata;
 
     // After spilling to file, we switch to file-backed mode
     private Path spilledPath;
     private InputStream fileStream;
 
-    CachingSource(InputStream source, TemporaryResources tmp, long length) {
-        this.tmp = tmp;
+    CachingSource(InputStream source, TemporaryResources tmp, long length, 
Metadata metadata) {
         this.length = length;
+        this.metadata = metadata;
         StreamCache cache = new StreamCache(tmp);
         this.cachingStream = new CachingInputStream(
                 source instanceof BufferedInputStream ? source : new 
BufferedInputStream(source),
@@ -104,7 +107,7 @@ class CachingSource extends InputStream implements 
TikaInputSource {
     }
 
     @Override
-    public Path getPath(TemporaryResources tmp, String suffix) throws 
IOException {
+    public Path getPath(String suffix) throws IOException {
         if (spilledPath == null) {
             // Spill to file and switch to file-backed mode
             spilledPath = cachingStream.spillToFile(suffix);
@@ -127,6 +130,12 @@ class CachingSource extends InputStream implements 
TikaInputSource {
                 length = fileSize;
             }
 
+            // Update metadata if not already set
+            if (metadata != null &&
+                    
StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) {
+                metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
+            }
+
             cachingStream = null;
         }
         return spilledPath;
diff --git a/tika-core/src/main/java/org/apache/tika/io/FileSource.java 
b/tika-core/src/main/java/org/apache/tika/io/FileSource.java
index e89690a086..90907c9173 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FileSource.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FileSource.java
@@ -102,7 +102,7 @@ class FileSource extends InputStream implements 
TikaInputSource {
     }
 
     @Override
-    public Path getPath(TemporaryResources tmp, String suffix) throws 
IOException {
+    public Path getPath(String suffix) throws IOException {
         // Already file-backed, just return the path
         return path;
     }
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java 
b/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java
index 10e2b52dd8..a8293dca67 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java
@@ -43,11 +43,10 @@ interface TikaInputSource extends Closeable {
 
     /**
      * Gets the file path, potentially spilling to a temp file if needed.
-     * @param tmp temporary resources for creating temp files
      * @param suffix file suffix for temp files
      * @return the file path
      */
-    Path getPath(TemporaryResources tmp, String suffix) throws IOException;
+    Path getPath(String suffix) throws IOException;
 
     /**
      * Returns the length of the content, or -1 if unknown.
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java 
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index c87a5ea09d..c92d599310 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -103,7 +103,7 @@ public class TikaInputStream extends TaggedInputStream {
             return (TikaInputStream) stream;
         }
         String ext = getExtension(metadata);
-        TikaInputSource inputSource = new CachingSource(stream, tmp, -1);
+        TikaInputSource inputSource = new CachingSource(stream, tmp, -1, 
metadata);
         return new TikaInputStream(inputSource, tmp, ext);
     }
 
@@ -123,7 +123,7 @@ public class TikaInputStream extends TaggedInputStream {
         metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
         String ext = getExtension(metadata);
         TemporaryResources tmp = new TemporaryResources();
-        TikaInputSource inputSource = new ByteArraySource(data);
+        TikaInputSource inputSource = new ByteArraySource(data, tmp);
         return new TikaInputStream(inputSource, tmp, ext);
     }
 
@@ -180,7 +180,7 @@ public class TikaInputStream extends TaggedInputStream {
             String ext = getExtension(metadata);
             TemporaryResources tmp = new TemporaryResources();
             TikaInputSource inputSource = new CachingSource(
-                    new BufferedInputStream(blob.getBinaryStream()), tmp, 
length);
+                    new BufferedInputStream(blob.getBinaryStream()), tmp, 
length, metadata);
             return new TikaInputStream(inputSource, tmp, ext);
         }
     }
@@ -241,7 +241,7 @@ public class TikaInputStream extends TaggedInputStream {
         String ext = getExtension(metadata);
         TemporaryResources tmp = new TemporaryResources();
         TikaInputSource inputSource = new CachingSource(
-                new BufferedInputStream(connection.getInputStream()), tmp, 
length);
+                new BufferedInputStream(connection.getInputStream()), tmp, 
length, metadata);
         return new TikaInputStream(inputSource, tmp, ext);
     }
 
@@ -383,7 +383,7 @@ public class TikaInputStream extends TaggedInputStream {
         if (source == null) {
             throw new IOException("No TikaInputSource available");
         }
-        return source.getPath(tmp, suffix);
+        return source.getPath(suffix);
     }
 
     public File getFile() throws IOException {
diff --git 
a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java 
b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
index 6976a0bba4..9c0c05dcc4 100644
--- a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
@@ -669,6 +669,122 @@ public class TikaInputStreamTest {
         }
     }
 
+    // ========== CachingSource Tests ==========
+
+    @Test
+    public void testCachingSourceUpdatesMetadataOnSpill() throws IOException {
+        byte[] data = bytes("Hello, World!");
+        Metadata metadata = new Metadata();
+        // Don't set CONTENT_LENGTH - let CachingSource set it on spill
+
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            CachingSource source = new CachingSource(
+                    new ByteArrayInputStream(data), tmp, -1, metadata);
+
+            // Read all data
+            byte[] buffer = new byte[data.length];
+            int totalRead = 0;
+            int n;
+            while ((n = source.read(buffer, totalRead, buffer.length - 
totalRead)) != -1) {
+                totalRead += n;
+                if (totalRead >= buffer.length) break;
+            }
+
+            // Before spill, metadata should not have length
+            assertNull(metadata.get(Metadata.CONTENT_LENGTH));
+
+            // Force spill to file
+            Path path = source.getPath(".tmp");
+            assertNotNull(path);
+            assertTrue(Files.exists(path));
+
+            // After spill, metadata should have length
+            assertEquals("13", metadata.get(Metadata.CONTENT_LENGTH));
+
+            source.close();
+        }
+    }
+
+    @Test
+    public void testCachingSourceDoesNotOverwriteExistingMetadata() throws 
IOException {
+        byte[] data = bytes("Hello, World!");
+        Metadata metadata = new Metadata();
+        // Pre-set CONTENT_LENGTH
+        metadata.set(Metadata.CONTENT_LENGTH, "999");
+
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            CachingSource source = new CachingSource(
+                    new ByteArrayInputStream(data), tmp, -1, metadata);
+
+            // Read and spill
+            IOUtils.toByteArray(source);
+            source.seekTo(0);
+            Path path = source.getPath(".tmp");
+
+            // Existing value should not be overwritten
+            assertEquals("999", metadata.get(Metadata.CONTENT_LENGTH));
+
+            source.close();
+        }
+    }
+
+    @Test
+    public void testCachingSourceSeekTo() throws IOException {
+        byte[] data = bytes("ABCDEFGHIJ");
+
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            CachingSource source = new CachingSource(
+                    new ByteArrayInputStream(data), tmp, -1, null);
+
+            // Read first 5 bytes
+            byte[] buf = new byte[5];
+            source.read(buf);
+            assertEquals("ABCDE", str(buf));
+
+            // Seek back to position 2
+            source.seekTo(2);
+
+            // Read again
+            buf = new byte[3];
+            source.read(buf);
+            assertEquals("CDE", str(buf));
+
+            source.close();
+        }
+    }
+
+    @Test
+    public void testCachingSourceAfterSpill() throws IOException {
+        byte[] data = bytes("ABCDEFGHIJ");
+
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            CachingSource source = new CachingSource(
+                    new ByteArrayInputStream(data), tmp, -1, null);
+
+            // Read first 5 bytes
+            byte[] buf = new byte[5];
+            source.read(buf);
+            assertEquals("ABCDE", str(buf));
+
+            // Force spill
+            Path path = source.getPath(".tmp");
+            assertTrue(Files.exists(path));
+
+            // Continue reading after spill
+            buf = new byte[5];
+            source.read(buf);
+            assertEquals("FGHIJ", str(buf));
+
+            // Seek back and read again
+            source.seekTo(0);
+            buf = new byte[10];
+            source.read(buf);
+            assertEquals("ABCDEFGHIJ", str(buf));
+
+            source.close();
+        }
+    }
+
     // ========== Helper Methods ==========
 
     private TikaInputStream createTikaInputStream(byte[] data, boolean 
fileBacked) throws IOException {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
index d3d1465dfd..68a7280f83 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
@@ -37,35 +37,35 @@ import org.apache.tika.digest.InputStreamDigester;
  * <p>
  * BouncyCastle supports additional algorithms beyond the standard Java ones,
  * such as SHA3-256, SHA3-384, SHA3-512.
+ * <p>
+ * This digester uses {@link 
org.apache.tika.io.TikaInputStream#enableRewind()} and
+ * {@link org.apache.tika.io.TikaInputStream#rewind()} to read the entire 
stream,
+ * compute the digest, and then rewind for subsequent processing.
  */
 public class BouncyCastleDigester extends CompositeDigester {
 
     /**
-     * @param markLimit limit for mark/reset; after this limit is hit, the
-     *                  stream is reset and spooled to disk
-     * @param digests   list of digest definitions (algorithm + encoding pairs)
+     * @param digests list of digest definitions (algorithm + encoding pairs)
      */
-    public BouncyCastleDigester(int markLimit, List<DigestDef> digests) {
-        super(buildDigesters(markLimit, digests));
+    public BouncyCastleDigester(List<DigestDef> digests) {
+        super(buildDigesters(digests));
     }
 
     /**
      * Convenience constructor using Algorithm enum with HEX encoding.
      *
-     * @param markLimit  limit for mark/reset; after this limit is hit, the
-     *                   stream is reset and spooled to disk
      * @param algorithms algorithms to run (uses HEX encoding for all)
      */
-    public BouncyCastleDigester(int markLimit, DigestDef.Algorithm... 
algorithms) {
-        super(buildDigesters(markLimit, algorithms));
+    public BouncyCastleDigester(DigestDef.Algorithm... algorithms) {
+        super(buildDigesters(algorithms));
     }
 
-    private static Digester[] buildDigesters(int markLimit, List<DigestDef> 
digests) {
+    private static Digester[] buildDigesters(List<DigestDef> digests) {
         Digester[] digesters = new Digester[digests.size()];
         int i = 0;
         for (DigestDef def : digests) {
             Encoder encoder = getEncoder(def.getEncoding());
-            digesters[i++] = new BCInputStreamDigester(markLimit,
+            digesters[i++] = new BCInputStreamDigester(
                     def.getAlgorithm().getJavaName(),
                     def.getMetadataKey(),
                     encoder);
@@ -73,13 +73,13 @@ public class BouncyCastleDigester extends CompositeDigester 
{
         return digesters;
     }
 
-    private static Digester[] buildDigesters(int markLimit, 
DigestDef.Algorithm[] algorithms) {
+    private static Digester[] buildDigesters(DigestDef.Algorithm[] algorithms) 
{
         Digester[] digesters = new Digester[algorithms.length];
         Encoder encoder = getEncoder(DigestDef.Encoding.HEX);
         int i = 0;
         for (DigestDef.Algorithm algorithm : algorithms) {
             DigestDef def = new DigestDef(algorithm, DigestDef.Encoding.HEX);
-            digesters[i++] = new BCInputStreamDigester(markLimit,
+            digesters[i++] = new BCInputStreamDigester(
                     algorithm.getJavaName(),
                     def.getMetadataKey(),
                     encoder);
@@ -123,9 +123,8 @@ public class BouncyCastleDigester extends CompositeDigester 
{
 
     private static class BCInputStreamDigester extends InputStreamDigester {
 
-        public BCInputStreamDigester(int markLimit, String algorithm, String 
algorithmKeyName,
-                                     Encoder encoder) {
-            super(markLimit, algorithm, algorithmKeyName, encoder);
+        public BCInputStreamDigester(String algorithm, String 
algorithmKeyName, Encoder encoder) {
+            super(algorithm, algorithmKeyName, encoder);
             try {
                 MessageDigest.getInstance(algorithm, getProvider());
             } catch (NoSuchAlgorithmException e) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
index 895880f246..a8a1894586 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
@@ -27,7 +27,7 @@ import org.apache.tika.digest.DigesterFactory;
 /**
  * Factory for {@link BouncyCastleDigester} with configurable algorithms and 
encodings.
  * <p>
- * Default: markLimit = 1000000, MD5 with HEX encoding.
+ * Default: MD5 with HEX encoding.
  * <p>
  * BouncyCastle supports additional algorithms beyond the standard Java ones,
  * such as SHA3-256, SHA3-384, SHA3-512.
@@ -37,7 +37,6 @@ import org.apache.tika.digest.DigesterFactory;
  * {
  *   "digesterFactory": {
  *     "bouncy-castle-digester-factory": {
- *       "markLimit": 1000000,
  *       "digests": [
  *         { "algorithm": "MD5" },
  *         { "algorithm": "SHA3_256", "encoding": "BASE32" }
@@ -50,7 +49,6 @@ import org.apache.tika.digest.DigesterFactory;
 @TikaComponent
 public class BouncyCastleDigesterFactory implements DigesterFactory {
 
-    private int markLimit = 1000000;
     private List<DigestDef> digests = new ArrayList<>();
 
     public BouncyCastleDigesterFactory() {
@@ -59,15 +57,7 @@ public class BouncyCastleDigesterFactory implements 
DigesterFactory {
 
     @Override
     public Digester build() {
-        return new BouncyCastleDigester(markLimit, digests);
-    }
-
-    public int getMarkLimit() {
-        return markLimit;
-    }
-
-    public void setMarkLimit(int markLimit) {
-        this.markLimit = markLimit;
+        return new BouncyCastleDigester(digests);
     }
 
     public List<DigestDef> getDigests() {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
index 4ae544ff64..0f5185b0f5 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
@@ -32,38 +32,33 @@ import org.apache.tika.digest.InputStreamDigester;
  * Implementation of {@link Digester}
  * that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
  * <p>
- * This digester tries to use the regular mark/reset protocol on the 
InputStream.
- * However, this wraps an internal BoundedInputStream, and if the InputStream
- * is not fully read, then this will reset the stream and
- * spool the InputStream to disk (via TikaInputStream) and then digest the 
file.
+ * This digester uses {@link 
org.apache.tika.io.TikaInputStream#enableRewind()} and
+ * {@link org.apache.tika.io.TikaInputStream#rewind()} to read the entire 
stream,
+ * compute the digest, and then rewind for subsequent processing.
  */
 public class CommonsDigester extends CompositeDigester {
 
     /**
-     * @param markLimit limit for mark/reset; after this limit is hit, the
-     *                  stream is reset and spooled to disk
-     * @param digests   list of digest definitions (algorithm + encoding pairs)
+     * @param digests list of digest definitions (algorithm + encoding pairs)
      */
-    public CommonsDigester(int markLimit, List<DigestDef> digests) {
-        super(buildDigesters(markLimit, digests));
+    public CommonsDigester(List<DigestDef> digests) {
+        super(buildDigesters(digests));
     }
 
     /**
-     * @param markLimit  limit for mark/reset; after this limit is hit, the
-     *                   stream is reset and spooled to disk
      * @param algorithms algorithms to run (uses HEX encoding for all)
      */
-    public CommonsDigester(int markLimit, DigestDef.Algorithm... algorithms) {
-        super(buildDigesters(markLimit, algorithms));
+    public CommonsDigester(DigestDef.Algorithm... algorithms) {
+        super(buildDigesters(algorithms));
     }
 
-    private static Digester[] buildDigesters(int markLimit, List<DigestDef> 
digests) {
+    private static Digester[] buildDigesters(List<DigestDef> digests) {
         Digester[] digesters = new Digester[digests.size()];
         int i = 0;
         for (DigestDef def : digests) {
             checkSupported(def.getAlgorithm());
             Encoder encoder = getEncoder(def.getEncoding());
-            digesters[i++] = new InputStreamDigester(markLimit,
+            digesters[i++] = new InputStreamDigester(
                     def.getAlgorithm().getJavaName(),
                     def.getMetadataKey(),
                     encoder);
@@ -71,14 +66,14 @@ public class CommonsDigester extends CompositeDigester {
         return digesters;
     }
 
-    private static Digester[] buildDigesters(int markLimit, 
DigestDef.Algorithm[] algorithms) {
+    private static Digester[] buildDigesters(DigestDef.Algorithm[] algorithms) 
{
         Digester[] digesters = new Digester[algorithms.length];
         Encoder encoder = getEncoder(DigestDef.Encoding.HEX);
         int i = 0;
         for (DigestDef.Algorithm algorithm : algorithms) {
             checkSupported(algorithm);
             DigestDef def = new DigestDef(algorithm, DigestDef.Encoding.HEX);
-            digesters[i++] = new InputStreamDigester(markLimit,
+            digesters[i++] = new InputStreamDigester(
                     algorithm.getJavaName(),
                     def.getMetadataKey(),
                     encoder);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
index 8d26fbce16..b141c7340e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
@@ -27,14 +27,13 @@ import org.apache.tika.digest.DigesterFactory;
 /**
  * Factory for {@link CommonsDigester} with configurable algorithms and 
encodings.
  * <p>
- * Default: markLimit = 1000000, MD5 with HEX encoding.
+ * Default: MD5 with HEX encoding.
  * <p>
  * Example JSON configuration:
  * <pre>
  * {
  *   "digesterFactory": {
  *     "commons-digester": {
- *       "markLimit": 1000000,
  *       "digests": [
  *         { "algorithm": "MD5" },
  *         { "algorithm": "SHA256", "encoding": "BASE32" }
@@ -47,7 +46,6 @@ import org.apache.tika.digest.DigesterFactory;
 @TikaComponent
 public class CommonsDigesterFactory implements DigesterFactory {
 
-    private int markLimit = 1000000;
     private List<DigestDef> digests = new ArrayList<>();
 
     public CommonsDigesterFactory() {
@@ -56,15 +54,7 @@ public class CommonsDigesterFactory implements 
DigesterFactory {
 
     @Override
     public Digester build() {
-        return new CommonsDigester(markLimit, digests);
-    }
-
-    public int getMarkLimit() {
-        return markLimit;
-    }
-
-    public void setMarkLimit(int markLimit) {
-        this.markLimit = markLimit;
+        return new CommonsDigester(digests);
     }
 
     public List<DigestDef> getDigests() {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java
index f887b2d7e7..fc37999a48 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java
@@ -64,9 +64,6 @@ public class MiscOLEDetector implements Detector {
      */
     public static final MediaType QUATTROPRO = application("x-quattro-pro");
 
-
-    private int markLimit = 16 * 1024 * 1024;
-
     /**
      * Internal detection of the specific kind of OLE2 document, based on the
      * names of the top level streams within the file.
@@ -117,23 +114,6 @@ public class MiscOLEDetector implements Detector {
         return names;
     }
 
-    /**
-     * If a TikaInputStream is passed in to {@link #detect(InputStream, 
Metadata)},
-     * and there is not an underlying file, this detector will spool up to 
{@link #markLimit}
-     * to disk.  If the stream was read in entirety (e.g. the spooled file is 
not truncated),
-     * this detector will open the file with POI and perform detection.
-     * If the spooled file is truncated, the detector will return {@link #OLE} 
(or
-     * {@link MediaType#OCTET_STREAM} if there's no OLE header).
-     * <p>
-     * As of Tika 1.21, this detector respects the legacy behavior of not 
performing detection
-     * on a non-TikaInputStream.
-     *
-     * @param markLimit
-     */
-    public void setMarkLimit(int markLimit) {
-        this.markLimit = markLimit;
-    }
-
     private Set<String> getTopLevelNames(TikaInputStream stream) throws 
IOException {
         // Force the document stream to a (possibly temporary) file
         // so we don't modify the current position of the stream.
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
index 9245331ec5..12b49d6267 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
@@ -6,7 +6,6 @@
     "throwOnZeroBytes": false,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "SHA256" }
         ]
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
index fed21bc5af..5ac209517f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
@@ -3,7 +3,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "bouncy-castle-digester-factory": {
-        "markLimit": 1000000,
         "digests": [
           { "algorithm": "SHA1", "encoding": "BASE32" }
         ]
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
index 770fba7ffe..53bfd01732 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
@@ -3,7 +3,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "bouncy-castle-digester-factory": {
-        "markLimit": 1000000,
         "digests": [
           { "algorithm": "MD2" },
           { "algorithm": "MD5" },
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
index 830d8c0809..b2e23ad974 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
@@ -3,7 +3,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "bouncy-castle-digester-factory": {
-        "markLimit": 1000000,
         "digests": [
           { "algorithm": "MD5" },
           { "algorithm": "SHA256" },
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
index 2a2634a88e..c37e6965f2 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
@@ -3,7 +3,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 1000000,
         "digests": [
           { "algorithm": "MD2" },
           { "algorithm": "MD5" },
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
index cf7c3874a0..60825fe974 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
@@ -12,7 +12,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "SHA256", "encoding": "BASE32" },
           { "algorithm": "MD5" }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
index ed2145a404..8ed562166a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
@@ -4,7 +4,6 @@
     "skipContainerDocumentDigest": true,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "SHA256", "encoding": "BASE32" },
           { "algorithm": "MD5" }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
index 004e6ea753..50bbd90b99 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
@@ -3,7 +3,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "SHA256", "encoding": "BASE32" },
           { "algorithm": "MD5" }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
index caffd0c709..a13a80c7db 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
@@ -2,7 +2,6 @@
   "auto-detect-parser": {
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "MD5" }
         ]
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
index 1872313a9c..3ca9aa461a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
@@ -4,7 +4,6 @@
     "skipContainerDocumentDigest": true,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "SHA256", "encoding": "BASE32" },
           { "algorithm": "MD5" }
diff --git 
a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
 
b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
index f8284e5e4d..bcae4fb7e6 100644
--- 
a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
+++ 
b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
@@ -50,7 +50,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "MD5" },
           { "algorithm": "SHA1", "encoding": "BASE32" }
diff --git 
a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
 
b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
index f8284e5e4d..bcae4fb7e6 100644
--- 
a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
+++ 
b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
@@ -50,7 +50,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "MD5" },
           { "algorithm": "SHA1", "encoding": "BASE32" }
diff --git 
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
 
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
index fdf80cb998..d134099806 100644
--- 
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
+++ 
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
@@ -13,7 +13,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "MD5" },
           { "algorithm": "SHA1", "encoding": "BASE32" }
diff --git 
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
 
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
index 97646bc879..dd199e46d2 100644
--- 
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
+++ 
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
@@ -18,7 +18,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "MD5" },
           { "algorithm": "SHA1", "encoding": "BASE32" }
diff --git 
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
 
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
index 8d3f74ed3c..4f30e99b4b 100644
--- 
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
+++ 
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
@@ -18,7 +18,6 @@
     "outputThreshold": 1000000,
     "digesterFactory": {
       "commons-digester-factory": {
-        "markLimit": 100000,
         "digests": [
           { "algorithm": "MD5" },
           { "algorithm": "SHA1", "encoding": "BASE32" }


Reply via email to