This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 066412ea14 WIP: Checkpoint - CachingSource metadata update and cleanup
(#2535)
066412ea14 is described below
commit 066412ea149f71f01ae92473c6bbe13fee433c6c
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jan 15 18:29:09 2026 -0500
WIP: Checkpoint - CachingSource metadata update and cleanup (#2535)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: Claude Opus 4.5 <[email protected]>
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../main/java/org/apache/tika/digest/Digester.java | 11 +-
.../org/apache/tika/digest/DigesterFactory.java | 3 +-
.../apache/tika/digest/InputStreamDigester.java | 115 ++++----------------
.../java/org/apache/tika/io/ByteArraySource.java | 6 +-
.../java/org/apache/tika/io/CachingSource.java | 15 ++-
.../main/java/org/apache/tika/io/FileSource.java | 2 +-
.../java/org/apache/tika/io/TikaInputSource.java | 3 +-
.../java/org/apache/tika/io/TikaInputStream.java | 10 +-
.../org/apache/tika/io/TikaInputStreamTest.java | 120 +++++++++++++++++++++
.../parser/digestutils/BouncyCastleDigester.java | 31 +++---
.../digestutils/BouncyCastleDigesterFactory.java | 14 +--
.../tika/parser/digestutils/CommonsDigester.java | 29 +++--
.../parser/digestutils/CommonsDigesterFactory.java | 14 +--
.../apache/tika/detect/ole/MiscOLEDetector.java | 20 ----
.../digest/SkipContainerDocumentDigestTest.java | 7 +-
.../src/test/resources/configs/tika-4533.json | 1 -
.../configs/tika-config-bc-digests-base32.json | 1 -
.../configs/tika-config-bc-digests-basic.json | 1 -
.../configs/tika-config-bc-digests-multiple.json | 1 -
.../configs/tika-config-commons-digests-basic.json | 1 -
.../configs/tika-config-digests-pdf-only.json | 1 -
.../tika-config-digests-skip-container.json | 1 -
.../resources/configs/tika-config-digests.json | 1 -
.../resources/configs/tika-config-md5-digest.json | 1 -
.../configs/tika-config-write-filter.json | 1 -
.../apache/tika/digest/MockDigesterFactory.java | 2 +-
.../org/apache/tika/server/core/CXFTestBase.java | 1 -
.../resources/configs/cxf-test-base-template.json | 1 -
.../resources/configs/cxf-test-base-template.json | 1 -
.../configs/tika-config-for-server-tests.json | 1 -
.../tika-config-langdetect-opennlp-filter.json | 1 -
.../tika-config-langdetect-optimaize-filter.json | 1 -
33 files changed, 207 insertions(+), 213 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index cb25539678..9bf9990271 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -433,7 +433,7 @@ public class TikaCLI {
} else if (arg.startsWith("--digest=")) {
String algorithmName =
arg.substring("--digest=".length()).toUpperCase(Locale.ROOT);
DigestDef.Algorithm algorithm =
DigestDef.Algorithm.valueOf(algorithmName);
- digester = new CommonsDigester(MAX_MARK, algorithm);
+ digester = new CommonsDigester(algorithm);
} else if (arg.startsWith("-e")) {
encoding = arg.substring("-e".length());
} else if (arg.startsWith("--encoding=")) {
diff --git a/tika-core/src/main/java/org/apache/tika/digest/Digester.java
b/tika-core/src/main/java/org/apache/tika/digest/Digester.java
index ac6459607c..133d5dce09 100644
--- a/tika-core/src/main/java/org/apache/tika/digest/Digester.java
+++ b/tika-core/src/main/java/org/apache/tika/digest/Digester.java
@@ -30,14 +30,11 @@ import org.apache.tika.parser.ParseContext;
public interface Digester {
/**
* Digests a TikaInputStream and sets the appropriate value(s) in the
metadata.
- * The Digester is also responsible for marking and resetting the stream.
+ * The Digester is responsible for calling {@link
TikaInputStream#enableRewind()}
+ * and {@link TikaInputStream#rewind()} to ensure the stream can be read by
+ * subsequent processing after digesting.
* <p>
- * The given stream is guaranteed to support the
- * {@link TikaInputStream#markSupported() mark feature} and the detector
- * is expected to {@link TikaInputStream#mark(int) mark} the stream before
- * reading any bytes from it, and to {@link TikaInputStream#reset() reset}
- * the stream before returning. The stream must not be closed by the
- * detector.
+ * The stream must not be closed by the digester.
*
* @param tis TikaInputStream to digest
* @param m Metadata to set the values for
diff --git
a/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java
b/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java
index 66744718aa..1b9215d226 100644
--- a/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/digest/DigesterFactory.java
@@ -19,7 +19,7 @@ package org.apache.tika.digest;
/**
* Factory interface for creating Digester instances.
* Implementations should be annotated with {@code @TikaComponent} and
- * provide bean properties for configuration (e.g., markLimit, digests).
+ * provide bean properties for configuration (e.g., digests).
* <p>
* This is used in {@link org.apache.tika.parser.AutoDetectParserConfig} to
* configure digesting in the AutoDetectParser.
@@ -29,7 +29,6 @@ package org.apache.tika.digest;
* "auto-detect-parser": {
* "digesterFactory": {
* "commons-digester-factory": {
- * "markLimit": 1000000,
* "digests": [
* { "algorithm": "MD5" },
* { "algorithm": "SHA256", "encoding": "BASE32" }
diff --git
a/tika-core/src/main/java/org/apache/tika/digest/InputStreamDigester.java
b/tika-core/src/main/java/org/apache/tika/digest/InputStreamDigester.java
index a384137300..2d1180435b 100644
--- a/tika-core/src/main/java/org/apache/tika/digest/InputStreamDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/digest/InputStreamDigester.java
@@ -16,69 +16,37 @@
*/
package org.apache.tika.digest;
-import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.io.InputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.security.Provider;
-import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.StringUtils;
-// TODO: TIKA-FOLLOWUP - With TikaInputStream.rewind(), markLimit is no longer
needed.
-// The digester can simply read the entire stream, then call tis.rewind().
-// This would simplify this class and allow removing markLimit from:
-// - InputStreamDigester, CommonsDigester, BouncyCastleDigester
-// - CommonsDigesterFactory, BouncyCastleDigesterFactory
(setMarkLimit/getMarkLimit)
-// - All JSON config files that specify markLimit for digesters
+/**
+ * Digester that uses {@link TikaInputStream#enableRewind()} and {@link
TikaInputStream#rewind()}
+ * to read the entire stream for digesting, then rewind for subsequent
processing.
+ */
public class InputStreamDigester implements Digester {
private final String algorithm;
private final String metadataKey;
private final Encoder encoder;
- private final int markLimit;
/**
- * @param markLimit limit in bytes to allow for mark/reset. If the
inputstream is longer
- * than this limit, the stream will be reset and then
spooled to a
- * temporary file.
- * Throws IllegalArgumentException if < 0.
* @param algorithm name of the digest algorithm to retrieve from the
Provider
* @param metadataKey the full metadata key to use when storing the digest
* (e.g., "X-TIKA:digest:MD5" or
"X-TIKA:digest:SHA256:BASE32")
* @param encoder encoder to convert the byte array returned from the
digester to a
* string
*/
- public InputStreamDigester(int markLimit, String algorithm, String
metadataKey,
- Encoder encoder) {
+ public InputStreamDigester(String algorithm, String metadataKey, Encoder
encoder) {
this.algorithm = algorithm;
this.metadataKey = metadataKey;
this.encoder = encoder;
- this.markLimit = markLimit;
-
- if (markLimit < 0) {
- throw new IllegalArgumentException("markLimit must be >= 0");
- }
- }
-
- /**
- * Copied from commons-codec
- */
- private static MessageDigest updateDigest(MessageDigest digest,
InputStream data,
- Metadata metadata) throws
IOException {
- byte[] buffer = new byte[1024];
- long total = 0;
- for (int read = data.read(buffer, 0, 1024); read > -1; read =
data.read(buffer, 0, 1024)) {
- digest.update(buffer, 0, read);
- total += read;
- }
- setContentLength(total, metadata);
- return digest;
}
private static void setContentLength(long length, Metadata metadata) {
@@ -113,6 +81,12 @@ public class InputStreamDigester implements Digester {
}
/**
+ * Digests the TikaInputStream and stores the result in metadata.
+ * <p>
+ * Uses {@link TikaInputStream#enableRewind()} to ensure the stream can be
+ * rewound after digesting, then calls {@link TikaInputStream#rewind()} to
+ * reset the stream for subsequent processing.
+ *
* @param tis TikaInputStream to digest
* @param metadata metadata in which to store the digest information
* @param parseContext ParseContext -- not actually used yet, but there
for future expansion
@@ -121,66 +95,21 @@ public class InputStreamDigester implements Digester {
@Override
public void digest(TikaInputStream tis, Metadata metadata, ParseContext
parseContext)
throws IOException {
- if (tis.hasFile()) {
- long sz = tis.getLength();
- //if the inputstream has a file,
- //and its size is greater than its mark limit,
- //just digest the underlying file.
- if (sz > markLimit) {
- digestFile(tis.getFile(), sz, metadata);
- return;
- }
- }
-
- //try the usual mark/reset stuff.
- //however, if you actually hit the bound,
- //then stop and spool to file via TikaInputStream
- BoundedInputStream bis = new BoundedInputStream(markLimit, tis);
- boolean finishedStream = false;
- bis.mark(markLimit + 1);
- finishedStream = digestStream(bis, metadata);
- bis.reset();
- if (finishedStream) {
- return;
- }
- //if the stream wasn't finished -- if the stream was longer than the
mark limit --
- //spool to File and digest that.
- digestFile(tis.getFile(), -1, metadata);
- }
-
- private void digestFile(File f, long sz, Metadata m) throws IOException {
- //only add it if it hasn't been populated already
- if (StringUtils.isBlank(m.get(Metadata.CONTENT_LENGTH))) {
- if (sz < 0) {
- sz = f.length();
- }
- setContentLength(sz, m);
- }
- try (InputStream is = new FileInputStream(f)) {
- digestStream(is, m);
- }
- }
+ tis.enableRewind();
- /**
- * @param is input stream to read from
- * @param metadata metadata for reporting the digest
- * @return whether or not this finished the input stream
- * @throws IOException
- */
- private boolean digestStream(InputStream is, Metadata metadata) throws
IOException {
- byte[] digestBytes;
MessageDigest messageDigest = newMessageDigest();
+ byte[] buffer = new byte[8192];
+ long total = 0;
+ int read;
+ while ((read = tis.read(buffer)) != -1) {
+ messageDigest.update(buffer, 0, read);
+ total += read;
+ }
- updateDigest(messageDigest, is, metadata);
- digestBytes = messageDigest.digest();
+ setContentLength(total, metadata);
+ metadata.set(metadataKey, encoder.encode(messageDigest.digest()));
- if (is instanceof BoundedInputStream) {
- if (((BoundedInputStream) is).hasHitBound()) {
- return false;
- }
- }
- metadata.set(metadataKey, encoder.encode(digestBytes));
- return true;
+ tis.rewind();
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java
b/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java
index a9dcd8da96..6a2046cdf4 100644
--- a/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java
+++ b/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java
@@ -36,12 +36,14 @@ class ByteArraySource extends InputStream implements
TikaInputSource {
private final byte[] data;
private final int length;
+ private final TemporaryResources tmp;
private int position;
private Path spilledPath;
- ByteArraySource(byte[] data) {
+ ByteArraySource(byte[] data, TemporaryResources tmp) {
this.data = data;
this.length = data.length;
+ this.tmp = tmp;
this.position = 0;
this.spilledPath = null;
}
@@ -97,7 +99,7 @@ class ByteArraySource extends InputStream implements
TikaInputSource {
}
@Override
- public Path getPath(TemporaryResources tmp, String suffix) throws
IOException {
+ public Path getPath(String suffix) throws IOException {
if (spilledPath == null) {
// Spill to temp file on first call
spilledPath = tmp.createTempFile(suffix);
diff --git a/tika-core/src/main/java/org/apache/tika/io/CachingSource.java
b/tika-core/src/main/java/org/apache/tika/io/CachingSource.java
index 07c6f0fdc3..d84b98fdd9 100644
--- a/tika-core/src/main/java/org/apache/tika/io/CachingSource.java
+++ b/tika-core/src/main/java/org/apache/tika/io/CachingSource.java
@@ -24,6 +24,9 @@ import java.nio.file.Path;
import org.apache.commons.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.StringUtils;
+
/**
* Input source that wraps a raw InputStream with optional caching.
* <p>
@@ -38,6 +41,7 @@ import org.apache.commons.io.IOUtils;
class CachingSource extends InputStream implements TikaInputSource {
private final TemporaryResources tmp;
+ private final Metadata metadata;
private long length;
// Passthrough mode: just a BufferedInputStream
@@ -52,9 +56,10 @@ class CachingSource extends InputStream implements
TikaInputSource {
private InputStream fileStream;
private long filePosition; // Track position in file mode
- CachingSource(InputStream source, TemporaryResources tmp, long length) {
+ CachingSource(InputStream source, TemporaryResources tmp, long length,
Metadata metadata) {
this.tmp = tmp;
this.length = length;
+ this.metadata = metadata;
// Start in passthrough mode
this.passthroughStream = source instanceof BufferedInputStream
? (BufferedInputStream) source
@@ -222,7 +227,7 @@ class CachingSource extends InputStream implements
TikaInputSource {
}
@Override
- public Path getPath(TemporaryResources tmp, String suffix) throws
IOException {
+ public Path getPath(String suffix) throws IOException {
if (spilledPath == null) {
// If still in passthrough mode, enable caching first
if (cachingStream == null) {
@@ -256,6 +261,12 @@ class CachingSource extends InputStream implements
TikaInputSource {
length = fileSize;
}
+ // Update metadata if not already set
+ if (metadata != null &&
+
StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) {
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
+ }
+
cachingStream = null;
}
return spilledPath;
diff --git a/tika-core/src/main/java/org/apache/tika/io/FileSource.java
b/tika-core/src/main/java/org/apache/tika/io/FileSource.java
index 95f6458574..79163f0ab2 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FileSource.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FileSource.java
@@ -102,7 +102,7 @@ class FileSource extends InputStream implements
TikaInputSource {
}
@Override
- public Path getPath(TemporaryResources tmp, String suffix) throws
IOException {
+ public Path getPath(String suffix) throws IOException {
// Already file-backed, just return the path
return path;
}
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java
b/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java
index 7a8da5d703..1620614f99 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java
@@ -43,11 +43,10 @@ interface TikaInputSource extends Closeable {
/**
* Gets the file path, potentially spilling to a temp file if needed.
- * @param tmp temporary resources for creating temp files
* @param suffix file suffix for temp files
* @return the file path
*/
- Path getPath(TemporaryResources tmp, String suffix) throws IOException;
+ Path getPath(String suffix) throws IOException;
/**
* Returns the length of the content, or -1 if unknown.
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 7eee791e3a..eb4c3cca76 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -103,7 +103,7 @@ public class TikaInputStream extends TaggedInputStream {
return (TikaInputStream) stream;
}
String ext = getExtension(metadata);
- TikaInputSource inputSource = new CachingSource(stream, tmp, -1);
+ TikaInputSource inputSource = new CachingSource(stream, tmp, -1,
metadata);
return new TikaInputStream(inputSource, tmp, ext);
}
@@ -123,7 +123,7 @@ public class TikaInputStream extends TaggedInputStream {
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
String ext = getExtension(metadata);
TemporaryResources tmp = new TemporaryResources();
- TikaInputSource inputSource = new ByteArraySource(data);
+ TikaInputSource inputSource = new ByteArraySource(data, tmp);
return new TikaInputStream(inputSource, tmp, ext);
}
@@ -180,7 +180,7 @@ public class TikaInputStream extends TaggedInputStream {
String ext = getExtension(metadata);
TemporaryResources tmp = new TemporaryResources();
TikaInputSource inputSource = new CachingSource(
- new BufferedInputStream(blob.getBinaryStream()), tmp,
length);
+ new BufferedInputStream(blob.getBinaryStream()), tmp,
length, metadata);
return new TikaInputStream(inputSource, tmp, ext);
}
}
@@ -241,7 +241,7 @@ public class TikaInputStream extends TaggedInputStream {
String ext = getExtension(metadata);
TemporaryResources tmp = new TemporaryResources();
TikaInputSource inputSource = new CachingSource(
- new BufferedInputStream(connection.getInputStream()), tmp,
length);
+ new BufferedInputStream(connection.getInputStream()), tmp,
length, metadata);
return new TikaInputStream(inputSource, tmp, ext);
}
@@ -379,7 +379,7 @@ public class TikaInputStream extends TaggedInputStream {
if (source == null) {
throw new IOException("No TikaInputSource available");
}
- return source.getPath(tmp, suffix);
+ return source.getPath(suffix);
}
public File getFile() throws IOException {
diff --git
a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
index bf8e0b7ba0..84676683a5 100644
--- a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
@@ -673,6 +673,126 @@ public class TikaInputStreamTest {
}
}
+ // ========== CachingSource Tests ==========
+
+ @Test
+ public void testCachingSourceUpdatesMetadataOnSpill() throws IOException {
+ byte[] data = bytes("Hello, World!");
+ Metadata metadata = new Metadata();
+ // Don't set CONTENT_LENGTH - let CachingSource set it on spill
+
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ CachingSource source = new CachingSource(
+ new ByteArrayInputStream(data), tmp, -1, metadata);
+ source.enableRewind(); // Enable caching for spill support
+
+ // Read all data
+ byte[] buffer = new byte[data.length];
+ int totalRead = 0;
+ int n;
+ while ((n = source.read(buffer, totalRead, buffer.length -
totalRead)) != -1) {
+ totalRead += n;
+ if (totalRead >= buffer.length) break;
+ }
+
+ // Before spill, metadata should not have length
+ assertNull(metadata.get(Metadata.CONTENT_LENGTH));
+
+ // Force spill to file
+ Path path = source.getPath(".tmp");
+ assertNotNull(path);
+ assertTrue(Files.exists(path));
+
+ // After spill, metadata should have length
+ assertEquals("13", metadata.get(Metadata.CONTENT_LENGTH));
+
+ source.close();
+ }
+ }
+
+ @Test
+ public void testCachingSourceDoesNotOverwriteExistingMetadata() throws
IOException {
+ byte[] data = bytes("Hello, World!");
+ Metadata metadata = new Metadata();
+ // Pre-set CONTENT_LENGTH
+ metadata.set(Metadata.CONTENT_LENGTH, "999");
+
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ CachingSource source = new CachingSource(
+ new ByteArrayInputStream(data), tmp, -1, metadata);
+ source.enableRewind(); // Enable caching for seek/spill support
+
+ // Read and spill
+ IOUtils.toByteArray(source);
+ source.seekTo(0);
+ Path path = source.getPath(".tmp");
+
+ // Existing value should not be overwritten
+ assertEquals("999", metadata.get(Metadata.CONTENT_LENGTH));
+
+ source.close();
+ }
+ }
+
+ @Test
+ public void testCachingSourceSeekTo() throws IOException {
+ byte[] data = bytes("ABCDEFGHIJ");
+
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ CachingSource source = new CachingSource(
+ new ByteArrayInputStream(data), tmp, -1, null);
+ source.enableRewind(); // Enable caching for seek support
+
+ // Read first 5 bytes
+ byte[] buf = new byte[5];
+ source.read(buf);
+ assertEquals("ABCDE", str(buf));
+
+ // Seek back to position 2
+ source.seekTo(2);
+
+ // Read again
+ buf = new byte[3];
+ source.read(buf);
+ assertEquals("CDE", str(buf));
+
+ source.close();
+ }
+ }
+
+ @Test
+ public void testCachingSourceAfterSpill() throws IOException {
+ byte[] data = bytes("ABCDEFGHIJ");
+
+ try (TemporaryResources tmp = new TemporaryResources()) {
+ CachingSource source = new CachingSource(
+ new ByteArrayInputStream(data), tmp, -1, null);
+ source.enableRewind(); // Enable caching for spill/seek support
+
+ // Read first 5 bytes
+ byte[] buf = new byte[5];
+ source.read(buf);
+ assertEquals("ABCDE", str(buf));
+
+ // Force spill
+ Path path = source.getPath(".tmp");
+ assertTrue(Files.exists(path));
+
+ // Continue reading after spill
+ buf = new byte[5];
+ source.read(buf);
+ assertEquals("FGHIJ", str(buf));
+
+ // Seek back and read again
+ source.seekTo(0);
+ buf = new byte[10];
+ source.read(buf);
+ assertEquals("ABCDEFGHIJ", str(buf));
+
+ source.close();
+ }
+ }
+
// ========== enableRewind() Tests ==========
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
index d3d1465dfd..68a7280f83 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java
@@ -37,35 +37,35 @@ import org.apache.tika.digest.InputStreamDigester;
* <p>
* BouncyCastle supports additional algorithms beyond the standard Java ones,
* such as SHA3-256, SHA3-384, SHA3-512.
+ * <p>
+ * This digester uses {@link
org.apache.tika.io.TikaInputStream#enableRewind()} and
+ * {@link org.apache.tika.io.TikaInputStream#rewind()} to read the entire
stream,
+ * compute the digest, and then rewind for subsequent processing.
*/
public class BouncyCastleDigester extends CompositeDigester {
/**
- * @param markLimit limit for mark/reset; after this limit is hit, the
- * stream is reset and spooled to disk
- * @param digests list of digest definitions (algorithm + encoding pairs)
+ * @param digests list of digest definitions (algorithm + encoding pairs)
*/
- public BouncyCastleDigester(int markLimit, List<DigestDef> digests) {
- super(buildDigesters(markLimit, digests));
+ public BouncyCastleDigester(List<DigestDef> digests) {
+ super(buildDigesters(digests));
}
/**
* Convenience constructor using Algorithm enum with HEX encoding.
*
- * @param markLimit limit for mark/reset; after this limit is hit, the
- * stream is reset and spooled to disk
* @param algorithms algorithms to run (uses HEX encoding for all)
*/
- public BouncyCastleDigester(int markLimit, DigestDef.Algorithm...
algorithms) {
- super(buildDigesters(markLimit, algorithms));
+ public BouncyCastleDigester(DigestDef.Algorithm... algorithms) {
+ super(buildDigesters(algorithms));
}
- private static Digester[] buildDigesters(int markLimit, List<DigestDef>
digests) {
+ private static Digester[] buildDigesters(List<DigestDef> digests) {
Digester[] digesters = new Digester[digests.size()];
int i = 0;
for (DigestDef def : digests) {
Encoder encoder = getEncoder(def.getEncoding());
- digesters[i++] = new BCInputStreamDigester(markLimit,
+ digesters[i++] = new BCInputStreamDigester(
def.getAlgorithm().getJavaName(),
def.getMetadataKey(),
encoder);
@@ -73,13 +73,13 @@ public class BouncyCastleDigester extends CompositeDigester
{
return digesters;
}
- private static Digester[] buildDigesters(int markLimit,
DigestDef.Algorithm[] algorithms) {
+ private static Digester[] buildDigesters(DigestDef.Algorithm[] algorithms)
{
Digester[] digesters = new Digester[algorithms.length];
Encoder encoder = getEncoder(DigestDef.Encoding.HEX);
int i = 0;
for (DigestDef.Algorithm algorithm : algorithms) {
DigestDef def = new DigestDef(algorithm, DigestDef.Encoding.HEX);
- digesters[i++] = new BCInputStreamDigester(markLimit,
+ digesters[i++] = new BCInputStreamDigester(
algorithm.getJavaName(),
def.getMetadataKey(),
encoder);
@@ -123,9 +123,8 @@ public class BouncyCastleDigester extends CompositeDigester
{
private static class BCInputStreamDigester extends InputStreamDigester {
- public BCInputStreamDigester(int markLimit, String algorithm, String
algorithmKeyName,
- Encoder encoder) {
- super(markLimit, algorithm, algorithmKeyName, encoder);
+ public BCInputStreamDigester(String algorithm, String
algorithmKeyName, Encoder encoder) {
+ super(algorithm, algorithmKeyName, encoder);
try {
MessageDigest.getInstance(algorithm, getProvider());
} catch (NoSuchAlgorithmException e) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
index 895880f246..a8a1894586 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java
@@ -27,7 +27,7 @@ import org.apache.tika.digest.DigesterFactory;
/**
* Factory for {@link BouncyCastleDigester} with configurable algorithms and
encodings.
* <p>
- * Default: markLimit = 1000000, MD5 with HEX encoding.
+ * Default: MD5 with HEX encoding.
* <p>
* BouncyCastle supports additional algorithms beyond the standard Java ones,
* such as SHA3-256, SHA3-384, SHA3-512.
@@ -37,7 +37,6 @@ import org.apache.tika.digest.DigesterFactory;
* {
* "digesterFactory": {
* "bouncy-castle-digester-factory": {
- * "markLimit": 1000000,
* "digests": [
* { "algorithm": "MD5" },
* { "algorithm": "SHA3_256", "encoding": "BASE32" }
@@ -50,7 +49,6 @@ import org.apache.tika.digest.DigesterFactory;
@TikaComponent
public class BouncyCastleDigesterFactory implements DigesterFactory {
- private int markLimit = 1000000;
private List<DigestDef> digests = new ArrayList<>();
public BouncyCastleDigesterFactory() {
@@ -59,15 +57,7 @@ public class BouncyCastleDigesterFactory implements
DigesterFactory {
@Override
public Digester build() {
- return new BouncyCastleDigester(markLimit, digests);
- }
-
- public int getMarkLimit() {
- return markLimit;
- }
-
- public void setMarkLimit(int markLimit) {
- this.markLimit = markLimit;
+ return new BouncyCastleDigester(digests);
}
public List<DigestDef> getDigests() {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
index 4ae544ff64..0f5185b0f5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
@@ -32,38 +32,33 @@ import org.apache.tika.digest.InputStreamDigester;
* Implementation of {@link Digester}
* that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
* <p>
- * This digester tries to use the regular mark/reset protocol on the
InputStream.
- * However, this wraps an internal BoundedInputStream, and if the InputStream
- * is not fully read, then this will reset the stream and
- * spool the InputStream to disk (via TikaInputStream) and then digest the
file.
+ * This digester uses {@link
org.apache.tika.io.TikaInputStream#enableRewind()} and
+ * {@link org.apache.tika.io.TikaInputStream#rewind()} to read the entire
stream,
+ * compute the digest, and then rewind for subsequent processing.
*/
public class CommonsDigester extends CompositeDigester {
/**
- * @param markLimit limit for mark/reset; after this limit is hit, the
- * stream is reset and spooled to disk
- * @param digests list of digest definitions (algorithm + encoding pairs)
+ * @param digests list of digest definitions (algorithm + encoding pairs)
*/
- public CommonsDigester(int markLimit, List<DigestDef> digests) {
- super(buildDigesters(markLimit, digests));
+ public CommonsDigester(List<DigestDef> digests) {
+ super(buildDigesters(digests));
}
/**
- * @param markLimit limit for mark/reset; after this limit is hit, the
- * stream is reset and spooled to disk
* @param algorithms algorithms to run (uses HEX encoding for all)
*/
- public CommonsDigester(int markLimit, DigestDef.Algorithm... algorithms) {
- super(buildDigesters(markLimit, algorithms));
+ public CommonsDigester(DigestDef.Algorithm... algorithms) {
+ super(buildDigesters(algorithms));
}
- private static Digester[] buildDigesters(int markLimit, List<DigestDef>
digests) {
+ private static Digester[] buildDigesters(List<DigestDef> digests) {
Digester[] digesters = new Digester[digests.size()];
int i = 0;
for (DigestDef def : digests) {
checkSupported(def.getAlgorithm());
Encoder encoder = getEncoder(def.getEncoding());
- digesters[i++] = new InputStreamDigester(markLimit,
+ digesters[i++] = new InputStreamDigester(
def.getAlgorithm().getJavaName(),
def.getMetadataKey(),
encoder);
@@ -71,14 +66,14 @@ public class CommonsDigester extends CompositeDigester {
return digesters;
}
- private static Digester[] buildDigesters(int markLimit,
DigestDef.Algorithm[] algorithms) {
+ private static Digester[] buildDigesters(DigestDef.Algorithm[] algorithms)
{
Digester[] digesters = new Digester[algorithms.length];
Encoder encoder = getEncoder(DigestDef.Encoding.HEX);
int i = 0;
for (DigestDef.Algorithm algorithm : algorithms) {
checkSupported(algorithm);
DigestDef def = new DigestDef(algorithm, DigestDef.Encoding.HEX);
- digesters[i++] = new InputStreamDigester(markLimit,
+ digesters[i++] = new InputStreamDigester(
algorithm.getJavaName(),
def.getMetadataKey(),
encoder);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
index 8d26fbce16..b141c7340e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
@@ -27,14 +27,13 @@ import org.apache.tika.digest.DigesterFactory;
/**
* Factory for {@link CommonsDigester} with configurable algorithms and
encodings.
* <p>
- * Default: markLimit = 1000000, MD5 with HEX encoding.
+ * Default: MD5 with HEX encoding.
* <p>
* Example JSON configuration:
* <pre>
* {
* "digesterFactory": {
* "commons-digester": {
- * "markLimit": 1000000,
* "digests": [
* { "algorithm": "MD5" },
* { "algorithm": "SHA256", "encoding": "BASE32" }
@@ -47,7 +46,6 @@ import org.apache.tika.digest.DigesterFactory;
@TikaComponent
public class CommonsDigesterFactory implements DigesterFactory {
- private int markLimit = 1000000;
private List<DigestDef> digests = new ArrayList<>();
public CommonsDigesterFactory() {
@@ -56,15 +54,7 @@ public class CommonsDigesterFactory implements
DigesterFactory {
@Override
public Digester build() {
- return new CommonsDigester(markLimit, digests);
- }
-
- public int getMarkLimit() {
- return markLimit;
- }
-
- public void setMarkLimit(int markLimit) {
- this.markLimit = markLimit;
+ return new CommonsDigester(digests);
}
public List<DigestDef> getDigests() {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java
index f887b2d7e7..fc37999a48 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java
@@ -64,9 +64,6 @@ public class MiscOLEDetector implements Detector {
*/
public static final MediaType QUATTROPRO = application("x-quattro-pro");
-
- private int markLimit = 16 * 1024 * 1024;
-
/**
* Internal detection of the specific kind of OLE2 document, based on the
* names of the top level streams within the file.
@@ -117,23 +114,6 @@ public class MiscOLEDetector implements Detector {
return names;
}
- /**
- * If a TikaInputStream is passed in to {@link #detect(InputStream,
Metadata)},
- * and there is not an underlying file, this detector will spool up to
{@link #markLimit}
- * to disk. If the stream was read in entirety (e.g. the spooled file is
not truncated),
- * this detector will open the file with POI and perform detection.
- * If the spooled file is truncated, the detector will return {@link #OLE}
(or
- * {@link MediaType#OCTET_STREAM} if there's no OLE header).
- * <p>
- * As of Tika 1.21, this detector respects the legacy behavior of not
performing detection
- * on a non-TikaInputStream.
- *
- * @param markLimit
- */
- public void setMarkLimit(int markLimit) {
- this.markLimit = markLimit;
- }
-
private Set<String> getTopLevelNames(TikaInputStream stream) throws
IOException {
// Force the document stream to a (possibly temporary) file
// so we don't modify the current position of the stream.
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java
index 09397ff1f0..a211165f56 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/digest/SkipContainerDocumentDigestTest.java
@@ -47,7 +47,7 @@ public class SkipContainerDocumentDigestTest extends TikaTest
{
public void testDigestContainerAndEmbedded() throws Exception {
// skipContainerDocumentDigest = false means digest everything
AutoDetectParserConfig config = new AutoDetectParserConfig();
- config.digester(new CommonsDigester(100000, DigestDef.Algorithm.MD5));
+ config.digester(new CommonsDigester(DigestDef.Algorithm.MD5));
config.setSkipContainerDocumentDigest(false);
AutoDetectParser parser = new AutoDetectParser();
@@ -71,7 +71,7 @@ public class SkipContainerDocumentDigestTest extends TikaTest
{
public void testSkipContainerDigestOnly() throws Exception {
// skipContainerDocumentDigest = true means skip container, digest
only embedded
AutoDetectParserConfig config = new AutoDetectParserConfig();
- config.digester(new CommonsDigester(100000, DigestDef.Algorithm.MD5));
+ config.digester(new CommonsDigester(DigestDef.Algorithm.MD5));
config.setSkipContainerDocumentDigest(true);
AutoDetectParser parser = new AutoDetectParser();
@@ -95,7 +95,7 @@ public class SkipContainerDocumentDigestTest extends TikaTest
{
public void testSkipContainerDocumentDigestMarkerInParseContext() throws
Exception {
// Test that the SkipContainerDocumentDigest marker in ParseContext
works
AutoDetectParserConfig config = new AutoDetectParserConfig();
- config.digester(new CommonsDigester(100000, DigestDef.Algorithm.MD5));
+ config.digester(new CommonsDigester(DigestDef.Algorithm.MD5));
config.setSkipContainerDocumentDigest(false); // Config says digest all
AutoDetectParser parser = new AutoDetectParser();
@@ -145,7 +145,6 @@ public class SkipContainerDocumentDigestTest extends
TikaTest {
public void testDigestWithFactory() throws Exception {
// Test using the factory pattern
CommonsDigesterFactory factory = new CommonsDigesterFactory();
- factory.setMarkLimit(100000);
AutoDetectParserConfig config = new AutoDetectParserConfig();
config.setDigesterFactory(factory);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
index 9245331ec5..12b49d6267 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json
@@ -6,7 +6,6 @@
"throwOnZeroBytes": false,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "SHA256" }
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
index fed21bc5af..5ac209517f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
@@ -3,7 +3,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"bouncy-castle-digester-factory": {
- "markLimit": 1000000,
"digests": [
{ "algorithm": "SHA1", "encoding": "BASE32" }
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
index 770fba7ffe..53bfd01732 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
@@ -3,7 +3,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"bouncy-castle-digester-factory": {
- "markLimit": 1000000,
"digests": [
{ "algorithm": "MD2" },
{ "algorithm": "MD5" },
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
index 830d8c0809..b2e23ad974 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
@@ -3,7 +3,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"bouncy-castle-digester-factory": {
- "markLimit": 1000000,
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA256" },
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
index 2a2634a88e..c37e6965f2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
@@ -3,7 +3,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 1000000,
"digests": [
{ "algorithm": "MD2" },
{ "algorithm": "MD5" },
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
index cf7c3874a0..60825fe974 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
@@ -12,7 +12,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "SHA256", "encoding": "BASE32" },
{ "algorithm": "MD5" }
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
index ed2145a404..8ed562166a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
@@ -4,7 +4,6 @@
"skipContainerDocumentDigest": true,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "SHA256", "encoding": "BASE32" },
{ "algorithm": "MD5" }
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
index 004e6ea753..50bbd90b99 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
@@ -3,7 +3,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "SHA256", "encoding": "BASE32" },
{ "algorithm": "MD5" }
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
index caffd0c709..a13a80c7db 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json
@@ -2,7 +2,6 @@
"auto-detect-parser": {
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "MD5" }
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
index 1872313a9c..3ca9aa461a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
@@ -4,7 +4,6 @@
"skipContainerDocumentDigest": true,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "SHA256", "encoding": "BASE32" },
{ "algorithm": "MD5" }
diff --git
a/tika-serialization/src/test/java/org/apache/tika/digest/MockDigesterFactory.java
b/tika-serialization/src/test/java/org/apache/tika/digest/MockDigesterFactory.java
index a8cbc69301..857267d74b 100644
---
a/tika-serialization/src/test/java/org/apache/tika/digest/MockDigesterFactory.java
+++
b/tika-serialization/src/test/java/org/apache/tika/digest/MockDigesterFactory.java
@@ -23,7 +23,7 @@ public class MockDigesterFactory implements DigesterFactory {
@Override
public Digester build() {
- return new InputStreamDigester(1000000, "SHA-256",
"X-TIKA:digest:SHA-256", new MockEncoder());
+ return new InputStreamDigester("SHA-256", "X-TIKA:digest:SHA-256", new
MockEncoder());
}
private static class MockEncoder implements Encoder {
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index 3a1389b140..d2a290fe91 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -74,7 +74,6 @@ public abstract class CXFTestBase {
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "MD5" }
]
diff --git
a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
index f8284e5e4d..bcae4fb7e6 100644
---
a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
+++
b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
@@ -50,7 +50,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
index f8284e5e4d..bcae4fb7e6 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
@@ -50,7 +50,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
index fdf80cb998..d134099806 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
@@ -13,7 +13,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
index 97646bc879..dd199e46d2 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
@@ -18,7 +18,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
index 8d3f74ed3c..4f30e99b4b 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
@@ -18,7 +18,6 @@
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
- "markLimit": 100000,
"digests": [
{ "algorithm": "MD5" },
{ "algorithm": "SHA1", "encoding": "BASE32" }