This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4624 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 500ef0dbe1c35f261606abd5833e642ef00510e6 Author: tallison <[email protected]> AuthorDate: Thu Jan 15 12:53:19 2026 -0500 WIP: Checkpoint - CachingSource metadata update and cleanup - Remove tmp parameter from TikaInputSource.getPath() - Add Metadata to CachingSource constructor - Update CONTENT_LENGTH when spilling to disk - Add CachingSource unit tests - Remove markLimit from digesters and configs (depends on enableRewind) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]> --- .../main/java/org/apache/tika/digest/Digester.java | 11 +- .../java/org/apache/tika/io/ByteArraySource.java | 6 +- .../java/org/apache/tika/io/CachingSource.java | 17 ++- .../main/java/org/apache/tika/io/FileSource.java | 2 +- .../java/org/apache/tika/io/TikaInputSource.java | 3 +- .../java/org/apache/tika/io/TikaInputStream.java | 10 +- .../org/apache/tika/io/TikaInputStreamTest.java | 116 +++++++++++++++++++++ .../parser/digestutils/BouncyCastleDigester.java | 31 +++--- .../digestutils/BouncyCastleDigesterFactory.java | 14 +-- .../tika/parser/digestutils/CommonsDigester.java | 29 +++--- .../parser/digestutils/CommonsDigesterFactory.java | 14 +-- .../apache/tika/detect/ole/MiscOLEDetector.java | 20 ---- .../src/test/resources/configs/tika-4533.json | 1 - .../configs/tika-config-bc-digests-base32.json | 1 - .../configs/tika-config-bc-digests-basic.json | 1 - .../configs/tika-config-bc-digests-multiple.json | 1 - .../configs/tika-config-commons-digests-basic.json | 1 - .../configs/tika-config-digests-pdf-only.json | 1 - .../tika-config-digests-skip-container.json | 1 - .../resources/configs/tika-config-digests.json | 1 - .../resources/configs/tika-config-md5-digest.json | 1 - .../configs/tika-config-write-filter.json | 1 - .../resources/configs/cxf-test-base-template.json | 1 - .../resources/configs/cxf-test-base-template.json | 1 - .../configs/tika-config-for-server-tests.json | 1 - .../tika-config-langdetect-opennlp-filter.json | 1 - .../tika-config-langdetect-optimaize-filter.json | 1 - 27 files changed, 175 insertions(+), 113 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/digest/Digester.java b/tika-core/src/main/java/org/apache/tika/digest/Digester.java index ac6459607c..133d5dce09 100644 --- a/tika-core/src/main/java/org/apache/tika/digest/Digester.java +++ b/tika-core/src/main/java/org/apache/tika/digest/Digester.java @@ -30,14 +30,11 @@ import org.apache.tika.parser.ParseContext; public interface Digester { /** * Digests a TikaInputStream and sets the appropriate value(s) in the metadata. - * The Digester is also responsible for marking and resetting the stream. + * The Digester is responsible for calling {@link TikaInputStream#enableRewind()} + * and {@link TikaInputStream#rewind()} to ensure the stream can be read by + * subsequent processing after digesting. * <p> - * The given stream is guaranteed to support the - * {@link TikaInputStream#markSupported() mark feature} and the detector - * is expected to {@link TikaInputStream#mark(int) mark} the stream before - * reading any bytes from it, and to {@link TikaInputStream#reset() reset} - * the stream before returning. The stream must not be closed by the - * detector. + * The stream must not be closed by the digester. * * @param tis TikaInputStream to digest * @param m Metadata to set the values for diff --git a/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java b/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java index 3d19a04a5d..148fcc04d8 100644 --- a/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java +++ b/tika-core/src/main/java/org/apache/tika/io/ByteArraySource.java @@ -36,12 +36,14 @@ class ByteArraySource extends InputStream implements TikaInputSource { private final byte[] data; private final int length; + private final TemporaryResources tmp; private int position; private Path spilledPath; - ByteArraySource(byte[] data) { + ByteArraySource(byte[] data, TemporaryResources tmp) { this.data = data; this.length = data.length; + this.tmp = tmp; this.position = 0; this.spilledPath = null; } @@ -97,7 +99,7 @@ class ByteArraySource extends InputStream implements TikaInputSource { } @Override - public Path getPath(TemporaryResources tmp, String suffix) throws IOException { + public Path getPath(String suffix) throws IOException { if (spilledPath == null) { // Spill to temp file on first call spilledPath = tmp.createTempFile(suffix); diff --git a/tika-core/src/main/java/org/apache/tika/io/CachingSource.java b/tika-core/src/main/java/org/apache/tika/io/CachingSource.java index baf38c7cd8..15a43d0b33 100644 --- a/tika-core/src/main/java/org/apache/tika/io/CachingSource.java +++ b/tika-core/src/main/java/org/apache/tika/io/CachingSource.java @@ -24,6 +24,9 @@ import java.nio.file.Path; import org.apache.commons.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.utils.StringUtils; + /** * Input source that caches bytes from a raw InputStream. * <p> @@ -33,17 +36,17 @@ import org.apache.commons.io.IOUtils; */ class CachingSource extends InputStream implements TikaInputSource { - private final TemporaryResources tmp; private CachingInputStream cachingStream; private long length; + private final Metadata metadata; // After spilling to file, we switch to file-backed mode private Path spilledPath; private InputStream fileStream; - CachingSource(InputStream source, TemporaryResources tmp, long length) { - this.tmp = tmp; + CachingSource(InputStream source, TemporaryResources tmp, long length, Metadata metadata) { this.length = length; + this.metadata = metadata; StreamCache cache = new StreamCache(tmp); this.cachingStream = new CachingInputStream( source instanceof BufferedInputStream ? source : new BufferedInputStream(source), @@ -104,7 +107,7 @@ class CachingSource extends InputStream implements TikaInputSource { } @Override - public Path getPath(TemporaryResources tmp, String suffix) throws IOException { + public Path getPath(String suffix) throws IOException { if (spilledPath == null) { // Spill to file and switch to file-backed mode spilledPath = cachingStream.spillToFile(suffix); @@ -127,6 +130,12 @@ class CachingSource extends InputStream implements TikaInputSource { length = fileSize; } + // Update metadata if not already set + if (metadata != null && + StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) { + metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length)); + } + cachingStream = null; } return spilledPath; diff --git a/tika-core/src/main/java/org/apache/tika/io/FileSource.java b/tika-core/src/main/java/org/apache/tika/io/FileSource.java index e89690a086..90907c9173 100644 --- a/tika-core/src/main/java/org/apache/tika/io/FileSource.java +++ b/tika-core/src/main/java/org/apache/tika/io/FileSource.java @@ -102,7 +102,7 @@ class FileSource extends InputStream implements TikaInputSource { } @Override - public Path getPath(TemporaryResources tmp, String suffix) throws IOException { + public Path getPath(String suffix) throws IOException { // Already file-backed, just return the path return path; } diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java index 10e2b52dd8..a8293dca67 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputSource.java @@ -43,11 +43,10 @@ interface TikaInputSource extends Closeable { /** * Gets the file path, potentially spilling to a temp file if needed. - * @param tmp temporary resources for creating temp files * @param suffix file suffix for temp files * @return the file path */ - Path getPath(TemporaryResources tmp, String suffix) throws IOException; + Path getPath(String suffix) throws IOException; /** * Returns the length of the content, or -1 if unknown. diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index c87a5ea09d..c92d599310 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -103,7 +103,7 @@ public class TikaInputStream extends TaggedInputStream { return (TikaInputStream) stream; } String ext = getExtension(metadata); - TikaInputSource inputSource = new CachingSource(stream, tmp, -1); + TikaInputSource inputSource = new CachingSource(stream, tmp, -1, metadata); return new TikaInputStream(inputSource, tmp, ext); } @@ -123,7 +123,7 @@ public class TikaInputStream extends TaggedInputStream { metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length)); String ext = getExtension(metadata); TemporaryResources tmp = new TemporaryResources(); - TikaInputSource inputSource = new ByteArraySource(data); + TikaInputSource inputSource = new ByteArraySource(data, tmp); return new TikaInputStream(inputSource, tmp, ext); } @@ -180,7 +180,7 @@ public class TikaInputStream extends TaggedInputStream { String ext = getExtension(metadata); TemporaryResources tmp = new TemporaryResources(); TikaInputSource inputSource = new CachingSource( - new BufferedInputStream(blob.getBinaryStream()), tmp, length); + new BufferedInputStream(blob.getBinaryStream()), tmp, length, metadata); return new TikaInputStream(inputSource, tmp, ext); } } @@ -241,7 +241,7 @@ public class TikaInputStream extends TaggedInputStream { String ext = getExtension(metadata); TemporaryResources tmp = new TemporaryResources(); TikaInputSource inputSource = new CachingSource( - new BufferedInputStream(connection.getInputStream()), tmp, length); + new BufferedInputStream(connection.getInputStream()), tmp, length, metadata); return new TikaInputStream(inputSource, tmp, ext); } @@ -383,7 +383,7 @@ public class TikaInputStream extends TaggedInputStream { if (source == null) { throw new IOException("No TikaInputSource available"); } - return source.getPath(tmp, suffix); + return source.getPath(suffix); } public File getFile() throws IOException { diff --git a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java index 6976a0bba4..9c0c05dcc4 100644 --- a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java @@ -669,6 +669,122 @@ public class TikaInputStreamTest { } } + // ========== CachingSource Tests ========== + + @Test + public void testCachingSourceUpdatesMetadataOnSpill() throws IOException { + byte[] data = bytes("Hello, World!"); + Metadata metadata = new Metadata(); + // Don't set CONTENT_LENGTH - let CachingSource set it on spill + + try (TemporaryResources tmp = new TemporaryResources()) { + CachingSource source = new CachingSource( + new ByteArrayInputStream(data), tmp, -1, metadata); + + // Read all data + byte[] buffer = new byte[data.length]; + int totalRead = 0; + int n; + while ((n = source.read(buffer, totalRead, buffer.length - totalRead)) != -1) { + totalRead += n; + if (totalRead >= buffer.length) break; + } + + // Before spill, metadata should not have length + assertNull(metadata.get(Metadata.CONTENT_LENGTH)); + + // Force spill to file + Path path = source.getPath(".tmp"); + assertNotNull(path); + assertTrue(Files.exists(path)); + + // After spill, metadata should have length + assertEquals("13", metadata.get(Metadata.CONTENT_LENGTH)); + + source.close(); + } + } + + @Test + public void testCachingSourceDoesNotOverwriteExistingMetadata() throws IOException { + byte[] data = bytes("Hello, World!"); + Metadata metadata = new Metadata(); + // Pre-set CONTENT_LENGTH + metadata.set(Metadata.CONTENT_LENGTH, "999"); + + try (TemporaryResources tmp = new TemporaryResources()) { + CachingSource source = new CachingSource( + new ByteArrayInputStream(data), tmp, -1, metadata); + + // Read and spill + IOUtils.toByteArray(source); + source.seekTo(0); + Path path = source.getPath(".tmp"); + + // Existing value should not be overwritten + assertEquals("999", metadata.get(Metadata.CONTENT_LENGTH)); + + source.close(); + } + } + + @Test + public void testCachingSourceSeekTo() throws IOException { + byte[] data = bytes("ABCDEFGHIJ"); + + try (TemporaryResources tmp = new TemporaryResources()) { + CachingSource source = new CachingSource( + new ByteArrayInputStream(data), tmp, -1, null); + + // Read first 5 bytes + byte[] buf = new byte[5]; + source.read(buf); + assertEquals("ABCDE", str(buf)); + + // Seek back to position 2 + source.seekTo(2); + + // Read again + buf = new byte[3]; + source.read(buf); + assertEquals("CDE", str(buf)); + + source.close(); + } + } + + @Test + public void testCachingSourceAfterSpill() throws IOException { + byte[] data = bytes("ABCDEFGHIJ"); + + try (TemporaryResources tmp = new TemporaryResources()) { + CachingSource source = new CachingSource( + new ByteArrayInputStream(data), tmp, -1, null); + + // Read first 5 bytes + byte[] buf = new byte[5]; + source.read(buf); + assertEquals("ABCDE", str(buf)); + + // Force spill + Path path = source.getPath(".tmp"); + assertTrue(Files.exists(path)); + + // Continue reading after spill + buf = new byte[5]; + source.read(buf); + assertEquals("FGHIJ", str(buf)); + + // Seek back and read again + source.seekTo(0); + buf = new byte[10]; + source.read(buf); + assertEquals("ABCDEFGHIJ", str(buf)); + + source.close(); + } + } + // ========== Helper Methods ========== private TikaInputStream createTikaInputStream(byte[] data, boolean fileBacked) throws IOException { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java index d3d1465dfd..68a7280f83 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java @@ -37,35 +37,35 @@ import org.apache.tika.digest.InputStreamDigester; * <p> * BouncyCastle supports additional algorithms beyond the standard Java ones, * such as SHA3-256, SHA3-384, SHA3-512. + * <p> + * This digester uses {@link org.apache.tika.io.TikaInputStream#enableRewind()} and + * {@link org.apache.tika.io.TikaInputStream#rewind()} to read the entire stream, + * compute the digest, and then rewind for subsequent processing. */ public class BouncyCastleDigester extends CompositeDigester { /** - * @param markLimit limit for mark/reset; after this limit is hit, the - * stream is reset and spooled to disk - * @param digests list of digest definitions (algorithm + encoding pairs) + * @param digests list of digest definitions (algorithm + encoding pairs) */ - public BouncyCastleDigester(int markLimit, List<DigestDef> digests) { - super(buildDigesters(markLimit, digests)); + public BouncyCastleDigester(List<DigestDef> digests) { + super(buildDigesters(digests)); } /** * Convenience constructor using Algorithm enum with HEX encoding. * - * @param markLimit limit for mark/reset; after this limit is hit, the - * stream is reset and spooled to disk * @param algorithms algorithms to run (uses HEX encoding for all) */ - public BouncyCastleDigester(int markLimit, DigestDef.Algorithm... algorithms) { - super(buildDigesters(markLimit, algorithms)); + public BouncyCastleDigester(DigestDef.Algorithm... algorithms) { + super(buildDigesters(algorithms)); } - private static Digester[] buildDigesters(int markLimit, List<DigestDef> digests) { + private static Digester[] buildDigesters(List<DigestDef> digests) { Digester[] digesters = new Digester[digests.size()]; int i = 0; for (DigestDef def : digests) { Encoder encoder = getEncoder(def.getEncoding()); - digesters[i++] = new BCInputStreamDigester(markLimit, + digesters[i++] = new BCInputStreamDigester( def.getAlgorithm().getJavaName(), def.getMetadataKey(), encoder); @@ -73,13 +73,13 @@ public class BouncyCastleDigester extends CompositeDigester { return digesters; } - private static Digester[] buildDigesters(int markLimit, DigestDef.Algorithm[] algorithms) { + private static Digester[] buildDigesters(DigestDef.Algorithm[] algorithms) { Digester[] digesters = new Digester[algorithms.length]; Encoder encoder = getEncoder(DigestDef.Encoding.HEX); int i = 0; for (DigestDef.Algorithm algorithm : algorithms) { DigestDef def = new DigestDef(algorithm, DigestDef.Encoding.HEX); - digesters[i++] = new BCInputStreamDigester(markLimit, + digesters[i++] = new BCInputStreamDigester( algorithm.getJavaName(), def.getMetadataKey(), encoder); @@ -123,9 +123,8 @@ public class BouncyCastleDigester extends CompositeDigester { private static class BCInputStreamDigester extends InputStreamDigester { - public BCInputStreamDigester(int markLimit, String algorithm, String algorithmKeyName, - Encoder encoder) { - super(markLimit, algorithm, algorithmKeyName, encoder); + public BCInputStreamDigester(String algorithm, String algorithmKeyName, Encoder encoder) { + super(algorithm, algorithmKeyName, encoder); try { MessageDigest.getInstance(algorithm, getProvider()); } catch (NoSuchAlgorithmException e) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java index 895880f246..a8a1894586 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigesterFactory.java @@ -27,7 +27,7 @@ import org.apache.tika.digest.DigesterFactory; /** * Factory for {@link BouncyCastleDigester} with configurable algorithms and encodings. * <p> - * Default: markLimit = 1000000, MD5 with HEX encoding. + * Default: MD5 with HEX encoding. * <p> * BouncyCastle supports additional algorithms beyond the standard Java ones, * such as SHA3-256, SHA3-384, SHA3-512. @@ -37,7 +37,6 @@ import org.apache.tika.digest.DigesterFactory; * { * "digesterFactory": { * "bouncy-castle-digester-factory": { - * "markLimit": 1000000, * "digests": [ * { "algorithm": "MD5" }, * { "algorithm": "SHA3_256", "encoding": "BASE32" } @@ -50,7 +49,6 @@ import org.apache.tika.digest.DigesterFactory; @TikaComponent public class BouncyCastleDigesterFactory implements DigesterFactory { - private int markLimit = 1000000; private List<DigestDef> digests = new ArrayList<>(); public BouncyCastleDigesterFactory() { @@ -59,15 +57,7 @@ public class BouncyCastleDigesterFactory implements DigesterFactory { @Override public Digester build() { - return new BouncyCastleDigester(markLimit, digests); - } - - public int getMarkLimit() { - return markLimit; - } - - public void setMarkLimit(int markLimit) { - this.markLimit = markLimit; + return new BouncyCastleDigester(digests); } public List<DigestDef> getDigests() { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java index 4ae544ff64..0f5185b0f5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java @@ -32,38 +32,33 @@ import org.apache.tika.digest.InputStreamDigester; * Implementation of {@link Digester} * that relies on commons.codec.digest.DigestUtils to calculate digest hashes. * <p> - * This digester tries to use the regular mark/reset protocol on the InputStream. - * However, this wraps an internal BoundedInputStream, and if the InputStream - * is not fully read, then this will reset the stream and - * spool the InputStream to disk (via TikaInputStream) and then digest the file. + * This digester uses {@link org.apache.tika.io.TikaInputStream#enableRewind()} and + * {@link org.apache.tika.io.TikaInputStream#rewind()} to read the entire stream, + * compute the digest, and then rewind for subsequent processing. */ public class CommonsDigester extends CompositeDigester { /** - * @param markLimit limit for mark/reset; after this limit is hit, the - * stream is reset and spooled to disk - * @param digests list of digest definitions (algorithm + encoding pairs) + * @param digests list of digest definitions (algorithm + encoding pairs) */ - public CommonsDigester(int markLimit, List<DigestDef> digests) { - super(buildDigesters(markLimit, digests)); + public CommonsDigester(List<DigestDef> digests) { + super(buildDigesters(digests)); } /** - * @param markLimit limit for mark/reset; after this limit is hit, the - * stream is reset and spooled to disk * @param algorithms algorithms to run (uses HEX encoding for all) */ - public CommonsDigester(int markLimit, DigestDef.Algorithm... algorithms) { - super(buildDigesters(markLimit, algorithms)); + public CommonsDigester(DigestDef.Algorithm... algorithms) { + super(buildDigesters(algorithms)); } - private static Digester[] buildDigesters(int markLimit, List<DigestDef> digests) { + private static Digester[] buildDigesters(List<DigestDef> digests) { Digester[] digesters = new Digester[digests.size()]; int i = 0; for (DigestDef def : digests) { checkSupported(def.getAlgorithm()); Encoder encoder = getEncoder(def.getEncoding()); - digesters[i++] = new InputStreamDigester(markLimit, + digesters[i++] = new InputStreamDigester( def.getAlgorithm().getJavaName(), def.getMetadataKey(), encoder); @@ -71,14 +66,14 @@ public class CommonsDigester extends CompositeDigester { return digesters; } - private static Digester[] buildDigesters(int markLimit, DigestDef.Algorithm[] algorithms) { + private static Digester[] buildDigesters(DigestDef.Algorithm[] algorithms) { Digester[] digesters = new Digester[algorithms.length]; Encoder encoder = getEncoder(DigestDef.Encoding.HEX); int i = 0; for (DigestDef.Algorithm algorithm : algorithms) { checkSupported(algorithm); DigestDef def = new DigestDef(algorithm, DigestDef.Encoding.HEX); - digesters[i++] = new InputStreamDigester(markLimit, + digesters[i++] = new InputStreamDigester( algorithm.getJavaName(), def.getMetadataKey(), encoder); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java index 8d26fbce16..b141c7340e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java @@ -27,14 +27,13 @@ import org.apache.tika.digest.DigesterFactory; /** * Factory for {@link CommonsDigester} with configurable algorithms and encodings. * <p> - * Default: markLimit = 1000000, MD5 with HEX encoding. + * Default: MD5 with HEX encoding. * <p> * Example JSON configuration: * <pre> * { * "digesterFactory": { * "commons-digester": { - * "markLimit": 1000000, * "digests": [ * { "algorithm": "MD5" }, * { "algorithm": "SHA256", "encoding": "BASE32" } @@ -47,7 +46,6 @@ import org.apache.tika.digest.DigesterFactory; @TikaComponent public class CommonsDigesterFactory implements DigesterFactory { - private int markLimit = 1000000; private List<DigestDef> digests = new ArrayList<>(); public CommonsDigesterFactory() { @@ -56,15 +54,7 @@ public class CommonsDigesterFactory implements DigesterFactory { @Override public Digester build() { - return new CommonsDigester(markLimit, digests); - } - - public int getMarkLimit() { - return markLimit; - } - - public void setMarkLimit(int markLimit) { - this.markLimit = markLimit; + return new CommonsDigester(digests); } public List<DigestDef> getDigests() { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java index f887b2d7e7..fc37999a48 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/detect/ole/MiscOLEDetector.java @@ -64,9 +64,6 @@ public class MiscOLEDetector implements Detector { */ public static final MediaType QUATTROPRO = application("x-quattro-pro"); - - private int markLimit = 16 * 1024 * 1024; - /** * Internal detection of the specific kind of OLE2 document, based on the * names of the top level streams within the file. @@ -117,23 +114,6 @@ public class MiscOLEDetector implements Detector { return names; } - /** - * If a TikaInputStream is passed in to {@link #detect(InputStream, Metadata)}, - * and there is not an underlying file, this detector will spool up to {@link #markLimit} - * to disk. If the stream was read in entirety (e.g. the spooled file is not truncated), - * this detector will open the file with POI and perform detection. - * If the spooled file is truncated, the detector will return {@link #OLE} (or - * {@link MediaType#OCTET_STREAM} if there's no OLE header). - * <p> - * As of Tika 1.21, this detector respects the legacy behavior of not performing detection - * on a non-TikaInputStream. - * - * @param markLimit - */ - public void setMarkLimit(int markLimit) { - this.markLimit = markLimit; - } - private Set<String> getTopLevelNames(TikaInputStream stream) throws IOException { // Force the document stream to a (possibly temporary) file // so we don't modify the current position of the stream. diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json index 9245331ec5..12b49d6267 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json @@ -6,7 +6,6 @@ "throwOnZeroBytes": false, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "SHA256" } ] diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json index fed21bc5af..5ac209517f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json @@ -3,7 +3,6 @@ "outputThreshold": 1000000, "digesterFactory": { "bouncy-castle-digester-factory": { - "markLimit": 1000000, "digests": [ { "algorithm": "SHA1", "encoding": "BASE32" } ] diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json index 770fba7ffe..53bfd01732 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json @@ -3,7 +3,6 @@ "outputThreshold": 1000000, "digesterFactory": { "bouncy-castle-digester-factory": { - "markLimit": 1000000, "digests": [ { "algorithm": "MD2" }, { "algorithm": "MD5" }, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json index 830d8c0809..b2e23ad974 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json @@ -3,7 +3,6 @@ "outputThreshold": 1000000, "digesterFactory": { "bouncy-castle-digester-factory": { - "markLimit": 1000000, "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA256" }, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json index 2a2634a88e..c37e6965f2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json @@ -3,7 +3,6 @@ "outputThreshold": 1000000, "digesterFactory": { "commons-digester-factory": { - "markLimit": 1000000, "digests": [ { "algorithm": "MD2" }, { "algorithm": "MD5" }, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json index cf7c3874a0..60825fe974 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json @@ -12,7 +12,6 @@ "outputThreshold": 1000000, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json index ed2145a404..8ed562166a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json @@ -4,7 +4,6 @@ "skipContainerDocumentDigest": true, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json index 004e6ea753..50bbd90b99 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json @@ -3,7 +3,6 @@ "outputThreshold": 1000000, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json index caffd0c709..a13a80c7db 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-md5-digest.json @@ -2,7 +2,6 @@ "auto-detect-parser": { "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "MD5" } ] diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json index 1872313a9c..3ca9aa461a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json @@ -4,7 +4,6 @@ "skipContainerDocumentDigest": true, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } diff --git a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json index f8284e5e4d..bcae4fb7e6 100644 --- a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json +++ b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json @@ -50,7 +50,6 @@ "outputThreshold": 1000000, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } diff --git a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json index f8284e5e4d..bcae4fb7e6 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json @@ -50,7 +50,6 @@ "outputThreshold": 1000000, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json index fdf80cb998..d134099806 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json @@ -13,7 +13,6 @@ "outputThreshold": 1000000, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json index 97646bc879..dd199e46d2 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json @@ -18,7 +18,6 @@ "outputThreshold": 1000000, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" } diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json index 8d3f74ed3c..4f30e99b4b 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json @@ -18,7 +18,6 @@ "outputThreshold": 1000000, "digesterFactory": { "commons-digester-factory": { - "markLimit": 100000, "digests": [ { "algorithm": "MD5" }, { "algorithm": "SHA1", "encoding": "BASE32" }
