This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a34d52da20 TIKA-4618 -- improve spooling strategy configuration (#2533)
a34d52da20 is described below
commit a34d52da20815cdc19d8fc46cfd5284f219cd730
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jan 15 08:41:52 2026 -0500
TIKA-4618 -- improve spooling strategy configuration (#2533)
---
docs/spooling.adoc | 211 +++++++++++++++++++++
.../org/apache/tika/detect/DefaultDetector.java | 101 ++++++++--
.../apache/tika/digest/InputStreamDigester.java | 6 +
.../java/org/apache/tika/io/SpoolingStrategy.java | 140 ++++++++++++++
.../apache/tika/metadata/TikaCoreProperties.java | 7 +
.../org/apache/tika/parser/AutoDetectParser.java | 32 ----
.../apache/tika/parser/AutoDetectParserConfig.java | 21 +-
.../apache/tika/sax/BodyContentHandlerTest.java | 3 +-
.../tika/parser/apple/AppleSingleFileParser.java | 16 +-
.../org/apache/tika/parser/crypto/TSDParser.java | 44 +++--
.../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +-
.../org/apache/tika/parser/pkg/tika-config.xml | 31 ---
.../apache/tika/parser/warc/WARCParserTest.java | 1 +
.../apache/tika/detect/TestDetectorLoading.java | 3 +-
.../apache/tika/parser/crypto/TSDParserTest.java | 3 +-
.../configs/tika-config-bc-digests-base32.json | 1 -
.../configs/tika-config-bc-digests-basic.json | 1 -
.../configs/tika-config-bc-digests-multiple.json | 1 -
.../configs/tika-config-commons-digests-basic.json | 1 -
.../configs/tika-config-digests-pdf-only.json | 1 -
.../tika-config-digests-skip-container.json | 1 -
.../resources/configs/tika-config-digests.json | 1 -
...a-config-doubling-custom-handler-decorator.json | 1 -
.../resources/configs/tika-config-no-names.json | 1 -
...a-config-upcasing-custom-handler-decorator.json | 1 -
.../resources/configs/tika-config-with-names.json | 1 -
.../configs/tika-config-write-filter.json | 1 -
.../test/resources/configs/tika-config-basic.json | 1 -
.../resources/configs/tika-config-passback.json | 1 -
.../resources/configs/tika-config-truncate.json | 1 -
.../resources/configs/tika-config-uppercasing.json | 1 -
.../apache/tika/config/loader/TikaJsonConfig.java | 2 +-
.../org/apache/tika/serialization/TikaModule.java | 16 ++
.../tika/config/loader/ConfigLoaderTest.java | 16 ++
.../tika/config/loader/TikaJsonConfigTest.java | 6 +-
.../apache/tika/config/loader/TikaLoaderTest.java | 52 +++--
.../test/resources/configs/TIKA-3695-exclude.json | 1 -
.../test/resources/configs/TIKA-3695-fields.json | 1 -
.../src/test/resources/configs/TIKA-3695.json | 1 -
.../configs/TIKA-4207-embedded-bytes-config.json | 1 -
.../test/resources/configs/test-config-loader.json | 4 +
.../org/apache/tika/server/core/CXFTestBase.java | 1 -
.../resources/configs/cxf-test-base-template.json | 1 -
.../tika/server/standard/TikaDetectorsTest.java | 19 +-
.../resources/configs/cxf-test-base-template.json | 1 -
.../configs/tika-config-for-server-tests.json | 1 -
.../tika-config-langdetect-opennlp-filter.json | 1 -
.../tika-config-langdetect-optimaize-filter.json | 1 -
48 files changed, 575 insertions(+), 187 deletions(-)
diff --git a/docs/spooling.adoc b/docs/spooling.adoc
new file mode 100644
index 0000000000..7e1959a5ec
--- /dev/null
+++ b/docs/spooling.adoc
@@ -0,0 +1,211 @@
+= Spooling in Apache Tika
+:toc:
+:toclevels: 3
+:sectnums:
+
+== Background
+
+=== What is Spooling?
+
+Spooling refers to the process of writing an input stream to a temporary file
on disk.
+This is necessary for certain file formats that require random access to the
underlying
+bytes during detection or parsing.
+
+=== Why Some Formats Benefit from Random Access
+
+Several file formats are most efficiently processed with random access vs
streaming:
+
+* **OLE2 (Microsoft Office legacy formats)**: The POI library needs to read
the file
+ as a random-access structure to navigate the OLE2 container.
+* **ZIP-based formats**: Container detection requires reading the ZIP central
directory,
+ which is located at the end of the file.
+* **Binary Property Lists (bplist)**: Apple's binary plist format requires
random access
+ for efficient parsing.
+* **PDF**: While detection works via magic bytes, parsing requires random
access for
+ the PDF cross-reference table.
+
+=== Architectural Decision: Decentralized Spooling
+
+==== The Problem with Centralized Spooling
+
+Earlier versions of Tika considered centralizing spooling decisions in
`DefaultDetector`.
+The detector would check the detected media type and spool to disk before
passing the
+stream to specialized detectors or parsers.
+
+This approach had several drawbacks:
+
+1. **Unnecessary spooling**: PDF files need spooling for _parsing_ but not for
_detection_
+ (magic bytes suffice). Centralized detection-time spooling would spool PDFs
unnecessarily
+ when only detecting.
+
+2. **Redundant logic**: Specialized detectors like `POIFSContainerDetector` and
+ `DefaultZipContainerDetector` already call `TikaInputStream.getFile()` or
`getPath()`
+ when they need random access. They know best when spooling is required.
+
+3. **Coupling**: Centralized spooling couples the detector to knowledge about
which
+ formats need random access, duplicating logic that already exists in
specialized
+ components.
+
+==== The Solution: Let Components Self-Spool
+
+The current architecture follows a simple principle: **each component that
needs random
+access is responsible for obtaining it**.
+
+When a detector or parser needs random access, it calls:
+
+[source,java]
+----
+Path path = TikaInputStream.get(inputStream).getPath();
+// or
+File file = TikaInputStream.get(inputStream).getFile();
+----
+
+`TikaInputStream` handles the spooling transparently:
+
+* If the stream is already backed by a file, it returns that file directly.
+* If the stream is in-memory or network-based, it spools to a temporary file.
+* The temporary file is automatically cleaned up when the stream is closed.
+
+==== Benefits of Decentralized Spooling
+
+1. **Efficiency**: Spooling happens only when actually needed, not
preemptively.
+2. **Simplicity**: No central configuration of "which types need spooling."
+3. **Correctness**: Each component knows its own requirements.
+4. **Flexibility**: New formats can be added without modifying central
spooling logic.
+
+=== TikaInputStream Backing Strategies
+
+`TikaInputStream` uses configurable backing strategies that handle caching and
temporary
+file management. This means:
+
+* Repeated calls to `getFile()` return the same temporary file (no
re-spooling).
+* The `rewind()` method efficiently resets the stream for re-reading.
+* Memory-mapped and disk-backed strategies can be selected based on use case.
+
+== User Guide
+
+=== Default Behavior
+
+By default, Tika handles spooling automatically. You don't need to configure
anything
+for most use cases. When a detector or parser needs random access to a file,
it will
+spool the input stream to a temporary file if necessary.
+
+=== SpoolingStrategy for Fine-Grained Control
+
+For advanced use cases, you can use `SpoolingStrategy` to control spooling
behavior.
+This is useful when you want to:
+
+* Restrict which file types are allowed to spool (e.g., for security reasons)
+* Customize spooling behavior based on metadata or stream properties
+
+==== Programmatic Configuration
+
+[source,java]
+----
+import org.apache.tika.io.SpoolingStrategy;
+import org.apache.tika.parser.ParseContext;
+
+// Create a custom spooling strategy
+SpoolingStrategy strategy = new SpoolingStrategy();
+strategy.setSpoolTypes(Set.of(
+ MediaType.application("zip"),
+ MediaType.application("pdf")
+));
+
+// Add to parse context
+ParseContext context = new ParseContext();
+context.set(SpoolingStrategy.class, strategy);
+
+// Parse with the custom context
+parser.parse(inputStream, handler, metadata, context);
+----
+
+==== SpoolingStrategy Methods
+
+[source,java]
+----
+// Check if spooling should occur for a given type
+boolean shouldSpool(TikaInputStream tis, Metadata metadata, MediaType
mediaType)
+
+// Configure which types should be spooled
+void setSpoolTypes(Set<MediaType> types)
+
+// Set the media type registry for specialization checking
+void setMediaTypeRegistry(MediaTypeRegistry registry)
+----
+
+==== How Type Matching Works
+
+The `shouldSpool()` method returns `true` if:
+
+1. The stream doesn't already have a backing file (`tis.hasFile()` is false),
AND
+2. The media type matches one of the configured spool types
+
+Type matching considers:
+
+* Exact matches (e.g., `application/zip`)
+* Base type matches (e.g., `application/zip` matches `application/zip;
charset=utf-8`)
+* Specializations (e.g., `application/vnd.oasis.opendocument.text` is a
specialization of `application/zip`)
+
+==== Default Spool Types
+
+The default spool types are:
+
+* `application/zip` - ZIP archives and ZIP-based formats (OOXML, ODF, EPUB,
etc.)
+* `application/x-tika-msoffice` - OLE2 Microsoft Office formats
+* `application/x-bplist` - Apple binary property lists
+* `application/pdf` - PDF documents
+
+=== JSON Configuration
+
+SpoolingStrategy can be configured via JSON in your `tika-config.json` file.
+Place the configuration in the `other-configs` section:
+
+[source,json]
+----
+{
+ "other-configs": {
+ "spooling-strategy": {
+ "spoolTypes": [
+ "application/zip",
+ "application/x-tika-msoffice",
+ "application/pdf"
+ ]
+ }
+ }
+}
+----
+
+Load the configuration using `TikaLoader`:
+
+[source,java]
+----
+TikaLoader loader = TikaLoader.load(Path.of("tika-config.json"));
+SpoolingStrategy strategy = loader.configs().load(SpoolingStrategy.class);
+
+// Add to parse context
+ParseContext context = new ParseContext();
+context.set(SpoolingStrategy.class, strategy);
+----
+
+=== Best Practices
+
+1. **Let Tika handle it**: For most applications, the default behavior is
optimal.
+ Don't configure spooling unless you have a specific need.
+
+2. **Use TikaInputStream**: Always wrap your input streams with
`TikaInputStream`
+ to enable efficient spooling and rewind capabilities.
+
+3. **Close streams properly**: Use try-with-resources to ensure temporary files
+ are cleaned up:
++
+[source,java]
+----
+try (TikaInputStream tis = TikaInputStream.get(inputStream)) {
+ parser.parse(tis, handler, metadata, context);
+}
+----
+
+4. **Consider memory vs. disk tradeoffs**: For very large files, spooling to
disk
+ is necessary. For small files processed in bulk, keeping data in memory may
be
+ faster. `TikaInputStream` backing strategies can be tuned for your workload.
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
index 2d71c5b180..3c1e91138a 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
@@ -16,50 +16,66 @@
*/
package org.apache.tika.detect;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
-import javax.imageio.spi.ServiceRegistry;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaComponent;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
- * A composite detector based on all the {@link Detector} implementations
- * available through the {@link ServiceRegistry service provider mechanism}.
+ * A composite detector that orchestrates the detection pipeline:
+ * <ol>
+ * <li>MimeTypes (magic byte) detection</li>
+ * <li>Container and other detectors loaded via SPI</li>
+ * <li>TextDetector as fallback for unknown types</li>
+ * <li>Returns the most specific type detected</li>
+ * </ol>
* <p>
* Detectors are loaded and returned in a specified order, of user supplied
- * followed by non-MimeType Tika, followed by the Tika MimeType class.
+ * followed by non-MimeType Tika detectors.
* If you need to control the order of the Detectors, you should instead
* construct your own {@link CompositeDetector} and pass in the list
* of Detectors in the required order.
+ * <p>
+ * Individual detectors that need random access (e.g., for container
inspection)
+ * handle their own spooling by calling {@link TikaInputStream#getFile()}.
*
* @since Apache Tika 0.9
*/
@TikaComponent(spi = false)
public class DefaultDetector extends CompositeDetector {
- /**
- * Serial version UID
- */
private static final long serialVersionUID = -8170114575326908027L;
+
private transient final ServiceLoader loader;
private final Collection<Class<? extends Detector>> excludedClasses;
+ private final MimeTypes mimeTypes;
+ private final TextDetector textDetector;
public DefaultDetector(MimeTypes types, ServiceLoader loader,
Collection<Class<? extends Detector>>
excludeDetectors) {
- super(types.getMediaTypeRegistry(), getDefaultDetectors(types, loader,
excludeDetectors));
+ super(types.getMediaTypeRegistry(), getDefaultDetectors(loader,
excludeDetectors));
this.loader = loader;
+ this.mimeTypes = types;
+ this.textDetector = new TextDetector();
this.excludedClasses = excludeDetectors != null ?
Collections.unmodifiableCollection(new
ArrayList<>(excludeDetectors)) :
Collections.emptySet();
}
public DefaultDetector(MimeTypes types, ServiceLoader loader) {
- this(types, loader, Collections.EMPTY_SET);
+ this(types, loader, Collections.emptySet());
}
public DefaultDetector(MimeTypes types, ClassLoader loader) {
@@ -86,11 +102,13 @@ public class DefaultDetector extends CompositeDetector {
* <p>
* If an {@link OverrideDetector} is loaded, it takes precedence over
* all other detectors.
+ * <p>
+ * Note: MimeTypes is handled separately in the detect() method, not
included here.
*
* @param loader service loader
* @return ordered list of statically loadable detectors
*/
- private static List<Detector> getDefaultDetectors(MimeTypes types,
ServiceLoader loader,
+ private static List<Detector> getDefaultDetectors(ServiceLoader loader,
Collection<Class<?
extends Detector>>
excludeDetectors) {
List<Detector> detectors =
@@ -111,16 +129,73 @@ public class DefaultDetector extends CompositeDetector {
Detector detector = detectors.remove(overrideIndex);
detectors.add(0, detector);
}
- // Finally the Tika MimeTypes as a fallback
- detectors.add(types);
return detectors;
}
+ @Override
+ public MediaType detect(TikaInputStream tis, Metadata metadata,
ParseContext parseContext)
+ throws IOException {
+ // 1. Magic detection via MimeTypes
+ MediaType magicType = mimeTypes.detect(tis, metadata, parseContext);
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_MAGIC_DETECTED,
magicType.toString());
+
+ // 2. Run other detectors (container detectors, etc.)
+ // Note: Container detectors that need random access handle their own
spooling
+ MediaType detectedType = super.detect(tis, metadata, parseContext);
+
+ // 3. Text detection - only if still unknown
+ MediaType textType = null;
+ if (MediaType.OCTET_STREAM.equals(detectedType) &&
+ MediaType.OCTET_STREAM.equals(magicType)) {
+ textType = textDetector.detect(tis, metadata, parseContext);
+ }
+
+ // 4. Return most specific
+ return mostSpecific(magicType, detectedType, textType);
+ }
+
+ private MediaType mostSpecific(MediaType magicType, MediaType
detectedType, MediaType textType) {
+ MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+
+ // Collect non-null, non-octet-stream candidates
+ MediaType best = MediaType.OCTET_STREAM;
+
+ // Start with magic type as baseline if valid
+ if (magicType != null && !MediaType.OCTET_STREAM.equals(magicType)) {
+ best = magicType;
+ }
+
+ // Container detectors may find more specific types (e.g., OLE ->
msword)
+ // or less specific (e.g., commons-compress tar vs magic gtar)
+ // Use the registry to determine which is more specific
+ if (detectedType != null &&
!MediaType.OCTET_STREAM.equals(detectedType)) {
+ if (MediaType.OCTET_STREAM.equals(best)) {
+ best = detectedType;
+ } else if (registry.isSpecializationOf(detectedType, best)) {
+ // detectedType is more specific than best
+ best = detectedType;
+ } else if (!registry.isSpecializationOf(best, detectedType)) {
+ // Neither is a specialization of the other - prefer container
detection
+ // for unrelated types (e.g., different format families)
+ best = detectedType;
+ }
+ // else: best is already more specific than detectedType, keep best
+ }
+
+ // Text detection as fallback only if still unknown
+ if (MediaType.OCTET_STREAM.equals(best) && textType != null &&
+ !MediaType.OCTET_STREAM.equals(textType)) {
+ best = textType;
+ }
+
+ return best;
+ }
+
@Override
public List<Detector> getDetectors() {
if (loader != null && loader.isDynamic()) {
List<Detector> detectors =
loader.loadDynamicServiceProviders(Detector.class);
- if (detectors.size() > 0) {
+ if (!detectors.isEmpty()) {
detectors.addAll(super.getDetectors());
return detectors;
} else {
diff --git
a/tika-core/src/main/java/org/apache/tika/digest/InputStreamDigester.java
b/tika-core/src/main/java/org/apache/tika/digest/InputStreamDigester.java
index 5a458fd6a2..a384137300 100644
--- a/tika-core/src/main/java/org/apache/tika/digest/InputStreamDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/digest/InputStreamDigester.java
@@ -30,6 +30,12 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.StringUtils;
+// TODO: TIKA-FOLLOWUP - With TikaInputStream.rewind(), markLimit is no longer
needed.
+// The digester can simply read the entire stream, then call tis.rewind().
+// This would simplify this class and allow removing markLimit from:
+// - InputStreamDigester, CommonsDigester, BouncyCastleDigester
+// - CommonsDigesterFactory, BouncyCastleDigesterFactory
(setMarkLimit/getMarkLimit)
+// - All JSON config files that specify markLimit for digesters
public class InputStreamDigester implements Digester {
private final String algorithm;
diff --git a/tika-core/src/main/java/org/apache/tika/io/SpoolingStrategy.java
b/tika-core/src/main/java/org/apache/tika/io/SpoolingStrategy.java
new file mode 100644
index 0000000000..0a3a45bad8
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/io/SpoolingStrategy.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.io;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+
+/**
+ * Strategy for determining when to spool a TikaInputStream to disk.
+ * <p>
+ * Components (detectors, parsers) can check this strategy before calling
+ * {@link TikaInputStream#getFile()} to determine if spooling is appropriate
+ * for the given media type.
+ * <p>
+ * Default behavior (when no strategy is in ParseContext): components spool
when needed.
+ * A strategy allows fine-grained control over spooling decisions.
+ * <p>
+ * Configure via JSON:
+ * <pre>
+ * {
+ * "spooling-strategy": {
+ * "spoolTypes": ["application/zip", "application/x-tika-msoffice",
"application/pdf"]
+ * }
+ * }
+ * </pre>
+ */
+@TikaComponent(spi = false)
+public class SpoolingStrategy {
+
+ private static final Set<MediaType> DEFAULT_SPOOL_TYPES;
+
+ static {
+ Set<MediaType> types = new HashSet<>();
+ types.add(MediaType.application("zip"));
+ types.add(MediaType.application("x-tika-msoffice"));
+ types.add(MediaType.application("x-bplist"));
+ types.add(MediaType.application("pdf"));
+ DEFAULT_SPOOL_TYPES = Set.copyOf(types);
+ }
+
+ private Set<MediaType> spoolTypes = new HashSet<>(DEFAULT_SPOOL_TYPES);
+ private MediaTypeRegistry mediaTypeRegistry;
+
+ /**
+ * Determines whether the stream should be spooled to disk.
+ *
+ * @param tis the TikaInputStream (can check hasFile(), getLength())
+ * @param metadata metadata (can check content-type hints, filename)
+ * @param mediaType the detected or declared media type
+ * @return true if the stream should be spooled to disk
+ */
+ public boolean shouldSpool(TikaInputStream tis, Metadata metadata,
MediaType mediaType) {
+ // Already has file? No need to spool
+ if (tis != null && tis.hasFile()) {
+ return false;
+ }
+ // Check type against spoolTypes
+ return matchesSpoolType(mediaType);
+ }
+
+ private boolean matchesSpoolType(MediaType type) {
+ if (type == null) {
+ return false;
+ }
+ // Exact match
+ if (spoolTypes.contains(type)) {
+ return true;
+ }
+ // Base type match (without parameters)
+ MediaType baseType = type.getBaseType();
+ if (spoolTypes.contains(baseType)) {
+ return true;
+ }
+ // Check if type is a specialization of any spool type
+ if (mediaTypeRegistry != null) {
+ for (MediaType spoolType : spoolTypes) {
+ if (mediaTypeRegistry.isSpecializationOf(type, spoolType)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Sets the media types that should be spooled to disk.
+ * Specializations of these types are also included.
+ *
+ * @param spoolTypes set of media types to spool
+ */
+ public void setSpoolTypes(Set<MediaType> spoolTypes) {
+ this.spoolTypes = spoolTypes != null ? new HashSet<>(spoolTypes) : new
HashSet<>();
+ }
+
+ /**
+ * Returns the media types that should be spooled to disk.
+ *
+ * @return set of media types to spool
+ */
+ public Set<MediaType> getSpoolTypes() {
+ return spoolTypes;
+ }
+
+ /**
+ * Sets the media type registry used for checking type specializations.
+ *
+ * @param registry the media type registry
+ */
+ public void setMediaTypeRegistry(MediaTypeRegistry registry) {
+ this.mediaTypeRegistry = registry;
+ }
+
+ /**
+ * Returns the media type registry.
+ *
+ * @return the media type registry, or null if not set
+ */
+ public MediaTypeRegistry getMediaTypeRegistry() {
+ return mediaTypeRegistry;
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index a2ac99c2bb..b89323fc11 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -229,6 +229,13 @@ public interface TikaCoreProperties {
*/
Property CONTENT_TYPE_PARSER_OVERRIDE =
Property.internalText(HttpHeaders.CONTENT_TYPE +
"-Parser-Override");
+ /**
+ * This is set by DefaultDetector to store the result of MimeTypes (magic
byte)
+ * detection. This allows downstream detectors to use it as a hint without
+ * re-running magic detection.
+ */
+ Property CONTENT_TYPE_MAGIC_DETECTED =
+ Property.internalText(HttpHeaders.CONTENT_TYPE +
"-Magic-Detected");
/**
* @see DublinCore#FORMAT
*/
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index da423e8e13..d03eb89961 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -30,7 +30,6 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -151,8 +150,6 @@ public class AutoDetectParser extends CompositeParser {
metadata.setMetadataWriteFilter(
autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance());
}
- //figure out if we should spool to disk
- maybeSpool(tis, autoDetectParserConfig, metadata);
// Compute digests before type detection if configured
DigestHelper.maybeDigest(tis,
@@ -211,35 +208,6 @@ public class AutoDetectParser extends CompositeParser {
return handler;
}
- private void maybeSpool(TikaInputStream tis, AutoDetectParserConfig
autoDetectParserConfig,
- Metadata metadata) throws IOException {
- if (tis.hasFile()) {
- return;
- }
- if (autoDetectParserConfig.getSpoolToDisk() == null) {
- return;
- }
- //whether or not a content-length has been sent in,
- //if spoolToDisk == 0, spool it
- if (autoDetectParserConfig.getSpoolToDisk() == 0) {
- tis.getPath();
- metadata.set(HttpHeaders.CONTENT_LENGTH,
Long.toString(tis.getLength()));
- return;
- }
- if (metadata.get(Metadata.CONTENT_LENGTH) != null) {
- long len = -1l;
- try {
- len = Long.parseLong(metadata.get(Metadata.CONTENT_LENGTH));
- if (len > autoDetectParserConfig.getSpoolToDisk()) {
- tis.getPath();
- metadata.set(HttpHeaders.CONTENT_LENGTH,
Long.toString(tis.getLength()));
- }
- } catch (NumberFormatException e) {
- //swallow...maybe log?
- }
- }
- }
-
private void initializeEmbeddedDocumentExtractor(Metadata metadata,
ParseContext context) {
if (context.get(EmbeddedDocumentExtractor.class) != null) {
return;
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index 75ba7f6f79..0aba04ad61 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -52,13 +52,6 @@ public class AutoDetectParserConfig implements Serializable {
public static AutoDetectParserConfig DEFAULT = new
AutoDetectParserConfig();
- /**
- * If this is not null and greater than -1, the AutoDetectParser
- * will spool the stream to disk if the length of the stream is known
- * ahead of time.
- */
- private Long spoolToDisk = null;
-
/**
* SecureContentHandler -- Desired output threshold in characters.
*/
@@ -102,16 +95,14 @@ public class AutoDetectParserConfig implements
Serializable {
/**
* Creates a SecureContentHandlerConfig using the passed in parameters.
*
- * @param spoolToDisk
* @param outputThreshold SecureContentHandler - character output
threshold.
* @param maximumCompressionRatio SecureContentHandler - max compression
ratio allowed.
* @param maximumDepth SecureContentHandler - maximum XML
element nesting level.
* @param maximumPackageEntryDepth SecureContentHandler - maximum package
entry nesting level.
*/
- public AutoDetectParserConfig(Long spoolToDisk, Long outputThreshold,
+ public AutoDetectParserConfig(Long outputThreshold,
Long maximumCompressionRatio, Integer
maximumDepth,
Integer maximumPackageEntryDepth) {
- this.spoolToDisk = spoolToDisk;
this.outputThreshold = outputThreshold;
this.maximumCompressionRatio = maximumCompressionRatio;
this.maximumDepth = maximumDepth;
@@ -122,14 +113,6 @@ public class AutoDetectParserConfig implements
Serializable {
}
- public Long getSpoolToDisk() {
- return spoolToDisk;
- }
-
- public void setSpoolToDisk(Long spoolToDisk) {
- this.spoolToDisk = spoolToDisk;
- }
-
public Long getOutputThreshold() {
return outputThreshold;
}
@@ -264,7 +247,7 @@ public class AutoDetectParserConfig implements Serializable
{
@Override
public String toString() {
- return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ",
outputThreshold=" +
+ return "AutoDetectParserConfig{" + "outputThreshold=" +
outputThreshold + ", maximumCompressionRatio=" +
maximumCompressionRatio +
", maximumDepth=" + maximumDepth + ",
maximumPackageEntryDepth=" +
maximumPackageEntryDepth + ", metadataWriteFilterFactory=" +
diff --git
a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
index 80b6315e93..ac5b0b077f 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
@@ -62,8 +62,9 @@ public class BodyContentHandlerTest extends TikaTest {
@Test
public void testLimit() throws Exception {
//TIKA-2668 - java 11-ea
+ // Note: limit is 16 to account for metadata overhead (each metadata
field adds a newline)
Parser p = new MockParser();
- WriteOutContentHandler handler = new WriteOutContentHandler(15);
+ WriteOutContentHandler handler = new WriteOutContentHandler(16);
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
Parser[] parsers = new Parser[1];
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index b7e6752511..5ff9e0cf15 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -26,6 +26,7 @@ import java.util.List;
import java.util.Set;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.BoundedInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -98,10 +99,17 @@ public class AppleSingleFileParser implements Parser {
long diff = contentFieldInfo.offset - bytesRead;
IOUtils.skipFully(tis, diff);
if (ex.shouldParseEmbedded(embeddedMetadata)) {
- // TODO: we should probably add a readlimiting wrapper around
this
- // stream to ensure that not more than contentFieldInfo.length
bytes
- // are read
- ex.parseEmbedded(tis, xhtml, embeddedMetadata, context, true);
+ // Use BoundedInputStream to limit bytes read, then spool to
temp file
+ // for complete isolation from parent stream (reset() goes to
embedded start)
+ BoundedInputStream bounded =
+ BoundedInputStream.builder()
+ .setInputStream(tis)
+ .setMaxCount(contentFieldInfo.length)
+ .get();
+ try (TikaInputStream inner = TikaInputStream.get(bounded)) {
+ inner.getPath();
+ ex.parseEmbedded(inner, xhtml, embeddedMetadata, context,
true);
+ }
}
}
xhtml.endDocument();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
index 0edcb0a6b3..0337729955 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
@@ -18,7 +18,10 @@ package org.apache.tika.parser.crypto;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
import java.math.BigInteger;
+import java.nio.file.Files;
+import java.nio.file.Path;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.security.NoSuchProviderException;
@@ -55,8 +58,10 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -167,10 +172,27 @@ public class TSDParser implements Parser {
EmbeddedDocumentExtractor edx =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (edx.shouldParseEmbedded(metadata)) {
- try {
+ try (TemporaryResources tmp = new TemporaryResources()) {
cmsTimeStampedDataParser = new
CMSTimeStampedDataParser(stream);
- try (TikaInputStream inner =
TikaInputStream.get(cmsTimeStampedDataParser.getContent())) {
+ // Spool content to temp file, catching any EOF from truncated
files
+ Path tempFile = tmp.createTempFile();
+ try (InputStream content =
cmsTimeStampedDataParser.getContent();
+ OutputStream out = Files.newOutputStream(tempFile)) {
+ byte[] buffer = new byte[8192];
+ int n;
+ while ((n = content.read(buffer)) != -1) {
+ out.write(buffer, 0, n);
+ }
+ } catch (IOException e) {
+ // Truncated file - record exception and work with what we
got
+ metadata.set(TikaCoreProperties.EMBEDDED_EXCEPTION,
+ e.getClass().getName() + ": " + e.getMessage());
+ LOG.debug("Error reading TSD content (possibly
truncated)", e);
+ }
+
+ // Parse whatever we managed to extract
+ try (TikaInputStream inner = TikaInputStream.get(tempFile)) {
edx.parseEmbedded(inner, handler, metadata, context, true);
}
@@ -180,17 +202,13 @@ public class TSDParser implements Parser {
WriteLimitReachedException.throwIfWriteLimitReached(ex);
LOG.error("Error in TSDParser.parseTSDContent {}",
ex.getMessage());
} finally {
- this.closeCMSParser(cmsTimeStampedDataParser);
- }
- }
- }
-
- private void closeCMSParser(CMSTimeStampedDataParser
cmsTimeStampedDataParser) {
- if (cmsTimeStampedDataParser != null) {
- try {
- cmsTimeStampedDataParser.close();
- } catch (IOException ex) {
- LOG.error("Error in TSDParser.closeCMSParser {}",
ex.getMessage());
+ if (cmsTimeStampedDataParser != null) {
+ try {
+ cmsTimeStampedDataParser.close();
+ } catch (IOException e) {
+ LOG.debug("Error closing CMSTimeStampedDataParser", e);
+ }
+ }
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 72753ce022..a18c27c6c5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -237,7 +237,7 @@ public class PDFParserTest extends TikaTest {
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get("pdf:encrypted"));
//pdf:encrypted, X-Parsed-By and Content-Type
- assertEquals(8, metadata.names().length, "very little metadata should
be parsed");
+ assertEquals(9, metadata.names().length, "very little metadata should
be parsed");
assertEquals(0, handler.toString().length());
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml
deleted file mode 100644
index 11b888c9f0..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml
+++ /dev/null
@@ -1,31 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers/>
- <detectors>
- <detector class="org.apache.tika.detect.DefaultDetector">
- <detector-exclude class="ZipContainerDetector"/>
- </detector>
- <detector class="ZipContainerDetector">
- <params>
- <param name="markLimit" type="int">100000</param>
- </params>
- </detector>
- </detectors>
- <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index 2e70fe8315..d0718c9ac5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -59,6 +59,7 @@ public class WARCParserTest extends TikaTest {
Set<String> fieldsToIgnore = new HashSet<>();
fieldsToIgnore.add("X-TIKA:parse_time_millis");
fieldsToIgnore.add("Content-Type");
+ fieldsToIgnore.add("Content-Type-Magic-Detected");
assertMetadataListEquals(metadataList, gzMetadataList, fieldsToIgnore);
assertEquals("application/warc",
metadataList.get(0).get(Metadata.CONTENT_TYPE));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
index e419b18e59..cec44d39a9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
@@ -32,7 +32,7 @@ public class TestDetectorLoading {
//integration test - detectors should be sorted alphabetically by
class name
Detector detector = TikaLoader.loadDefault().loadDetectors();
List<Detector> detectors = ((CompositeDetector)
detector).getDetectors();
- assertEquals(8, detectors.size());
+ assertEquals(7, detectors.size());
// Sorted alphabetically by full class name (all are org.apache.tika.*)
assertEquals("org.apache.tika.detect.MatroskaDetector",
detectors.get(0).getClass().getName());
assertEquals("org.apache.tika.detect.apple.BPListDetector",
detectors.get(1).getClass().getName());
@@ -44,6 +44,5 @@ public class TestDetectorLoading {
assertEquals("org.apache.tika.detect.ole.MiscOLEDetector",
detectors.get(5).getClass().getName());
assertEquals("org.apache.tika.detect.zip.DefaultZipContainerDetector",
detectors.get(6).getClass().getName());
- assertEquals("org.apache.tika.mime.MimeTypes",
detectors.get(7).getClass().getName());
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
index 140a82d5f0..56c9f29572 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
@@ -44,7 +44,8 @@ public class TSDParserTest extends TikaTest {
assertEquals(2, list.size());
assertEquals("application/pdf",
list.get(1).get(Metadata.CONTENT_TYPE));
assertNotNull(list.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
- assertContains("org.apache.pdfbox.io.RandomAccessReadBuffer.<init>",
+ // Exception occurs during TSD content extraction (truncated file)
+ assertContains("EOFException",
list.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
index 3abbeaef18..fed21bc5af 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"bouncy-castle-digester-factory": {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
index 98714ec028..770fba7ffe 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"bouncy-castle-digester-factory": {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
index 38b2a17bed..830d8c0809 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"bouncy-castle-digester-factory": {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
index 039384ea5e..2a2634a88e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
index 124b07adca..cf7c3874a0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json
@@ -9,7 +9,6 @@
}
],
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
index d4f565519b..ed2145a404 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"skipContainerDocumentDigest": true,
"digesterFactory": {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
index d2e238f0f4..004e6ea753 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json
index 012142231b..c721b2df1a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 1000,
"outputThreshold": 1000,
"contentHandlerDecoratorFactory":
"doubling-content-handler-decorator-factory"
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
index 2f0ac2a2fe..b56a7d5d2d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 123450,
"outputThreshold": 678900,
"embeddedDocumentExtractorFactory": {
"runpack-extractor-factory": {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
index a58fa91fc3..6a466c1385 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 1000,
"outputThreshold": 1000,
"maximumCompressionRatio": 0.8,
"maximumDepth": 1000,
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
index 0659adb852..17811c8dec 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 123450,
"outputThreshold": 678900,
"embeddedDocumentExtractorFactory": {
"runpack-extractor-factory": {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
index 1b6f13c1cb..1872313a9c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json
@@ -1,6 +1,5 @@
{
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"skipContainerDocumentDigest": true,
"digesterFactory": {
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
index 5873c39a87..755c345dfa 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json
@@ -45,7 +45,6 @@
}
},
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"skipContainerDocumentDigest": false,
"digesterFactory": {
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
index 529e878cb6..2e0748f854 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json
@@ -44,7 +44,6 @@
}
},
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"skipContainerDocumentDigest": false,
"digesterFactory": {
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
index b58bfe269c..07a78edf3c 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
@@ -45,7 +45,6 @@
}
},
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"skipContainerDocumentDigest": false,
"digesterFactory": {
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
index e7d8a21c02..6498c15a7a 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json
@@ -40,7 +40,6 @@
}
},
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"skipContainerDocumentDigest": false,
"digesterFactory": {
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
index 3d6a1ba473..a8160b1548 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
@@ -78,7 +78,7 @@ import org.apache.tika.exception.TikaConfigException;
* ],
* "detectors": [
* "poifs-container-detector", // String shorthand
- * { "mime-types": { "markLimit": 10000 } }
+ * { "default-detector": { "spoolTypes": ["application/zip",
"application/pdf"] } }
* ],
*
* // Pipes components (validated by validateKeys())
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
index 181e6b90e1..8ad0d588ff 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java
@@ -129,6 +129,22 @@ public class TikaModule extends SimpleModule {
public TikaModule() {
super("TikaModule");
+ // Register MediaType serializers (string-based)
+ addSerializer(MediaType.class, new JsonSerializer<MediaType>() {
+ @Override
+ public void serialize(MediaType value, JsonGenerator gen,
SerializerProvider serializers)
+ throws IOException {
+ gen.writeString(value.toString());
+ }
+ });
+ addDeserializer(MediaType.class, new JsonDeserializer<MediaType>() {
+ @Override
+ public MediaType deserialize(JsonParser p, DeserializationContext
ctxt)
+ throws IOException {
+ return MediaType.parse(p.getValueAsString());
+ }
+ });
+
// Register Metadata serializers
addSerializer(Metadata.class, new MetadataSerializer());
addDeserializer(Metadata.class, new MetadataDeserializer());
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
index 1db87866e7..12695472c8 100644
---
a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java
@@ -31,6 +31,8 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.io.SpoolingStrategy;
+import org.apache.tika.mime.MediaType;
/**
* Unit tests for {@link ConfigLoader}.
@@ -211,6 +213,20 @@ public class ConfigLoaderTest {
assertEquals(30000, timeout.getMillis());
}
+ @Test
+ public void testLoadSpoolingStrategy() throws Exception {
+ // SpoolingStrategy -> "spooling-strategy"
+ // JSON has "spooling-strategy" with spoolTypes: ["application/zip",
"application/pdf"]
+ SpoolingStrategy strategy = configLoader.load(SpoolingStrategy.class);
+
+ assertNotNull(strategy);
+ assertEquals(2, strategy.getSpoolTypes().size());
+
assertTrue(strategy.getSpoolTypes().contains(MediaType.application("zip")));
+
assertTrue(strategy.getSpoolTypes().contains(MediaType.application("pdf")));
+ // Verify default types are NOT present (we replaced the set)
+
assertFalse(strategy.getSpoolTypes().contains(MediaType.application("x-tika-msoffice")));
+ }
+
@Test
public void testLoadByClassNameMyFeatureSettings() throws Exception {
// MyFeatureSettings -> "my-feature-settings" (full name, no suffix
stripping)
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
index edd8e55634..10578c1f0c 100644
---
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
@@ -70,7 +70,7 @@ public class TikaJsonConfigTest {
{
"detectors": [
"poifs-container-detector",
- { "mime-types": { "markLimit": 10000 } },
+ { "default-detector": { "spoolTypes": ["application/zip",
"application/pdf"] } },
"zip-container-detector"
]
}
@@ -85,8 +85,8 @@ public class TikaJsonConfigTest {
assertEquals("poifs-container-detector", detectors.get(0).getKey());
assertTrue(detectors.get(0).getValue().isEmpty());
- assertEquals("mime-types", detectors.get(1).getKey());
- assertEquals(10000,
detectors.get(1).getValue().get("markLimit").asInt());
+ assertEquals("default-detector", detectors.get(1).getKey());
+ assertTrue(detectors.get(1).getValue().get("spoolTypes").isArray());
assertEquals("zip-container-detector", detectors.get(2).getKey());
assertTrue(detectors.get(2).getValue().isEmpty());
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
index 33e493d509..5b8e44c788 100644
---
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
@@ -322,42 +322,40 @@ public class TikaLoaderTest {
"Should NOT support application/test+optin (opt-in only, not
in SPI)");
}
- // TODO: TIKA-SERIALIZATION-FOLLOWUP - Implement validation for common
typos
- @Disabled("TIKA-SERIALIZATION-FOLLOWUP: Validation for excludes typo not
yet implemented")
+ // TODO: TIKA-SERIALIZATION-FOLLOWUP - Jackson may need configuration to
fail on unknown properties
+ @Disabled("TIKA-SERIALIZATION-FOLLOWUP")
@Test
- public void testExcludesInsteadOfExcludeThrowsException() throws Exception
{
- // Create a config with the common mistake: "excludes" instead of
"exclude"
- String invalidConfig = "{\n" +
- " \"parsers\": [\n" +
- " {\n" +
- " \"default-parser\": {\n" +
- " \"excludes\": [\"pdf-parser\"]\n" +
- " }\n" +
- " }\n" +
- " ]\n" +
- "}";
-
- // Write to a temp file
- Path tempFile = Files.createTempFile("test-invalid-excludes", ".json");
+ public void testInvalidBeanPropertyThrowsException() throws Exception {
+ // Config with a property that doesn't exist on DefaultDetector
+ String invalidConfig = """
+ {
+ "detectors": [
+ {
+ "default-detector": {
+ "nonExistentProperty": 12345
+ }
+ }
+ ]
+ }
+ """;
+
+ Path tempFile = Files.createTempFile("test-invalid-property", ".json");
try {
Files.write(tempFile,
invalidConfig.getBytes(StandardCharsets.UTF_8));
- // Attempt to load should throw TikaConfigException
+ TikaLoader loader = TikaLoader.load(tempFile);
try {
- TikaLoader loader = TikaLoader.load(tempFile);
- loader.get(Parser.class);
- throw new AssertionError("Expected TikaConfigException to be
thrown");
+ loader.loadDetectors();
+ throw new AssertionError("Expected TikaConfigException for
invalid property");
} catch (org.apache.tika.exception.TikaConfigException e) {
- // Expected - verify the error message is helpful
- assertTrue(e.getMessage().contains("excludes"),
- "Error message should mention 'excludes'");
- assertTrue(e.getMessage().contains("exclude"),
- "Error message should mention the correct field
'exclude'");
- assertTrue(e.getMessage().contains("singular"),
- "Error message should explain it should be singular");
+ // Expected - Jackson should fail on unknown property
+ assertTrue(e.getMessage().contains("nonExistentProperty") ||
+
e.getCause().getMessage().contains("nonExistentProperty"),
+ "Error should mention the invalid property name");
}
} finally {
Files.deleteIfExists(tempFile);
}
}
+
}
diff --git
a/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json
b/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json
index e40af02044..6675ef1d3c 100644
--- a/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json
+++ b/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json
@@ -3,7 +3,6 @@
"default-parser"
],
"auto-detect-parser": {
- "spoolToDisk": 12345,
"outputThreshold": 6789,
"metadataWriteFilterFactory": {
"standard-write-filter-factory": {
diff --git
a/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json
b/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json
index 9d38adbdd2..27b73f7e9a 100644
--- a/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json
+++ b/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json
@@ -3,7 +3,6 @@
"default-parser"
],
"auto-detect-parser": {
- "spoolToDisk": 12345,
"outputThreshold": 6789,
"metadataWriteFilterFactory": {
"standard-write-filter-factory": {
diff --git a/tika-serialization/src/test/resources/configs/TIKA-3695.json
b/tika-serialization/src/test/resources/configs/TIKA-3695.json
index f7c05313ba..f24ce43246 100644
--- a/tika-serialization/src/test/resources/configs/TIKA-3695.json
+++ b/tika-serialization/src/test/resources/configs/TIKA-3695.json
@@ -3,7 +3,6 @@
"default-parser"
],
"auto-detect-parser": {
- "spoolToDisk": 12345,
"outputThreshold": 6789,
"metadataWriteFilterFactory": {
"standard-write-filter-factory": {
diff --git
a/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json
b/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json
index d32516e877..b014152172 100644
---
a/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json
+++
b/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json
@@ -3,7 +3,6 @@
"default-parser"
],
"auto-detect-parser": {
- "spoolToDisk": 123450,
"outputThreshold": 678900,
"embeddedDocumentExtractorFactory": {
"runpack-extractor-factory": {
diff --git
a/tika-serialization/src/test/resources/configs/test-config-loader.json
b/tika-serialization/src/test/resources/configs/test-config-loader.json
index dd657c81e0..c5c24254eb 100644
--- a/tika-serialization/src/test/resources/configs/test-config-loader.json
+++ b/tika-serialization/src/test/resources/configs/test-config-loader.json
@@ -21,6 +21,10 @@
"millis": 30000
},
+ "spooling-strategy": {
+ "spoolTypes": ["application/zip", "application/pdf"]
+ },
+
"my-feature-settings": {
"featureName": "test-feature",
"priority": 10
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index 8004fb79ae..3a1389b140 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -71,7 +71,6 @@ public abstract class CXFTestBase {
public final static String BASIC_CONFIG = """
{
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
diff --git
a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
index 355e34ecdd..f8284e5e4d 100644
---
a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
+++
b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json
@@ -47,7 +47,6 @@
}
},
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaDetectorsTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaDetectorsTest.java
index 8f50d35901..ae76831340 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaDetectorsTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaDetectorsTest.java
@@ -33,7 +33,6 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.detect.microsoft.POIFSContainerDetector;
import org.apache.tika.detect.ogg.OggDetector;
import org.apache.tika.detect.zip.DefaultZipContainerDetector;
-import org.apache.tika.mime.MimeTypes;
import org.apache.tika.server.core.CXFTestBase;
import org.apache.tika.server.core.resource.TikaDetectors;
@@ -64,7 +63,7 @@ public class TikaDetectorsTest extends CXFTestBase {
assertContains(OggDetector.class.getName(), text);
assertContains(POIFSContainerDetector.class.getName(), text);
assertContains(DefaultZipContainerDetector.class.getName(), text);
- assertContains(MimeTypes.class.getName(), text);
+ // Note: MimeTypes is now handled internally by DefaultDetector, not
as a child detector
}
@Test
@@ -81,12 +80,11 @@ public class TikaDetectorsTest extends CXFTestBase {
assertContains("<h3>OggDetector", text);
assertContains("<h3>POIFSContainerDetector", text);
- assertContains("<h3>MimeTypes", text);
+ // Note: MimeTypes is now handled internally by DefaultDetector, not
as a child detector
assertContains(OggDetector.class.getName(), text);
assertContains(POIFSContainerDetector.class.getName(), text);
assertContains(DefaultZipContainerDetector.class.getName(), text);
- assertContains(MimeTypes.class.getName(), text);
}
@Test
@@ -110,10 +108,11 @@ public class TikaDetectorsTest extends CXFTestBase {
assertEquals("org.apache.tika.detect.DefaultDetector",
json.get("name"));
assertEquals(Boolean.TRUE, json.get("composite"));
- // At least 4 child detectors, none of them composite
+ // At least 3 child detectors, none of them composite
+ // Note: MimeTypes is now handled internally by DefaultDetector, not
as a child detector
List<Object> children = (List) json.get("children");
- assertTrue(children.size() >= 4);
- boolean hasOgg = false, hasPOIFS = false, hasZIP = false, hasMime =
false;
+ assertTrue(children.size() >= 3);
+ boolean hasOgg = false, hasPOIFS = false, hasZIP = false;
for (Object o : children) {
Map<String, Object> d = (Map<String, Object>) o;
assertTrue(d.containsKey("name"));
@@ -137,16 +136,10 @@ public class TikaDetectorsTest extends CXFTestBase {
.equals(name)) {
hasZIP = true;
}
- if (MimeTypes.class
- .getName()
- .equals(name)) {
- hasMime = true;
- }
}
assertTrue(hasOgg);
assertTrue(hasPOIFS);
assertTrue(hasZIP);
- assertTrue(hasMime);
}
}
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
index 355e34ecdd..f8284e5e4d 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json
@@ -47,7 +47,6 @@
}
},
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
index e96b3b7f71..fdf80cb998 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json
@@ -10,7 +10,6 @@
}
],
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
index e2b779035a..97646bc879 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json
@@ -15,7 +15,6 @@
}
],
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {
diff --git
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
index 52a9a4a871..8d3f74ed3c 100644
---
a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
+++
b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json
@@ -15,7 +15,6 @@
}
],
"auto-detect-parser": {
- "spoolToDisk": 1000000,
"outputThreshold": 1000000,
"digesterFactory": {
"commons-digester-factory": {