This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4618 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 76f19d5b9695cf8726f1540e1c10f1e33f6d3b66 Author: tallison <[email protected]> AuthorDate: Wed Jan 14 19:30:21 2026 -0500 TIKA-4618 -- improve spooling strategies --- .../org/apache/tika/detect/DefaultDetector.java | 164 +++++++++++++++++++-- .../apache/tika/metadata/TikaCoreProperties.java | 7 + .../apache/tika/sax/BodyContentHandlerTest.java | 3 +- .../tika/parser/apple/AppleSingleFileParser.java | 16 +- .../org/apache/tika/parser/crypto/TSDParser.java | 44 ++++-- .../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- .../apache/tika/parser/warc/WARCParserTest.java | 1 + .../apache/tika/detect/TestDetectorLoading.java | 3 +- .../apache/tika/parser/crypto/TSDParserTest.java | 4 +- 9 files changed, 209 insertions(+), 35 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java index 2d71c5b180..46c0656e6e 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java @@ -16,23 +16,37 @@ */ package org.apache.tika.detect; +import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashSet; import java.util.List; -import javax.imageio.spi.ServiceRegistry; +import java.util.Set; import org.apache.tika.config.ServiceLoader; import org.apache.tika.config.TikaComponent; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.ParseContext; import org.apache.tika.utils.ServiceLoaderUtils; /** - * A composite detector based on all the {@link Detector} implementations - * available through the {@link ServiceRegistry service provider mechanism}. + * A composite detector that orchestrates the detection pipeline: + * <ol> + * <li>MimeTypes (magic byte) detection</li> + * <li>Spooling to temp file if needed for random access formats</li> + * <li>Container and other detectors loaded via SPI</li> + * <li>TextDetector as fallback for unknown types</li> + * <li>Returns the most specific type detected</li> + * </ol> * <p> * Detectors are loaded and returned in a specified order, of user supplied - * followed by non-MimeType Tika, followed by the Tika MimeType class. + * followed by non-MimeType Tika detectors. * If you need to control the order of the Detectors, you should instead * construct your own {@link CompositeDetector} and pass in the list * of Detectors in the required order. @@ -42,24 +56,28 @@ import org.apache.tika.utils.ServiceLoaderUtils; @TikaComponent(spi = false) public class DefaultDetector extends CompositeDetector { - /** - * Serial version UID - */ private static final long serialVersionUID = -8170114575326908027L; + private transient final ServiceLoader loader; private final Collection<Class<? extends Detector>> excludedClasses; + private final MimeTypes mimeTypes; + private final TextDetector textDetector; + private Set<MediaType> spoolTypes; public DefaultDetector(MimeTypes types, ServiceLoader loader, Collection<Class<? extends Detector>> excludeDetectors) { - super(types.getMediaTypeRegistry(), getDefaultDetectors(types, loader, excludeDetectors)); + super(types.getMediaTypeRegistry(), getDefaultDetectors(loader, excludeDetectors)); this.loader = loader; + this.mimeTypes = types; + this.textDetector = new TextDetector(); this.excludedClasses = excludeDetectors != null ? Collections.unmodifiableCollection(new ArrayList<>(excludeDetectors)) : Collections.emptySet(); + this.spoolTypes = getDefaultSpoolTypes(); } public DefaultDetector(MimeTypes types, ServiceLoader loader) { - this(types, loader, Collections.EMPTY_SET); + this(types, loader, Collections.emptySet()); } public DefaultDetector(MimeTypes types, ClassLoader loader) { @@ -78,6 +96,16 @@ public class DefaultDetector extends CompositeDetector { this(MimeTypes.getDefaultMimeTypes()); } + private static Set<MediaType> getDefaultSpoolTypes() { + Set<MediaType> types = new HashSet<>(); + types.add(MediaType.application("zip")); + types.add(MediaType.application("x-tika-msoffice")); + types.add(MediaType.application("x-tika-ooxml")); + types.add(MediaType.application("pdf")); + types.add(MediaType.application("x-bplist")); + return types; + } + /** * Finds all statically loadable detectors and sort the list by name, * rather than discovery order. Detectors are used in the given order, @@ -86,11 +114,13 @@ public class DefaultDetector extends CompositeDetector { * <p> * If an {@link OverrideDetector} is loaded, it takes precedence over * all other detectors. + * <p> + * Note: MimeTypes is handled separately in the detect() method, not included here. * * @param loader service loader * @return ordered list of statically loadable detectors */ - private static List<Detector> getDefaultDetectors(MimeTypes types, ServiceLoader loader, + private static List<Detector> getDefaultDetectors(ServiceLoader loader, Collection<Class<? extends Detector>> excludeDetectors) { List<Detector> detectors = @@ -111,16 +141,104 @@ public class DefaultDetector extends CompositeDetector { Detector detector = detectors.remove(overrideIndex); detectors.add(0, detector); } - // Finally the Tika MimeTypes as a fallback - detectors.add(types); return detectors; } + @Override + public MediaType detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) + throws IOException { + // 1. Magic detection via MimeTypes + MediaType magicType = mimeTypes.detect(tis, metadata, parseContext); + metadata.set(TikaCoreProperties.CONTENT_TYPE_MAGIC_DETECTED, magicType.toString()); + + // 2. Spool if needed for random access formats + if (tis != null && shouldSpool(magicType)) { + try { + tis.getFile(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + // 3. Run other detectors (container detectors, etc.) + MediaType detectedType = super.detect(tis, metadata, parseContext); + + // 4. Text detection - only if still unknown + MediaType textType = null; + if (MediaType.OCTET_STREAM.equals(detectedType) && + MediaType.OCTET_STREAM.equals(magicType)) { + textType = textDetector.detect(tis, metadata, parseContext); + } + + // 5. Return most specific + return mostSpecific(magicType, detectedType, textType); + } + + private boolean shouldSpool(MediaType type) { + if (spoolTypes == null || spoolTypes.isEmpty()) { + return false; + } + // Check exact match + if (spoolTypes.contains(type)) { + return true; + } + // Check base type (without parameters) + MediaType baseType = type.getBaseType(); + if (spoolTypes.contains(baseType)) { + return true; + } + // Check if type is a specialization of any spool type + MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); + for (MediaType spoolType : spoolTypes) { + if (registry.isSpecializationOf(type, spoolType)) { + return true; + } + } + return false; + } + + private MediaType mostSpecific(MediaType magicType, MediaType detectedType, MediaType textType) { + MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); + + // Collect non-null, non-octet-stream candidates + MediaType best = MediaType.OCTET_STREAM; + + // Start with magic type as baseline if valid + if (magicType != null && !MediaType.OCTET_STREAM.equals(magicType)) { + best = magicType; + } + + // Container detectors may find more specific types (e.g., OLE -> msword) + // or less specific (e.g., commons-compress tar vs magic gtar) + // Use the registry to determine which is more specific + if (detectedType != null && !MediaType.OCTET_STREAM.equals(detectedType)) { + if (MediaType.OCTET_STREAM.equals(best)) { + best = detectedType; + } else if (registry.isSpecializationOf(detectedType, best)) { + // detectedType is more specific than best + best = detectedType; + } else if (!registry.isSpecializationOf(best, detectedType)) { + // Neither is a specialization of the other - prefer container detection + // for unrelated types (e.g., different format families) + best = detectedType; + } + // else: best is already more specific than detectedType, keep best + } + + // Text detection as fallback only if still unknown + if (MediaType.OCTET_STREAM.equals(best) && textType != null && + !MediaType.OCTET_STREAM.equals(textType)) { + best = textType; + } + + return best; + } + @Override public List<Detector> getDetectors() { if (loader != null && loader.isDynamic()) { List<Detector> detectors = loader.loadDynamicServiceProviders(Detector.class); - if (detectors.size() > 0) { + if (!detectors.isEmpty()) { detectors.addAll(super.getDetectors()); return detectors; } else { @@ -140,4 +258,24 @@ public class DefaultDetector extends CompositeDetector { public Collection<Class<? extends Detector>> getExcludedClasses() { return excludedClasses; } + + /** + * Sets the media types that should be spooled to a temp file before + * container detection. This enables random access for formats like + * ZIP, OLE, and PDF. + * + * @param spoolTypes set of media types to spool + */ + public void setSpoolTypes(Set<MediaType> spoolTypes) { + this.spoolTypes = spoolTypes; + } + + /** + * Returns the media types that are spooled to temp files. + * + * @return set of media types to spool + */ + public Set<MediaType> getSpoolTypes() { + return spoolTypes; + } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index a2ac99c2bb..b89323fc11 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -229,6 +229,13 @@ public interface TikaCoreProperties { */ Property CONTENT_TYPE_PARSER_OVERRIDE = Property.internalText(HttpHeaders.CONTENT_TYPE + "-Parser-Override"); + /** + * This is set by DefaultDetector to store the result of MimeTypes (magic byte) + * detection. This allows downstream detectors to use it as a hint without + * re-running magic detection. + */ + Property CONTENT_TYPE_MAGIC_DETECTED = + Property.internalText(HttpHeaders.CONTENT_TYPE + "-Magic-Detected"); /** * @see DublinCore#FORMAT */ diff --git a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java index 80b6315e93..ac5b0b077f 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java @@ -62,8 +62,9 @@ public class BodyContentHandlerTest extends TikaTest { @Test public void testLimit() throws Exception { //TIKA-2668 - java 11-ea + // Note: limit is 16 to account for metadata overhead (each metadata field adds a newline) Parser p = new MockParser(); - WriteOutContentHandler handler = new WriteOutContentHandler(15); + WriteOutContentHandler handler = new WriteOutContentHandler(16); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Parser[] parsers = new Parser[1]; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java index b7e6752511..5ff9e0cf15 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java @@ -26,6 +26,7 @@ import java.util.List; import java.util.Set; import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.BoundedInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -98,10 +99,17 @@ public class AppleSingleFileParser implements Parser { long diff = contentFieldInfo.offset - bytesRead; IOUtils.skipFully(tis, diff); if (ex.shouldParseEmbedded(embeddedMetadata)) { - // TODO: we should probably add a readlimiting wrapper around this - // stream to ensure that not more than contentFieldInfo.length bytes - // are read - ex.parseEmbedded(tis, xhtml, embeddedMetadata, context, true); + // Use BoundedInputStream to limit bytes read, then spool to temp file + // for complete isolation from parent stream (reset() goes to embedded start) + BoundedInputStream bounded = + BoundedInputStream.builder() + .setInputStream(tis) + .setMaxCount(contentFieldInfo.length) + .get(); + try (TikaInputStream inner = TikaInputStream.get(bounded)) { + inner.getPath(); + ex.parseEmbedded(inner, xhtml, embeddedMetadata, context, true); + } } } xhtml.endDocument(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java index 0edcb0a6b3..0337729955 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java @@ -18,7 +18,10 @@ package org.apache.tika.parser.crypto; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.math.BigInteger; +import java.nio.file.Files; +import java.nio.file.Path; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.security.NoSuchProviderException; @@ -55,8 +58,10 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -167,10 +172,27 @@ public class TSDParser implements Parser { EmbeddedDocumentExtractor edx = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); if (edx.shouldParseEmbedded(metadata)) { - try { + try (TemporaryResources tmp = new TemporaryResources()) { cmsTimeStampedDataParser = new CMSTimeStampedDataParser(stream); - try (TikaInputStream inner = TikaInputStream.get(cmsTimeStampedDataParser.getContent())) { + // Spool content to temp file, catching any EOF from truncated files + Path tempFile = tmp.createTempFile(); + try (InputStream content = cmsTimeStampedDataParser.getContent(); + OutputStream out = Files.newOutputStream(tempFile)) { + byte[] buffer = new byte[8192]; + int n; + while ((n = content.read(buffer)) != -1) { + out.write(buffer, 0, n); + } + } catch (IOException e) { + // Truncated file - record exception and work with what we got + metadata.set(TikaCoreProperties.EMBEDDED_EXCEPTION, + e.getClass().getName() + ": " + e.getMessage()); + LOG.debug("Error reading TSD content (possibly truncated)", e); + } + + // Parse whatever we managed to extract + try (TikaInputStream inner = TikaInputStream.get(tempFile)) { edx.parseEmbedded(inner, handler, metadata, context, true); } @@ -180,17 +202,13 @@ public class TSDParser implements Parser { WriteLimitReachedException.throwIfWriteLimitReached(ex); LOG.error("Error in TSDParser.parseTSDContent {}", ex.getMessage()); } finally { - this.closeCMSParser(cmsTimeStampedDataParser); - } - } - } - - private void closeCMSParser(CMSTimeStampedDataParser cmsTimeStampedDataParser) { - if (cmsTimeStampedDataParser != null) { - try { - cmsTimeStampedDataParser.close(); - } catch (IOException ex) { - LOG.error("Error in TSDParser.closeCMSParser {}", ex.getMessage()); + if (cmsTimeStampedDataParser != null) { + try { + cmsTimeStampedDataParser.close(); + } catch (IOException e) { + LOG.debug("Error closing CMSTimeStampedDataParser", e); + } + } } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 72753ce022..a18c27c6c5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -237,7 +237,7 @@ public class PDFParserTest extends TikaTest { assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("true", metadata.get("pdf:encrypted")); //pdf:encrypted, X-Parsed-By and Content-Type - assertEquals(8, metadata.names().length, "very little metadata should be parsed"); + assertEquals(9, metadata.names().length, "very little metadata should be parsed"); assertEquals(0, handler.toString().length()); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java index 2e70fe8315..d0718c9ac5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java @@ -59,6 +59,7 @@ public class WARCParserTest extends TikaTest { Set<String> fieldsToIgnore = new HashSet<>(); fieldsToIgnore.add("X-TIKA:parse_time_millis"); fieldsToIgnore.add("Content-Type"); + fieldsToIgnore.add("Content-Type-Magic-Detected"); assertMetadataListEquals(metadataList, gzMetadataList, fieldsToIgnore); assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java index e419b18e59..cec44d39a9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java @@ -32,7 +32,7 @@ public class TestDetectorLoading { //integration test - detectors should be sorted alphabetically by class name Detector detector = TikaLoader.loadDefault().loadDetectors(); List<Detector> detectors = ((CompositeDetector) detector).getDetectors(); - assertEquals(8, detectors.size()); + assertEquals(7, detectors.size()); // Sorted alphabetically by full class name (all are org.apache.tika.*) assertEquals("org.apache.tika.detect.MatroskaDetector", detectors.get(0).getClass().getName()); assertEquals("org.apache.tika.detect.apple.BPListDetector", detectors.get(1).getClass().getName()); @@ -44,6 +44,5 @@ public class TestDetectorLoading { assertEquals("org.apache.tika.detect.ole.MiscOLEDetector", detectors.get(5).getClass().getName()); assertEquals("org.apache.tika.detect.zip.DefaultZipContainerDetector", detectors.get(6).getClass().getName()); - assertEquals("org.apache.tika.mime.MimeTypes", detectors.get(7).getClass().getName()); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java index 140a82d5f0..47f8c65593 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java @@ -41,10 +41,12 @@ public class TSDParserTest extends TikaTest { //make sure that embedded file appears in list //and make sure embedded exception is recorded List<Metadata> list = getRecursiveMetadata("testTSD_broken_pdf.tsd", parseContext); + debug(list); assertEquals(2, list.size()); assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE)); assertNotNull(list.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION)); - assertContains("org.apache.pdfbox.io.RandomAccessReadBuffer.<init>", + // Exception occurs during TSD content extraction (truncated file) + assertContains("EOFException", list.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION)); }
