This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 363378f7fe TIKA-4749 - improve inline handling of metadata only (#2866)
363378f7fe is described below
commit 363378f7fe98de6521c3e1bb74baa622eb472579
Author: Tim Allison <[email protected]>
AuthorDate: Wed Jun 3 16:40:17 2026 -0400
TIKA-4749 - improve inline handling of metadata only (#2866)
---
.../org/apache/tika/parser/AutoDetectParser.java | 4 +++
.../org/apache/tika/parser/MetadataOnlyParse.java | 35 +++++++++++++++++++++
.../tika/parser/pdf/image/ImageGraphicsEngine.java | 12 +++-----
.../org/apache/tika/parser/pdf/PDFParserTest.java | 36 ++++++++++++++++++++--
4 files changed, 78 insertions(+), 9 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 6867c622d2..6cff66c0b5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -160,6 +160,10 @@ public class AutoDetectParser extends CompositeParser {
// don't leak into CONTENT_TYPE
metadata.set(Metadata.CONTENT_TYPE,
EmbeddedDocumentUtil.normalizeMediaType(type.toString()));
+ // Metadata-only pseudo-parse: register the entry, skip the content
parse.
+ if (context.get(MetadataOnlyParse.class) != null) {
+ return;
+ }
//check for zero-byte inputstream
if (tis.getOpenContainer() == null) {
if (autoDetectParserConfig.getThrowOnZeroBytes()) {
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/MetadataOnlyParse.java
b/tika-core/src/main/java/org/apache/tika/parser/MetadataOnlyParse.java
new file mode 100644
index 0000000000..26840cf9bc
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/MetadataOnlyParse.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+/**
+ * ParseContext marker telling {@link AutoDetectParser} to register the
embedded
+ * entry but skip the content parse. Set by metadata-only passes that
pseudo-parse
+ * a placeholder stream only to register an entry. Independent of
throwOnZeroBytes.
+ */
+public final class MetadataOnlyParse {
+
+ /**
+ * Singleton instance indicating the current parse should not dispatch to a
+ * content parser.
+ */
+ public static final MetadataOnlyParse INSTANCE = new MetadataOnlyParse();
+
+ private MetadataOnlyParse() {
+ // Private constructor for singleton
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
index 72f95d8a1d..f77c317236 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
@@ -57,7 +57,6 @@ import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
-import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.BoundedInputStream;
@@ -65,6 +64,7 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaPagedText;
+import org.apache.tika.parser.MetadataOnlyParse;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.parser.pdf.PDMetadataExtractor;
@@ -448,16 +448,14 @@ public class ImageGraphicsEngine extends
PDFGraphicsStreamEngine {
metadata.set(Metadata.IMAGE_WIDTH, pdImage.getWidth());
metadata.set(Metadata.IMAGE_LENGTH, pdImage.getHeight());
//TODO: what else can we extract from the PDImage without rendering?
- ZeroByteFileException.IgnoreZeroByteFileException before =
-
parseContext.get(ZeroByteFileException.IgnoreZeroByteFileException.class);
+ //Register the image's metadata entry without decoding it (marker
skips the parse).
try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
-
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
- ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION);
+ parseContext.set(MetadataOnlyParse.class,
MetadataOnlyParse.INSTANCE);
embeddedDocumentExtractor.parseEmbedded(tis,
new EmbeddedContentHandler(xhtml), metadata, parseContext,
false);
} finally {
- //replace whatever was there before
-
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
before);
+ //clear so it can't leak to the next image
+ parseContext.set(MetadataOnlyParse.class, null);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ccae004def..b8d3265b8f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -47,7 +47,6 @@ import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.AccessPermissionException;
import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Font;
@@ -62,7 +61,9 @@ import org.apache.tika.metadata.XMPMM;
import org.apache.tika.metadata.XMPPDF;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.AutoDetectParserConfig;
import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.MetadataOnlyParse;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
@@ -1358,7 +1359,7 @@ public class PDFParserTest extends TikaTest {
config.setExtractInlineImageMetadataOnly(true);
context.set(PDFParserConfig.class, config);
List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf",
context);
-
assertNull(context.get(ZeroByteFileException.IgnoreZeroByteFileException.class));
+ assertNull(context.get(MetadataOnlyParse.class));
assertEquals(2, metadataList.size());
assertEquals("image/png",
metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertEquals("/image-0.png",
@@ -1368,6 +1369,37 @@ public class PDFParserTest extends TikaTest {
assertEquals("image-0.png",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
}
+ @Test
+ public void testExtractInlineImageMetadataThrowOnZeroBytesFalse() throws
Exception {
+ //TIKA-4749: in metadata-only mode the inline image is registered via a
+ //placeholder pseudo-parse. With throwOnZeroBytes=false that
placeholder used
+ //to be handed to a real parser (image/OCR), recording a spurious
embedded
+ //exception. The MetadataOnlyParse marker must make it skip the parse
instead.
+ ParseContext context = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractInlineImageMetadataOnly(true);
+ context.set(PDFParserConfig.class, config);
+
+ AutoDetectParser p = new AutoDetectParser();
+ AutoDetectParserConfig adpc = new AutoDetectParserConfig();
+ adpc.setThrowOnZeroBytes(false);
+ p.setAutoDetectParserConfig(adpc);
+
+ List<Metadata> metadataList =
+ getRecursiveMetadata("testOCR.pdf", p, new Metadata(),
context, false);
+ assertNull(context.get(MetadataOnlyParse.class));
+ assertEquals(2, metadataList.size());
+ Metadata image = metadataList.get(1);
+ assertEquals("image/png", image.get(Metadata.CONTENT_TYPE));
+ assertEquals(261, (int) image.getInt(Metadata.IMAGE_LENGTH));
+ assertEquals(934, (int) image.getInt(Metadata.IMAGE_WIDTH));
+ //the placeholder must not be dispatched to any content parser.
Without the
+ //fix it is (EmptyParser here; ImageParser+TesseractOCRParser when
tesseract
+ //is installed, which is what records the spurious embedded exception).
+ assertEquals(0,
image.getValues(TikaCoreProperties.TIKA_PARSED_BY).length);
+ assertNull(image.get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+ }
+
/**
* Simple class to count end of document events. If functionality is
useful,
* move to org.apache.tika in src/test